summaryrefslogtreecommitdiffstats
path: root/src/spdk/module
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
commite6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree64f88b554b444a49f656b6c656111a145cbbaa28 /src/spdk/module
parentInitial commit. (diff)
downloadceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz
ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/spdk/module')
-rw-r--r--src/spdk/module/Makefile55
-rw-r--r--src/spdk/module/accel/Makefile46
-rw-r--r--src/spdk/module/accel/idxd/Makefile45
-rw-r--r--src/spdk/module/accel/idxd/accel_engine_idxd.c847
-rw-r--r--src/spdk/module/accel/idxd/accel_engine_idxd.h43
-rw-r--r--src/spdk/module/accel/idxd/accel_engine_idxd_rpc.c75
-rw-r--r--src/spdk/module/accel/ioat/Makefile45
-rw-r--r--src/spdk/module/accel/ioat/accel_engine_ioat.c764
-rw-r--r--src/spdk/module/accel/ioat/accel_engine_ioat.h44
-rw-r--r--src/spdk/module/accel/ioat/accel_engine_ioat_rpc.c116
-rw-r--r--src/spdk/module/bdev/Makefile61
-rw-r--r--src/spdk/module/bdev/aio/Makefile46
-rw-r--r--src/spdk/module/bdev/aio/bdev_aio.c827
-rw-r--r--src/spdk/module/bdev/aio/bdev_aio.h46
-rw-r--r--src/spdk/module/bdev/aio/bdev_aio_rpc.c148
-rw-r--r--src/spdk/module/bdev/compress/Makefile48
-rw-r--r--src/spdk/module/bdev/compress/vbdev_compress.c1865
-rw-r--r--src/spdk/module/bdev/compress/vbdev_compress.h106
-rw-r--r--src/spdk/module/bdev/compress/vbdev_compress_rpc.c252
-rw-r--r--src/spdk/module/bdev/crypto/Makefile47
-rw-r--r--src/spdk/module/bdev/crypto/vbdev_crypto.c2040
-rw-r--r--src/spdk/module/bdev/crypto/vbdev_crypto.h78
-rw-r--r--src/spdk/module/bdev/crypto/vbdev_crypto_rpc.c195
-rw-r--r--src/spdk/module/bdev/delay/Makefile47
-rw-r--r--src/spdk/module/bdev/delay/vbdev_delay.c851
-rw-r--r--src/spdk/module/bdev/delay/vbdev_delay.h85
-rw-r--r--src/spdk/module/bdev/delay/vbdev_delay_rpc.c225
-rw-r--r--src/spdk/module/bdev/error/Makefile45
-rw-r--r--src/spdk/module/bdev/error/vbdev_error.c508
-rw-r--r--src/spdk/module/bdev/error/vbdev_error.h76
-rw-r--r--src/spdk/module/bdev/error/vbdev_error_rpc.c245
-rw-r--r--src/spdk/module/bdev/ftl/Makefile45
-rw-r--r--src/spdk/module/bdev/ftl/bdev_ftl.c517
-rw-r--r--src/spdk/module/bdev/ftl/bdev_ftl.h70
-rw-r--r--src/spdk/module/bdev/ftl/bdev_ftl_rpc.c258
-rw-r--r--src/spdk/module/bdev/gpt/Makefile45
-rw-r--r--src/spdk/module/bdev/gpt/gpt.c320
-rw-r--r--src/spdk/module/bdev/gpt/gpt.h70
-rw-r--r--src/spdk/module/bdev/gpt/vbdev_gpt.c565
-rw-r--r--src/spdk/module/bdev/iscsi/Makefile51
-rw-r--r--src/spdk/module/bdev/iscsi/bdev_iscsi.c936
-rw-r--r--src/spdk/module/bdev/iscsi/bdev_iscsi.h75
-rw-r--r--src/spdk/module/bdev/iscsi/bdev_iscsi_rpc.c158
-rw-r--r--src/spdk/module/bdev/lvol/Makefile46
-rw-r--r--src/spdk/module/bdev/lvol/vbdev_lvol.c1354
-rw-r--r--src/spdk/module/bdev/lvol/vbdev_lvol.h130
-rw-r--r--src/spdk/module/bdev/lvol/vbdev_lvol_rpc.c1098
-rw-r--r--src/spdk/module/bdev/malloc/Makefile46
-rw-r--r--src/spdk/module/bdev/malloc/bdev_malloc.c532
-rw-r--r--src/spdk/module/bdev/malloc/bdev_malloc.h48
-rw-r--r--src/spdk/module/bdev/malloc/bdev_malloc_rpc.c173
-rw-r--r--src/spdk/module/bdev/null/Makefile45
-rw-r--r--src/spdk/module/bdev/null/bdev_null.c550
-rw-r--r--src/spdk/module/bdev/null/bdev_null.h67
-rw-r--r--src/spdk/module/bdev/null/bdev_null_rpc.c204
-rw-r--r--src/spdk/module/bdev/nvme/Makefile50
-rw-r--r--src/spdk/module/bdev/nvme/bdev_nvme.c2924
-rw-r--r--src/spdk/module/bdev/nvme/bdev_nvme.h90
-rw-r--r--src/spdk/module/bdev/nvme/bdev_nvme_cuse_rpc.c152
-rw-r--r--src/spdk/module/bdev/nvme/bdev_nvme_rpc.c842
-rw-r--r--src/spdk/module/bdev/nvme/bdev_ocssd.c1498
-rw-r--r--src/spdk/module/bdev/nvme/bdev_ocssd.h67
-rw-r--r--src/spdk/module/bdev/nvme/bdev_ocssd_rpc.c197
-rw-r--r--src/spdk/module/bdev/nvme/common.c204
-rw-r--r--src/spdk/module/bdev/nvme/common.h163
-rw-r--r--src/spdk/module/bdev/nvme/nvme_rpc.c492
-rw-r--r--src/spdk/module/bdev/nvme/vbdev_opal.c630
-rw-r--r--src/spdk/module/bdev/nvme/vbdev_opal.h54
-rw-r--r--src/spdk/module/bdev/nvme/vbdev_opal_rpc.c453
-rw-r--r--src/spdk/module/bdev/ocf/Makefile52
-rw-r--r--src/spdk/module/bdev/ocf/ctx.c565
-rw-r--r--src/spdk/module/bdev/ocf/ctx.h65
-rw-r--r--src/spdk/module/bdev/ocf/data.c122
-rw-r--r--src/spdk/module/bdev/ocf/data.h57
-rw-r--r--src/spdk/module/bdev/ocf/stats.c109
-rw-r--r--src/spdk/module/bdev/ocf/stats.h51
-rw-r--r--src/spdk/module/bdev/ocf/utils.c136
-rw-r--r--src/spdk/module/bdev/ocf/utils.h67
-rw-r--r--src/spdk/module/bdev/ocf/vbdev_ocf.c1775
-rw-r--r--src/spdk/module/bdev/ocf/vbdev_ocf.h210
-rw-r--r--src/spdk/module/bdev/ocf/vbdev_ocf_rpc.c362
-rw-r--r--src/spdk/module/bdev/ocf/volume.c441
-rw-r--r--src/spdk/module/bdev/ocf/volume.h63
-rw-r--r--src/spdk/module/bdev/passthru/Makefile47
-rw-r--r--src/spdk/module/bdev/passthru/vbdev_passthru.c809
-rw-r--r--src/spdk/module/bdev/passthru/vbdev_passthru.h61
-rw-r--r--src/spdk/module/bdev/passthru/vbdev_passthru_rpc.c148
-rw-r--r--src/spdk/module/bdev/pmem/Makefile45
-rw-r--r--src/spdk/module/bdev/pmem/bdev_pmem.c473
-rw-r--r--src/spdk/module/bdev/pmem/bdev_pmem.h64
-rw-r--r--src/spdk/module/bdev/pmem/bdev_pmem_rpc.c337
-rw-r--r--src/spdk/module/bdev/raid/Makefile51
-rw-r--r--src/spdk/module/bdev/raid/bdev_raid.c1719
-rw-r--r--src/spdk/module/bdev/raid/bdev_raid.h319
-rw-r--r--src/spdk/module/bdev/raid/bdev_raid_rpc.c452
-rw-r--r--src/spdk/module/bdev/raid/raid0.c398
-rw-r--r--src/spdk/module/bdev/raid/raid5.c114
-rw-r--r--src/spdk/module/bdev/rbd/Makefile45
-rw-r--r--src/spdk/module/bdev/rbd/bdev_rbd.c898
-rw-r--r--src/spdk/module/bdev/rbd/bdev_rbd.h68
-rw-r--r--src/spdk/module/bdev/rbd/bdev_rbd_rpc.c252
-rw-r--r--src/spdk/module/bdev/rpc/Makefile45
-rw-r--r--src/spdk/module/bdev/rpc/bdev_rpc.c676
-rw-r--r--src/spdk/module/bdev/split/Makefile45
-rw-r--r--src/spdk/module/bdev/split/vbdev_split.c582
-rw-r--r--src/spdk/module/bdev/split/vbdev_split.h68
-rw-r--r--src/spdk/module/bdev/split/vbdev_split_rpc.c145
-rw-r--r--src/spdk/module/bdev/uring/Makefile51
-rw-r--r--src/spdk/module/bdev/uring/bdev_uring.c676
-rw-r--r--src/spdk/module/bdev/uring/bdev_uring.h50
-rw-r--r--src/spdk/module/bdev/uring/bdev_uring_rpc.c150
-rw-r--r--src/spdk/module/bdev/virtio/Makefile45
-rw-r--r--src/spdk/module/bdev/virtio/bdev_virtio.h164
-rw-r--r--src/spdk/module/bdev/virtio/bdev_virtio_blk.c756
-rw-r--r--src/spdk/module/bdev/virtio/bdev_virtio_rpc.c264
-rw-r--r--src/spdk/module/bdev/virtio/bdev_virtio_scsi.c2036
-rw-r--r--src/spdk/module/bdev/zone_block/Makefile45
-rw-r--r--src/spdk/module/bdev/zone_block/vbdev_zone_block.c916
-rw-r--r--src/spdk/module/bdev/zone_block/vbdev_zone_block.h47
-rw-r--r--src/spdk/module/bdev/zone_block/vbdev_zone_block_rpc.c146
-rw-r--r--src/spdk/module/blob/Makefile44
-rw-r--r--src/spdk/module/blob/bdev/Makefile45
-rw-r--r--src/spdk/module/blob/bdev/blob_bdev.c390
-rw-r--r--src/spdk/module/blob/bdev/spdk_blob_bdev.map10
-rw-r--r--src/spdk/module/blobfs/Makefile44
-rw-r--r--src/spdk/module/blobfs/bdev/Makefile51
-rw-r--r--src/spdk/module/blobfs/bdev/blobfs_bdev.c361
-rw-r--r--src/spdk/module/blobfs/bdev/blobfs_bdev_rpc.c344
-rw-r--r--src/spdk/module/blobfs/bdev/blobfs_fuse.c358
-rw-r--r--src/spdk/module/blobfs/bdev/blobfs_fuse.h52
-rw-r--r--src/spdk/module/blobfs/bdev/spdk_blobfs_bdev.map8
-rw-r--r--src/spdk/module/env_dpdk/Makefile45
-rw-r--r--src/spdk/module/env_dpdk/env_dpdk_rpc.c68
-rw-r--r--src/spdk/module/event/Makefile44
-rw-r--r--src/spdk/module/event/rpc/Makefile45
-rw-r--r--src/spdk/module/event/rpc/app_rpc.c543
-rw-r--r--src/spdk/module/event/rpc/subsystem_rpc.c118
-rw-r--r--src/spdk/module/event/subsystems/Makefile61
-rw-r--r--src/spdk/module/event/subsystems/accel/Makefile45
-rw-r--r--src/spdk/module/event/subsystems/accel/accel.c71
-rw-r--r--src/spdk/module/event/subsystems/bdev/Makefile45
-rw-r--r--src/spdk/module/event/subsystems/bdev/bdev.c84
-rw-r--r--src/spdk/module/event/subsystems/iscsi/Makefile46
-rw-r--r--src/spdk/module/event/subsystems/iscsi/iscsi.c80
-rw-r--r--src/spdk/module/event/subsystems/nbd/Makefile45
-rw-r--r--src/spdk/module/event/subsystems/nbd/nbd.c72
-rw-r--r--src/spdk/module/event/subsystems/net/Makefile45
-rw-r--r--src/spdk/module/event/subsystems/net/net.c98
-rw-r--r--src/spdk/module/event/subsystems/nvmf/Makefile45
-rw-r--r--src/spdk/module/event/subsystems/nvmf/conf.c709
-rw-r--r--src/spdk/module/event/subsystems/nvmf/event_nvmf.h67
-rw-r--r--src/spdk/module/event/subsystems/nvmf/nvmf_rpc.c153
-rw-r--r--src/spdk/module/event/subsystems/nvmf/nvmf_tgt.c476
-rw-r--r--src/spdk/module/event/subsystems/scsi/Makefile45
-rw-r--r--src/spdk/module/event/subsystems/scsi/scsi.c65
-rw-r--r--src/spdk/module/event/subsystems/sock/Makefile44
-rw-r--r--src/spdk/module/event/subsystems/sock/sock.c62
-rw-r--r--src/spdk/module/event/subsystems/vhost/Makefile45
-rw-r--r--src/spdk/module/event/subsystems/vhost/vhost.c73
-rw-r--r--src/spdk/module/event/subsystems/vmd/Makefile45
-rw-r--r--src/spdk/module/event/subsystems/vmd/event_vmd.h39
-rw-r--r--src/spdk/module/event/subsystems/vmd/vmd.c132
-rw-r--r--src/spdk/module/event/subsystems/vmd/vmd_rpc.c55
-rw-r--r--src/spdk/module/sock/Makefile48
-rw-r--r--src/spdk/module/sock/posix/Makefile45
-rw-r--r--src/spdk/module/sock/posix/posix.c1405
-rw-r--r--src/spdk/module/sock/uring/Makefile45
-rw-r--r--src/spdk/module/sock/uring/uring.c1328
-rw-r--r--src/spdk/module/sock/vpp/Makefile55
-rw-r--r--src/spdk/module/sock/vpp/vpp.c1633
170 files changed, 52298 insertions, 0 deletions
diff --git a/src/spdk/module/Makefile b/src/spdk/module/Makefile
new file mode 100644
index 000000000..5662634b7
--- /dev/null
+++ b/src/spdk/module/Makefile
@@ -0,0 +1,55 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+DIRS-y = bdev blob blobfs accel event sock
+ifeq ($(SPDK_ROOT_DIR)/lib/env_dpdk,$(CONFIG_ENV))
+DIRS-y += env_dpdk
+endif
+
+DEPDIRS-blob :=
+DEPDIRS-accel :=
+DEPDIRS-env_dpdk :=
+DEPDIRS-sock :=
+DEPDIRS-bdev := blob
+DEPDIRS-blobfs := blob
+DEPDIRS-event := bdev blob
+
+.PHONY: all clean $(DIRS-y)
+
+all: $(DIRS-y)
+clean: $(DIRS-y)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.subdirs.mk
diff --git a/src/spdk/module/accel/Makefile b/src/spdk/module/accel/Makefile
new file mode 100644
index 000000000..bafa7b9ca
--- /dev/null
+++ b/src/spdk/module/accel/Makefile
@@ -0,0 +1,46 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+DIRS-y = ioat
+
+DIRS-$(CONFIG_IDXD) += idxd
+
+.PHONY: all clean $(DIRS-y)
+
+all: $(DIRS-y)
+clean: $(DIRS-y)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.subdirs.mk
diff --git a/src/spdk/module/accel/idxd/Makefile b/src/spdk/module/accel/idxd/Makefile
new file mode 100644
index 000000000..f2540f900
--- /dev/null
+++ b/src/spdk/module/accel/idxd/Makefile
@@ -0,0 +1,45 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 1
+SO_MINOR := 0
+
+LIBNAME = accel_idxd
+C_SRCS = accel_engine_idxd.c accel_engine_idxd_rpc.c
+
+SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/module/accel/idxd/accel_engine_idxd.c b/src/spdk/module/accel/idxd/accel_engine_idxd.c
new file mode 100644
index 000000000..e5af0181f
--- /dev/null
+++ b/src/spdk/module/accel/idxd/accel_engine_idxd.c
@@ -0,0 +1,847 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accel_engine_idxd.h"
+
+#include "spdk/stdinc.h"
+
+#include "spdk_internal/accel_engine.h"
+#include "spdk_internal/log.h"
+#include "spdk_internal/idxd.h"
+
+#include "spdk/env.h"
+#include "spdk/conf.h"
+#include "spdk/event.h"
+#include "spdk/thread.h"
+#include "spdk/idxd.h"
+#include "spdk/util.h"
+#include "spdk/json.h"
+
+#define ALIGN_4K 0x1000
+
+static bool g_idxd_enable = false;
+uint32_t g_config_number;
+
+enum channel_state {
+ IDXD_CHANNEL_ACTIVE,
+ IDXD_CHANNEL_PAUSED,
+ IDXD_CHANNEL_ERROR,
+};
+
+static bool g_idxd_initialized = false;
+
+struct pci_device {
+ struct spdk_pci_device *pci_dev;
+ TAILQ_ENTRY(pci_device) tailq;
+};
+static TAILQ_HEAD(, pci_device) g_pci_devices = TAILQ_HEAD_INITIALIZER(g_pci_devices);
+
+struct idxd_device {
+ struct spdk_idxd_device *idxd;
+ int num_channels;
+ TAILQ_ENTRY(idxd_device) tailq;
+};
+static TAILQ_HEAD(, idxd_device) g_idxd_devices = TAILQ_HEAD_INITIALIZER(g_idxd_devices);
+static struct idxd_device *g_next_dev = NULL;
+
+struct idxd_op {
+ struct spdk_idxd_io_channel *chan;
+ void *cb_arg;
+ spdk_idxd_req_cb cb_fn;
+ void *src;
+ union {
+ void *dst;
+ void *src2;
+ };
+ void *dst2;
+ uint32_t seed;
+ uint64_t fill_pattern;
+ uint32_t op_code;
+ uint64_t nbytes;
+ struct idxd_batch *batch;
+ TAILQ_ENTRY(idxd_op) link;
+};
+
+struct idxd_io_channel {
+ struct spdk_idxd_io_channel *chan;
+ struct spdk_idxd_device *idxd;
+ struct idxd_device *dev;
+ enum channel_state state;
+ struct spdk_poller *poller;
+ TAILQ_HEAD(, idxd_op) queued_ops;
+};
+
+struct idxd_task {
+ spdk_accel_completion_cb cb;
+};
+
+pthread_mutex_t g_configuration_lock = PTHREAD_MUTEX_INITIALIZER;
+
+static struct spdk_io_channel *idxd_get_io_channel(void);
+
+static struct idxd_device *
+idxd_select_device(void)
+{
+ /*
+ * We allow channels to share underlying devices,
+ * selection is round-robin based.
+ */
+
+ g_next_dev = TAILQ_NEXT(g_next_dev, tailq);
+ if (g_next_dev == NULL) {
+ g_next_dev = TAILQ_FIRST(&g_idxd_devices);
+ }
+ return g_next_dev;
+}
+
+static int
+idxd_poll(void *arg)
+{
+ struct idxd_io_channel *chan = arg;
+ struct idxd_op *op = NULL;
+ int rc;
+
+ spdk_idxd_process_events(chan->chan);
+
+ /* Check if there are any pending ops to process if the channel is active */
+ if (chan->state != IDXD_CHANNEL_ACTIVE) {
+ return -1;
+ }
+
+ while (!TAILQ_EMPTY(&chan->queued_ops)) {
+ op = TAILQ_FIRST(&chan->queued_ops);
+
+ switch (op->op_code) {
+ case IDXD_OPCODE_MEMMOVE:
+ rc = spdk_idxd_submit_copy(op->chan, op->dst, op->src, op->nbytes,
+ op->cb_fn, op->cb_arg);
+ break;
+ case IDXD_OPCODE_DUALCAST:
+ rc = spdk_idxd_submit_dualcast(op->chan, op->dst, op->dst2, op->src, op->nbytes,
+ op->cb_fn, op->cb_arg);
+ break;
+ case IDXD_OPCODE_COMPARE:
+ rc = spdk_idxd_submit_compare(op->chan, op->src, op->src2, op->nbytes,
+ op->cb_fn, op->cb_arg);
+ break;
+ case IDXD_OPCODE_MEMFILL:
+ rc = spdk_idxd_submit_fill(op->chan, op->dst, op->fill_pattern, op->nbytes,
+ op->cb_fn, op->cb_arg);
+ break;
+ case IDXD_OPCODE_CRC32C_GEN:
+ rc = spdk_idxd_submit_crc32c(op->chan, op->dst, op->src, op->seed, op->nbytes,
+ op->cb_fn, op->cb_arg);
+ break;
+ case IDXD_OPCODE_BATCH:
+ rc = spdk_idxd_batch_submit(op->chan, op->batch, op->cb_fn, op->cb_arg);
+ break;
+ default:
+ /* Should never get here */
+ assert(false);
+ break;
+ }
+ if (rc == 0) {
+ TAILQ_REMOVE(&chan->queued_ops, op, link);
+ free(op);
+ } else {
+ /* Busy, resubmit to try again later */
+ break;
+ }
+ }
+
+ return -1;
+}
+
+static size_t
+accel_engine_idxd_get_ctx_size(void)
+{
+ return sizeof(struct idxd_task) + sizeof(struct spdk_accel_task);
+}
+
+static void
+idxd_done(void *cb_arg, int status)
+{
+ struct spdk_accel_task *accel_req;
+ struct idxd_task *idxd_task = cb_arg;
+
+ accel_req = SPDK_CONTAINEROF(idxd_task, struct spdk_accel_task,
+ offload_ctx);
+
+ idxd_task->cb(accel_req, status);
+}
+
+static struct idxd_op *
+_prep_queue_command(struct idxd_io_channel *chan, spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+ struct idxd_op *op_to_queue;
+
+ op_to_queue = calloc(1, sizeof(struct idxd_op));
+ if (op_to_queue == NULL) {
+ SPDK_ERRLOG("Failed to allocate operation for queueing\n");
+ return NULL;
+ }
+
+ op_to_queue->chan = chan->chan;
+ op_to_queue->cb_fn = cb_fn;
+ op_to_queue->cb_arg = cb_arg;
+
+ return op_to_queue;
+}
+
+static int
+idxd_submit_copy(struct spdk_io_channel *ch, void *dst, void *src, uint64_t nbytes,
+ spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+ struct idxd_task *idxd_task = (struct idxd_task *)cb_arg;
+ struct idxd_io_channel *chan = spdk_io_channel_get_ctx(ch);
+ int rc = 0;
+
+ idxd_task->cb = cb_fn;
+
+ if (chan->state == IDXD_CHANNEL_ACTIVE) {
+ rc = spdk_idxd_submit_copy(chan->chan, dst, src, nbytes, idxd_done, idxd_task);
+ }
+
+ if (chan->state == IDXD_CHANNEL_PAUSED || rc == -EBUSY) {
+ struct idxd_op *op_to_queue;
+
+ /* Commpom prep. */
+ op_to_queue = _prep_queue_command(chan, idxd_done, idxd_task);
+ if (op_to_queue == NULL) {
+ return -ENOMEM;
+ }
+
+ /* Command specific. */
+ op_to_queue->dst = dst;
+ op_to_queue->src = src;
+ op_to_queue->nbytes = nbytes;
+ op_to_queue->op_code = IDXD_OPCODE_MEMMOVE;
+
+ /* Queue the operation. */
+ TAILQ_INSERT_TAIL(&chan->queued_ops, op_to_queue, link);
+ return 0;
+
+ } else if (chan->state == IDXD_CHANNEL_ERROR) {
+ return -EINVAL;
+ }
+
+ return rc;
+}
+
+static int
+idxd_submit_dualcast(struct spdk_io_channel *ch, void *dst1, void *dst2, void *src,
+ uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+ struct idxd_task *idxd_task = (struct idxd_task *)cb_arg;
+ struct idxd_io_channel *chan = spdk_io_channel_get_ctx(ch);
+ int rc = 0;
+
+ idxd_task->cb = cb_fn;
+
+ if (chan->state == IDXD_CHANNEL_ACTIVE) {
+ rc = spdk_idxd_submit_dualcast(chan->chan, dst1, dst2, src, nbytes, idxd_done, idxd_task);
+ }
+
+ if (chan->state == IDXD_CHANNEL_PAUSED || rc == -EBUSY) {
+ struct idxd_op *op_to_queue;
+
+ /* Commpom prep. */
+ op_to_queue = _prep_queue_command(chan, idxd_done, idxd_task);
+ if (op_to_queue == NULL) {
+ return -ENOMEM;
+ }
+
+ /* Command specific. */
+ op_to_queue->dst = dst1;
+ op_to_queue->dst2 = dst2;
+ op_to_queue->src = src;
+ op_to_queue->nbytes = nbytes;
+ op_to_queue->op_code = IDXD_OPCODE_DUALCAST;
+
+ /* Queue the operation. */
+ TAILQ_INSERT_TAIL(&chan->queued_ops, op_to_queue, link);
+ return 0;
+
+ } else if (chan->state == IDXD_CHANNEL_ERROR) {
+ return -EINVAL;
+ }
+
+ return rc;
+}
+
+static int
+idxd_submit_compare(struct spdk_io_channel *ch, void *src1, void *src2,
+ uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+ struct idxd_task *idxd_task = (struct idxd_task *)cb_arg;
+ struct idxd_io_channel *chan = spdk_io_channel_get_ctx(ch);
+ int rc = 0;
+
+ idxd_task->cb = cb_fn;
+
+ if (chan->state == IDXD_CHANNEL_ACTIVE) {
+ rc = spdk_idxd_submit_compare(chan->chan, src1, src2, nbytes, idxd_done, idxd_task);
+ }
+
+ if (chan->state == IDXD_CHANNEL_PAUSED || rc == -EBUSY) {
+ struct idxd_op *op_to_queue;
+
+ /* Commpom prep. */
+ op_to_queue = _prep_queue_command(chan, idxd_done, idxd_task);
+ if (op_to_queue == NULL) {
+ return -ENOMEM;
+ }
+
+ /* Command specific. */
+ op_to_queue->src = src1;
+ op_to_queue->src2 = src2;
+ op_to_queue->nbytes = nbytes;
+ op_to_queue->op_code = IDXD_OPCODE_COMPARE;
+
+ /* Queue the operation. */
+ TAILQ_INSERT_TAIL(&chan->queued_ops, op_to_queue, link);
+ return 0;
+
+ } else if (chan->state == IDXD_CHANNEL_ERROR) {
+ return -EINVAL;
+ }
+
+ return rc;
+}
+
+static int
+idxd_submit_fill(struct spdk_io_channel *ch, void *dst, uint8_t fill,
+ uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+ struct idxd_task *idxd_task = (struct idxd_task *)cb_arg;
+ struct idxd_io_channel *chan = spdk_io_channel_get_ctx(ch);
+ int rc = 0;
+ uint64_t fill_pattern;
+
+ idxd_task->cb = cb_fn;
+ memset(&fill_pattern, fill, sizeof(uint64_t));
+
+ if (chan->state == IDXD_CHANNEL_ACTIVE) {
+ rc = spdk_idxd_submit_fill(chan->chan, dst, fill_pattern, nbytes, idxd_done, idxd_task);
+ }
+
+ if (chan->state == IDXD_CHANNEL_PAUSED || rc == -EBUSY) {
+ struct idxd_op *op_to_queue;
+
+ /* Commpom prep. */
+ op_to_queue = _prep_queue_command(chan, idxd_done, idxd_task);
+ if (op_to_queue == NULL) {
+ return -ENOMEM;
+ }
+
+ /* Command specific. */
+ op_to_queue->dst = dst;
+ op_to_queue->fill_pattern = fill_pattern;
+ op_to_queue->nbytes = nbytes;
+ op_to_queue->op_code = IDXD_OPCODE_MEMFILL;
+
+ /* Queue the operation. */
+ TAILQ_INSERT_TAIL(&chan->queued_ops, op_to_queue, link);
+ return 0;
+
+ } else if (chan->state == IDXD_CHANNEL_ERROR) {
+ return -EINVAL;
+ }
+
+ return rc;
+}
+
+static int
+idxd_submit_crc32c(struct spdk_io_channel *ch, uint32_t *dst, void *src,
+ uint32_t seed, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+ struct idxd_task *idxd_task = (struct idxd_task *)cb_arg;
+ struct idxd_io_channel *chan = spdk_io_channel_get_ctx(ch);
+ int rc = 0;
+
+ idxd_task->cb = cb_fn;
+
+ if (chan->state == IDXD_CHANNEL_ACTIVE) {
+ rc = spdk_idxd_submit_crc32c(chan->chan, dst, src, seed, nbytes, idxd_done, idxd_task);
+ }
+
+ if (chan->state == IDXD_CHANNEL_PAUSED || rc == -EBUSY) {
+ struct idxd_op *op_to_queue;
+
+ /* Commpom prep. */
+ op_to_queue = _prep_queue_command(chan, idxd_done, idxd_task);
+ if (op_to_queue == NULL) {
+ return -ENOMEM;
+ }
+
+ /* Command specific. */
+ op_to_queue->dst = dst;
+ op_to_queue->src = src;
+ op_to_queue->seed = seed;
+ op_to_queue->nbytes = nbytes;
+ op_to_queue->op_code = IDXD_OPCODE_CRC32C_GEN;
+
+ /* Queue the operation. */
+ TAILQ_INSERT_TAIL(&chan->queued_ops, op_to_queue, link);
+ return 0;
+
+ } else if (chan->state == IDXD_CHANNEL_ERROR) {
+ return -EINVAL;
+ }
+
+ return rc;
+}
+
+static uint64_t
+idxd_get_capabilities(void)
+{
+ return ACCEL_COPY | ACCEL_FILL | ACCEL_CRC32C | ACCEL_COMPARE |
+ ACCEL_DUALCAST | ACCEL_BATCH;
+}
+
+static uint32_t
+idxd_batch_get_max(void)
+{
+ return spdk_idxd_batch_get_max();
+}
+
+static struct spdk_accel_batch *
+idxd_batch_start(struct spdk_io_channel *ch)
+{
+ struct idxd_io_channel *chan = spdk_io_channel_get_ctx(ch);
+
+ return (struct spdk_accel_batch *)spdk_idxd_batch_create(chan->chan);
+}
+
+static int
+idxd_batch_cancel(struct spdk_io_channel *ch, struct spdk_accel_batch *_batch)
+{
+ struct idxd_io_channel *chan = spdk_io_channel_get_ctx(ch);
+ struct idxd_batch *batch = (struct idxd_batch *)_batch;
+
+ return spdk_idxd_batch_cancel(chan->chan, batch);
+}
+
+static int
+idxd_batch_submit(struct spdk_io_channel *ch, struct spdk_accel_batch *_batch,
+ spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+ struct idxd_task *idxd_task = (struct idxd_task *)cb_arg;
+ struct idxd_io_channel *chan = spdk_io_channel_get_ctx(ch);
+ struct idxd_batch *batch = (struct idxd_batch *)_batch;
+ int rc = 0;
+
+ idxd_task->cb = cb_fn;
+
+ if (chan->state == IDXD_CHANNEL_ACTIVE) {
+ rc = spdk_idxd_batch_submit(chan->chan, batch, idxd_done, idxd_task);
+ }
+
+ if (chan->state == IDXD_CHANNEL_PAUSED || rc == -EBUSY) {
+ struct idxd_op *op_to_queue;
+
+ /* Commpom prep. */
+ op_to_queue = _prep_queue_command(chan, idxd_done, idxd_task);
+ if (op_to_queue == NULL) {
+ return -ENOMEM;
+ }
+
+ /* Command specific. */
+ op_to_queue->batch = batch;
+ op_to_queue->op_code = IDXD_OPCODE_BATCH;
+
+ /* Queue the operation. */
+ TAILQ_INSERT_TAIL(&chan->queued_ops, op_to_queue, link);
+ return 0;
+
+ } else if (chan->state == IDXD_CHANNEL_ERROR) {
+ return -EINVAL;
+ }
+
+ return rc;
+}
+
+static int
+idxd_batch_prep_copy(struct spdk_io_channel *ch, struct spdk_accel_batch *_batch,
+ void *dst, void *src, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+ struct idxd_task *idxd_task = (struct idxd_task *)cb_arg;
+ struct idxd_io_channel *chan = spdk_io_channel_get_ctx(ch);
+ struct idxd_batch *batch = (struct idxd_batch *)_batch;
+
+ idxd_task->cb = cb_fn;
+
+ return spdk_idxd_batch_prep_copy(chan->chan, batch, dst, src, nbytes,
+ idxd_done, idxd_task);
+}
+
+static int
+idxd_batch_prep_fill(struct spdk_io_channel *ch, struct spdk_accel_batch *_batch,
+ void *dst, uint8_t fill, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+ struct idxd_task *idxd_task = (struct idxd_task *)cb_arg;
+ struct idxd_io_channel *chan = spdk_io_channel_get_ctx(ch);
+ uint64_t fill_pattern;
+ struct idxd_batch *batch = (struct idxd_batch *)_batch;
+
+ idxd_task->cb = cb_fn;
+ memset(&fill_pattern, fill, sizeof(uint64_t));
+
+ return spdk_idxd_batch_prep_fill(chan->chan, batch, dst, fill_pattern, nbytes, idxd_done,
+ idxd_task);
+}
+
+static int
+idxd_batch_prep_dualcast(struct spdk_io_channel *ch, struct spdk_accel_batch *_batch,
+ void *dst1, void *dst2, void *src, uint64_t nbytes,
+ spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+ struct idxd_task *idxd_task = (struct idxd_task *)cb_arg;
+ struct idxd_io_channel *chan = spdk_io_channel_get_ctx(ch);
+ struct idxd_batch *batch = (struct idxd_batch *)_batch;
+
+ idxd_task->cb = cb_fn;
+
+ return spdk_idxd_batch_prep_dualcast(chan->chan, batch, dst1, dst2, src, nbytes, idxd_done,
+ idxd_task);
+}
+
+static int
+idxd_batch_prep_crc32c(struct spdk_io_channel *ch, struct spdk_accel_batch *_batch,
+ uint32_t *dst, void *src, uint32_t seed, uint64_t nbytes,
+ spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+ struct idxd_task *idxd_task = (struct idxd_task *)cb_arg;
+ struct idxd_io_channel *chan = spdk_io_channel_get_ctx(ch);
+ struct idxd_batch *batch = (struct idxd_batch *)_batch;
+
+ idxd_task->cb = cb_fn;
+
+ return spdk_idxd_batch_prep_crc32c(chan->chan, batch, dst, src, seed, nbytes, idxd_done,
+ idxd_task);
+}
+
+static int
+idxd_batch_prep_compare(struct spdk_io_channel *ch, struct spdk_accel_batch *_batch,
+ void *src1, void *src2, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+ struct idxd_task *idxd_task = (struct idxd_task *)cb_arg;
+ struct idxd_io_channel *chan = spdk_io_channel_get_ctx(ch);
+ struct idxd_batch *batch = (struct idxd_batch *)_batch;
+
+ idxd_task->cb = cb_fn;
+
+ return spdk_idxd_batch_prep_compare(chan->chan, batch, src1, src2, nbytes, idxd_done,
+ idxd_task);
+}
+
+static struct spdk_accel_engine idxd_accel_engine = {
+ .get_capabilities = idxd_get_capabilities,
+ .copy = idxd_submit_copy,
+ .batch_get_max = idxd_batch_get_max,
+ .batch_create = idxd_batch_start,
+ .batch_cancel = idxd_batch_cancel,
+ .batch_prep_copy = idxd_batch_prep_copy,
+ .batch_prep_fill = idxd_batch_prep_fill,
+ .batch_prep_dualcast = idxd_batch_prep_dualcast,
+ .batch_prep_crc32c = idxd_batch_prep_crc32c,
+ .batch_prep_compare = idxd_batch_prep_compare,
+ .batch_submit = idxd_batch_submit,
+ .dualcast = idxd_submit_dualcast,
+ .compare = idxd_submit_compare,
+ .fill = idxd_submit_fill,
+ .crc32c = idxd_submit_crc32c,
+ .get_io_channel = idxd_get_io_channel,
+};
+
+/*
+ * Configure the max number of descriptors that a channel is
+ * allowed to use based on the total number of current channels.
+ * This is to allow for dynamic load balancing for hw flow control.
+ */
+static void
+_config_max_desc(struct spdk_io_channel_iter *i)
+{
+ struct idxd_io_channel *chan;
+ struct spdk_io_channel *ch;
+ int rc;
+
+ ch = spdk_io_channel_iter_get_channel(i);
+ chan = spdk_io_channel_get_ctx(ch);
+
+ pthread_mutex_lock(&g_configuration_lock);
+ rc = spdk_idxd_reconfigure_chan(chan->chan, chan->dev->num_channels);
+ pthread_mutex_unlock(&g_configuration_lock);
+ if (rc == 0) {
+ chan->state = IDXD_CHANNEL_ACTIVE;
+ } else {
+ chan->state = IDXD_CHANNEL_ERROR;
+ }
+
+ spdk_for_each_channel_continue(i, 0);
+}
+
+/* Pauses a channel so that it can be re-configured. */
+static void
+_pause_chan(struct spdk_io_channel_iter *i)
+{
+ struct idxd_io_channel *chan;
+ struct spdk_io_channel *ch;
+
+ ch = spdk_io_channel_iter_get_channel(i);
+ chan = spdk_io_channel_get_ctx(ch);
+
+ /* start queueing up new requests. */
+ chan->state = IDXD_CHANNEL_PAUSED;
+
+ spdk_for_each_channel_continue(i, 0);
+}
+
+static void
+_pause_chan_done(struct spdk_io_channel_iter *i, int status)
+{
+ spdk_for_each_channel(&idxd_accel_engine, _config_max_desc, NULL, NULL);
+}
+
+static int
+idxd_create_cb(void *io_device, void *ctx_buf)
+{
+ struct idxd_io_channel *chan = ctx_buf;
+ struct idxd_device *dev;
+ int rc;
+
+ dev = idxd_select_device();
+ if (dev == NULL) {
+ SPDK_ERRLOG("Failed to allocate idxd_device\n");
+ return -EINVAL;
+ }
+
+ chan->chan = spdk_idxd_get_channel(dev->idxd);
+ if (chan->chan == NULL) {
+ return -ENOMEM;
+ }
+
+ chan->dev = dev;
+ chan->poller = spdk_poller_register(idxd_poll, chan, 0);
+ TAILQ_INIT(&chan->queued_ops);
+
+ /*
+ * Configure the channel but leave paused until all others
+ * are paused and re-configured based on the new number of
+ * channels. This enables dynamic load balancing for HW
+ * flow control.
+ */
+ pthread_mutex_lock(&g_configuration_lock);
+ rc = spdk_idxd_configure_chan(chan->chan);
+ if (rc) {
+ SPDK_ERRLOG("Failed to configure new channel rc = %d\n", rc);
+ chan->state = IDXD_CHANNEL_ERROR;
+ spdk_poller_unregister(&chan->poller);
+ pthread_mutex_unlock(&g_configuration_lock);
+ return rc;
+ }
+
+ chan->state = IDXD_CHANNEL_PAUSED;
+ chan->dev->num_channels++;
+ pthread_mutex_unlock(&g_configuration_lock);
+
+ /*
+ * Pause all channels so that we can set proper flow control
+ * per channel. When all are paused, we'll update the max
+ * number of descriptors allowed per channel.
+ */
+ spdk_for_each_channel(&idxd_accel_engine, _pause_chan, NULL,
+ _pause_chan_done);
+
+ return 0;
+}
+
+static void
+_pause_chan_destroy_done(struct spdk_io_channel_iter *i, int status)
+{
+ /* Rebalance the rings with the smaller number of remaining channels. */
+ spdk_for_each_channel(&idxd_accel_engine, _config_max_desc, NULL, NULL);
+}
+
+static void
+idxd_destroy_cb(void *io_device, void *ctx_buf)
+{
+ struct idxd_io_channel *chan = ctx_buf;
+
+ pthread_mutex_lock(&g_configuration_lock);
+ assert(chan->dev->num_channels > 0);
+ chan->dev->num_channels--;
+ spdk_idxd_reconfigure_chan(chan->chan, 0);
+ pthread_mutex_unlock(&g_configuration_lock);
+
+ spdk_poller_unregister(&chan->poller);
+ spdk_idxd_put_channel(chan->chan);
+
+ /* Pause each channel then rebalance the max number of ring slots. */
+ spdk_for_each_channel(&idxd_accel_engine, _pause_chan, NULL,
+ _pause_chan_destroy_done);
+}
+
+static struct spdk_io_channel *
+idxd_get_io_channel(void)
+{
+ return spdk_get_io_channel(&idxd_accel_engine);
+}
+
+static bool
+probe_cb(void *cb_ctx, struct spdk_pci_device *pci_dev)
+{
+ struct spdk_pci_addr pci_addr = spdk_pci_device_get_addr(pci_dev);
+ struct pci_device *pdev;
+
+ SPDK_NOTICELOG(
+ " Found matching device at %04x:%02x:%02x.%x vendor:0x%04x device:0x%04x\n",
+ pci_addr.domain,
+ pci_addr.bus,
+ pci_addr.dev,
+ pci_addr.func,
+ spdk_pci_device_get_vendor_id(pci_dev),
+ spdk_pci_device_get_device_id(pci_dev));
+
+ pdev = calloc(1, sizeof(*pdev));
+ if (pdev == NULL) {
+ return false;
+ }
+ pdev->pci_dev = pci_dev;
+ TAILQ_INSERT_TAIL(&g_pci_devices, pdev, tailq);
+
+ /* Claim the device in case conflict with other process */
+ if (spdk_pci_device_claim(pci_dev) < 0) {
+ return false;
+ }
+
+ return true;
+}
+
+static void
+attach_cb(void *cb_ctx, struct spdk_pci_device *pci_dev, struct spdk_idxd_device *idxd)
+{
+ struct idxd_device *dev;
+
+ dev = calloc(1, sizeof(*dev));
+ if (dev == NULL) {
+ SPDK_ERRLOG("Failed to allocate device struct\n");
+ return;
+ }
+
+ dev->idxd = idxd;
+ if (g_next_dev == NULL) {
+ g_next_dev = dev;
+ }
+
+ TAILQ_INSERT_TAIL(&g_idxd_devices, dev, tailq);
+}
+
+void
+accel_engine_idxd_enable_probe(uint32_t config_number)
+{
+ if (config_number > IDXD_MAX_CONFIG_NUM) {
+ SPDK_ERRLOG("Invalid config number, using default of 0\n");
+ config_number = 0;
+ }
+
+ g_config_number = config_number;
+ g_idxd_enable = true;
+ spdk_idxd_set_config(g_config_number);
+}
+
+static int
+accel_engine_idxd_init(void)
+{
+ if (!g_idxd_enable) {
+ return -EINVAL;
+ }
+
+ if (spdk_idxd_probe(NULL, probe_cb, attach_cb) != 0) {
+ SPDK_ERRLOG("spdk_idxd_probe() failed\n");
+ return -EINVAL;
+ }
+
+ g_idxd_initialized = true;
+ SPDK_NOTICELOG("Accel engine updated to use IDXD DSA engine.\n");
+ spdk_accel_hw_engine_register(&idxd_accel_engine);
+ spdk_io_device_register(&idxd_accel_engine, idxd_create_cb, idxd_destroy_cb,
+ sizeof(struct idxd_io_channel), "idxd_accel_engine");
+ return 0;
+}
+
+static void
+accel_engine_idxd_exit(void *ctx)
+{
+ struct idxd_device *dev;
+ struct pci_device *pci_dev;
+
+ if (g_idxd_initialized) {
+ spdk_io_device_unregister(&idxd_accel_engine, NULL);
+ }
+
+ while (!TAILQ_EMPTY(&g_idxd_devices)) {
+ dev = TAILQ_FIRST(&g_idxd_devices);
+ TAILQ_REMOVE(&g_idxd_devices, dev, tailq);
+ spdk_idxd_detach(dev->idxd);
+ free(dev);
+ }
+
+ while (!TAILQ_EMPTY(&g_pci_devices)) {
+ pci_dev = TAILQ_FIRST(&g_pci_devices);
+ TAILQ_REMOVE(&g_pci_devices, pci_dev, tailq);
+ spdk_pci_device_detach(pci_dev->pci_dev);
+ free(pci_dev);
+ }
+
+ spdk_accel_engine_module_finish();
+}
+
+static void
+accel_engine_idxd_write_config_json(struct spdk_json_write_ctx *w)
+{
+ if (g_idxd_enable) {
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "method", "idxd_scan_accel_engine");
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_uint32(w, "config_number", g_config_number);
+ spdk_json_write_object_end(w);
+ spdk_json_write_object_end(w);
+ }
+}
+
+SPDK_ACCEL_MODULE_REGISTER(accel_engine_idxd_init, accel_engine_idxd_exit,
+ NULL, accel_engine_idxd_write_config_json,
+ accel_engine_idxd_get_ctx_size)
+
+SPDK_LOG_REGISTER_COMPONENT("accel_idxd", SPDK_LOG_ACCEL_IDXD)
diff --git a/src/spdk/module/accel/idxd/accel_engine_idxd.h b/src/spdk/module/accel/idxd/accel_engine_idxd.h
new file mode 100644
index 000000000..dac6569a2
--- /dev/null
+++ b/src/spdk/module/accel/idxd/accel_engine_idxd.h
@@ -0,0 +1,43 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_ACCEL_ENGINE_IDXD_H
+#define SPDK_ACCEL_ENGINE_IDXD_H
+
+#include "spdk/stdinc.h"
+
+#define IDXD_MAX_DEVICES 16
+
+void accel_engine_idxd_enable_probe(uint32_t config_number);
+
+#endif /* SPDK_ACCEL_ENGINE_IDXD_H */
diff --git a/src/spdk/module/accel/idxd/accel_engine_idxd_rpc.c b/src/spdk/module/accel/idxd/accel_engine_idxd_rpc.c
new file mode 100644
index 000000000..c3406c510
--- /dev/null
+++ b/src/spdk/module/accel/idxd/accel_engine_idxd_rpc.c
@@ -0,0 +1,75 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accel_engine_idxd.h"
+
+#include "spdk/rpc.h"
+#include "spdk/util.h"
+#include "spdk/event.h"
+#include "spdk/stdinc.h"
+#include "spdk/env.h"
+
+struct rpc_idxd_scan_accel_engine {
+ uint32_t config_number;
+};
+
+static const struct spdk_json_object_decoder rpc_idxd_scan_accel_engine_decoder[] = {
+ {"config_number", offsetof(struct rpc_idxd_scan_accel_engine, config_number), spdk_json_decode_uint32},
+};
+
+static void
+rpc_idxd_scan_accel_engine(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_idxd_scan_accel_engine req = {};
+ struct spdk_json_write_ctx *w;
+
+ if (params != NULL) {
+ if (spdk_json_decode_object(params, rpc_idxd_scan_accel_engine_decoder,
+ SPDK_COUNTOF(rpc_idxd_scan_accel_engine_decoder),
+ &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object() failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid parameters");
+ return;
+ }
+ }
+
+ SPDK_NOTICELOG("Enabling IDXD with config #%u\n", req.config_number);
+ accel_engine_idxd_enable_probe(req.config_number);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("idxd_scan_accel_engine", rpc_idxd_scan_accel_engine, SPDK_RPC_STARTUP)
diff --git a/src/spdk/module/accel/ioat/Makefile b/src/spdk/module/accel/ioat/Makefile
new file mode 100644
index 000000000..0e43adbb1
--- /dev/null
+++ b/src/spdk/module/accel/ioat/Makefile
@@ -0,0 +1,45 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+LIBNAME = accel_ioat
+C_SRCS = accel_engine_ioat.c accel_engine_ioat_rpc.c
+
+SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/module/accel/ioat/accel_engine_ioat.c b/src/spdk/module/accel/ioat/accel_engine_ioat.c
new file mode 100644
index 000000000..0fff3a7c5
--- /dev/null
+++ b/src/spdk/module/accel/ioat/accel_engine_ioat.c
@@ -0,0 +1,764 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accel_engine_ioat.h"
+
+#include "spdk/stdinc.h"
+
+#include "spdk_internal/accel_engine.h"
+#include "spdk_internal/log.h"
+
+#include "spdk/env.h"
+#include "spdk/conf.h"
+#include "spdk/event.h"
+#include "spdk/thread.h"
+#include "spdk/ioat.h"
+#include "spdk/crc32.h"
+
+#define ALIGN_4K 0x1000
+
+enum ioat_accel_opcode {
+ IOAT_ACCEL_OPCODE_MEMMOVE = 0,
+ IOAT_ACCEL_OPCODE_MEMFILL = 1,
+ IOAT_ACCEL_OPCODE_COMPARE = 2,
+ IOAT_ACCEL_OPCODE_CRC32C = 3,
+ IOAT_ACCEL_OPCODE_DUALCAST = 4,
+};
+
+struct ioat_accel_op {
+ struct ioat_io_channel *ioat_ch;
+ void *cb_arg;
+ spdk_accel_completion_cb cb_fn;
+ void *src;
+ union {
+ void *dst;
+ void *src2;
+ };
+ void *dst2;
+ uint32_t seed;
+ uint64_t fill_pattern;
+ enum ioat_accel_opcode op_code;
+ uint64_t nbytes;
+ TAILQ_ENTRY(ioat_accel_op) link;
+};
+
+static int g_batch_size;
+static bool g_ioat_enable = false;
+static bool g_ioat_initialized = false;
+
+struct ioat_probe_ctx {
+ int num_whitelist_devices;
+ struct spdk_pci_addr whitelist[IOAT_MAX_CHANNELS];
+};
+
+static struct ioat_probe_ctx g_probe_ctx;
+
+struct ioat_device {
+ struct spdk_ioat_chan *ioat;
+ bool is_allocated;
+ /** linked list pointer for device list */
+ TAILQ_ENTRY(ioat_device) tailq;
+};
+
+struct pci_device {
+ struct spdk_pci_device *pci_dev;
+ TAILQ_ENTRY(pci_device) tailq;
+};
+
+static TAILQ_HEAD(, ioat_device) g_devices = TAILQ_HEAD_INITIALIZER(g_devices);
+static pthread_mutex_t g_ioat_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static TAILQ_HEAD(, pci_device) g_pci_devices = TAILQ_HEAD_INITIALIZER(g_pci_devices);
+
+struct ioat_io_channel {
+ struct spdk_ioat_chan *ioat_ch;
+ struct ioat_device *ioat_dev;
+ struct spdk_poller *poller;
+ TAILQ_HEAD(, ioat_accel_op) op_pool;
+ TAILQ_HEAD(, ioat_accel_op) sw_batch; /* for operations not hw accelerated */
+ bool hw_batch; /* for operations that are hw accelerated */
+};
+
+static int
+ioat_find_dev_by_whitelist_bdf(const struct spdk_pci_addr *pci_addr,
+ const struct spdk_pci_addr *whitelist,
+ int num_whitelist_devices)
+{
+ int i;
+
+ for (i = 0; i < num_whitelist_devices; i++) {
+ if (spdk_pci_addr_compare(pci_addr, &whitelist[i]) == 0) {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+static struct ioat_device *
+ioat_allocate_device(void)
+{
+ struct ioat_device *dev;
+
+ pthread_mutex_lock(&g_ioat_mutex);
+ TAILQ_FOREACH(dev, &g_devices, tailq) {
+ if (!dev->is_allocated) {
+ dev->is_allocated = true;
+ pthread_mutex_unlock(&g_ioat_mutex);
+ return dev;
+ }
+ }
+ pthread_mutex_unlock(&g_ioat_mutex);
+
+ return NULL;
+}
+
+static void
+ioat_free_device(struct ioat_device *dev)
+{
+ pthread_mutex_lock(&g_ioat_mutex);
+ dev->is_allocated = false;
+ pthread_mutex_unlock(&g_ioat_mutex);
+}
+
+struct ioat_task {
+ spdk_accel_completion_cb cb;
+};
+
+static int accel_engine_ioat_init(void);
+static void accel_engine_ioat_exit(void *ctx);
+static void accel_engine_ioat_config_text(FILE *fp);
+
+static size_t
+accel_engine_ioat_get_ctx_size(void)
+{
+ return sizeof(struct ioat_task) + sizeof(struct spdk_accel_task);
+}
+
+SPDK_ACCEL_MODULE_REGISTER(accel_engine_ioat_init, accel_engine_ioat_exit,
+ accel_engine_ioat_config_text, NULL,
+ accel_engine_ioat_get_ctx_size)
+
+static void
+ioat_done(void *cb_arg)
+{
+ struct spdk_accel_task *accel_req;
+ struct ioat_task *ioat_task = cb_arg;
+
+ accel_req = (struct spdk_accel_task *)
+ ((uintptr_t)ioat_task -
+ offsetof(struct spdk_accel_task, offload_ctx));
+
+ ioat_task->cb(accel_req, 0);
+}
+
+static int
+ioat_submit_copy(struct spdk_io_channel *ch, void *dst, void *src, uint64_t nbytes,
+ spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+ struct ioat_task *ioat_task = (struct ioat_task *)cb_arg;
+ struct ioat_io_channel *ioat_ch = spdk_io_channel_get_ctx(ch);
+
+ assert(ioat_ch->ioat_ch != NULL);
+
+ ioat_task->cb = cb_fn;
+
+ return spdk_ioat_submit_copy(ioat_ch->ioat_ch, ioat_task, ioat_done, dst, src, nbytes);
+}
+
+static int
+ioat_submit_fill(struct spdk_io_channel *ch, void *dst, uint8_t fill,
+ uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+ struct ioat_task *ioat_task = (struct ioat_task *)cb_arg;
+ struct ioat_io_channel *ioat_ch = spdk_io_channel_get_ctx(ch);
+ uint64_t fill64 = 0x0101010101010101ULL * fill;
+
+ assert(ioat_ch->ioat_ch != NULL);
+
+ ioat_task->cb = cb_fn;
+
+ return spdk_ioat_submit_fill(ioat_ch->ioat_ch, ioat_task, ioat_done, dst, fill64, nbytes);
+}
+
+static int
+ioat_poll(void *arg)
+{
+ struct spdk_ioat_chan *chan = arg;
+
+ return spdk_ioat_process_events(chan) != 0 ? SPDK_POLLER_BUSY :
+ SPDK_POLLER_IDLE;
+}
+
+static struct spdk_io_channel *ioat_get_io_channel(void);
+
+/*
+ * The IOAT engine only supports these capabilities as hardware
+ * accelerated. The accel fw will handle unsupported functions
+ * by calling the software implementations of the functions.
+ */
+static uint64_t
+ioat_get_capabilities(void)
+{
+ return ACCEL_COPY | ACCEL_FILL | ACCEL_BATCH;
+}
+
+/* The IOAT batch functions exposed by the accel fw do not match up 1:1
+ * with the functions in the IOAT library. The IOAT library directly only
+ * supports construction of accelerated functions via the IOAT native
+ * interface. The accel_fw batch capabilities are implemented here in the
+ * plug-in and rely on either the IOAT library for accelerated commands
+ * or software functions for non-accelerated.
+ */
+static uint32_t
+ioat_batch_get_max(void)
+{
+ return g_batch_size;
+}
+
+static struct spdk_accel_batch *
+ioat_batch_create(struct spdk_io_channel *ch)
+{
+ struct ioat_io_channel *ioat_ch = spdk_io_channel_get_ctx(ch);
+
+ if (!TAILQ_EMPTY(&ioat_ch->sw_batch) || (ioat_ch->hw_batch == true)) {
+ SPDK_ERRLOG("IOAT accel engine only supports one batch at a time.\n");
+ return NULL;
+ }
+
+ return (struct spdk_accel_batch *)&ioat_ch->hw_batch;
+}
+
+static struct ioat_accel_op *
+_prep_op(struct ioat_io_channel *ioat_ch, struct spdk_accel_batch *batch,
+ spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+ struct ioat_accel_op *op;
+
+ if ((struct spdk_accel_batch *)&ioat_ch->hw_batch != batch) {
+ SPDK_ERRLOG("Invalid batch\n");
+ return NULL;
+ }
+
+ if (!TAILQ_EMPTY(&ioat_ch->op_pool)) {
+ op = TAILQ_FIRST(&ioat_ch->op_pool);
+ TAILQ_REMOVE(&ioat_ch->op_pool, op, link);
+ } else {
+ SPDK_ERRLOG("Ran out of operations for batch\n");
+ return NULL;
+ }
+
+ op->cb_arg = cb_arg;
+ op->cb_fn = cb_fn;
+ op->ioat_ch = ioat_ch;
+
+ return op;
+}
+
+static int
+ioat_batch_prep_copy(struct spdk_io_channel *ch, struct spdk_accel_batch *batch,
+ void *dst, void *src, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+ struct ioat_io_channel *ioat_ch = spdk_io_channel_get_ctx(ch);
+ struct ioat_task *ioat_task = (struct ioat_task *)cb_arg;
+
+ ioat_task->cb = cb_fn;
+ ioat_ch->hw_batch = true;
+
+ /* Call the IOAT library prep function. */
+ return spdk_ioat_build_copy(ioat_ch->ioat_ch, ioat_task, ioat_done, dst, src, nbytes);
+}
+
+static int
+ioat_batch_prep_fill(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, void *dst,
+ uint8_t fill, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+ struct ioat_io_channel *ioat_ch = spdk_io_channel_get_ctx(ch);
+ struct ioat_task *ioat_task = (struct ioat_task *)cb_arg;
+ uint64_t fill_pattern;
+
+ ioat_task->cb = cb_fn;
+ ioat_ch->hw_batch = true;
+ memset(&fill_pattern, fill, sizeof(uint64_t));
+
+ /* Call the IOAT library prep function. */
+ return spdk_ioat_build_fill(ioat_ch->ioat_ch, ioat_task, ioat_done, dst, fill_pattern, nbytes);
+}
+
+static int
+ioat_batch_prep_dualcast(struct spdk_io_channel *ch,
+ struct spdk_accel_batch *batch, void *dst1, void *dst2,
+ void *src, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+ struct ioat_accel_op *op;
+ struct ioat_io_channel *ioat_ch = spdk_io_channel_get_ctx(ch);
+
+ if ((uintptr_t)dst1 & (ALIGN_4K - 1) || (uintptr_t)dst2 & (ALIGN_4K - 1)) {
+ SPDK_ERRLOG("Dualcast requires 4K alignment on dst addresses\n");
+ return -EINVAL;
+ }
+
+ op = _prep_op(ioat_ch, batch, cb_fn, cb_arg);
+ if (op == NULL) {
+ return -EINVAL;
+ }
+
+ /* Command specific. */
+ op->src = src;
+ op->dst = dst1;
+ op->dst2 = dst2;
+ op->nbytes = nbytes;
+ op->op_code = IOAT_ACCEL_OPCODE_DUALCAST;
+ TAILQ_INSERT_TAIL(&ioat_ch->sw_batch, op, link);
+
+ return 0;
+}
+
+static int
+ioat_batch_prep_compare(struct spdk_io_channel *ch,
+ struct spdk_accel_batch *batch, void *src1,
+ void *src2, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+ struct ioat_accel_op *op;
+ struct ioat_io_channel *ioat_ch = spdk_io_channel_get_ctx(ch);
+
+ op = _prep_op(ioat_ch, batch, cb_fn, cb_arg);
+ if (op == NULL) {
+ return -EINVAL;
+ }
+
+ /* Command specific. */
+ op->src = src1;
+ op->src2 = src2;
+ op->nbytes = nbytes;
+ op->op_code = IOAT_ACCEL_OPCODE_COMPARE;
+ TAILQ_INSERT_TAIL(&ioat_ch->sw_batch, op, link);
+
+ return 0;
+}
+
+static int
+ioat_batch_prep_crc32c(struct spdk_io_channel *ch,
+ struct spdk_accel_batch *batch, uint32_t *dst, void *src,
+ uint32_t seed, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+ struct ioat_accel_op *op;
+ struct ioat_io_channel *ioat_ch = spdk_io_channel_get_ctx(ch);
+
+ op = _prep_op(ioat_ch, batch, cb_fn, cb_arg);
+ if (op == NULL) {
+ return -EINVAL;
+ }
+
+ /* Command specific. */
+ op->dst = (void *)dst;
+ op->src = src;
+ op->seed = seed;
+ op->nbytes = nbytes;
+ op->op_code = IOAT_ACCEL_OPCODE_CRC32C;
+ TAILQ_INSERT_TAIL(&ioat_ch->sw_batch, op, link);
+
+ return 0;
+}
+
+static int
+ioat_batch_cancel(struct spdk_io_channel *ch, struct spdk_accel_batch *batch)
+{
+ struct ioat_accel_op *op;
+ struct ioat_io_channel *ioat_ch = spdk_io_channel_get_ctx(ch);
+
+ if ((struct spdk_accel_batch *)&ioat_ch->hw_batch != batch) {
+ SPDK_ERRLOG("Invalid batch\n");
+ return -EINVAL;
+ }
+
+ /* Flush the batched HW items, there's no way to cancel these without resetting. */
+ spdk_ioat_flush(ioat_ch->ioat_ch);
+ ioat_ch->hw_batch = false;
+
+ /* Return batched software items to the pool. */
+ while ((op = TAILQ_FIRST(&ioat_ch->sw_batch))) {
+ TAILQ_REMOVE(&ioat_ch->sw_batch, op, link);
+ TAILQ_INSERT_TAIL(&ioat_ch->op_pool, op, link);
+ }
+
+ return 0;
+}
+
+static int
+ioat_batch_submit(struct spdk_io_channel *ch, struct spdk_accel_batch *batch,
+ spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+ struct ioat_accel_op *op;
+ struct ioat_io_channel *ioat_ch = spdk_io_channel_get_ctx(ch);
+ struct spdk_accel_task *accel_req;
+ int batch_status = 0, cmd_status = 0;
+
+ if ((struct spdk_accel_batch *)&ioat_ch->hw_batch != batch) {
+ SPDK_ERRLOG("Invalid batch\n");
+ return -EINVAL;
+ }
+
+ /* Flush the batched HW items first. */
+ spdk_ioat_flush(ioat_ch->ioat_ch);
+ ioat_ch->hw_batch = false;
+
+ /* Complete the batched software items. */
+ while ((op = TAILQ_FIRST(&ioat_ch->sw_batch))) {
+ TAILQ_REMOVE(&ioat_ch->sw_batch, op, link);
+ accel_req = (struct spdk_accel_task *)((uintptr_t)op->cb_arg -
+ offsetof(struct spdk_accel_task, offload_ctx));
+
+ switch (op->op_code) {
+ case IOAT_ACCEL_OPCODE_DUALCAST:
+ memcpy(op->dst, op->src, op->nbytes);
+ memcpy(op->dst2, op->src, op->nbytes);
+ break;
+ case IOAT_ACCEL_OPCODE_COMPARE:
+ cmd_status = memcmp(op->src, op->src2, op->nbytes);
+ break;
+ case IOAT_ACCEL_OPCODE_CRC32C:
+ *(uint32_t *)op->dst = spdk_crc32c_update(op->src, op->nbytes, ~op->seed);
+ break;
+ default:
+ assert(false);
+ break;
+ }
+
+ batch_status |= cmd_status;
+ op->cb_fn(accel_req, cmd_status);
+ TAILQ_INSERT_TAIL(&ioat_ch->op_pool, op, link);
+ }
+
+ /* Now complete the batch request itself. */
+ accel_req = (struct spdk_accel_task *)((uintptr_t)cb_arg -
+ offsetof(struct spdk_accel_task, offload_ctx));
+ cb_fn(accel_req, batch_status);
+
+ return 0;
+}
+
+static struct spdk_accel_engine ioat_accel_engine = {
+ .get_capabilities = ioat_get_capabilities,
+ .copy = ioat_submit_copy,
+ .fill = ioat_submit_fill,
+ .batch_get_max = ioat_batch_get_max,
+ .batch_create = ioat_batch_create,
+ .batch_cancel = ioat_batch_cancel,
+ .batch_prep_copy = ioat_batch_prep_copy,
+ .batch_prep_dualcast = ioat_batch_prep_dualcast,
+ .batch_prep_compare = ioat_batch_prep_compare,
+ .batch_prep_fill = ioat_batch_prep_fill,
+ .batch_prep_crc32c = ioat_batch_prep_crc32c,
+ .batch_submit = ioat_batch_submit,
+ .get_io_channel = ioat_get_io_channel,
+};
+
+static int
+ioat_create_cb(void *io_device, void *ctx_buf)
+{
+ struct ioat_io_channel *ch = ctx_buf;
+ struct ioat_device *ioat_dev;
+ struct ioat_accel_op *op;
+ int i;
+
+ ioat_dev = ioat_allocate_device();
+ if (ioat_dev == NULL) {
+ return -1;
+ }
+
+ TAILQ_INIT(&ch->sw_batch);
+ ch->hw_batch = false;
+ TAILQ_INIT(&ch->op_pool);
+
+ g_batch_size = spdk_ioat_get_max_descriptors(ioat_dev->ioat);
+ for (i = 0 ; i < g_batch_size ; i++) {
+ op = calloc(1, sizeof(struct ioat_accel_op));
+ if (op == NULL) {
+ SPDK_ERRLOG("Failed to allocate operation for batch.\n");
+ while ((op = TAILQ_FIRST(&ch->op_pool))) {
+ TAILQ_REMOVE(&ch->op_pool, op, link);
+ free(op);
+ }
+ return -ENOMEM;
+ }
+ TAILQ_INSERT_TAIL(&ch->op_pool, op, link);
+ }
+
+ ch->ioat_dev = ioat_dev;
+ ch->ioat_ch = ioat_dev->ioat;
+ ch->poller = SPDK_POLLER_REGISTER(ioat_poll, ch->ioat_ch, 0);
+ return 0;
+}
+
+static void
+ioat_destroy_cb(void *io_device, void *ctx_buf)
+{
+ struct ioat_io_channel *ch = ctx_buf;
+ struct ioat_accel_op *op;
+
+ while ((op = TAILQ_FIRST(&ch->op_pool))) {
+ TAILQ_REMOVE(&ch->op_pool, op, link);
+ free(op);
+ }
+
+ ioat_free_device(ch->ioat_dev);
+ spdk_poller_unregister(&ch->poller);
+}
+
+static struct spdk_io_channel *
+ioat_get_io_channel(void)
+{
+ return spdk_get_io_channel(&ioat_accel_engine);
+}
+
+static bool
+probe_cb(void *cb_ctx, struct spdk_pci_device *pci_dev)
+{
+ struct ioat_probe_ctx *ctx = cb_ctx;
+ struct spdk_pci_addr pci_addr = spdk_pci_device_get_addr(pci_dev);
+ struct pci_device *pdev;
+
+ SPDK_INFOLOG(SPDK_LOG_ACCEL_IOAT,
+ " Found matching device at %04x:%02x:%02x.%x vendor:0x%04x device:0x%04x\n",
+ pci_addr.domain,
+ pci_addr.bus,
+ pci_addr.dev,
+ pci_addr.func,
+ spdk_pci_device_get_vendor_id(pci_dev),
+ spdk_pci_device_get_device_id(pci_dev));
+
+ pdev = calloc(1, sizeof(*pdev));
+ if (pdev == NULL) {
+ return false;
+ }
+ pdev->pci_dev = pci_dev;
+ TAILQ_INSERT_TAIL(&g_pci_devices, pdev, tailq);
+
+ if (ctx->num_whitelist_devices > 0 &&
+ !ioat_find_dev_by_whitelist_bdf(&pci_addr, ctx->whitelist, ctx->num_whitelist_devices)) {
+ return false;
+ }
+
+ /* Claim the device in case conflict with other process */
+ if (spdk_pci_device_claim(pci_dev) < 0) {
+ return false;
+ }
+
+ return true;
+}
+
+static void
+attach_cb(void *cb_ctx, struct spdk_pci_device *pci_dev, struct spdk_ioat_chan *ioat)
+{
+ struct ioat_device *dev;
+
+ dev = calloc(1, sizeof(*dev));
+ if (dev == NULL) {
+ SPDK_ERRLOG("Failed to allocate device struct\n");
+ return;
+ }
+
+ dev->ioat = ioat;
+ TAILQ_INSERT_TAIL(&g_devices, dev, tailq);
+}
+
+void
+accel_engine_ioat_enable_probe(void)
+{
+ g_ioat_enable = true;
+}
+
+static int
+accel_engine_ioat_add_whitelist_device(const char *pci_bdf)
+{
+ if (pci_bdf == NULL) {
+ return -1;
+ }
+
+ if (g_probe_ctx.num_whitelist_devices >= IOAT_MAX_CHANNELS) {
+ SPDK_ERRLOG("Ioat whitelist is full (max size is %d)\n",
+ IOAT_MAX_CHANNELS);
+ return -1;
+ }
+
+ if (spdk_pci_addr_parse(&g_probe_ctx.whitelist[g_probe_ctx.num_whitelist_devices],
+ pci_bdf) < 0) {
+ SPDK_ERRLOG("Invalid address %s\n", pci_bdf);
+ return -1;
+ }
+
+ g_probe_ctx.num_whitelist_devices++;
+
+ return 0;
+}
+
+int
+accel_engine_ioat_add_whitelist_devices(const char *pci_bdfs[], size_t num_pci_bdfs)
+{
+ size_t i;
+
+ for (i = 0; i < num_pci_bdfs; i++) {
+ if (accel_engine_ioat_add_whitelist_device(pci_bdfs[i]) < 0) {
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+static int
+accel_engine_ioat_read_config_file_params(struct spdk_conf_section *sp)
+{
+ int i;
+ char *val, *pci_bdf;
+
+ if (spdk_conf_section_get_boolval(sp, "Enable", false)) {
+ g_ioat_enable = true;
+ /* Enable Ioat */
+ }
+
+ val = spdk_conf_section_get_val(sp, "Disable");
+ if (val != NULL) {
+ SPDK_WARNLOG("\"Disable\" option is deprecated and will be removed in a future release.\n");
+ SPDK_WARNLOG("IOAT is now disabled by default. It may be enabled by \"Enable Yes\"\n");
+
+ if (g_ioat_enable && (strcasecmp(val, "Yes") == 0)) {
+ SPDK_ERRLOG("\"Enable Yes\" and \"Disable Yes\" cannot be set at the same time\n");
+ return -1;
+ }
+ }
+
+ /* Init the whitelist */
+ for (i = 0; ; i++) {
+ pci_bdf = spdk_conf_section_get_nmval(sp, "Whitelist", i, 0);
+ if (!pci_bdf) {
+ break;
+ }
+
+ if (accel_engine_ioat_add_whitelist_device(pci_bdf) < 0) {
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+static int
+accel_engine_ioat_init(void)
+{
+ struct spdk_conf_section *sp;
+ int rc;
+
+ sp = spdk_conf_find_section(NULL, "Ioat");
+ if (sp != NULL) {
+ rc = accel_engine_ioat_read_config_file_params(sp);
+ if (rc != 0) {
+ SPDK_ERRLOG("accel_engine_ioat_read_config_file_params() failed\n");
+ return rc;
+ }
+ }
+
+ if (!g_ioat_enable) {
+ return 0;
+ }
+
+ if (spdk_ioat_probe(&g_probe_ctx, probe_cb, attach_cb) != 0) {
+ SPDK_ERRLOG("spdk_ioat_probe() failed\n");
+ return -1;
+ }
+
+ g_ioat_initialized = true;
+ SPDK_NOTICELOG("Accel engine updated to use IOAT engine.\n");
+ spdk_accel_hw_engine_register(&ioat_accel_engine);
+ spdk_io_device_register(&ioat_accel_engine, ioat_create_cb, ioat_destroy_cb,
+ sizeof(struct ioat_io_channel), "ioat_accel_engine");
+ return 0;
+}
+
+static void
+accel_engine_ioat_exit(void *ctx)
+{
+ struct ioat_device *dev;
+ struct pci_device *pci_dev;
+
+ if (g_ioat_initialized) {
+ spdk_io_device_unregister(&ioat_accel_engine, NULL);
+ }
+
+ while (!TAILQ_EMPTY(&g_devices)) {
+ dev = TAILQ_FIRST(&g_devices);
+ TAILQ_REMOVE(&g_devices, dev, tailq);
+ spdk_ioat_detach(dev->ioat);
+ ioat_free_device(dev);
+ free(dev);
+ }
+
+ while (!TAILQ_EMPTY(&g_pci_devices)) {
+ pci_dev = TAILQ_FIRST(&g_pci_devices);
+ TAILQ_REMOVE(&g_pci_devices, pci_dev, tailq);
+ spdk_pci_device_detach(pci_dev->pci_dev);
+ free(pci_dev);
+ }
+
+ spdk_accel_engine_module_finish();
+}
+
+#define ACCEL_ENGINE_IOAT_HEADER_TMPL \
+"[Ioat]\n" \
+" # Users may not want to use offload even it is available.\n" \
+" # Users may use the whitelist to initialize specified devices, IDS\n" \
+" # uses BUS:DEVICE.FUNCTION to identify each Ioat channel.\n"
+
+#define ACCEL_ENGINE_IOAT_ENABLE_TMPL \
+" Enable %s\n"
+
+#define ACCEL_ENGINE_IOAT_WHITELIST_TMPL \
+" Whitelist %.4" PRIx16 ":%.2" PRIx8 ":%.2" PRIx8 ".%" PRIx8 "\n"
+
+static void
+accel_engine_ioat_config_text(FILE *fp)
+{
+ int i;
+ struct spdk_pci_addr *dev;
+
+ fprintf(fp, ACCEL_ENGINE_IOAT_HEADER_TMPL);
+ fprintf(fp, ACCEL_ENGINE_IOAT_ENABLE_TMPL, g_ioat_enable ? "Yes" : "No");
+
+ for (i = 0; i < g_probe_ctx.num_whitelist_devices; i++) {
+ dev = &g_probe_ctx.whitelist[i];
+ fprintf(fp, ACCEL_ENGINE_IOAT_WHITELIST_TMPL,
+ dev->domain, dev->bus, dev->dev, dev->func);
+ }
+}
+
+SPDK_LOG_REGISTER_COMPONENT("accel_ioat", SPDK_LOG_ACCEL_IOAT)
diff --git a/src/spdk/module/accel/ioat/accel_engine_ioat.h b/src/spdk/module/accel/ioat/accel_engine_ioat.h
new file mode 100644
index 000000000..26a167eb6
--- /dev/null
+++ b/src/spdk/module/accel/ioat/accel_engine_ioat.h
@@ -0,0 +1,44 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_ACCEL_ENGINE_IOAT_H
+#define SPDK_ACCEL_ENGINE_IOAT_H
+
+#include "spdk/stdinc.h"
+
+#define IOAT_MAX_CHANNELS 64
+
+int accel_engine_ioat_add_whitelist_devices(const char *pci_bdfs[], size_t num_pci_bdfs);
+void accel_engine_ioat_enable_probe(void);
+
+#endif /* SPDK_ACCEL_ENGINE_IOAT_H */
diff --git a/src/spdk/module/accel/ioat/accel_engine_ioat_rpc.c b/src/spdk/module/accel/ioat/accel_engine_ioat_rpc.c
new file mode 100644
index 000000000..7f2322536
--- /dev/null
+++ b/src/spdk/module/accel/ioat/accel_engine_ioat_rpc.c
@@ -0,0 +1,116 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accel_engine_ioat.h"
+
+#include "spdk/rpc.h"
+#include "spdk/util.h"
+#include "spdk/event.h"
+
+struct rpc_pci_whitelist {
+ size_t num_bdfs;
+ char *bdfs[IOAT_MAX_CHANNELS];
+};
+
+static int
+decode_rpc_pci_whitelist(const struct spdk_json_val *val, void *out)
+{
+ struct rpc_pci_whitelist *pci_whitelist = out;
+
+ return spdk_json_decode_array(val, spdk_json_decode_string, pci_whitelist->bdfs,
+ IOAT_MAX_CHANNELS, &pci_whitelist->num_bdfs, sizeof(char *));
+}
+
+static void
+free_rpc_pci_whitelist(struct rpc_pci_whitelist *list)
+{
+ size_t i;
+
+ for (i = 0; i < list->num_bdfs; i++) {
+ free(list->bdfs[i]);
+ }
+}
+
+struct rpc_ioat_scan_accel_engine {
+ struct rpc_pci_whitelist pci_whitelist;
+};
+
+static void
+free_rpc_ioat_scan_accel_engine(struct rpc_ioat_scan_accel_engine *p)
+{
+ free_rpc_pci_whitelist(&p->pci_whitelist);
+}
+
+static const struct spdk_json_object_decoder rpc_ioat_scan_accel_engine_decoder[] = {
+ {"pci_whitelist", offsetof(struct rpc_ioat_scan_accel_engine, pci_whitelist), decode_rpc_pci_whitelist},
+};
+
+static void
+rpc_ioat_scan_accel_engine(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_ioat_scan_accel_engine req = {};
+ struct spdk_json_write_ctx *w;
+ int rc;
+
+ if (params != NULL) {
+ if (spdk_json_decode_object(params, rpc_ioat_scan_accel_engine_decoder,
+ SPDK_COUNTOF(rpc_ioat_scan_accel_engine_decoder),
+ &req)) {
+ free_rpc_ioat_scan_accel_engine(&req);
+ SPDK_ERRLOG("spdk_json_decode_object() failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid parameters");
+ return;
+ }
+
+ rc = accel_engine_ioat_add_whitelist_devices((const char **)req.pci_whitelist.bdfs,
+ req.pci_whitelist.num_bdfs);
+ free_rpc_ioat_scan_accel_engine(&req);
+ if (rc < 0) {
+ SPDK_ERRLOG("accel_engine_ioat_add_whitelist_devices() failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid parameters");
+ return;
+ }
+ }
+
+ accel_engine_ioat_enable_probe();
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("ioat_scan_accel_engine", rpc_ioat_scan_accel_engine, SPDK_RPC_STARTUP)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(ioat_scan_accel_engine, ioat_scan_copy_engine)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(ioat_scan_accel_engine, scan_ioat_copy_engine)
diff --git a/src/spdk/module/bdev/Makefile b/src/spdk/module/bdev/Makefile
new file mode 100644
index 000000000..2e30470ec
--- /dev/null
+++ b/src/spdk/module/bdev/Makefile
@@ -0,0 +1,61 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+DIRS-y += delay error gpt lvol malloc null nvme passthru raid rpc split zone_block
+
+DIRS-$(CONFIG_CRYPTO) += crypto
+
+DIRS-$(CONFIG_OCF) += ocf
+
+DIRS-$(CONFIG_REDUCE) += compress
+
+DIRS-$(CONFIG_URING) += uring
+
+ifeq ($(OS),Linux)
+DIRS-y += aio ftl
+DIRS-$(CONFIG_ISCSI_INITIATOR) += iscsi
+DIRS-$(CONFIG_VIRTIO) += virtio
+DIRS-$(CONFIG_PMDK) += pmem
+endif
+
+DIRS-$(CONFIG_RBD) += rbd
+
+.PHONY: all clean $(DIRS-y)
+
+all: $(DIRS-y)
+clean: $(DIRS-y)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.subdirs.mk
diff --git a/src/spdk/module/bdev/aio/Makefile b/src/spdk/module/bdev/aio/Makefile
new file mode 100644
index 000000000..9f0e3a582
--- /dev/null
+++ b/src/spdk/module/bdev/aio/Makefile
@@ -0,0 +1,46 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = bdev_aio.c bdev_aio_rpc.c
+LIBNAME = bdev_aio
+LOCAL_SYS_LIBS = -laio
+
+SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/module/bdev/aio/bdev_aio.c b/src/spdk/module/bdev/aio/bdev_aio.c
new file mode 100644
index 000000000..4b49fb2c3
--- /dev/null
+++ b/src/spdk/module/bdev/aio/bdev_aio.c
@@ -0,0 +1,827 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "bdev_aio.h"
+
+#include "spdk/stdinc.h"
+
+#include "spdk/barrier.h"
+#include "spdk/bdev.h"
+#include "spdk/bdev_module.h"
+#include "spdk/conf.h"
+#include "spdk/env.h"
+#include "spdk/fd.h"
+#include "spdk/likely.h"
+#include "spdk/thread.h"
+#include "spdk/json.h"
+#include "spdk/util.h"
+#include "spdk/string.h"
+
+#include "spdk_internal/log.h"
+
+#include <libaio.h>
+
+struct bdev_aio_io_channel {
+ uint64_t io_inflight;
+ struct bdev_aio_group_channel *group_ch;
+};
+
+struct bdev_aio_group_channel {
+ struct spdk_poller *poller;
+ io_context_t io_ctx;
+};
+
+struct bdev_aio_task {
+ struct iocb iocb;
+ uint64_t len;
+ struct bdev_aio_io_channel *ch;
+ TAILQ_ENTRY(bdev_aio_task) link;
+};
+
+struct file_disk {
+ struct bdev_aio_task *reset_task;
+ struct spdk_poller *reset_retry_timer;
+ struct spdk_bdev disk;
+ char *filename;
+ int fd;
+ TAILQ_ENTRY(file_disk) link;
+ bool block_size_override;
+};
+
+/* For user space reaping of completions */
+struct spdk_aio_ring {
+ uint32_t id;
+ uint32_t size;
+ uint32_t head;
+ uint32_t tail;
+
+ uint32_t version;
+ uint32_t compat_features;
+ uint32_t incompat_features;
+ uint32_t header_length;
+};
+
+#define SPDK_AIO_RING_VERSION 0xa10a10a1
+
+static int bdev_aio_initialize(void);
+static void bdev_aio_fini(void);
+static void aio_free_disk(struct file_disk *fdisk);
+static void bdev_aio_get_spdk_running_config(FILE *fp);
+static TAILQ_HEAD(, file_disk) g_aio_disk_head;
+
+#define SPDK_AIO_QUEUE_DEPTH 128
+#define MAX_EVENTS_PER_POLL 32
+
+static int
+bdev_aio_get_ctx_size(void)
+{
+ return sizeof(struct bdev_aio_task);
+}
+
+static struct spdk_bdev_module aio_if = {
+ .name = "aio",
+ .module_init = bdev_aio_initialize,
+ .module_fini = bdev_aio_fini,
+ .config_text = bdev_aio_get_spdk_running_config,
+ .get_ctx_size = bdev_aio_get_ctx_size,
+};
+
+SPDK_BDEV_MODULE_REGISTER(aio, &aio_if)
+
+static int
+bdev_aio_open(struct file_disk *disk)
+{
+ int fd;
+
+ fd = open(disk->filename, O_RDWR | O_DIRECT);
+ if (fd < 0) {
+ /* Try without O_DIRECT for non-disk files */
+ fd = open(disk->filename, O_RDWR);
+ if (fd < 0) {
+ SPDK_ERRLOG("open() failed (file:%s), errno %d: %s\n",
+ disk->filename, errno, spdk_strerror(errno));
+ disk->fd = -1;
+ return -1;
+ }
+ }
+
+ disk->fd = fd;
+
+ return 0;
+}
+
+static int
+bdev_aio_close(struct file_disk *disk)
+{
+ int rc;
+
+ if (disk->fd == -1) {
+ return 0;
+ }
+
+ rc = close(disk->fd);
+ if (rc < 0) {
+ SPDK_ERRLOG("close() failed (fd=%d), errno %d: %s\n",
+ disk->fd, errno, spdk_strerror(errno));
+ return -1;
+ }
+
+ disk->fd = -1;
+
+ return 0;
+}
+
+static int64_t
+bdev_aio_readv(struct file_disk *fdisk, struct spdk_io_channel *ch,
+ struct bdev_aio_task *aio_task,
+ struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset)
+{
+ struct iocb *iocb = &aio_task->iocb;
+ struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch);
+ int rc;
+
+ io_prep_preadv(iocb, fdisk->fd, iov, iovcnt, offset);
+ iocb->data = aio_task;
+ aio_task->len = nbytes;
+ aio_task->ch = aio_ch;
+
+ SPDK_DEBUGLOG(SPDK_LOG_AIO, "read %d iovs size %lu to off: %#lx\n",
+ iovcnt, nbytes, offset);
+
+ rc = io_submit(aio_ch->group_ch->io_ctx, 1, &iocb);
+ if (rc < 0) {
+ if (rc == -EAGAIN) {
+ spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_NOMEM);
+ } else {
+ spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_FAILED);
+ SPDK_ERRLOG("%s: io_submit returned %d\n", __func__, rc);
+ }
+ return -1;
+ }
+ aio_ch->io_inflight++;
+ return nbytes;
+}
+
+static int64_t
+bdev_aio_writev(struct file_disk *fdisk, struct spdk_io_channel *ch,
+ struct bdev_aio_task *aio_task,
+ struct iovec *iov, int iovcnt, size_t len, uint64_t offset)
+{
+ struct iocb *iocb = &aio_task->iocb;
+ struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch);
+ int rc;
+
+ io_prep_pwritev(iocb, fdisk->fd, iov, iovcnt, offset);
+ iocb->data = aio_task;
+ aio_task->len = len;
+ aio_task->ch = aio_ch;
+
+ SPDK_DEBUGLOG(SPDK_LOG_AIO, "write %d iovs size %lu from off: %#lx\n",
+ iovcnt, len, offset);
+
+ rc = io_submit(aio_ch->group_ch->io_ctx, 1, &iocb);
+ if (rc < 0) {
+ if (rc == -EAGAIN) {
+ spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_NOMEM);
+ } else {
+ spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_FAILED);
+ SPDK_ERRLOG("%s: io_submit returned %d\n", __func__, rc);
+ }
+ return -1;
+ }
+ aio_ch->io_inflight++;
+ return len;
+}
+
+static void
+bdev_aio_flush(struct file_disk *fdisk, struct bdev_aio_task *aio_task)
+{
+ int rc = fsync(fdisk->fd);
+
+ spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task),
+ rc == 0 ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED);
+}
+
+static int
+bdev_aio_destruct(void *ctx)
+{
+ struct file_disk *fdisk = ctx;
+ int rc = 0;
+
+ TAILQ_REMOVE(&g_aio_disk_head, fdisk, link);
+ rc = bdev_aio_close(fdisk);
+ if (rc < 0) {
+ SPDK_ERRLOG("bdev_aio_close() failed\n");
+ }
+ spdk_io_device_unregister(fdisk, NULL);
+ aio_free_disk(fdisk);
+ return rc;
+}
+
+static int
+bdev_user_io_getevents(io_context_t io_ctx, unsigned int max, struct io_event *uevents)
+{
+ uint32_t head, tail, count;
+ struct spdk_aio_ring *ring;
+ struct timespec timeout;
+ struct io_event *kevents;
+
+ ring = (struct spdk_aio_ring *)io_ctx;
+
+ if (spdk_unlikely(ring->version != SPDK_AIO_RING_VERSION || ring->incompat_features != 0)) {
+ timeout.tv_sec = 0;
+ timeout.tv_nsec = 0;
+
+ return io_getevents(io_ctx, 0, max, uevents, &timeout);
+ }
+
+ /* Read the current state out of the ring */
+ head = ring->head;
+ tail = ring->tail;
+
+ /* This memory barrier is required to prevent the loads above
+ * from being re-ordered with stores to the events array
+ * potentially occurring on other threads. */
+ spdk_smp_rmb();
+
+ /* Calculate how many items are in the circular ring */
+ count = tail - head;
+ if (tail < head) {
+ count += ring->size;
+ }
+
+ /* Reduce the count to the limit provided by the user */
+ count = spdk_min(max, count);
+
+ /* Grab the memory location of the event array */
+ kevents = (struct io_event *)((uintptr_t)ring + ring->header_length);
+
+ /* Copy the events out of the ring. */
+ if ((head + count) <= ring->size) {
+ /* Only one copy is required */
+ memcpy(uevents, &kevents[head], count * sizeof(struct io_event));
+ } else {
+ uint32_t first_part = ring->size - head;
+ /* Two copies are required */
+ memcpy(uevents, &kevents[head], first_part * sizeof(struct io_event));
+ memcpy(&uevents[first_part], &kevents[0], (count - first_part) * sizeof(struct io_event));
+ }
+
+ /* Update the head pointer. On x86, stores will not be reordered with older loads,
+ * so the copies out of the event array will always be complete prior to this
+ * update becoming visible. On other architectures this is not guaranteed, so
+ * add a barrier. */
+#if defined(__i386__) || defined(__x86_64__)
+ spdk_compiler_barrier();
+#else
+ spdk_smp_mb();
+#endif
+ ring->head = (head + count) % ring->size;
+
+ return count;
+}
+
+static int
+bdev_aio_group_poll(void *arg)
+{
+ struct bdev_aio_group_channel *group_ch = arg;
+ int nr, i = 0;
+ enum spdk_bdev_io_status status;
+ struct bdev_aio_task *aio_task;
+ struct io_event events[SPDK_AIO_QUEUE_DEPTH];
+
+ nr = bdev_user_io_getevents(group_ch->io_ctx, SPDK_AIO_QUEUE_DEPTH, events);
+
+ if (nr < 0) {
+ return SPDK_POLLER_IDLE;
+ }
+
+ for (i = 0; i < nr; i++) {
+ aio_task = events[i].data;
+ if (events[i].res != aio_task->len) {
+ status = SPDK_BDEV_IO_STATUS_FAILED;
+ } else {
+ status = SPDK_BDEV_IO_STATUS_SUCCESS;
+ }
+
+ spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), status);
+ aio_task->ch->io_inflight--;
+ }
+
+ return nr > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
+}
+
+static void
+_bdev_aio_get_io_inflight(struct spdk_io_channel_iter *i)
+{
+ struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
+ struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch);
+
+ if (aio_ch->io_inflight) {
+ spdk_for_each_channel_continue(i, -1);
+ return;
+ }
+
+ spdk_for_each_channel_continue(i, 0);
+}
+
+static int bdev_aio_reset_retry_timer(void *arg);
+
+static void
+_bdev_aio_get_io_inflight_done(struct spdk_io_channel_iter *i, int status)
+{
+ struct file_disk *fdisk = spdk_io_channel_iter_get_ctx(i);
+
+ if (status == -1) {
+ fdisk->reset_retry_timer = SPDK_POLLER_REGISTER(bdev_aio_reset_retry_timer, fdisk, 500);
+ return;
+ }
+
+ spdk_bdev_io_complete(spdk_bdev_io_from_ctx(fdisk->reset_task), SPDK_BDEV_IO_STATUS_SUCCESS);
+}
+
+static int
+bdev_aio_reset_retry_timer(void *arg)
+{
+ struct file_disk *fdisk = arg;
+
+ if (fdisk->reset_retry_timer) {
+ spdk_poller_unregister(&fdisk->reset_retry_timer);
+ }
+
+ spdk_for_each_channel(fdisk,
+ _bdev_aio_get_io_inflight,
+ fdisk,
+ _bdev_aio_get_io_inflight_done);
+
+ return SPDK_POLLER_BUSY;
+}
+
+static void
+bdev_aio_reset(struct file_disk *fdisk, struct bdev_aio_task *aio_task)
+{
+ fdisk->reset_task = aio_task;
+
+ bdev_aio_reset_retry_timer(fdisk);
+}
+
+static void
+bdev_aio_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
+ bool success)
+{
+ if (!success) {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ return;
+ }
+
+ switch (bdev_io->type) {
+ case SPDK_BDEV_IO_TYPE_READ:
+ bdev_aio_readv((struct file_disk *)bdev_io->bdev->ctxt,
+ ch,
+ (struct bdev_aio_task *)bdev_io->driver_ctx,
+ bdev_io->u.bdev.iovs,
+ bdev_io->u.bdev.iovcnt,
+ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen,
+ bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen);
+ break;
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ bdev_aio_writev((struct file_disk *)bdev_io->bdev->ctxt,
+ ch,
+ (struct bdev_aio_task *)bdev_io->driver_ctx,
+ bdev_io->u.bdev.iovs,
+ bdev_io->u.bdev.iovcnt,
+ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen,
+ bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen);
+ break;
+ default:
+ SPDK_ERRLOG("Wrong io type\n");
+ break;
+ }
+}
+
+static int _bdev_aio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
+{
+ switch (bdev_io->type) {
+ /* Read and write operations must be performed on buffers aligned to
+ * bdev->required_alignment. If user specified unaligned buffers,
+ * get the aligned buffer from the pool by calling spdk_bdev_io_get_buf. */
+ case SPDK_BDEV_IO_TYPE_READ:
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ spdk_bdev_io_get_buf(bdev_io, bdev_aio_get_buf_cb,
+ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
+ return 0;
+ case SPDK_BDEV_IO_TYPE_FLUSH:
+ bdev_aio_flush((struct file_disk *)bdev_io->bdev->ctxt,
+ (struct bdev_aio_task *)bdev_io->driver_ctx);
+ return 0;
+
+ case SPDK_BDEV_IO_TYPE_RESET:
+ bdev_aio_reset((struct file_disk *)bdev_io->bdev->ctxt,
+ (struct bdev_aio_task *)bdev_io->driver_ctx);
+ return 0;
+ default:
+ return -1;
+ }
+}
+
+static void bdev_aio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
+{
+ if (_bdev_aio_submit_request(ch, bdev_io) < 0) {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+}
+
+static bool
+bdev_aio_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
+{
+ switch (io_type) {
+ case SPDK_BDEV_IO_TYPE_READ:
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ case SPDK_BDEV_IO_TYPE_FLUSH:
+ case SPDK_BDEV_IO_TYPE_RESET:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+static int
+bdev_aio_create_cb(void *io_device, void *ctx_buf)
+{
+ struct bdev_aio_io_channel *ch = ctx_buf;
+
+ ch->group_ch = spdk_io_channel_get_ctx(spdk_get_io_channel(&aio_if));
+
+ return 0;
+}
+
+static void
+bdev_aio_destroy_cb(void *io_device, void *ctx_buf)
+{
+ struct bdev_aio_io_channel *ch = ctx_buf;
+
+ spdk_put_io_channel(spdk_io_channel_from_ctx(ch->group_ch));
+}
+
+static struct spdk_io_channel *
+bdev_aio_get_io_channel(void *ctx)
+{
+ struct file_disk *fdisk = ctx;
+
+ return spdk_get_io_channel(fdisk);
+}
+
+
+static int
+bdev_aio_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
+{
+ struct file_disk *fdisk = ctx;
+
+ spdk_json_write_named_object_begin(w, "aio");
+
+ spdk_json_write_named_string(w, "filename", fdisk->filename);
+
+ spdk_json_write_object_end(w);
+
+ return 0;
+}
+
+static void
+bdev_aio_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
+{
+ struct file_disk *fdisk = bdev->ctxt;
+
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_string(w, "method", "bdev_aio_create");
+
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_string(w, "name", bdev->name);
+ if (fdisk->block_size_override) {
+ spdk_json_write_named_uint32(w, "block_size", bdev->blocklen);
+ }
+ spdk_json_write_named_string(w, "filename", fdisk->filename);
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+}
+
+static const struct spdk_bdev_fn_table aio_fn_table = {
+ .destruct = bdev_aio_destruct,
+ .submit_request = bdev_aio_submit_request,
+ .io_type_supported = bdev_aio_io_type_supported,
+ .get_io_channel = bdev_aio_get_io_channel,
+ .dump_info_json = bdev_aio_dump_info_json,
+ .write_config_json = bdev_aio_write_json_config,
+};
+
+static void aio_free_disk(struct file_disk *fdisk)
+{
+ if (fdisk == NULL) {
+ return;
+ }
+ free(fdisk->filename);
+ free(fdisk->disk.name);
+ free(fdisk);
+}
+
+static int
+bdev_aio_group_create_cb(void *io_device, void *ctx_buf)
+{
+ struct bdev_aio_group_channel *ch = ctx_buf;
+
+ if (io_setup(SPDK_AIO_QUEUE_DEPTH, &ch->io_ctx) < 0) {
+ SPDK_ERRLOG("async I/O context setup failure\n");
+ return -1;
+ }
+
+ ch->poller = SPDK_POLLER_REGISTER(bdev_aio_group_poll, ch, 0);
+ return 0;
+}
+
+static void
+bdev_aio_group_destroy_cb(void *io_device, void *ctx_buf)
+{
+ struct bdev_aio_group_channel *ch = ctx_buf;
+
+ io_destroy(ch->io_ctx);
+
+ spdk_poller_unregister(&ch->poller);
+}
+
+int
+create_aio_bdev(const char *name, const char *filename, uint32_t block_size)
+{
+ struct file_disk *fdisk;
+ uint32_t detected_block_size;
+ uint64_t disk_size;
+ int rc;
+
+ fdisk = calloc(1, sizeof(*fdisk));
+ if (!fdisk) {
+ SPDK_ERRLOG("Unable to allocate enough memory for aio backend\n");
+ return -ENOMEM;
+ }
+
+ fdisk->filename = strdup(filename);
+ if (!fdisk->filename) {
+ rc = -ENOMEM;
+ goto error_return;
+ }
+
+ if (bdev_aio_open(fdisk)) {
+ SPDK_ERRLOG("Unable to open file %s. fd: %d errno: %d\n", filename, fdisk->fd, errno);
+ rc = -errno;
+ goto error_return;
+ }
+
+ disk_size = spdk_fd_get_size(fdisk->fd);
+
+ fdisk->disk.name = strdup(name);
+ if (!fdisk->disk.name) {
+ rc = -ENOMEM;
+ goto error_return;
+ }
+ fdisk->disk.product_name = "AIO disk";
+ fdisk->disk.module = &aio_if;
+
+ fdisk->disk.write_cache = 1;
+
+ detected_block_size = spdk_fd_get_blocklen(fdisk->fd);
+ if (block_size == 0) {
+ /* User did not specify block size - use autodetected block size. */
+ if (detected_block_size == 0) {
+ SPDK_ERRLOG("Block size could not be auto-detected\n");
+ rc = -EINVAL;
+ goto error_return;
+ }
+ fdisk->block_size_override = false;
+ block_size = detected_block_size;
+ } else {
+ if (block_size < detected_block_size) {
+ SPDK_ERRLOG("Specified block size %" PRIu32 " is smaller than "
+ "auto-detected block size %" PRIu32 "\n",
+ block_size, detected_block_size);
+ rc = -EINVAL;
+ goto error_return;
+ } else if (detected_block_size != 0 && block_size != detected_block_size) {
+ SPDK_WARNLOG("Specified block size %" PRIu32 " does not match "
+ "auto-detected block size %" PRIu32 "\n",
+ block_size, detected_block_size);
+ }
+ fdisk->block_size_override = true;
+ }
+
+ if (block_size < 512) {
+ SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be at least 512).\n", block_size);
+ rc = -EINVAL;
+ goto error_return;
+ }
+
+ if (!spdk_u32_is_pow2(block_size)) {
+ SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be a power of 2.)\n", block_size);
+ rc = -EINVAL;
+ goto error_return;
+ }
+
+ fdisk->disk.blocklen = block_size;
+ if (fdisk->block_size_override && detected_block_size) {
+ fdisk->disk.required_alignment = spdk_u32log2(detected_block_size);
+ } else {
+ fdisk->disk.required_alignment = spdk_u32log2(block_size);
+ }
+
+ if (disk_size % fdisk->disk.blocklen != 0) {
+ SPDK_ERRLOG("Disk size %" PRIu64 " is not a multiple of block size %" PRIu32 "\n",
+ disk_size, fdisk->disk.blocklen);
+ rc = -EINVAL;
+ goto error_return;
+ }
+
+ fdisk->disk.blockcnt = disk_size / fdisk->disk.blocklen;
+ fdisk->disk.ctxt = fdisk;
+
+ fdisk->disk.fn_table = &aio_fn_table;
+
+ spdk_io_device_register(fdisk, bdev_aio_create_cb, bdev_aio_destroy_cb,
+ sizeof(struct bdev_aio_io_channel),
+ fdisk->disk.name);
+ rc = spdk_bdev_register(&fdisk->disk);
+ if (rc) {
+ spdk_io_device_unregister(fdisk, NULL);
+ goto error_return;
+ }
+
+ TAILQ_INSERT_TAIL(&g_aio_disk_head, fdisk, link);
+ return 0;
+
+error_return:
+ bdev_aio_close(fdisk);
+ aio_free_disk(fdisk);
+ return rc;
+}
+
+struct delete_aio_bdev_ctx {
+ delete_aio_bdev_complete cb_fn;
+ void *cb_arg;
+};
+
+static void
+aio_bdev_unregister_cb(void *arg, int bdeverrno)
+{
+ struct delete_aio_bdev_ctx *ctx = arg;
+
+ ctx->cb_fn(ctx->cb_arg, bdeverrno);
+ free(ctx);
+}
+
+void
+bdev_aio_delete(struct spdk_bdev *bdev, delete_aio_bdev_complete cb_fn, void *cb_arg)
+{
+ struct delete_aio_bdev_ctx *ctx;
+
+ if (!bdev || bdev->module != &aio_if) {
+ cb_fn(cb_arg, -ENODEV);
+ return;
+ }
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (ctx == NULL) {
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ ctx->cb_fn = cb_fn;
+ ctx->cb_arg = cb_arg;
+ spdk_bdev_unregister(bdev, aio_bdev_unregister_cb, ctx);
+}
+
+static int
+bdev_aio_initialize(void)
+{
+ size_t i;
+ struct spdk_conf_section *sp;
+ int rc = 0;
+
+ TAILQ_INIT(&g_aio_disk_head);
+ spdk_io_device_register(&aio_if, bdev_aio_group_create_cb, bdev_aio_group_destroy_cb,
+ sizeof(struct bdev_aio_group_channel),
+ "aio_module");
+
+ sp = spdk_conf_find_section(NULL, "AIO");
+ if (!sp) {
+ return 0;
+ }
+
+ i = 0;
+ while (true) {
+ const char *file;
+ const char *name;
+ const char *block_size_str;
+ uint32_t block_size = 0;
+ long int tmp;
+
+ file = spdk_conf_section_get_nmval(sp, "AIO", i, 0);
+ if (!file) {
+ break;
+ }
+
+ name = spdk_conf_section_get_nmval(sp, "AIO", i, 1);
+ if (!name) {
+ SPDK_ERRLOG("No name provided for AIO disk with file %s\n", file);
+ i++;
+ continue;
+ }
+
+ block_size_str = spdk_conf_section_get_nmval(sp, "AIO", i, 2);
+ if (block_size_str) {
+ tmp = spdk_strtol(block_size_str, 10);
+ if (tmp < 0) {
+ SPDK_ERRLOG("Invalid block size for AIO disk with file %s\n", file);
+ i++;
+ continue;
+ }
+ block_size = (uint32_t)tmp;
+ }
+
+ rc = create_aio_bdev(name, file, block_size);
+ if (rc) {
+ SPDK_ERRLOG("Unable to create AIO bdev from file %s, err is %s\n", file, spdk_strerror(-rc));
+ }
+
+ i++;
+ }
+
+ return 0;
+}
+
+static void
+bdev_aio_fini(void)
+{
+ spdk_io_device_unregister(&aio_if, NULL);
+}
+
+static void
+bdev_aio_get_spdk_running_config(FILE *fp)
+{
+ char *file;
+ char *name;
+ uint32_t block_size;
+ struct file_disk *fdisk;
+
+ fprintf(fp,
+ "\n"
+ "# Users must change this section to match the /dev/sdX devices to be\n"
+ "# exported as iSCSI LUNs. The devices are accessed using Linux AIO.\n"
+ "# The format is:\n"
+ "# AIO <file name> <bdev name> [<block size>]\n"
+ "# The file name is the backing device\n"
+ "# The bdev name can be referenced from elsewhere in the configuration file.\n"
+ "# Block size may be omitted to automatically detect the block size of a disk.\n"
+ "[AIO]\n");
+
+ TAILQ_FOREACH(fdisk, &g_aio_disk_head, link) {
+ file = fdisk->filename;
+ name = fdisk->disk.name;
+ block_size = fdisk->disk.blocklen;
+ fprintf(fp, " AIO %s %s ", file, name);
+ if (fdisk->block_size_override) {
+ fprintf(fp, "%d", block_size);
+ }
+ fprintf(fp, "\n");
+ }
+ fprintf(fp, "\n");
+}
+
+SPDK_LOG_REGISTER_COMPONENT("aio", SPDK_LOG_AIO)
diff --git a/src/spdk/module/bdev/aio/bdev_aio.h b/src/spdk/module/bdev/aio/bdev_aio.h
new file mode 100644
index 000000000..9ba425946
--- /dev/null
+++ b/src/spdk/module/bdev/aio/bdev_aio.h
@@ -0,0 +1,46 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_BDEV_AIO_H
+#define SPDK_BDEV_AIO_H
+
+#include "spdk/stdinc.h"
+#include "spdk/bdev.h"
+
+typedef void (*delete_aio_bdev_complete)(void *cb_arg, int bdeverrno);
+
+int create_aio_bdev(const char *name, const char *filename, uint32_t block_size);
+
+void bdev_aio_delete(struct spdk_bdev *bdev, delete_aio_bdev_complete cb_fn, void *cb_arg);
+
+#endif /* SPDK_BDEV_AIO_H */
diff --git a/src/spdk/module/bdev/aio/bdev_aio_rpc.c b/src/spdk/module/bdev/aio/bdev_aio_rpc.c
new file mode 100644
index 000000000..0968b8d76
--- /dev/null
+++ b/src/spdk/module/bdev/aio/bdev_aio_rpc.c
@@ -0,0 +1,148 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "bdev_aio.h"
+#include "spdk/rpc.h"
+#include "spdk/util.h"
+#include "spdk/string.h"
+#include "spdk_internal/log.h"
+
+struct rpc_construct_aio {
+ char *name;
+ char *filename;
+ uint32_t block_size;
+};
+
+static void
+free_rpc_construct_aio(struct rpc_construct_aio *req)
+{
+ free(req->name);
+ free(req->filename);
+}
+
+static const struct spdk_json_object_decoder rpc_construct_aio_decoders[] = {
+ {"name", offsetof(struct rpc_construct_aio, name), spdk_json_decode_string},
+ {"filename", offsetof(struct rpc_construct_aio, filename), spdk_json_decode_string},
+ {"block_size", offsetof(struct rpc_construct_aio, block_size), spdk_json_decode_uint32, true},
+};
+
+static void
+rpc_bdev_aio_create(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_construct_aio req = {};
+ struct spdk_json_write_ctx *w;
+ int rc = 0;
+
+ if (spdk_json_decode_object(params, rpc_construct_aio_decoders,
+ SPDK_COUNTOF(rpc_construct_aio_decoders),
+ &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ rc = create_aio_bdev(req.name, req.filename, req.block_size);
+ if (rc) {
+ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc));
+ goto cleanup;
+ }
+
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_string(w, req.name);
+ spdk_jsonrpc_end_result(request, w);
+
+cleanup:
+ free_rpc_construct_aio(&req);
+}
+SPDK_RPC_REGISTER("bdev_aio_create", rpc_bdev_aio_create, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_aio_create, construct_aio_bdev)
+
+struct rpc_delete_aio {
+ char *name;
+};
+
+static void
+free_rpc_delete_aio(struct rpc_delete_aio *r)
+{
+ free(r->name);
+}
+
+static const struct spdk_json_object_decoder rpc_delete_aio_decoders[] = {
+ {"name", offsetof(struct rpc_delete_aio, name), spdk_json_decode_string},
+};
+
+static void
+_rpc_bdev_aio_delete_cb(void *cb_arg, int bdeverrno)
+{
+ struct spdk_jsonrpc_request *request = cb_arg;
+ struct spdk_json_write_ctx *w = spdk_jsonrpc_begin_result(request);
+
+ spdk_json_write_bool(w, bdeverrno == 0);
+ spdk_jsonrpc_end_result(request, w);
+}
+
+static void
+rpc_bdev_aio_delete(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_delete_aio req = {NULL};
+ struct spdk_bdev *bdev;
+
+ if (spdk_json_decode_object(params, rpc_delete_aio_decoders,
+ SPDK_COUNTOF(rpc_delete_aio_decoders),
+ &req)) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ bdev = spdk_bdev_get_by_name(req.name);
+ if (bdev == NULL) {
+ spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV));
+ goto cleanup;
+ }
+
+ bdev_aio_delete(bdev, _rpc_bdev_aio_delete_cb, request);
+
+ free_rpc_delete_aio(&req);
+
+ return;
+
+cleanup:
+ free_rpc_delete_aio(&req);
+}
+SPDK_RPC_REGISTER("bdev_aio_delete", rpc_bdev_aio_delete, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_aio_delete, delete_aio_bdev)
diff --git a/src/spdk/module/bdev/compress/Makefile b/src/spdk/module/bdev/compress/Makefile
new file mode 100644
index 000000000..e3d889e67
--- /dev/null
+++ b/src/spdk/module/bdev/compress/Makefile
@@ -0,0 +1,48 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+CFLAGS += -I$(SPDK_ROOT_DIR)/lib/bdev/
+
+C_SRCS = vbdev_compress.c vbdev_compress_rpc.c
+LIBNAME = bdev_compress
+CFLAGS += $(ENV_CFLAGS)
+
+SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/module/bdev/compress/vbdev_compress.c b/src/spdk/module/bdev/compress/vbdev_compress.c
new file mode 100644
index 000000000..a83c97c64
--- /dev/null
+++ b/src/spdk/module/bdev/compress/vbdev_compress.c
@@ -0,0 +1,1865 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "vbdev_compress.h"
+
+#include "spdk/reduce.h"
+#include "spdk/stdinc.h"
+#include "spdk/rpc.h"
+#include "spdk/env.h"
+#include "spdk/conf.h"
+#include "spdk/endian.h"
+#include "spdk/string.h"
+#include "spdk/thread.h"
+#include "spdk/util.h"
+#include "spdk/bdev_module.h"
+
+#include "spdk_internal/log.h"
+
+#include <rte_config.h>
+#include <rte_bus_vdev.h>
+#include <rte_compressdev.h>
+#include <rte_comp.h>
+
+#define NUM_MAX_XFORMS 2
+#define NUM_MAX_INFLIGHT_OPS 128
+#define DEFAULT_WINDOW_SIZE 15
+/* We need extra mbufs per operation to accommodate host buffers that
+ * span a 2MB boundary.
+ */
+#define MAX_MBUFS_PER_OP (REDUCE_MAX_IOVECS * 2)
+#define CHUNK_SIZE (1024 * 16)
+#define COMP_BDEV_NAME "compress"
+#define BACKING_IO_SZ (4 * 1024)
+
+#define ISAL_PMD "compress_isal"
+#define QAT_PMD "compress_qat"
+#define NUM_MBUFS 8192
+#define POOL_CACHE_SIZE 256
+
+static enum compress_pmd g_opts;
+
+/* Global list of available compression devices. */
+struct compress_dev {
+ struct rte_compressdev_info cdev_info; /* includes device friendly name */
+ uint8_t cdev_id; /* identifier for the device */
+ void *comp_xform; /* shared private xform for comp on this PMD */
+ void *decomp_xform; /* shared private xform for decomp on this PMD */
+ TAILQ_ENTRY(compress_dev) link;
+};
+static TAILQ_HEAD(, compress_dev) g_compress_devs = TAILQ_HEAD_INITIALIZER(g_compress_devs);
+
+/* Although ISAL PMD reports 'unlimited' qpairs, it has an unplanned limit of 99 due to
+ * the length of the internal ring name that it creates, it breaks a limit in the generic
+ * ring code and fails the qp initialization.
+ */
+#define MAX_NUM_QP 99
+/* Global list and lock for unique device/queue pair combos */
+struct comp_device_qp {
+ struct compress_dev *device; /* ptr to compression device */
+ uint8_t qp; /* queue pair for this node */
+ struct spdk_thread *thread; /* thead that this qp is assigned to */
+ TAILQ_ENTRY(comp_device_qp) link;
+};
+static TAILQ_HEAD(, comp_device_qp) g_comp_device_qp = TAILQ_HEAD_INITIALIZER(g_comp_device_qp);
+static pthread_mutex_t g_comp_device_qp_lock = PTHREAD_MUTEX_INITIALIZER;
+
+/* For queueing up compression operations that we can't submit for some reason */
+struct vbdev_comp_op {
+ struct spdk_reduce_backing_dev *backing_dev;
+ struct iovec *src_iovs;
+ int src_iovcnt;
+ struct iovec *dst_iovs;
+ int dst_iovcnt;
+ bool compress;
+ void *cb_arg;
+ TAILQ_ENTRY(vbdev_comp_op) link;
+};
+
+struct vbdev_comp_delete_ctx {
+ spdk_delete_compress_complete cb_fn;
+ void *cb_arg;
+ int cb_rc;
+ struct spdk_thread *orig_thread;
+};
+
+/* List of virtual bdevs and associated info for each. */
+struct vbdev_compress {
+ struct spdk_bdev *base_bdev; /* the thing we're attaching to */
+ struct spdk_bdev_desc *base_desc; /* its descriptor we get from open */
+ struct spdk_io_channel *base_ch; /* IO channel of base device */
+ struct spdk_bdev comp_bdev; /* the compression virtual bdev */
+ struct comp_io_channel *comp_ch; /* channel associated with this bdev */
+ char *drv_name; /* name of the compression device driver */
+ struct comp_device_qp *device_qp;
+ struct spdk_thread *reduce_thread;
+ pthread_mutex_t reduce_lock;
+ uint32_t ch_count;
+ TAILQ_HEAD(, spdk_bdev_io) pending_comp_ios; /* outstanding operations to a comp library */
+ struct spdk_poller *poller; /* completion poller */
+ struct spdk_reduce_vol_params params; /* params for the reduce volume */
+ struct spdk_reduce_backing_dev backing_dev; /* backing device info for the reduce volume */
+ struct spdk_reduce_vol *vol; /* the reduce volume */
+ struct vbdev_comp_delete_ctx *delete_ctx;
+ bool orphaned; /* base bdev claimed but comp_bdev not registered */
+ int reduce_errno;
+ TAILQ_HEAD(, vbdev_comp_op) queued_comp_ops;
+ TAILQ_ENTRY(vbdev_compress) link;
+ struct spdk_thread *thread; /* thread where base device is opened */
+};
+static TAILQ_HEAD(, vbdev_compress) g_vbdev_comp = TAILQ_HEAD_INITIALIZER(g_vbdev_comp);
+
+/* The comp vbdev channel struct. It is allocated and freed on my behalf by the io channel code.
+ */
+struct comp_io_channel {
+ struct spdk_io_channel_iter *iter; /* used with for_each_channel in reset */
+};
+
+/* Per I/O context for the compression vbdev. */
+struct comp_bdev_io {
+ struct comp_io_channel *comp_ch; /* used in completion handling */
+ struct vbdev_compress *comp_bdev; /* vbdev associated with this IO */
+ struct spdk_bdev_io_wait_entry bdev_io_wait; /* for bdev_io_wait */
+ struct spdk_bdev_io *orig_io; /* the original IO */
+ struct spdk_io_channel *ch; /* for resubmission */
+ int status; /* save for completion on orig thread */
+};
+
+/* Shared mempools between all devices on this system */
+static struct rte_mempool *g_mbuf_mp = NULL; /* mbuf mempool */
+static struct rte_mempool *g_comp_op_mp = NULL; /* comp operations, must be rte* mempool */
+static struct rte_mbuf_ext_shared_info g_shinfo = {}; /* used by DPDK mbuf macros */
+static bool g_qat_available = false;
+static bool g_isal_available = false;
+
+/* Create shared (between all ops per PMD) compress xforms. */
+static struct rte_comp_xform g_comp_xform = {
+ .type = RTE_COMP_COMPRESS,
+ .compress = {
+ .algo = RTE_COMP_ALGO_DEFLATE,
+ .deflate.huffman = RTE_COMP_HUFFMAN_DEFAULT,
+ .level = RTE_COMP_LEVEL_MAX,
+ .window_size = DEFAULT_WINDOW_SIZE,
+ .chksum = RTE_COMP_CHECKSUM_NONE,
+ .hash_algo = RTE_COMP_HASH_ALGO_NONE
+ }
+};
+/* Create shared (between all ops per PMD) decompress xforms. */
+static struct rte_comp_xform g_decomp_xform = {
+ .type = RTE_COMP_DECOMPRESS,
+ .decompress = {
+ .algo = RTE_COMP_ALGO_DEFLATE,
+ .chksum = RTE_COMP_CHECKSUM_NONE,
+ .window_size = DEFAULT_WINDOW_SIZE,
+ .hash_algo = RTE_COMP_HASH_ALGO_NONE
+ }
+};
+
+static void vbdev_compress_examine(struct spdk_bdev *bdev);
+static void vbdev_compress_claim(struct vbdev_compress *comp_bdev);
+static void vbdev_compress_queue_io(struct spdk_bdev_io *bdev_io);
+struct vbdev_compress *_prepare_for_load_init(struct spdk_bdev *bdev, uint32_t lb_size);
+static void vbdev_compress_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io);
+static void comp_bdev_ch_destroy_cb(void *io_device, void *ctx_buf);
+static void vbdev_compress_delete_done(void *cb_arg, int bdeverrno);
+
+/* Dummy function used by DPDK to free ext attached buffers
+ * to mbufs, we free them ourselves but this callback has to
+ * be here.
+ */
+static void
+shinfo_free_cb(void *arg1, void *arg2)
+{
+}
+
+/* Called by vbdev_init_compress_drivers() to init each discovered compression device */
+static int
+create_compress_dev(uint8_t index)
+{
+ struct compress_dev *device;
+ uint16_t q_pairs;
+ uint8_t cdev_id;
+ int rc, i;
+ struct comp_device_qp *dev_qp;
+ struct comp_device_qp *tmp_qp;
+
+ device = calloc(1, sizeof(struct compress_dev));
+ if (!device) {
+ return -ENOMEM;
+ }
+
+ /* Get details about this device. */
+ rte_compressdev_info_get(index, &device->cdev_info);
+
+ cdev_id = device->cdev_id = index;
+
+ /* Zero means no limit so choose number of lcores. */
+ if (device->cdev_info.max_nb_queue_pairs == 0) {
+ q_pairs = MAX_NUM_QP;
+ } else {
+ q_pairs = spdk_min(device->cdev_info.max_nb_queue_pairs, MAX_NUM_QP);
+ }
+
+ /* Configure the compression device. */
+ struct rte_compressdev_config config = {
+ .socket_id = rte_socket_id(),
+ .nb_queue_pairs = q_pairs,
+ .max_nb_priv_xforms = NUM_MAX_XFORMS,
+ .max_nb_streams = 0
+ };
+ rc = rte_compressdev_configure(cdev_id, &config);
+ if (rc < 0) {
+ SPDK_ERRLOG("Failed to configure compressdev %u\n", cdev_id);
+ goto err;
+ }
+
+ /* Pre-setup all potential qpairs now and assign them in the channel
+ * callback.
+ */
+ for (i = 0; i < q_pairs; i++) {
+ rc = rte_compressdev_queue_pair_setup(cdev_id, i,
+ NUM_MAX_INFLIGHT_OPS,
+ rte_socket_id());
+ if (rc) {
+ if (i > 0) {
+ q_pairs = i;
+ SPDK_NOTICELOG("FYI failed to setup a queue pair on "
+ "compressdev %u with error %u "
+ "so limiting to %u qpairs\n",
+ cdev_id, rc, q_pairs);
+ break;
+ } else {
+ SPDK_ERRLOG("Failed to setup queue pair on "
+ "compressdev %u with error %u\n", cdev_id, rc);
+ rc = -EINVAL;
+ goto err;
+ }
+ }
+ }
+
+ rc = rte_compressdev_start(cdev_id);
+ if (rc < 0) {
+ SPDK_ERRLOG("Failed to start device %u: error %d\n",
+ cdev_id, rc);
+ goto err;
+ }
+
+ if (device->cdev_info.capabilities->comp_feature_flags & RTE_COMP_FF_SHAREABLE_PRIV_XFORM) {
+ rc = rte_compressdev_private_xform_create(cdev_id, &g_comp_xform,
+ &device->comp_xform);
+ if (rc < 0) {
+ SPDK_ERRLOG("Failed to create private comp xform device %u: error %d\n",
+ cdev_id, rc);
+ goto err;
+ }
+
+ rc = rte_compressdev_private_xform_create(cdev_id, &g_decomp_xform,
+ &device->decomp_xform);
+ if (rc) {
+ SPDK_ERRLOG("Failed to create private decomp xform device %u: error %d\n",
+ cdev_id, rc);
+ goto err;
+ }
+ } else {
+ SPDK_ERRLOG("PMD does not support shared transforms\n");
+ goto err;
+ }
+
+ /* Build up list of device/qp combinations */
+ for (i = 0; i < q_pairs; i++) {
+ dev_qp = calloc(1, sizeof(struct comp_device_qp));
+ if (!dev_qp) {
+ rc = -ENOMEM;
+ goto err;
+ }
+ dev_qp->device = device;
+ dev_qp->qp = i;
+ dev_qp->thread = NULL;
+ TAILQ_INSERT_TAIL(&g_comp_device_qp, dev_qp, link);
+ }
+
+ TAILQ_INSERT_TAIL(&g_compress_devs, device, link);
+
+ if (strcmp(device->cdev_info.driver_name, QAT_PMD) == 0) {
+ g_qat_available = true;
+ }
+ if (strcmp(device->cdev_info.driver_name, ISAL_PMD) == 0) {
+ g_isal_available = true;
+ }
+
+ return 0;
+
+err:
+ TAILQ_FOREACH_SAFE(dev_qp, &g_comp_device_qp, link, tmp_qp) {
+ TAILQ_REMOVE(&g_comp_device_qp, dev_qp, link);
+ free(dev_qp);
+ }
+ free(device);
+ return rc;
+}
+
+/* Called from driver init entry point, vbdev_compress_init() */
+static int
+vbdev_init_compress_drivers(void)
+{
+ uint8_t cdev_count, i;
+ struct compress_dev *tmp_dev;
+ struct compress_dev *device;
+ int rc;
+
+ /* We always init the compress_isal PMD */
+ rc = rte_vdev_init(ISAL_PMD, NULL);
+ if (rc == 0) {
+ SPDK_NOTICELOG("created virtual PMD %s\n", ISAL_PMD);
+ } else if (rc == -EEXIST) {
+ SPDK_NOTICELOG("virtual PMD %s already exists.\n", ISAL_PMD);
+ } else {
+ SPDK_ERRLOG("creating virtual PMD %s\n", ISAL_PMD);
+ return -EINVAL;
+ }
+
+ /* If we have no compression devices, there's no reason to continue. */
+ cdev_count = rte_compressdev_count();
+ if (cdev_count == 0) {
+ return 0;
+ }
+ if (cdev_count > RTE_COMPRESS_MAX_DEVS) {
+ SPDK_ERRLOG("invalid device count from rte_compressdev_count()\n");
+ return -EINVAL;
+ }
+
+ g_mbuf_mp = rte_pktmbuf_pool_create("comp_mbuf_mp", NUM_MBUFS, POOL_CACHE_SIZE,
+ sizeof(struct rte_mbuf), 0, rte_socket_id());
+ if (g_mbuf_mp == NULL) {
+ SPDK_ERRLOG("Cannot create mbuf pool\n");
+ rc = -ENOMEM;
+ goto error_create_mbuf;
+ }
+
+ g_comp_op_mp = rte_comp_op_pool_create("comp_op_pool", NUM_MBUFS, POOL_CACHE_SIZE,
+ 0, rte_socket_id());
+ if (g_comp_op_mp == NULL) {
+ SPDK_ERRLOG("Cannot create comp op pool\n");
+ rc = -ENOMEM;
+ goto error_create_op;
+ }
+
+ /* Init all devices */
+ for (i = 0; i < cdev_count; i++) {
+ rc = create_compress_dev(i);
+ if (rc != 0) {
+ goto error_create_compress_devs;
+ }
+ }
+
+ if (g_qat_available == true) {
+ SPDK_NOTICELOG("initialized QAT PMD\n");
+ }
+
+ g_shinfo.free_cb = shinfo_free_cb;
+
+ return 0;
+
+ /* Error cleanup paths. */
+error_create_compress_devs:
+ TAILQ_FOREACH_SAFE(device, &g_compress_devs, link, tmp_dev) {
+ TAILQ_REMOVE(&g_compress_devs, device, link);
+ free(device);
+ }
+error_create_op:
+error_create_mbuf:
+ rte_mempool_free(g_mbuf_mp);
+
+ return rc;
+}
+
+/* for completing rw requests on the orig IO thread. */
+static void
+_reduce_rw_blocks_cb(void *arg)
+{
+ struct comp_bdev_io *io_ctx = arg;
+
+ if (io_ctx->status == 0) {
+ spdk_bdev_io_complete(io_ctx->orig_io, SPDK_BDEV_IO_STATUS_SUCCESS);
+ } else {
+ SPDK_ERRLOG("status %d on operation from reduce API\n", io_ctx->status);
+ spdk_bdev_io_complete(io_ctx->orig_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+}
+
+/* Completion callback for r/w that were issued via reducelib. */
+static void
+reduce_rw_blocks_cb(void *arg, int reduce_errno)
+{
+ struct spdk_bdev_io *bdev_io = arg;
+ struct comp_bdev_io *io_ctx = (struct comp_bdev_io *)bdev_io->driver_ctx;
+ struct spdk_io_channel *ch = spdk_io_channel_from_ctx(io_ctx->comp_ch);
+ struct spdk_thread *orig_thread;
+
+ /* TODO: need to decide which error codes are bdev_io success vs failure;
+ * example examine calls reading metadata */
+
+ io_ctx->status = reduce_errno;
+
+ /* Send this request to the orig IO thread. */
+ orig_thread = spdk_io_channel_get_thread(ch);
+ if (orig_thread != spdk_get_thread()) {
+ spdk_thread_send_msg(orig_thread, _reduce_rw_blocks_cb, io_ctx);
+ } else {
+ _reduce_rw_blocks_cb(io_ctx);
+ }
+}
+
+static uint64_t
+_setup_compress_mbuf(struct rte_mbuf **mbufs, int *mbuf_total, uint64_t *total_length,
+ struct iovec *iovs, int iovcnt, void *reduce_cb_arg)
+{
+ uint64_t updated_length, remainder, phys_addr;
+ uint8_t *current_base = NULL;
+ int iov_index, mbuf_index;
+ int rc = 0;
+
+ /* Setup mbufs */
+ iov_index = mbuf_index = 0;
+ while (iov_index < iovcnt) {
+
+ current_base = iovs[iov_index].iov_base;
+ if (total_length) {
+ *total_length += iovs[iov_index].iov_len;
+ }
+ assert(mbufs[mbuf_index] != NULL);
+ mbufs[mbuf_index]->userdata = reduce_cb_arg;
+ updated_length = iovs[iov_index].iov_len;
+ phys_addr = spdk_vtophys((void *)current_base, &updated_length);
+
+ rte_pktmbuf_attach_extbuf(mbufs[mbuf_index],
+ current_base,
+ phys_addr,
+ updated_length,
+ &g_shinfo);
+ rte_pktmbuf_append(mbufs[mbuf_index], updated_length);
+ remainder = iovs[iov_index].iov_len - updated_length;
+
+ if (mbuf_index > 0) {
+ rte_pktmbuf_chain(mbufs[0], mbufs[mbuf_index]);
+ }
+
+ /* If we crossed 2 2MB boundary we need another mbuf for the remainder */
+ if (remainder > 0) {
+ /* allocate an mbuf at the end of the array */
+ rc = rte_pktmbuf_alloc_bulk(g_mbuf_mp,
+ (struct rte_mbuf **)&mbufs[*mbuf_total], 1);
+ if (rc) {
+ SPDK_ERRLOG("ERROR trying to get an extra mbuf!\n");
+ return -1;
+ }
+ (*mbuf_total)++;
+ mbuf_index++;
+ mbufs[mbuf_index]->userdata = reduce_cb_arg;
+ current_base += updated_length;
+ phys_addr = spdk_vtophys((void *)current_base, &remainder);
+ /* assert we don't cross another */
+ assert(remainder == iovs[iov_index].iov_len - updated_length);
+
+ rte_pktmbuf_attach_extbuf(mbufs[mbuf_index],
+ current_base,
+ phys_addr,
+ remainder,
+ &g_shinfo);
+ rte_pktmbuf_append(mbufs[mbuf_index], remainder);
+ rte_pktmbuf_chain(mbufs[0], mbufs[mbuf_index]);
+ }
+ iov_index++;
+ mbuf_index++;
+ }
+
+ return 0;
+}
+
+static int
+_compress_operation(struct spdk_reduce_backing_dev *backing_dev, struct iovec *src_iovs,
+ int src_iovcnt, struct iovec *dst_iovs,
+ int dst_iovcnt, bool compress, void *cb_arg)
+{
+ void *reduce_cb_arg = cb_arg;
+ struct vbdev_compress *comp_bdev = SPDK_CONTAINEROF(backing_dev, struct vbdev_compress,
+ backing_dev);
+ struct rte_comp_op *comp_op;
+ struct rte_mbuf *src_mbufs[MAX_MBUFS_PER_OP];
+ struct rte_mbuf *dst_mbufs[MAX_MBUFS_PER_OP];
+ uint8_t cdev_id = comp_bdev->device_qp->device->cdev_id;
+ uint64_t total_length = 0;
+ int rc = 0;
+ struct vbdev_comp_op *op_to_queue;
+ int i;
+ int src_mbuf_total = src_iovcnt;
+ int dst_mbuf_total = dst_iovcnt;
+ bool device_error = false;
+
+ assert(src_iovcnt < MAX_MBUFS_PER_OP);
+
+#ifdef DEBUG
+ memset(src_mbufs, 0, sizeof(src_mbufs));
+ memset(dst_mbufs, 0, sizeof(dst_mbufs));
+#endif
+
+ comp_op = rte_comp_op_alloc(g_comp_op_mp);
+ if (!comp_op) {
+ SPDK_ERRLOG("trying to get a comp op!\n");
+ goto error_get_op;
+ }
+
+ /* get an mbuf per iov, src and dst */
+ rc = rte_pktmbuf_alloc_bulk(g_mbuf_mp, (struct rte_mbuf **)&src_mbufs[0], src_iovcnt);
+ if (rc) {
+ SPDK_ERRLOG("ERROR trying to get src_mbufs!\n");
+ goto error_get_src;
+ }
+
+ rc = rte_pktmbuf_alloc_bulk(g_mbuf_mp, (struct rte_mbuf **)&dst_mbufs[0], dst_iovcnt);
+ if (rc) {
+ SPDK_ERRLOG("ERROR trying to get dst_mbufs!\n");
+ goto error_get_dst;
+ }
+
+ /* There is a 1:1 mapping between a bdev_io and a compression operation, but
+ * all compression PMDs that SPDK uses support chaining so build our mbuf chain
+ * and associate with our single comp_op.
+ */
+
+ rc = _setup_compress_mbuf(&src_mbufs[0], &src_mbuf_total, &total_length,
+ src_iovs, src_iovcnt, reduce_cb_arg);
+ if (rc < 0) {
+ goto error_src_dst;
+ }
+
+ comp_op->m_src = src_mbufs[0];
+ comp_op->src.offset = 0;
+ comp_op->src.length = total_length;
+
+ /* setup dst mbufs, for the current test being used with this code there's only one vector */
+ rc = _setup_compress_mbuf(&dst_mbufs[0], &dst_mbuf_total, NULL,
+ dst_iovs, dst_iovcnt, reduce_cb_arg);
+ if (rc < 0) {
+ goto error_src_dst;
+ }
+
+ comp_op->m_dst = dst_mbufs[0];
+ comp_op->dst.offset = 0;
+
+ if (compress == true) {
+ comp_op->private_xform = comp_bdev->device_qp->device->comp_xform;
+ } else {
+ comp_op->private_xform = comp_bdev->device_qp->device->decomp_xform;
+ }
+
+ comp_op->op_type = RTE_COMP_OP_STATELESS;
+ comp_op->flush_flag = RTE_COMP_FLUSH_FINAL;
+
+ rc = rte_compressdev_enqueue_burst(cdev_id, comp_bdev->device_qp->qp, &comp_op, 1);
+ assert(rc <= 1);
+
+ /* We always expect 1 got queued, if 0 then we need to queue it up. */
+ if (rc == 1) {
+ return 0;
+ } else if (comp_op->status == RTE_COMP_OP_STATUS_NOT_PROCESSED) {
+ /* we free mbufs differently depending on whether they were chained or not */
+ rte_pktmbuf_free(comp_op->m_src);
+ rte_pktmbuf_free(comp_op->m_dst);
+ goto error_enqueue;
+ } else {
+ device_error = true;
+ goto error_src_dst;
+ }
+
+ /* Error cleanup paths. */
+error_src_dst:
+ for (i = 0; i < dst_mbuf_total; i++) {
+ rte_pktmbuf_free((struct rte_mbuf *)&dst_mbufs[i]);
+ }
+error_get_dst:
+ for (i = 0; i < src_mbuf_total; i++) {
+ rte_pktmbuf_free((struct rte_mbuf *)&src_mbufs[i]);
+ }
+error_get_src:
+error_enqueue:
+ rte_comp_op_free(comp_op);
+error_get_op:
+
+ if (device_error == true) {
+ /* There was an error sending the op to the device, most
+ * likely with the parameters.
+ */
+ SPDK_ERRLOG("Compression API returned 0x%x\n", comp_op->status);
+ return -EINVAL;
+ }
+
+ op_to_queue = calloc(1, sizeof(struct vbdev_comp_op));
+ if (op_to_queue == NULL) {
+ SPDK_ERRLOG("unable to allocate operation for queueing.\n");
+ return -ENOMEM;
+ }
+ op_to_queue->backing_dev = backing_dev;
+ op_to_queue->src_iovs = src_iovs;
+ op_to_queue->src_iovcnt = src_iovcnt;
+ op_to_queue->dst_iovs = dst_iovs;
+ op_to_queue->dst_iovcnt = dst_iovcnt;
+ op_to_queue->compress = compress;
+ op_to_queue->cb_arg = cb_arg;
+ TAILQ_INSERT_TAIL(&comp_bdev->queued_comp_ops,
+ op_to_queue,
+ link);
+ return 0;
+}
+
+/* Poller for the DPDK compression driver. */
+static int
+comp_dev_poller(void *args)
+{
+ struct vbdev_compress *comp_bdev = args;
+ uint8_t cdev_id = comp_bdev->device_qp->device->cdev_id;
+ struct rte_comp_op *deq_ops[NUM_MAX_INFLIGHT_OPS];
+ uint16_t num_deq;
+ struct spdk_reduce_vol_cb_args *reduce_args;
+ struct vbdev_comp_op *op_to_resubmit;
+ int rc, i;
+
+ num_deq = rte_compressdev_dequeue_burst(cdev_id, comp_bdev->device_qp->qp, deq_ops,
+ NUM_MAX_INFLIGHT_OPS);
+ for (i = 0; i < num_deq; i++) {
+ reduce_args = (struct spdk_reduce_vol_cb_args *)deq_ops[i]->m_src->userdata;
+
+ if (deq_ops[i]->status == RTE_COMP_OP_STATUS_SUCCESS) {
+
+ /* tell reduce this is done and what the bytecount was */
+ reduce_args->cb_fn(reduce_args->cb_arg, deq_ops[i]->produced);
+ } else {
+ SPDK_NOTICELOG("FYI storing data uncompressed due to deque status %u\n",
+ deq_ops[i]->status);
+
+ /* Reduce will simply store uncompressed on neg errno value. */
+ reduce_args->cb_fn(reduce_args->cb_arg, -EINVAL);
+ }
+
+ /* Now free both mbufs and the compress operation. The rte_pktmbuf_free()
+ * call takes care of freeing all of the mbufs in the chain back to their
+ * original pool.
+ */
+ rte_pktmbuf_free(deq_ops[i]->m_src);
+ rte_pktmbuf_free(deq_ops[i]->m_dst);
+
+ /* There is no bulk free for com ops so we have to free them one at a time
+ * here however it would be rare that we'd ever have more than 1 at a time
+ * anyways.
+ */
+ rte_comp_op_free(deq_ops[i]);
+
+ /* Check if there are any pending comp ops to process, only pull one
+ * at a time off as _compress_operation() may re-queue the op.
+ */
+ if (!TAILQ_EMPTY(&comp_bdev->queued_comp_ops)) {
+ op_to_resubmit = TAILQ_FIRST(&comp_bdev->queued_comp_ops);
+ rc = _compress_operation(op_to_resubmit->backing_dev,
+ op_to_resubmit->src_iovs,
+ op_to_resubmit->src_iovcnt,
+ op_to_resubmit->dst_iovs,
+ op_to_resubmit->dst_iovcnt,
+ op_to_resubmit->compress,
+ op_to_resubmit->cb_arg);
+ if (rc == 0) {
+ TAILQ_REMOVE(&comp_bdev->queued_comp_ops, op_to_resubmit, link);
+ free(op_to_resubmit);
+ }
+ }
+ }
+ return num_deq == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY;
+}
+
+/* Entry point for reduce lib to issue a compress operation. */
+static void
+_comp_reduce_compress(struct spdk_reduce_backing_dev *dev,
+ struct iovec *src_iovs, int src_iovcnt,
+ struct iovec *dst_iovs, int dst_iovcnt,
+ struct spdk_reduce_vol_cb_args *cb_arg)
+{
+ int rc;
+
+ rc = _compress_operation(dev, src_iovs, src_iovcnt, dst_iovs, dst_iovcnt, true, cb_arg);
+ if (rc) {
+ SPDK_ERRLOG("with compress operation code %d (%s)\n", rc, spdk_strerror(-rc));
+ cb_arg->cb_fn(cb_arg->cb_arg, rc);
+ }
+}
+
+/* Entry point for reduce lib to issue a decompress operation. */
+static void
+_comp_reduce_decompress(struct spdk_reduce_backing_dev *dev,
+ struct iovec *src_iovs, int src_iovcnt,
+ struct iovec *dst_iovs, int dst_iovcnt,
+ struct spdk_reduce_vol_cb_args *cb_arg)
+{
+ int rc;
+
+ rc = _compress_operation(dev, src_iovs, src_iovcnt, dst_iovs, dst_iovcnt, false, cb_arg);
+ if (rc) {
+ SPDK_ERRLOG("with decompress operation code %d (%s)\n", rc, spdk_strerror(-rc));
+ cb_arg->cb_fn(cb_arg->cb_arg, rc);
+ }
+}
+
+/* Callback for getting a buf from the bdev pool in the event that the caller passed
+ * in NULL, we need to own the buffer so it doesn't get freed by another vbdev module
+ * beneath us before we're done with it.
+ */
+static void
+comp_read_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success)
+{
+ struct vbdev_compress *comp_bdev = SPDK_CONTAINEROF(bdev_io->bdev, struct vbdev_compress,
+ comp_bdev);
+
+ spdk_reduce_vol_readv(comp_bdev->vol, bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
+ bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
+ reduce_rw_blocks_cb, bdev_io);
+}
+
+/* scheduled for completion on IO thread */
+static void
+_complete_other_io(void *arg)
+{
+ struct comp_bdev_io *io_ctx = (struct comp_bdev_io *)arg;
+ if (io_ctx->status == 0) {
+ spdk_bdev_io_complete(io_ctx->orig_io, SPDK_BDEV_IO_STATUS_SUCCESS);
+ } else {
+ spdk_bdev_io_complete(io_ctx->orig_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+}
+
+/* scheduled for submission on reduce thread */
+static void
+_comp_bdev_io_submit(void *arg)
+{
+ struct spdk_bdev_io *bdev_io = arg;
+ struct comp_bdev_io *io_ctx = (struct comp_bdev_io *)bdev_io->driver_ctx;
+ struct spdk_io_channel *ch = spdk_io_channel_from_ctx(io_ctx->comp_ch);
+ struct vbdev_compress *comp_bdev = SPDK_CONTAINEROF(bdev_io->bdev, struct vbdev_compress,
+ comp_bdev);
+ struct spdk_thread *orig_thread;
+ int rc = 0;
+
+ switch (bdev_io->type) {
+ case SPDK_BDEV_IO_TYPE_READ:
+ spdk_bdev_io_get_buf(bdev_io, comp_read_get_buf_cb,
+ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
+ return;
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ spdk_reduce_vol_writev(comp_bdev->vol, bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
+ bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
+ reduce_rw_blocks_cb, bdev_io);
+ return;
+ /* TODO in future patch in the series */
+ case SPDK_BDEV_IO_TYPE_RESET:
+ break;
+ case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
+ case SPDK_BDEV_IO_TYPE_UNMAP:
+ case SPDK_BDEV_IO_TYPE_FLUSH:
+ default:
+ SPDK_ERRLOG("Unknown I/O type %d\n", bdev_io->type);
+ rc = -EINVAL;
+ }
+
+ if (rc) {
+ if (rc == -ENOMEM) {
+ SPDK_ERRLOG("No memory, start to queue io for compress.\n");
+ io_ctx->ch = ch;
+ vbdev_compress_queue_io(bdev_io);
+ return;
+ } else {
+ SPDK_ERRLOG("on bdev_io submission!\n");
+ io_ctx->status = rc;
+ }
+ }
+
+ /* Complete this on the orig IO thread. */
+ orig_thread = spdk_io_channel_get_thread(ch);
+ if (orig_thread != spdk_get_thread()) {
+ spdk_thread_send_msg(orig_thread, _complete_other_io, io_ctx);
+ } else {
+ _complete_other_io(io_ctx);
+ }
+}
+
+/* Called when someone above submits IO to this vbdev. */
+static void
+vbdev_compress_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
+{
+ struct comp_bdev_io *io_ctx = (struct comp_bdev_io *)bdev_io->driver_ctx;
+ struct vbdev_compress *comp_bdev = SPDK_CONTAINEROF(bdev_io->bdev, struct vbdev_compress,
+ comp_bdev);
+ struct comp_io_channel *comp_ch = spdk_io_channel_get_ctx(ch);
+
+ memset(io_ctx, 0, sizeof(struct comp_bdev_io));
+ io_ctx->comp_bdev = comp_bdev;
+ io_ctx->comp_ch = comp_ch;
+ io_ctx->orig_io = bdev_io;
+
+ /* Send this request to the reduce_thread if that's not what we're on. */
+ if (spdk_get_thread() != comp_bdev->reduce_thread) {
+ spdk_thread_send_msg(comp_bdev->reduce_thread, _comp_bdev_io_submit, bdev_io);
+ } else {
+ _comp_bdev_io_submit(bdev_io);
+ }
+}
+
+static bool
+vbdev_compress_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
+{
+ struct vbdev_compress *comp_bdev = (struct vbdev_compress *)ctx;
+
+ switch (io_type) {
+ case SPDK_BDEV_IO_TYPE_READ:
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ return spdk_bdev_io_type_supported(comp_bdev->base_bdev, io_type);
+ case SPDK_BDEV_IO_TYPE_UNMAP:
+ case SPDK_BDEV_IO_TYPE_RESET:
+ case SPDK_BDEV_IO_TYPE_FLUSH:
+ case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
+ default:
+ return false;
+ }
+}
+
+/* Resubmission function used by the bdev layer when a queued IO is ready to be
+ * submitted.
+ */
+static void
+vbdev_compress_resubmit_io(void *arg)
+{
+ struct spdk_bdev_io *bdev_io = (struct spdk_bdev_io *)arg;
+ struct comp_bdev_io *io_ctx = (struct comp_bdev_io *)bdev_io->driver_ctx;
+
+ vbdev_compress_submit_request(io_ctx->ch, bdev_io);
+}
+
+/* Used to queue an IO in the event of resource issues. */
+static void
+vbdev_compress_queue_io(struct spdk_bdev_io *bdev_io)
+{
+ struct comp_bdev_io *io_ctx = (struct comp_bdev_io *)bdev_io->driver_ctx;
+ int rc;
+
+ io_ctx->bdev_io_wait.bdev = bdev_io->bdev;
+ io_ctx->bdev_io_wait.cb_fn = vbdev_compress_resubmit_io;
+ io_ctx->bdev_io_wait.cb_arg = bdev_io;
+
+ rc = spdk_bdev_queue_io_wait(bdev_io->bdev, io_ctx->comp_bdev->base_ch, &io_ctx->bdev_io_wait);
+ if (rc) {
+ SPDK_ERRLOG("Queue io failed in vbdev_compress_queue_io, rc=%d.\n", rc);
+ assert(false);
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+}
+
+/* Callback for unregistering the IO device. */
+static void
+_device_unregister_cb(void *io_device)
+{
+ struct vbdev_compress *comp_bdev = io_device;
+
+ /* Done with this comp_bdev. */
+ pthread_mutex_destroy(&comp_bdev->reduce_lock);
+ free(comp_bdev->comp_bdev.name);
+ free(comp_bdev);
+}
+
+static void
+_vbdev_compress_destruct_cb(void *ctx)
+{
+ struct vbdev_compress *comp_bdev = ctx;
+
+ TAILQ_REMOVE(&g_vbdev_comp, comp_bdev, link);
+ spdk_bdev_module_release_bdev(comp_bdev->base_bdev);
+ /* Close the underlying bdev on its same opened thread. */
+ spdk_bdev_close(comp_bdev->base_desc);
+ comp_bdev->vol = NULL;
+ if (comp_bdev->orphaned == false) {
+ spdk_io_device_unregister(comp_bdev, _device_unregister_cb);
+ } else {
+ vbdev_compress_delete_done(comp_bdev->delete_ctx, 0);
+ _device_unregister_cb(comp_bdev);
+ }
+}
+
+static void
+vbdev_compress_destruct_cb(void *cb_arg, int reduce_errno)
+{
+ struct vbdev_compress *comp_bdev = (struct vbdev_compress *)cb_arg;
+
+ if (reduce_errno) {
+ SPDK_ERRLOG("number %d\n", reduce_errno);
+ } else {
+ if (comp_bdev->thread && comp_bdev->thread != spdk_get_thread()) {
+ spdk_thread_send_msg(comp_bdev->thread,
+ _vbdev_compress_destruct_cb, comp_bdev);
+ } else {
+ _vbdev_compress_destruct_cb(comp_bdev);
+ }
+ }
+}
+
+static void
+_reduce_destroy_cb(void *ctx, int reduce_errno)
+{
+ struct vbdev_compress *comp_bdev = (struct vbdev_compress *)ctx;
+
+ if (reduce_errno) {
+ SPDK_ERRLOG("number %d\n", reduce_errno);
+ }
+
+ comp_bdev->vol = NULL;
+ spdk_put_io_channel(comp_bdev->base_ch);
+ if (comp_bdev->orphaned == false) {
+ spdk_bdev_unregister(&comp_bdev->comp_bdev, vbdev_compress_delete_done,
+ comp_bdev->delete_ctx);
+ } else {
+ vbdev_compress_destruct_cb((void *)comp_bdev, 0);
+ }
+
+}
+
+static void
+_delete_vol_unload_cb(void *ctx)
+{
+ struct vbdev_compress *comp_bdev = ctx;
+
+ /* FIXME: Assert if these conditions are not satisified for now. */
+ assert(!comp_bdev->reduce_thread ||
+ comp_bdev->reduce_thread == spdk_get_thread());
+
+ /* reducelib needs a channel to comm with the backing device */
+ comp_bdev->base_ch = spdk_bdev_get_io_channel(comp_bdev->base_desc);
+
+ /* Clean the device before we free our resources. */
+ spdk_reduce_vol_destroy(&comp_bdev->backing_dev, _reduce_destroy_cb, comp_bdev);
+}
+
+/* Called by reduceLib after performing unload vol actions */
+static void
+delete_vol_unload_cb(void *cb_arg, int reduce_errno)
+{
+ struct vbdev_compress *comp_bdev = (struct vbdev_compress *)cb_arg;
+
+ if (reduce_errno) {
+ SPDK_ERRLOG("number %d\n", reduce_errno);
+ /* FIXME: callback should be executed. */
+ return;
+ }
+
+ pthread_mutex_lock(&comp_bdev->reduce_lock);
+ if (comp_bdev->reduce_thread && comp_bdev->reduce_thread != spdk_get_thread()) {
+ spdk_thread_send_msg(comp_bdev->reduce_thread,
+ _delete_vol_unload_cb, comp_bdev);
+ pthread_mutex_unlock(&comp_bdev->reduce_lock);
+ } else {
+ pthread_mutex_unlock(&comp_bdev->reduce_lock);
+
+ _delete_vol_unload_cb(comp_bdev);
+ }
+}
+
+const char *
+compress_get_name(const struct vbdev_compress *comp_bdev)
+{
+ return comp_bdev->comp_bdev.name;
+}
+
+struct vbdev_compress *
+compress_bdev_first(void)
+{
+ struct vbdev_compress *comp_bdev;
+
+ comp_bdev = TAILQ_FIRST(&g_vbdev_comp);
+
+ return comp_bdev;
+}
+
+struct vbdev_compress *
+compress_bdev_next(struct vbdev_compress *prev)
+{
+ struct vbdev_compress *comp_bdev;
+
+ comp_bdev = TAILQ_NEXT(prev, link);
+
+ return comp_bdev;
+}
+
+bool
+compress_has_orphan(const char *name)
+{
+ struct vbdev_compress *comp_bdev;
+
+ TAILQ_FOREACH(comp_bdev, &g_vbdev_comp, link) {
+ if (comp_bdev->orphaned && strcmp(name, comp_bdev->comp_bdev.name) == 0) {
+ return true;
+ }
+ }
+ return false;
+}
+
+/* Called after we've unregistered following a hot remove callback.
+ * Our finish entry point will be called next.
+ */
+static int
+vbdev_compress_destruct(void *ctx)
+{
+ struct vbdev_compress *comp_bdev = (struct vbdev_compress *)ctx;
+
+ if (comp_bdev->vol != NULL) {
+ /* Tell reducelib that we're done with this volume. */
+ spdk_reduce_vol_unload(comp_bdev->vol, vbdev_compress_destruct_cb, comp_bdev);
+ } else {
+ vbdev_compress_destruct_cb(comp_bdev, 0);
+ }
+
+ return 0;
+}
+
+/* We supplied this as an entry point for upper layers who want to communicate to this
+ * bdev. This is how they get a channel.
+ */
+static struct spdk_io_channel *
+vbdev_compress_get_io_channel(void *ctx)
+{
+ struct vbdev_compress *comp_bdev = (struct vbdev_compress *)ctx;
+
+ /* The IO channel code will allocate a channel for us which consists of
+ * the SPDK channel structure plus the size of our comp_io_channel struct
+ * that we passed in when we registered our IO device. It will then call
+ * our channel create callback to populate any elements that we need to
+ * update.
+ */
+ return spdk_get_io_channel(comp_bdev);
+}
+
+/* This is the output for bdev_get_bdevs() for this vbdev */
+static int
+vbdev_compress_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
+{
+ struct vbdev_compress *comp_bdev = (struct vbdev_compress *)ctx;
+
+ spdk_json_write_name(w, "compress");
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&comp_bdev->comp_bdev));
+ spdk_json_write_named_string(w, "base_bdev_name", spdk_bdev_get_name(comp_bdev->base_bdev));
+ spdk_json_write_named_string(w, "compression_pmd", comp_bdev->drv_name);
+ spdk_json_write_object_end(w);
+
+ return 0;
+}
+
+/* This is used to generate JSON that can configure this module to its current state. */
+static int
+vbdev_compress_config_json(struct spdk_json_write_ctx *w)
+{
+ struct vbdev_compress *comp_bdev;
+
+ TAILQ_FOREACH(comp_bdev, &g_vbdev_comp, link) {
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "method", "bdev_compress_create");
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_string(w, "base_bdev_name", spdk_bdev_get_name(comp_bdev->base_bdev));
+ spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&comp_bdev->comp_bdev));
+ spdk_json_write_named_string(w, "compression_pmd", comp_bdev->drv_name);
+ spdk_json_write_object_end(w);
+ spdk_json_write_object_end(w);
+ }
+ return 0;
+}
+
+static void
+_vbdev_reduce_init_cb(void *ctx)
+{
+ struct vbdev_compress *meta_ctx = ctx;
+
+ /* We're done with metadata operations */
+ spdk_put_io_channel(meta_ctx->base_ch);
+ /* Close the underlying bdev on its same opened thread. */
+ spdk_bdev_close(meta_ctx->base_desc);
+ meta_ctx->base_desc = NULL;
+
+ if (meta_ctx->vol) {
+ vbdev_compress_claim(meta_ctx);
+ } else {
+ free(meta_ctx);
+ }
+}
+
+/* Callback from reduce for when init is complete. We'll pass the vbdev_comp struct
+ * used for initial metadata operations to claim where it will be further filled out
+ * and added to the global list.
+ */
+static void
+vbdev_reduce_init_cb(void *cb_arg, struct spdk_reduce_vol *vol, int reduce_errno)
+{
+ struct vbdev_compress *meta_ctx = cb_arg;
+
+ if (reduce_errno == 0) {
+ meta_ctx->vol = vol;
+ } else {
+ SPDK_ERRLOG("for vol %s, error %u\n",
+ spdk_bdev_get_name(meta_ctx->base_bdev), reduce_errno);
+ }
+
+ if (meta_ctx->thread && meta_ctx->thread != spdk_get_thread()) {
+ spdk_thread_send_msg(meta_ctx->thread, _vbdev_reduce_init_cb, meta_ctx);
+ } else {
+ _vbdev_reduce_init_cb(meta_ctx);
+ }
+}
+
+/* Callback for the function used by reduceLib to perform IO to/from the backing device. We just
+ * call the callback provided by reduceLib when it called the read/write/unmap function and
+ * free the bdev_io.
+ */
+static void
+comp_reduce_io_cb(struct spdk_bdev_io *bdev_io, bool success, void *arg)
+{
+ struct spdk_reduce_vol_cb_args *cb_args = arg;
+ int reduce_errno;
+
+ if (success) {
+ reduce_errno = 0;
+ } else {
+ reduce_errno = -EIO;
+ }
+ spdk_bdev_free_io(bdev_io);
+ cb_args->cb_fn(cb_args->cb_arg, reduce_errno);
+}
+
+/* This is the function provided to the reduceLib for sending reads directly to
+ * the backing device.
+ */
+static void
+_comp_reduce_readv(struct spdk_reduce_backing_dev *dev, struct iovec *iov, int iovcnt,
+ uint64_t lba, uint32_t lba_count, struct spdk_reduce_vol_cb_args *args)
+{
+ struct vbdev_compress *comp_bdev = SPDK_CONTAINEROF(dev, struct vbdev_compress,
+ backing_dev);
+ int rc;
+
+ rc = spdk_bdev_readv_blocks(comp_bdev->base_desc, comp_bdev->base_ch,
+ iov, iovcnt, lba, lba_count,
+ comp_reduce_io_cb,
+ args);
+ if (rc) {
+ if (rc == -ENOMEM) {
+ SPDK_ERRLOG("No memory, start to queue io.\n");
+ /* TODO: there's no bdev_io to queue */
+ } else {
+ SPDK_ERRLOG("submitting readv request\n");
+ }
+ args->cb_fn(args->cb_arg, rc);
+ }
+}
+
+/* This is the function provided to the reduceLib for sending writes directly to
+ * the backing device.
+ */
+static void
+_comp_reduce_writev(struct spdk_reduce_backing_dev *dev, struct iovec *iov, int iovcnt,
+ uint64_t lba, uint32_t lba_count, struct spdk_reduce_vol_cb_args *args)
+{
+ struct vbdev_compress *comp_bdev = SPDK_CONTAINEROF(dev, struct vbdev_compress,
+ backing_dev);
+ int rc;
+
+ rc = spdk_bdev_writev_blocks(comp_bdev->base_desc, comp_bdev->base_ch,
+ iov, iovcnt, lba, lba_count,
+ comp_reduce_io_cb,
+ args);
+ if (rc) {
+ if (rc == -ENOMEM) {
+ SPDK_ERRLOG("No memory, start to queue io.\n");
+ /* TODO: there's no bdev_io to queue */
+ } else {
+ SPDK_ERRLOG("error submitting writev request\n");
+ }
+ args->cb_fn(args->cb_arg, rc);
+ }
+}
+
+/* This is the function provided to the reduceLib for sending unmaps directly to
+ * the backing device.
+ */
+static void
+_comp_reduce_unmap(struct spdk_reduce_backing_dev *dev,
+ uint64_t lba, uint32_t lba_count, struct spdk_reduce_vol_cb_args *args)
+{
+ struct vbdev_compress *comp_bdev = SPDK_CONTAINEROF(dev, struct vbdev_compress,
+ backing_dev);
+ int rc;
+
+ rc = spdk_bdev_unmap_blocks(comp_bdev->base_desc, comp_bdev->base_ch,
+ lba, lba_count,
+ comp_reduce_io_cb,
+ args);
+
+ if (rc) {
+ if (rc == -ENOMEM) {
+ SPDK_ERRLOG("No memory, start to queue io.\n");
+ /* TODO: there's no bdev_io to queue */
+ } else {
+ SPDK_ERRLOG("submitting unmap request\n");
+ }
+ args->cb_fn(args->cb_arg, rc);
+ }
+}
+
+/* Called by reduceLib after performing unload vol actions following base bdev hotremove */
+static void
+bdev_hotremove_vol_unload_cb(void *cb_arg, int reduce_errno)
+{
+ struct vbdev_compress *comp_bdev = (struct vbdev_compress *)cb_arg;
+
+ if (reduce_errno) {
+ SPDK_ERRLOG("number %d\n", reduce_errno);
+ }
+
+ comp_bdev->vol = NULL;
+ spdk_bdev_unregister(&comp_bdev->comp_bdev, NULL, NULL);
+}
+
+/* Called when the underlying base bdev goes away. */
+static void
+vbdev_compress_base_bdev_hotremove_cb(void *ctx)
+{
+ struct vbdev_compress *comp_bdev, *tmp;
+ struct spdk_bdev *bdev_find = ctx;
+
+ TAILQ_FOREACH_SAFE(comp_bdev, &g_vbdev_comp, link, tmp) {
+ if (bdev_find == comp_bdev->base_bdev) {
+ /* Tell reduceLib that we're done with this volume. */
+ spdk_reduce_vol_unload(comp_bdev->vol, bdev_hotremove_vol_unload_cb, comp_bdev);
+ }
+ }
+}
+
+/* TODO: determine which parms we want user configurable, HC for now
+ * params.vol_size
+ * params.chunk_size
+ * compression PMD, algorithm, window size, comp level, etc.
+ * DEV_MD_PATH
+ */
+
+/* Common function for init and load to allocate and populate the minimal
+ * information for reducelib to init or load.
+ */
+struct vbdev_compress *
+_prepare_for_load_init(struct spdk_bdev *bdev, uint32_t lb_size)
+{
+ struct vbdev_compress *meta_ctx;
+
+ meta_ctx = calloc(1, sizeof(struct vbdev_compress));
+ if (meta_ctx == NULL) {
+ SPDK_ERRLOG("failed to alloc init contexts\n");
+ return NULL;
+ }
+
+ meta_ctx->drv_name = "None";
+ meta_ctx->base_bdev = bdev;
+ meta_ctx->backing_dev.unmap = _comp_reduce_unmap;
+ meta_ctx->backing_dev.readv = _comp_reduce_readv;
+ meta_ctx->backing_dev.writev = _comp_reduce_writev;
+ meta_ctx->backing_dev.compress = _comp_reduce_compress;
+ meta_ctx->backing_dev.decompress = _comp_reduce_decompress;
+
+ meta_ctx->backing_dev.blocklen = bdev->blocklen;
+ meta_ctx->backing_dev.blockcnt = bdev->blockcnt;
+
+ meta_ctx->params.chunk_size = CHUNK_SIZE;
+ if (lb_size == 0) {
+ meta_ctx->params.logical_block_size = bdev->blocklen;
+ } else {
+ meta_ctx->params.logical_block_size = lb_size;
+ }
+
+ meta_ctx->params.backing_io_unit_size = BACKING_IO_SZ;
+ return meta_ctx;
+}
+
+static bool
+_set_pmd(struct vbdev_compress *comp_dev)
+{
+ if (g_opts == COMPRESS_PMD_AUTO) {
+ if (g_qat_available) {
+ comp_dev->drv_name = QAT_PMD;
+ } else {
+ comp_dev->drv_name = ISAL_PMD;
+ }
+ } else if (g_opts == COMPRESS_PMD_QAT_ONLY && g_qat_available) {
+ comp_dev->drv_name = QAT_PMD;
+ } else if (g_opts == COMPRESS_PMD_ISAL_ONLY && g_isal_available) {
+ comp_dev->drv_name = ISAL_PMD;
+ } else {
+ SPDK_ERRLOG("Requested PMD is not available.\n");
+ return false;
+ }
+ SPDK_NOTICELOG("PMD being used: %s\n", comp_dev->drv_name);
+ return true;
+}
+
+/* Call reducelib to initialize a new volume */
+static int
+vbdev_init_reduce(struct spdk_bdev *bdev, const char *pm_path, uint32_t lb_size)
+{
+ struct vbdev_compress *meta_ctx;
+ int rc;
+
+ meta_ctx = _prepare_for_load_init(bdev, lb_size);
+ if (meta_ctx == NULL) {
+ return -EINVAL;
+ }
+
+ if (_set_pmd(meta_ctx) == false) {
+ SPDK_ERRLOG("could not find required pmd\n");
+ free(meta_ctx);
+ return -EINVAL;
+ }
+
+ rc = spdk_bdev_open(meta_ctx->base_bdev, true, vbdev_compress_base_bdev_hotremove_cb,
+ meta_ctx->base_bdev, &meta_ctx->base_desc);
+ if (rc) {
+ SPDK_ERRLOG("could not open bdev %s\n", spdk_bdev_get_name(meta_ctx->base_bdev));
+ free(meta_ctx);
+ return -EINVAL;
+ }
+
+ /* Save the thread where the base device is opened */
+ meta_ctx->thread = spdk_get_thread();
+
+ meta_ctx->base_ch = spdk_bdev_get_io_channel(meta_ctx->base_desc);
+
+ spdk_reduce_vol_init(&meta_ctx->params, &meta_ctx->backing_dev,
+ pm_path,
+ vbdev_reduce_init_cb,
+ meta_ctx);
+ return 0;
+}
+
+/* We provide this callback for the SPDK channel code to create a channel using
+ * the channel struct we provided in our module get_io_channel() entry point. Here
+ * we get and save off an underlying base channel of the device below us so that
+ * we can communicate with the base bdev on a per channel basis. If we needed
+ * our own poller for this vbdev, we'd register it here.
+ */
+static int
+comp_bdev_ch_create_cb(void *io_device, void *ctx_buf)
+{
+ struct vbdev_compress *comp_bdev = io_device;
+ struct comp_device_qp *device_qp;
+
+ /* Now set the reduce channel if it's not already set. */
+ pthread_mutex_lock(&comp_bdev->reduce_lock);
+ if (comp_bdev->ch_count == 0) {
+ /* We use this queue to track outstanding IO in our layer. */
+ TAILQ_INIT(&comp_bdev->pending_comp_ios);
+
+ /* We use this to queue up compression operations as needed. */
+ TAILQ_INIT(&comp_bdev->queued_comp_ops);
+
+ comp_bdev->base_ch = spdk_bdev_get_io_channel(comp_bdev->base_desc);
+ comp_bdev->reduce_thread = spdk_get_thread();
+ comp_bdev->poller = SPDK_POLLER_REGISTER(comp_dev_poller, comp_bdev, 0);
+ /* Now assign a q pair */
+ pthread_mutex_lock(&g_comp_device_qp_lock);
+ TAILQ_FOREACH(device_qp, &g_comp_device_qp, link) {
+ if (strcmp(device_qp->device->cdev_info.driver_name, comp_bdev->drv_name) == 0) {
+ if (device_qp->thread == spdk_get_thread()) {
+ comp_bdev->device_qp = device_qp;
+ break;
+ }
+ if (device_qp->thread == NULL) {
+ comp_bdev->device_qp = device_qp;
+ device_qp->thread = spdk_get_thread();
+ break;
+ }
+ }
+ }
+ pthread_mutex_unlock(&g_comp_device_qp_lock);
+ }
+ comp_bdev->ch_count++;
+ pthread_mutex_unlock(&comp_bdev->reduce_lock);
+
+ if (comp_bdev->device_qp != NULL) {
+ return 0;
+ } else {
+ SPDK_ERRLOG("out of qpairs, cannot assign one to comp_bdev %p\n", comp_bdev);
+ assert(false);
+ return -ENOMEM;
+ }
+}
+
+static void
+_channel_cleanup(struct vbdev_compress *comp_bdev)
+{
+ /* Note: comp_bdevs can share a device_qp if they are
+ * on the same thread so we leave the device_qp element
+ * alone for this comp_bdev and just clear the reduce thread.
+ */
+ spdk_put_io_channel(comp_bdev->base_ch);
+ comp_bdev->reduce_thread = NULL;
+ spdk_poller_unregister(&comp_bdev->poller);
+}
+
+/* Used to reroute destroy_ch to the correct thread */
+static void
+_comp_bdev_ch_destroy_cb(void *arg)
+{
+ struct vbdev_compress *comp_bdev = arg;
+
+ pthread_mutex_lock(&comp_bdev->reduce_lock);
+ _channel_cleanup(comp_bdev);
+ pthread_mutex_unlock(&comp_bdev->reduce_lock);
+}
+
+/* We provide this callback for the SPDK channel code to destroy a channel
+ * created with our create callback. We just need to undo anything we did
+ * when we created. If this bdev used its own poller, we'd unregister it here.
+ */
+static void
+comp_bdev_ch_destroy_cb(void *io_device, void *ctx_buf)
+{
+ struct vbdev_compress *comp_bdev = io_device;
+
+ pthread_mutex_lock(&comp_bdev->reduce_lock);
+ comp_bdev->ch_count--;
+ if (comp_bdev->ch_count == 0) {
+ /* Send this request to the thread where the channel was created. */
+ if (comp_bdev->reduce_thread != spdk_get_thread()) {
+ spdk_thread_send_msg(comp_bdev->reduce_thread,
+ _comp_bdev_ch_destroy_cb, comp_bdev);
+ } else {
+ _channel_cleanup(comp_bdev);
+ }
+ }
+ pthread_mutex_unlock(&comp_bdev->reduce_lock);
+}
+
+/* RPC entry point for compression vbdev creation. */
+int
+create_compress_bdev(const char *bdev_name, const char *pm_path, uint32_t lb_size)
+{
+ struct spdk_bdev *bdev;
+
+ bdev = spdk_bdev_get_by_name(bdev_name);
+ if (!bdev) {
+ return -ENODEV;
+ }
+
+ if ((lb_size != 0) && (lb_size != LB_SIZE_4K) && (lb_size != LB_SIZE_512B)) {
+ SPDK_ERRLOG("Logical block size must be 512 or 4096\n");
+ return -EINVAL;
+ }
+
+ return vbdev_init_reduce(bdev, pm_path, lb_size);
+}
+
+/* On init, just init the compress drivers. All metadata is stored on disk. */
+static int
+vbdev_compress_init(void)
+{
+ if (vbdev_init_compress_drivers()) {
+ SPDK_ERRLOG("Error setting up compression devices\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/* Called when the entire module is being torn down. */
+static void
+vbdev_compress_finish(void)
+{
+ struct comp_device_qp *dev_qp;
+ /* TODO: unload vol in a future patch */
+
+ while ((dev_qp = TAILQ_FIRST(&g_comp_device_qp))) {
+ TAILQ_REMOVE(&g_comp_device_qp, dev_qp, link);
+ free(dev_qp);
+ }
+ pthread_mutex_destroy(&g_comp_device_qp_lock);
+
+ rte_mempool_free(g_comp_op_mp);
+ rte_mempool_free(g_mbuf_mp);
+}
+
+/* During init we'll be asked how much memory we'd like passed to us
+ * in bev_io structures as context. Here's where we specify how
+ * much context we want per IO.
+ */
+static int
+vbdev_compress_get_ctx_size(void)
+{
+ return sizeof(struct comp_bdev_io);
+}
+
+/* When we register our bdev this is how we specify our entry points. */
+static const struct spdk_bdev_fn_table vbdev_compress_fn_table = {
+ .destruct = vbdev_compress_destruct,
+ .submit_request = vbdev_compress_submit_request,
+ .io_type_supported = vbdev_compress_io_type_supported,
+ .get_io_channel = vbdev_compress_get_io_channel,
+ .dump_info_json = vbdev_compress_dump_info_json,
+ .write_config_json = NULL,
+};
+
+static struct spdk_bdev_module compress_if = {
+ .name = "compress",
+ .module_init = vbdev_compress_init,
+ .config_text = NULL,
+ .get_ctx_size = vbdev_compress_get_ctx_size,
+ .examine_disk = vbdev_compress_examine,
+ .module_fini = vbdev_compress_finish,
+ .config_json = vbdev_compress_config_json
+};
+
+SPDK_BDEV_MODULE_REGISTER(compress, &compress_if)
+
+static int _set_compbdev_name(struct vbdev_compress *comp_bdev)
+{
+ struct spdk_bdev_alias *aliases;
+
+ if (!TAILQ_EMPTY(spdk_bdev_get_aliases(comp_bdev->base_bdev))) {
+ aliases = TAILQ_FIRST(spdk_bdev_get_aliases(comp_bdev->base_bdev));
+ comp_bdev->comp_bdev.name = spdk_sprintf_alloc("COMP_%s", aliases->alias);
+ if (!comp_bdev->comp_bdev.name) {
+ SPDK_ERRLOG("could not allocate comp_bdev name for alias\n");
+ return -ENOMEM;
+ }
+ } else {
+ comp_bdev->comp_bdev.name = spdk_sprintf_alloc("COMP_%s", comp_bdev->base_bdev->name);
+ if (!comp_bdev->comp_bdev.name) {
+ SPDK_ERRLOG("could not allocate comp_bdev name for unique name\n");
+ return -ENOMEM;
+ }
+ }
+ return 0;
+}
+
+static void
+vbdev_compress_claim(struct vbdev_compress *comp_bdev)
+{
+ int rc;
+
+ if (_set_compbdev_name(comp_bdev)) {
+ goto error_bdev_name;
+ }
+
+ /* Note: some of the fields below will change in the future - for example,
+ * blockcnt specifically will not match (the compressed volume size will
+ * be slightly less than the base bdev size)
+ */
+ comp_bdev->comp_bdev.product_name = COMP_BDEV_NAME;
+ comp_bdev->comp_bdev.write_cache = comp_bdev->base_bdev->write_cache;
+
+ if (strcmp(comp_bdev->drv_name, QAT_PMD) == 0) {
+ comp_bdev->comp_bdev.required_alignment =
+ spdk_max(spdk_u32log2(comp_bdev->base_bdev->blocklen),
+ comp_bdev->base_bdev->required_alignment);
+ SPDK_NOTICELOG("QAT in use: Required alignment set to %u\n",
+ comp_bdev->comp_bdev.required_alignment);
+ } else {
+ comp_bdev->comp_bdev.required_alignment = comp_bdev->base_bdev->required_alignment;
+ }
+ comp_bdev->comp_bdev.optimal_io_boundary =
+ comp_bdev->params.chunk_size / comp_bdev->params.logical_block_size;
+
+ comp_bdev->comp_bdev.split_on_optimal_io_boundary = true;
+
+ comp_bdev->comp_bdev.blocklen = comp_bdev->params.logical_block_size;
+ comp_bdev->comp_bdev.blockcnt = comp_bdev->params.vol_size / comp_bdev->comp_bdev.blocklen;
+ assert(comp_bdev->comp_bdev.blockcnt > 0);
+
+ /* This is the context that is passed to us when the bdev
+ * layer calls in so we'll save our comp_bdev node here.
+ */
+ comp_bdev->comp_bdev.ctxt = comp_bdev;
+ comp_bdev->comp_bdev.fn_table = &vbdev_compress_fn_table;
+ comp_bdev->comp_bdev.module = &compress_if;
+
+ pthread_mutex_init(&comp_bdev->reduce_lock, NULL);
+
+ rc = spdk_bdev_open(comp_bdev->base_bdev, true, vbdev_compress_base_bdev_hotremove_cb,
+ comp_bdev->base_bdev, &comp_bdev->base_desc);
+ if (rc) {
+ SPDK_ERRLOG("could not open bdev %s\n", spdk_bdev_get_name(comp_bdev->base_bdev));
+ goto error_open;
+ }
+
+ /* Save the thread where the base device is opened */
+ comp_bdev->thread = spdk_get_thread();
+
+ spdk_io_device_register(comp_bdev, comp_bdev_ch_create_cb, comp_bdev_ch_destroy_cb,
+ sizeof(struct comp_io_channel),
+ comp_bdev->comp_bdev.name);
+
+ rc = spdk_bdev_module_claim_bdev(comp_bdev->base_bdev, comp_bdev->base_desc,
+ comp_bdev->comp_bdev.module);
+ if (rc) {
+ SPDK_ERRLOG("could not claim bdev %s\n", spdk_bdev_get_name(comp_bdev->base_bdev));
+ goto error_claim;
+ }
+
+ rc = spdk_bdev_register(&comp_bdev->comp_bdev);
+ if (rc < 0) {
+ SPDK_ERRLOG("trying to register bdev\n");
+ goto error_bdev_register;
+ }
+
+ TAILQ_INSERT_TAIL(&g_vbdev_comp, comp_bdev, link);
+
+ SPDK_NOTICELOG("registered io_device and virtual bdev for: %s\n", comp_bdev->comp_bdev.name);
+
+ return;
+ /* Error cleanup paths. */
+error_bdev_register:
+ spdk_bdev_module_release_bdev(comp_bdev->base_bdev);
+error_claim:
+ spdk_io_device_unregister(comp_bdev, NULL);
+ spdk_bdev_close(comp_bdev->base_desc);
+error_open:
+ free(comp_bdev->comp_bdev.name);
+error_bdev_name:
+ free(comp_bdev);
+}
+
+static void
+_vbdev_compress_delete_done(void *_ctx)
+{
+ struct vbdev_comp_delete_ctx *ctx = _ctx;
+
+ ctx->cb_fn(ctx->cb_arg, ctx->cb_rc);
+
+ free(ctx);
+}
+
+static void
+vbdev_compress_delete_done(void *cb_arg, int bdeverrno)
+{
+ struct vbdev_comp_delete_ctx *ctx = cb_arg;
+
+ ctx->cb_rc = bdeverrno;
+
+ if (ctx->orig_thread != spdk_get_thread()) {
+ spdk_thread_send_msg(ctx->orig_thread, _vbdev_compress_delete_done, ctx);
+ } else {
+ _vbdev_compress_delete_done(ctx);
+ }
+}
+
+void
+bdev_compress_delete(const char *name, spdk_delete_compress_complete cb_fn, void *cb_arg)
+{
+ struct vbdev_compress *comp_bdev = NULL;
+ struct vbdev_comp_delete_ctx *ctx;
+
+ TAILQ_FOREACH(comp_bdev, &g_vbdev_comp, link) {
+ if (strcmp(name, comp_bdev->comp_bdev.name) == 0) {
+ break;
+ }
+ }
+
+ if (comp_bdev == NULL) {
+ cb_fn(cb_arg, -ENODEV);
+ return;
+ }
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (ctx == NULL) {
+ SPDK_ERRLOG("Failed to allocate delete context\n");
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ /* Save these for after the vol is destroyed. */
+ ctx->cb_fn = cb_fn;
+ ctx->cb_arg = cb_arg;
+ ctx->orig_thread = spdk_get_thread();
+
+ comp_bdev->delete_ctx = ctx;
+
+ /* Tell reducelib that we're done with this volume. */
+ if (comp_bdev->orphaned == false) {
+ spdk_reduce_vol_unload(comp_bdev->vol, delete_vol_unload_cb, comp_bdev);
+ } else {
+ delete_vol_unload_cb(comp_bdev, 0);
+ }
+}
+
+static void
+_vbdev_reduce_load_cb(void *ctx)
+{
+ struct vbdev_compress *meta_ctx = ctx;
+ int rc;
+
+ /* Done with metadata operations */
+ spdk_put_io_channel(meta_ctx->base_ch);
+ /* Close the underlying bdev on its same opened thread. */
+ spdk_bdev_close(meta_ctx->base_desc);
+ meta_ctx->base_desc = NULL;
+
+ if (meta_ctx->reduce_errno == 0) {
+ if (_set_pmd(meta_ctx) == false) {
+ SPDK_ERRLOG("could not find required pmd\n");
+ goto err;
+ }
+
+ vbdev_compress_claim(meta_ctx);
+ } else if (meta_ctx->reduce_errno == -ENOENT) {
+ if (_set_compbdev_name(meta_ctx)) {
+ goto err;
+ }
+
+ /* We still want to open and claim the backing device to protect the data until
+ * either the pm metadata file is recovered or the comp bdev is deleted.
+ */
+ rc = spdk_bdev_open(meta_ctx->base_bdev, true, vbdev_compress_base_bdev_hotremove_cb,
+ meta_ctx->base_bdev, &meta_ctx->base_desc);
+ if (rc) {
+ SPDK_ERRLOG("could not open bdev %s\n", spdk_bdev_get_name(meta_ctx->base_bdev));
+ free(meta_ctx->comp_bdev.name);
+ goto err;
+ }
+
+ /* Save the thread where the base device is opened */
+ meta_ctx->thread = spdk_get_thread();
+
+ meta_ctx->comp_bdev.module = &compress_if;
+ pthread_mutex_init(&meta_ctx->reduce_lock, NULL);
+ rc = spdk_bdev_module_claim_bdev(meta_ctx->base_bdev, meta_ctx->base_desc,
+ meta_ctx->comp_bdev.module);
+ if (rc) {
+ SPDK_ERRLOG("could not claim bdev %s\n", spdk_bdev_get_name(meta_ctx->base_bdev));
+ spdk_bdev_close(meta_ctx->base_desc);
+ free(meta_ctx->comp_bdev.name);
+ goto err;
+ }
+
+ meta_ctx->orphaned = true;
+ TAILQ_INSERT_TAIL(&g_vbdev_comp, meta_ctx, link);
+ } else {
+ if (meta_ctx->reduce_errno != -EILSEQ) {
+ SPDK_ERRLOG("for vol %s, error %u\n",
+ spdk_bdev_get_name(meta_ctx->base_bdev), meta_ctx->reduce_errno);
+ }
+ goto err;
+ }
+
+ spdk_bdev_module_examine_done(&compress_if);
+ return;
+
+err:
+ free(meta_ctx);
+ spdk_bdev_module_examine_done(&compress_if);
+}
+
+/* Callback from reduce for then load is complete. We'll pass the vbdev_comp struct
+ * used for initial metadata operations to claim where it will be further filled out
+ * and added to the global list.
+ */
+static void
+vbdev_reduce_load_cb(void *cb_arg, struct spdk_reduce_vol *vol, int reduce_errno)
+{
+ struct vbdev_compress *meta_ctx = cb_arg;
+
+ if (reduce_errno == 0) {
+ /* Update information following volume load. */
+ meta_ctx->vol = vol;
+ memcpy(&meta_ctx->params, spdk_reduce_vol_get_params(vol),
+ sizeof(struct spdk_reduce_vol_params));
+ }
+
+ meta_ctx->reduce_errno = reduce_errno;
+
+ if (meta_ctx->thread && meta_ctx->thread != spdk_get_thread()) {
+ spdk_thread_send_msg(meta_ctx->thread, _vbdev_reduce_load_cb, meta_ctx);
+ } else {
+ _vbdev_reduce_load_cb(meta_ctx);
+ }
+
+}
+
+/* Examine_disk entry point: will do a metadata load to see if this is ours,
+ * and if so will go ahead and claim it.
+ */
+static void
+vbdev_compress_examine(struct spdk_bdev *bdev)
+{
+ struct vbdev_compress *meta_ctx;
+ int rc;
+
+ if (strcmp(bdev->product_name, COMP_BDEV_NAME) == 0) {
+ spdk_bdev_module_examine_done(&compress_if);
+ return;
+ }
+
+ meta_ctx = _prepare_for_load_init(bdev, 0);
+ if (meta_ctx == NULL) {
+ spdk_bdev_module_examine_done(&compress_if);
+ return;
+ }
+
+ rc = spdk_bdev_open(meta_ctx->base_bdev, false, vbdev_compress_base_bdev_hotremove_cb,
+ meta_ctx->base_bdev, &meta_ctx->base_desc);
+ if (rc) {
+ SPDK_ERRLOG("could not open bdev %s\n", spdk_bdev_get_name(meta_ctx->base_bdev));
+ free(meta_ctx);
+ spdk_bdev_module_examine_done(&compress_if);
+ return;
+ }
+
+ /* Save the thread where the base device is opened */
+ meta_ctx->thread = spdk_get_thread();
+
+ meta_ctx->base_ch = spdk_bdev_get_io_channel(meta_ctx->base_desc);
+ spdk_reduce_vol_load(&meta_ctx->backing_dev, vbdev_reduce_load_cb, meta_ctx);
+}
+
+int
+compress_set_pmd(enum compress_pmd *opts)
+{
+ g_opts = *opts;
+
+ return 0;
+}
+
+SPDK_LOG_REGISTER_COMPONENT("vbdev_compress", SPDK_LOG_VBDEV_COMPRESS)
diff --git a/src/spdk/module/bdev/compress/vbdev_compress.h b/src/spdk/module/bdev/compress/vbdev_compress.h
new file mode 100644
index 000000000..4dcd78f60
--- /dev/null
+++ b/src/spdk/module/bdev/compress/vbdev_compress.h
@@ -0,0 +1,106 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_VBDEV_COMPRESS_H
+#define SPDK_VBDEV_COMPRESS_H
+
+#include "spdk/stdinc.h"
+
+#include "spdk/bdev.h"
+
+#define LB_SIZE_4K 0x1000UL
+#define LB_SIZE_512B 0x200UL
+
+/**
+ * Get the first compression bdev.
+ *
+ * \return the first compression bdev.
+ */
+struct vbdev_compress *compress_bdev_first(void);
+
+/**
+ * Get the next compression bdev.
+ *
+ * \param prev previous compression bdev.
+ * \return the next compression bdev.
+ */
+struct vbdev_compress *compress_bdev_next(struct vbdev_compress *prev);
+
+/**
+ * Test to see if a compression bdev orphan exists.
+ *
+ * \param name The name of the compression bdev.
+ * \return true if found, false if not.
+ */
+bool compress_has_orphan(const char *name);
+
+/**
+ * Get the name of a compression bdev.
+ *
+ * \param comp_bdev The compression bdev.
+ * \return the name of the compression bdev.
+ */
+const char *compress_get_name(const struct vbdev_compress *comp_bdev);
+
+enum compress_pmd {
+ COMPRESS_PMD_AUTO = 0,
+ COMPRESS_PMD_QAT_ONLY,
+ COMPRESS_PMD_ISAL_ONLY,
+ COMPRESS_PMD_MAX
+};
+
+int compress_set_pmd(enum compress_pmd *opts);
+
+typedef void (*spdk_delete_compress_complete)(void *cb_arg, int bdeverrno);
+
+/**
+ * Create new compression bdev.
+ *
+ * \param bdev_name Bdev on which compression bdev will be created.
+ * \param pm_path Path to persistent memory.
+ * \param lb_size Logical block size for the compressed volume in bytes. Must be 4K or 512.
+ * \return 0 on success, other on failure.
+ */
+int create_compress_bdev(const char *bdev_name, const char *pm_path, uint32_t lb_size);
+
+/**
+ * Delete compress bdev.
+ *
+ * \param bdev_name Bdev on which compression bdev will be deleted.
+ * \param cb_fn Function to call after deletion.
+ * \param cb_arg Argument to pass to cb_fn.
+ */
+void bdev_compress_delete(const char *bdev_name, spdk_delete_compress_complete cb_fn,
+ void *cb_arg);
+
+#endif /* SPDK_VBDEV_COMPRESS_H */
diff --git a/src/spdk/module/bdev/compress/vbdev_compress_rpc.c b/src/spdk/module/bdev/compress/vbdev_compress_rpc.c
new file mode 100644
index 000000000..9eedae066
--- /dev/null
+++ b/src/spdk/module/bdev/compress/vbdev_compress_rpc.c
@@ -0,0 +1,252 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "vbdev_compress.h"
+#include "spdk/rpc.h"
+#include "spdk/util.h"
+#include "spdk/string.h"
+#include "spdk_internal/log.h"
+
+struct rpc_bdev_compress_get_orphans {
+ char *name;
+};
+
+static void
+free_rpc_bdev_compress_get_orphans(struct rpc_bdev_compress_get_orphans *r)
+{
+ free(r->name);
+}
+
+static const struct spdk_json_object_decoder rpc_bdev_compress_get_orphans_decoders[] = {
+ {"name", offsetof(struct rpc_bdev_compress_get_orphans, name), spdk_json_decode_string, true},
+};
+
+static void
+rpc_bdev_compress_get_orphans(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_compress_get_orphans req = {};
+ struct spdk_json_write_ctx *w;
+ struct vbdev_compress *comp_bdev;
+ bool found = false;
+
+
+ if (params && spdk_json_decode_object(params, rpc_bdev_compress_get_orphans_decoders,
+ SPDK_COUNTOF(rpc_bdev_compress_get_orphans_decoders),
+ &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ free_rpc_bdev_compress_get_orphans(&req);
+ return;
+ }
+
+ if (req.name) {
+ if (compress_has_orphan(req.name) == false) {
+ spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV));
+ free_rpc_bdev_compress_get_orphans(&req);
+ return;
+ }
+ found = true;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_array_begin(w);
+ if (found) {
+ spdk_json_write_string(w, req.name);
+ } else {
+ for (comp_bdev = compress_bdev_first(); comp_bdev != NULL;
+ comp_bdev = compress_bdev_next(comp_bdev)) {
+ if (compress_has_orphan(compress_get_name(comp_bdev))) {
+ spdk_json_write_string(w, compress_get_name(comp_bdev));
+ }
+ }
+ }
+ spdk_json_write_array_end(w);
+ spdk_jsonrpc_end_result(request, w);
+ free_rpc_bdev_compress_get_orphans(&req);
+}
+SPDK_RPC_REGISTER("bdev_compress_get_orphans", rpc_bdev_compress_get_orphans, SPDK_RPC_RUNTIME)
+
+struct rpc_compress_set_pmd {
+ enum compress_pmd pmd;
+};
+
+static const struct spdk_json_object_decoder rpc_compress_pmd_decoder[] = {
+ {"pmd", offsetof(struct rpc_compress_set_pmd, pmd), spdk_json_decode_int32},
+};
+
+static void
+rpc_compress_set_pmd(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_compress_set_pmd req;
+ struct spdk_json_write_ctx *w;
+ int rc = 0;
+
+ if (spdk_json_decode_object(params, rpc_compress_pmd_decoder,
+ SPDK_COUNTOF(rpc_compress_pmd_decoder),
+ &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ return;
+ }
+
+ if (req.pmd >= COMPRESS_PMD_MAX) {
+ spdk_jsonrpc_send_error_response_fmt(request, -EINVAL,
+ "PMD value %d should be less than %d", req.pmd, COMPRESS_PMD_MAX);
+ return;
+ }
+
+ rc = compress_set_pmd(&req.pmd);
+ if (rc) {
+ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc));
+ return;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ if (w != NULL) {
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ }
+}
+SPDK_RPC_REGISTER("compress_set_pmd", rpc_compress_set_pmd,
+ SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(compress_set_pmd, set_compress_pmd)
+
+/* Structure to hold the parameters for this RPC method. */
+struct rpc_construct_compress {
+ char *base_bdev_name;
+ char *pm_path;
+ uint32_t lb_size;
+};
+
+/* Free the allocated memory resource after the RPC handling. */
+static void
+free_rpc_construct_compress(struct rpc_construct_compress *r)
+{
+ free(r->base_bdev_name);
+ free(r->pm_path);
+}
+
+/* Structure to decode the input parameters for this RPC method. */
+static const struct spdk_json_object_decoder rpc_construct_compress_decoders[] = {
+ {"base_bdev_name", offsetof(struct rpc_construct_compress, base_bdev_name), spdk_json_decode_string},
+ {"pm_path", offsetof(struct rpc_construct_compress, pm_path), spdk_json_decode_string},
+ {"lb_size", offsetof(struct rpc_construct_compress, lb_size), spdk_json_decode_uint32},
+};
+
+/* Decode the parameters for this RPC method and properly construct the compress
+ * device. Error status returned in the failed cases.
+ */
+static void
+rpc_bdev_compress_create(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_construct_compress req = {NULL};
+ struct spdk_json_write_ctx *w;
+ char *name;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_construct_compress_decoders,
+ SPDK_COUNTOF(rpc_construct_compress_decoders),
+ &req)) {
+ SPDK_DEBUGLOG(SPDK_LOG_VBDEV_COMPRESS, "spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_PARSE_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ rc = create_compress_bdev(req.base_bdev_name, req.pm_path, req.lb_size);
+ if (rc != 0) {
+ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc));
+ goto cleanup;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ name = spdk_sprintf_alloc("COMP_%s", req.base_bdev_name);
+ spdk_json_write_string(w, name);
+ spdk_jsonrpc_end_result(request, w);
+ free(name);
+
+cleanup:
+ free_rpc_construct_compress(&req);
+}
+SPDK_RPC_REGISTER("bdev_compress_create", rpc_bdev_compress_create, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_compress_create, construct_compress_bdev)
+
+struct rpc_delete_compress {
+ char *name;
+};
+
+static void
+free_rpc_delete_compress(struct rpc_delete_compress *req)
+{
+ free(req->name);
+}
+
+static const struct spdk_json_object_decoder rpc_delete_compress_decoders[] = {
+ {"name", offsetof(struct rpc_delete_compress, name), spdk_json_decode_string},
+};
+
+static void
+_rpc_bdev_compress_delete_cb(void *cb_arg, int bdeverrno)
+{
+ struct spdk_jsonrpc_request *request = cb_arg;
+ struct spdk_json_write_ctx *w;
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, bdeverrno == 0);
+ spdk_jsonrpc_end_result(request, w);
+}
+
+static void
+rpc_bdev_compress_delete(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_delete_compress req = {NULL};
+
+ if (spdk_json_decode_object(params, rpc_delete_compress_decoders,
+ SPDK_COUNTOF(rpc_delete_compress_decoders),
+ &req)) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ } else {
+ bdev_compress_delete(req.name, _rpc_bdev_compress_delete_cb, request);
+ }
+
+ free_rpc_delete_compress(&req);
+}
+SPDK_RPC_REGISTER("bdev_compress_delete", rpc_bdev_compress_delete, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_compress_delete, delete_compress_bdev)
diff --git a/src/spdk/module/bdev/crypto/Makefile b/src/spdk/module/bdev/crypto/Makefile
new file mode 100644
index 000000000..dbf96952d
--- /dev/null
+++ b/src/spdk/module/bdev/crypto/Makefile
@@ -0,0 +1,47 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+CFLAGS += $(ENV_CFLAGS)
+
+C_SRCS = vbdev_crypto.c vbdev_crypto_rpc.c
+LIBNAME = bdev_crypto
+
+SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/module/bdev/crypto/vbdev_crypto.c b/src/spdk/module/bdev/crypto/vbdev_crypto.c
new file mode 100644
index 000000000..f5dd0f814
--- /dev/null
+++ b/src/spdk/module/bdev/crypto/vbdev_crypto.c
@@ -0,0 +1,2040 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUcryptoION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "vbdev_crypto.h"
+
+#include "spdk/env.h"
+#include "spdk/conf.h"
+#include "spdk/endian.h"
+#include "spdk/thread.h"
+#include "spdk/bdev_module.h"
+#include "spdk_internal/log.h"
+
+#include <rte_config.h>
+#include <rte_version.h>
+#include <rte_bus_vdev.h>
+#include <rte_crypto.h>
+#include <rte_cryptodev.h>
+#include <rte_cryptodev_pmd.h>
+
+/* To add support for new device types, follow the examples of the following...
+ * Note that the string names are defined by the DPDK PMD in question so be
+ * sure to use the exact names.
+ */
+#define MAX_NUM_DRV_TYPES 2
+
+/* The VF spread is the number of queue pairs between virtual functions, we use this to
+ * load balance the QAT device.
+ */
+#define QAT_VF_SPREAD 32
+static uint8_t g_qat_total_qp = 0;
+static uint8_t g_next_qat_index;
+
+const char *g_driver_names[MAX_NUM_DRV_TYPES] = { AESNI_MB, QAT };
+
+/* Global list of available crypto devices. */
+struct vbdev_dev {
+ struct rte_cryptodev_info cdev_info; /* includes device friendly name */
+ uint8_t cdev_id; /* identifier for the device */
+ TAILQ_ENTRY(vbdev_dev) link;
+};
+static TAILQ_HEAD(, vbdev_dev) g_vbdev_devs = TAILQ_HEAD_INITIALIZER(g_vbdev_devs);
+
+/* Global list and lock for unique device/queue pair combos. We keep 1 list per supported PMD
+ * so that we can optimize per PMD where it make sense. For example, with QAT there an optimal
+ * pattern for assigning queue pairs where with AESNI there is not.
+ */
+struct device_qp {
+ struct vbdev_dev *device; /* ptr to crypto device */
+ uint8_t qp; /* queue pair for this node */
+ bool in_use; /* whether this node is in use or not */
+ uint8_t index; /* used by QAT to load balance placement of qpairs */
+ TAILQ_ENTRY(device_qp) link;
+};
+static TAILQ_HEAD(, device_qp) g_device_qp_qat = TAILQ_HEAD_INITIALIZER(g_device_qp_qat);
+static TAILQ_HEAD(, device_qp) g_device_qp_aesni_mb = TAILQ_HEAD_INITIALIZER(g_device_qp_aesni_mb);
+static pthread_mutex_t g_device_qp_lock = PTHREAD_MUTEX_INITIALIZER;
+
+
+/* In order to limit the number of resources we need to do one crypto
+ * operation per LBA (we use LBA as IV), we tell the bdev layer that
+ * our max IO size is something reasonable. Units here are in bytes.
+ */
+#define CRYPTO_MAX_IO (64 * 1024)
+
+/* This controls how many ops will be dequeued from the crypto driver in one run
+ * of the poller. It is mainly a performance knob as it effectively determines how
+ * much work the poller has to do. However even that can vary between crypto drivers
+ * as the AESNI_MB driver for example does all the crypto work on dequeue whereas the
+ * QAT driver just dequeues what has been completed already.
+ */
+#define MAX_DEQUEUE_BURST_SIZE 64
+
+/* When enqueueing, we need to supply the crypto driver with an array of pointers to
+ * operation structs. As each of these can be max 512B, we can adjust the CRYPTO_MAX_IO
+ * value in conjunction with the other defines to make sure we're not using crazy amounts
+ * of memory. All of these numbers can and probably should be adjusted based on the
+ * workload. By default we'll use the worst case (smallest) block size for the
+ * minimum number of array entries. As an example, a CRYPTO_MAX_IO size of 64K with 512B
+ * blocks would give us an enqueue array size of 128.
+ */
+#define MAX_ENQUEUE_ARRAY_SIZE (CRYPTO_MAX_IO / 512)
+
+/* The number of MBUFS we need must be a power of two and to support other small IOs
+ * in addition to the limits mentioned above, we go to the next power of two. It is
+ * big number because it is one mempool for source and destination mbufs. It may
+ * need to be bigger to support multiple crypto drivers at once.
+ */
+#define NUM_MBUFS 32768
+#define POOL_CACHE_SIZE 256
+#define MAX_CRYPTO_VOLUMES 128
+#define NUM_SESSIONS (2 * MAX_CRYPTO_VOLUMES)
+#define SESS_MEMPOOL_CACHE_SIZE 0
+uint8_t g_number_of_claimed_volumes = 0;
+
+/* This is the max number of IOs we can supply to any crypto device QP at one time.
+ * It can vary between drivers.
+ */
+#define CRYPTO_QP_DESCRIPTORS 2048
+
+/* Specific to AES_CBC. */
+#define AES_CBC_IV_LENGTH 16
+#define AES_CBC_KEY_LENGTH 16
+#define AES_XTS_KEY_LENGTH 16 /* XTS uses 2 keys, each of this size. */
+#define AESNI_MB_NUM_QP 64
+
+/* Common for suported devices. */
+#define IV_OFFSET (sizeof(struct rte_crypto_op) + \
+ sizeof(struct rte_crypto_sym_op))
+#define QUEUED_OP_OFFSET (IV_OFFSET + AES_CBC_IV_LENGTH)
+
+static void _complete_internal_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg);
+static void _complete_internal_read(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg);
+static void _complete_internal_write(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg);
+static void vbdev_crypto_examine(struct spdk_bdev *bdev);
+static int vbdev_crypto_claim(struct spdk_bdev *bdev);
+static void vbdev_crypto_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io);
+
+/* List of crypto_bdev names and their base bdevs via configuration file. */
+struct bdev_names {
+ char *vbdev_name; /* name of the vbdev to create */
+ char *bdev_name; /* base bdev name */
+
+ /* Note, for dev/test we allow use of key in the config file, for production
+ * use, you must use an RPC to specify the key for security reasons.
+ */
+ uint8_t *key; /* key per bdev */
+ char *drv_name; /* name of the crypto device driver */
+ char *cipher; /* AES_CBC or AES_XTS */
+ uint8_t *key2; /* key #2 for AES_XTS, per bdev */
+ TAILQ_ENTRY(bdev_names) link;
+};
+static TAILQ_HEAD(, bdev_names) g_bdev_names = TAILQ_HEAD_INITIALIZER(g_bdev_names);
+
+/* List of virtual bdevs and associated info for each. We keep the device friendly name here even
+ * though its also in the device struct because we use it early on.
+ */
+struct vbdev_crypto {
+ struct spdk_bdev *base_bdev; /* the thing we're attaching to */
+ struct spdk_bdev_desc *base_desc; /* its descriptor we get from open */
+ struct spdk_bdev crypto_bdev; /* the crypto virtual bdev */
+ uint8_t *key; /* key per bdev */
+ uint8_t *key2; /* for XTS */
+ uint8_t *xts_key; /* key + key 2 */
+ char *drv_name; /* name of the crypto device driver */
+ char *cipher; /* cipher used */
+ struct rte_cryptodev_sym_session *session_encrypt; /* encryption session for this bdev */
+ struct rte_cryptodev_sym_session *session_decrypt; /* decryption session for this bdev */
+ struct rte_crypto_sym_xform cipher_xform; /* crypto control struct for this bdev */
+ TAILQ_ENTRY(vbdev_crypto) link;
+ struct spdk_thread *thread; /* thread where base device is opened */
+};
+static TAILQ_HEAD(, vbdev_crypto) g_vbdev_crypto = TAILQ_HEAD_INITIALIZER(g_vbdev_crypto);
+
+/* Shared mempools between all devices on this system */
+static struct rte_mempool *g_session_mp = NULL;
+static struct rte_mempool *g_session_mp_priv = NULL;
+static struct spdk_mempool *g_mbuf_mp = NULL; /* mbuf mempool */
+static struct rte_mempool *g_crypto_op_mp = NULL; /* crypto operations, must be rte* mempool */
+
+/* For queueing up crypto operations that we can't submit for some reason */
+struct vbdev_crypto_op {
+ uint8_t cdev_id;
+ uint8_t qp;
+ struct rte_crypto_op *crypto_op;
+ struct spdk_bdev_io *bdev_io;
+ TAILQ_ENTRY(vbdev_crypto_op) link;
+};
+#define QUEUED_OP_LENGTH (sizeof(struct vbdev_crypto_op))
+
+/* The crypto vbdev channel struct. It is allocated and freed on my behalf by the io channel code.
+ * We store things in here that are needed on per thread basis like the base_channel for this thread,
+ * and the poller for this thread.
+ */
+struct crypto_io_channel {
+ struct spdk_io_channel *base_ch; /* IO channel of base device */
+ struct spdk_poller *poller; /* completion poller */
+ struct device_qp *device_qp; /* unique device/qp combination for this channel */
+ TAILQ_HEAD(, spdk_bdev_io) pending_cry_ios; /* outstanding operations to the crypto device */
+ struct spdk_io_channel_iter *iter; /* used with for_each_channel in reset */
+ TAILQ_HEAD(, vbdev_crypto_op) queued_cry_ops; /* queued for re-submission to CryptoDev */
+};
+
+/* This is the crypto per IO context that the bdev layer allocates for us opaquely and attaches to
+ * each IO for us.
+ */
+struct crypto_bdev_io {
+ int cryop_cnt_remaining; /* counter used when completing crypto ops */
+ struct crypto_io_channel *crypto_ch; /* need to store for crypto completion handling */
+ struct vbdev_crypto *crypto_bdev; /* the crypto node struct associated with this IO */
+ struct spdk_bdev_io *orig_io; /* the original IO */
+ struct spdk_bdev_io *read_io; /* the read IO we issued */
+ int8_t bdev_io_status; /* the status we'll report back on the bdev IO */
+ bool on_pending_list;
+ /* Used for the single contiguous buffer that serves as the crypto destination target for writes */
+ uint64_t aux_num_blocks; /* num of blocks for the contiguous buffer */
+ uint64_t aux_offset_blocks; /* block offset on media */
+ void *aux_buf_raw; /* raw buffer that the bdev layer gave us for write buffer */
+ struct iovec aux_buf_iov; /* iov representing aligned contig write buffer */
+
+ /* for bdev_io_wait */
+ struct spdk_bdev_io_wait_entry bdev_io_wait;
+ struct spdk_io_channel *ch;
+};
+
+/* Called by vbdev_crypto_init_crypto_drivers() to init each discovered crypto device */
+static int
+create_vbdev_dev(uint8_t index, uint16_t num_lcores)
+{
+ struct vbdev_dev *device;
+ uint8_t j, cdev_id, cdrv_id;
+ struct device_qp *dev_qp;
+ struct device_qp *tmp_qp;
+ int rc;
+ TAILQ_HEAD(device_qps, device_qp) *dev_qp_head;
+
+ device = calloc(1, sizeof(struct vbdev_dev));
+ if (!device) {
+ return -ENOMEM;
+ }
+
+ /* Get details about this device. */
+ rte_cryptodev_info_get(index, &device->cdev_info);
+ cdrv_id = device->cdev_info.driver_id;
+ cdev_id = device->cdev_id = index;
+
+ /* Before going any further, make sure we have enough resources for this
+ * device type to function. We need a unique queue pair per core accross each
+ * device type to remain lockless....
+ */
+ if ((rte_cryptodev_device_count_by_driver(cdrv_id) *
+ device->cdev_info.max_nb_queue_pairs) < num_lcores) {
+ SPDK_ERRLOG("Insufficient unique queue pairs available for %s\n",
+ device->cdev_info.driver_name);
+ SPDK_ERRLOG("Either add more crypto devices or decrease core count\n");
+ rc = -EINVAL;
+ goto err;
+ }
+
+ /* Setup queue pairs. */
+ struct rte_cryptodev_config conf = {
+ .nb_queue_pairs = device->cdev_info.max_nb_queue_pairs,
+ .socket_id = SPDK_ENV_SOCKET_ID_ANY
+ };
+
+ rc = rte_cryptodev_configure(cdev_id, &conf);
+ if (rc < 0) {
+ SPDK_ERRLOG("Failed to configure cryptodev %u\n", cdev_id);
+ rc = -EINVAL;
+ goto err;
+ }
+
+ struct rte_cryptodev_qp_conf qp_conf = {
+ .nb_descriptors = CRYPTO_QP_DESCRIPTORS,
+#if RTE_VERSION >= RTE_VERSION_NUM(19, 02, 0, 0)
+ .mp_session = g_session_mp,
+ .mp_session_private = g_session_mp_priv,
+#endif
+ };
+
+ /* Pre-setup all potential qpairs now and assign them in the channel
+ * callback. If we were to create them there, we'd have to stop the
+ * entire device affecting all other threads that might be using it
+ * even on other queue pairs.
+ */
+ for (j = 0; j < device->cdev_info.max_nb_queue_pairs; j++) {
+#if RTE_VERSION >= RTE_VERSION_NUM(19, 02, 0, 0)
+ rc = rte_cryptodev_queue_pair_setup(cdev_id, j, &qp_conf, SOCKET_ID_ANY);
+#else
+ rc = rte_cryptodev_queue_pair_setup(cdev_id, j, &qp_conf, SOCKET_ID_ANY,
+ g_session_mp);
+#endif
+
+ if (rc < 0) {
+ SPDK_ERRLOG("Failed to setup queue pair %u on "
+ "cryptodev %u\n", j, cdev_id);
+ rc = -EINVAL;
+ goto err;
+ }
+ }
+
+ rc = rte_cryptodev_start(cdev_id);
+ if (rc < 0) {
+ SPDK_ERRLOG("Failed to start device %u: error %d\n",
+ cdev_id, rc);
+ rc = -EINVAL;
+ goto err;
+ }
+
+ /* Select the right device/qp list based on driver name
+ * or error if it does not exist.
+ */
+ if (strcmp(device->cdev_info.driver_name, QAT) == 0) {
+ dev_qp_head = (struct device_qps *)&g_device_qp_qat;
+ } else if (strcmp(device->cdev_info.driver_name, AESNI_MB) == 0) {
+ dev_qp_head = (struct device_qps *)&g_device_qp_aesni_mb;
+ } else {
+ rc = -EINVAL;
+ goto err;
+ }
+
+ /* Build up lists of device/qp combinations per PMD */
+ for (j = 0; j < device->cdev_info.max_nb_queue_pairs; j++) {
+ dev_qp = calloc(1, sizeof(struct device_qp));
+ if (!dev_qp) {
+ rc = -ENOMEM;
+ goto err_qp_alloc;
+ }
+ dev_qp->device = device;
+ dev_qp->qp = j;
+ dev_qp->in_use = false;
+ if (strcmp(device->cdev_info.driver_name, QAT) == 0) {
+ g_qat_total_qp++;
+ }
+ TAILQ_INSERT_TAIL(dev_qp_head, dev_qp, link);
+ }
+
+ /* Add to our list of available crypto devices. */
+ TAILQ_INSERT_TAIL(&g_vbdev_devs, device, link);
+
+ return 0;
+err_qp_alloc:
+ TAILQ_FOREACH_SAFE(dev_qp, dev_qp_head, link, tmp_qp) {
+ TAILQ_REMOVE(dev_qp_head, dev_qp, link);
+ free(dev_qp);
+ }
+err:
+ free(device);
+
+ return rc;
+}
+
+/* This is called from the module's init function. We setup all crypto devices early on as we are unable
+ * to easily dynamically configure queue pairs after the drivers are up and running. So, here, we
+ * configure the max capabilities of each device and assign threads to queue pairs as channels are
+ * requested.
+ */
+static int
+vbdev_crypto_init_crypto_drivers(void)
+{
+ uint8_t cdev_count;
+ uint8_t cdev_id;
+ int i, rc = 0;
+ struct vbdev_dev *device;
+ struct vbdev_dev *tmp_dev;
+ struct device_qp *dev_qp;
+ unsigned int max_sess_size = 0, sess_size;
+ uint16_t num_lcores = rte_lcore_count();
+ char aesni_args[32];
+
+ /* Only the first call, via RPC or module init should init the crypto drivers. */
+ if (g_session_mp != NULL) {
+ return 0;
+ }
+
+ /* We always init AESNI_MB */
+ snprintf(aesni_args, sizeof(aesni_args), "max_nb_queue_pairs=%d", AESNI_MB_NUM_QP);
+ rc = rte_vdev_init(AESNI_MB, aesni_args);
+ if (rc) {
+ SPDK_ERRLOG("error creating virtual PMD %s\n", AESNI_MB);
+ return -EINVAL;
+ }
+
+ /* If we have no crypto devices, there's no reason to continue. */
+ cdev_count = rte_cryptodev_count();
+ if (cdev_count == 0) {
+ return 0;
+ }
+
+ /*
+ * Create global mempools, shared by all devices regardless of type.
+ */
+
+ /* First determine max session size, most pools are shared by all the devices,
+ * so we need to find the global max sessions size.
+ */
+ for (cdev_id = 0; cdev_id < cdev_count; cdev_id++) {
+ sess_size = rte_cryptodev_sym_get_private_session_size(cdev_id);
+ if (sess_size > max_sess_size) {
+ max_sess_size = sess_size;
+ }
+ }
+
+#if RTE_VERSION >= RTE_VERSION_NUM(19, 02, 0, 0)
+ g_session_mp_priv = rte_mempool_create("session_mp_priv", NUM_SESSIONS, max_sess_size,
+ SESS_MEMPOOL_CACHE_SIZE, 0, NULL, NULL, NULL,
+ NULL, SOCKET_ID_ANY, 0);
+ if (g_session_mp_priv == NULL) {
+ SPDK_ERRLOG("Cannot create private session pool max size 0x%x\n", max_sess_size);
+ return -ENOMEM;
+ }
+
+ g_session_mp = rte_cryptodev_sym_session_pool_create(
+ "session_mp",
+ NUM_SESSIONS, 0, SESS_MEMPOOL_CACHE_SIZE, 0,
+ SOCKET_ID_ANY);
+#else
+ g_session_mp = rte_mempool_create("session_mp", NUM_SESSIONS, max_sess_size,
+ SESS_MEMPOOL_CACHE_SIZE,
+ 0, NULL, NULL, NULL, NULL, SOCKET_ID_ANY, 0);
+#endif
+ if (g_session_mp == NULL) {
+ SPDK_ERRLOG("Cannot create session pool max size 0x%x\n", max_sess_size);
+ goto error_create_session_mp;
+ return -ENOMEM;
+ }
+
+ g_mbuf_mp = spdk_mempool_create("mbuf_mp", NUM_MBUFS, sizeof(struct rte_mbuf),
+ SPDK_MEMPOOL_DEFAULT_CACHE_SIZE,
+ SPDK_ENV_SOCKET_ID_ANY);
+ if (g_mbuf_mp == NULL) {
+ SPDK_ERRLOG("Cannot create mbuf pool\n");
+ rc = -ENOMEM;
+ goto error_create_mbuf;
+ }
+
+ /* We use per op private data to store the IV and our own struct
+ * for queueing ops.
+ */
+ g_crypto_op_mp = rte_crypto_op_pool_create("op_mp",
+ RTE_CRYPTO_OP_TYPE_SYMMETRIC,
+ NUM_MBUFS,
+ POOL_CACHE_SIZE,
+ AES_CBC_IV_LENGTH + QUEUED_OP_LENGTH,
+ rte_socket_id());
+
+ if (g_crypto_op_mp == NULL) {
+ SPDK_ERRLOG("Cannot create op pool\n");
+ rc = -ENOMEM;
+ goto error_create_op;
+ }
+
+ /* Init all devices */
+ for (i = 0; i < cdev_count; i++) {
+ rc = create_vbdev_dev(i, num_lcores);
+ if (rc) {
+ goto err;
+ }
+ }
+
+ /* Assign index values to the QAT device qp nodes so that we can
+ * assign them for optimal performance.
+ */
+ i = 0;
+ TAILQ_FOREACH(dev_qp, &g_device_qp_qat, link) {
+ dev_qp->index = i++;
+ }
+
+ return 0;
+
+ /* Error cleanup paths. */
+err:
+ TAILQ_FOREACH_SAFE(device, &g_vbdev_devs, link, tmp_dev) {
+ TAILQ_REMOVE(&g_vbdev_devs, device, link);
+ free(device);
+ }
+ rte_mempool_free(g_crypto_op_mp);
+ g_crypto_op_mp = NULL;
+error_create_op:
+ spdk_mempool_free(g_mbuf_mp);
+ g_mbuf_mp = NULL;
+error_create_mbuf:
+ rte_mempool_free(g_session_mp);
+ g_session_mp = NULL;
+error_create_session_mp:
+ if (g_session_mp_priv != NULL) {
+ rte_mempool_free(g_session_mp_priv);
+ g_session_mp_priv = NULL;
+ }
+ return rc;
+}
+
+/* Following an encrypt or decrypt we need to then either write the encrypted data or finish
+ * the read on decrypted data. Do that here.
+ */
+static void
+_crypto_operation_complete(struct spdk_bdev_io *bdev_io)
+{
+ struct vbdev_crypto *crypto_bdev = SPDK_CONTAINEROF(bdev_io->bdev, struct vbdev_crypto,
+ crypto_bdev);
+ struct crypto_bdev_io *io_ctx = (struct crypto_bdev_io *)bdev_io->driver_ctx;
+ struct crypto_io_channel *crypto_ch = io_ctx->crypto_ch;
+ struct spdk_bdev_io *free_me = io_ctx->read_io;
+ int rc = 0;
+
+ TAILQ_REMOVE(&crypto_ch->pending_cry_ios, bdev_io, module_link);
+
+ if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
+
+ /* Complete the original IO and then free the one that we created
+ * as a result of issuing an IO via submit_request.
+ */
+ if (io_ctx->bdev_io_status != SPDK_BDEV_IO_STATUS_FAILED) {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
+ } else {
+ SPDK_ERRLOG("Issue with decryption on bdev_io %p\n", bdev_io);
+ rc = -EINVAL;
+ }
+ spdk_bdev_free_io(free_me);
+
+ } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
+
+ if (io_ctx->bdev_io_status != SPDK_BDEV_IO_STATUS_FAILED) {
+ /* Write the encrypted data. */
+ rc = spdk_bdev_writev_blocks(crypto_bdev->base_desc, crypto_ch->base_ch,
+ &io_ctx->aux_buf_iov, 1, io_ctx->aux_offset_blocks,
+ io_ctx->aux_num_blocks, _complete_internal_write,
+ bdev_io);
+ } else {
+ SPDK_ERRLOG("Issue with encryption on bdev_io %p\n", bdev_io);
+ rc = -EINVAL;
+ }
+
+ } else {
+ SPDK_ERRLOG("Unknown bdev type %u on crypto operation completion\n",
+ bdev_io->type);
+ rc = -EINVAL;
+ }
+
+ if (rc) {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+}
+
+static int _crypto_operation(struct spdk_bdev_io *bdev_io,
+ enum rte_crypto_cipher_operation crypto_op,
+ void *aux_buf);
+
+/* This is the poller for the crypto device. It uses a single API to dequeue whatever is ready at
+ * the device. Then we need to decide if what we've got so far (including previous poller
+ * runs) totals up to one or more complete bdev_ios and if so continue with the bdev_io
+ * accordingly. This means either completing a read or issuing a new write.
+ */
+static int
+crypto_dev_poller(void *args)
+{
+ struct crypto_io_channel *crypto_ch = args;
+ uint8_t cdev_id = crypto_ch->device_qp->device->cdev_id;
+ int i, num_dequeued_ops, num_enqueued_ops;
+ struct spdk_bdev_io *bdev_io = NULL;
+ struct crypto_bdev_io *io_ctx = NULL;
+ struct rte_crypto_op *dequeued_ops[MAX_DEQUEUE_BURST_SIZE];
+ struct rte_crypto_op *mbufs_to_free[2 * MAX_DEQUEUE_BURST_SIZE];
+ int num_mbufs = 0;
+ struct vbdev_crypto_op *op_to_resubmit;
+
+ /* Each run of the poller will get just what the device has available
+ * at the moment we call it, we don't check again after draining the
+ * first batch.
+ */
+ num_dequeued_ops = rte_cryptodev_dequeue_burst(cdev_id, crypto_ch->device_qp->qp,
+ dequeued_ops, MAX_DEQUEUE_BURST_SIZE);
+
+ /* Check if operation was processed successfully */
+ for (i = 0; i < num_dequeued_ops; i++) {
+
+ /* We don't know the order or association of the crypto ops wrt any
+ * partiular bdev_io so need to look at each and determine if it's
+ * the last one for it's bdev_io or not.
+ */
+ bdev_io = (struct spdk_bdev_io *)dequeued_ops[i]->sym->m_src->userdata;
+ assert(bdev_io != NULL);
+ io_ctx = (struct crypto_bdev_io *)bdev_io->driver_ctx;
+
+ if (dequeued_ops[i]->status != RTE_CRYPTO_OP_STATUS_SUCCESS) {
+ SPDK_ERRLOG("error with op %d status %u\n", i,
+ dequeued_ops[i]->status);
+ /* Update the bdev status to error, we'll still process the
+ * rest of the crypto ops for this bdev_io though so they
+ * aren't left hanging.
+ */
+ io_ctx->bdev_io_status = SPDK_BDEV_IO_STATUS_FAILED;
+ }
+
+ assert(io_ctx->cryop_cnt_remaining > 0);
+
+ /* Return the associated src and dst mbufs by collecting them into
+ * an array that we can use the bulk API to free after the loop.
+ */
+ dequeued_ops[i]->sym->m_src->userdata = NULL;
+ mbufs_to_free[num_mbufs++] = (void *)dequeued_ops[i]->sym->m_src;
+ if (dequeued_ops[i]->sym->m_dst) {
+ mbufs_to_free[num_mbufs++] = (void *)dequeued_ops[i]->sym->m_dst;
+ }
+
+ /* done encrypting, complete the bdev_io */
+ if (--io_ctx->cryop_cnt_remaining == 0) {
+
+ /* If we're completing this with an outstanding reset we need
+ * to fail it.
+ */
+ if (crypto_ch->iter) {
+ io_ctx->bdev_io_status = SPDK_BDEV_IO_STATUS_FAILED;
+ }
+
+ /* Complete the IO */
+ _crypto_operation_complete(bdev_io);
+ }
+ }
+
+ /* Now bulk free both mbufs and crypto operations. */
+ if (num_dequeued_ops > 0) {
+ rte_mempool_put_bulk(g_crypto_op_mp,
+ (void **)dequeued_ops,
+ num_dequeued_ops);
+ assert(num_mbufs > 0);
+ spdk_mempool_put_bulk(g_mbuf_mp,
+ (void **)mbufs_to_free,
+ num_mbufs);
+ }
+
+ /* Check if there are any pending crypto ops to process */
+ while (!TAILQ_EMPTY(&crypto_ch->queued_cry_ops)) {
+ op_to_resubmit = TAILQ_FIRST(&crypto_ch->queued_cry_ops);
+ io_ctx = (struct crypto_bdev_io *)op_to_resubmit->bdev_io->driver_ctx;
+ num_enqueued_ops = rte_cryptodev_enqueue_burst(op_to_resubmit->cdev_id,
+ op_to_resubmit->qp,
+ &op_to_resubmit->crypto_op,
+ 1);
+ if (num_enqueued_ops == 1) {
+ /* Make sure we don't put this on twice as one bdev_io is made up
+ * of many crypto ops.
+ */
+ if (io_ctx->on_pending_list == false) {
+ TAILQ_INSERT_TAIL(&crypto_ch->pending_cry_ios, op_to_resubmit->bdev_io, module_link);
+ io_ctx->on_pending_list = true;
+ }
+ TAILQ_REMOVE(&crypto_ch->queued_cry_ops, op_to_resubmit, link);
+ } else {
+ /* if we couldn't get one, just break and try again later. */
+ break;
+ }
+ }
+
+ /* If the channel iter is not NULL, we need to continue to poll
+ * until the pending list is empty, then we can move on to the
+ * next channel.
+ */
+ if (crypto_ch->iter && TAILQ_EMPTY(&crypto_ch->pending_cry_ios)) {
+ SPDK_NOTICELOG("Channel %p has been quiesced.\n", crypto_ch);
+ spdk_for_each_channel_continue(crypto_ch->iter, 0);
+ crypto_ch->iter = NULL;
+ }
+
+ return num_dequeued_ops;
+}
+
+/* We're either encrypting on the way down or decrypting on the way back. */
+static int
+_crypto_operation(struct spdk_bdev_io *bdev_io, enum rte_crypto_cipher_operation crypto_op,
+ void *aux_buf)
+{
+ uint16_t num_enqueued_ops = 0;
+ uint32_t cryop_cnt = bdev_io->u.bdev.num_blocks;
+ struct crypto_bdev_io *io_ctx = (struct crypto_bdev_io *)bdev_io->driver_ctx;
+ struct crypto_io_channel *crypto_ch = io_ctx->crypto_ch;
+ uint8_t cdev_id = crypto_ch->device_qp->device->cdev_id;
+ uint32_t crypto_len = io_ctx->crypto_bdev->crypto_bdev.blocklen;
+ uint64_t total_length = bdev_io->u.bdev.num_blocks * crypto_len;
+ int rc;
+ uint32_t iov_index = 0;
+ uint32_t allocated = 0;
+ uint8_t *current_iov = NULL;
+ uint64_t total_remaining = 0;
+ uint64_t updated_length, current_iov_remaining = 0;
+ uint32_t crypto_index = 0;
+ uint32_t en_offset = 0;
+ struct rte_crypto_op *crypto_ops[MAX_ENQUEUE_ARRAY_SIZE];
+ struct rte_mbuf *src_mbufs[MAX_ENQUEUE_ARRAY_SIZE];
+ struct rte_mbuf *dst_mbufs[MAX_ENQUEUE_ARRAY_SIZE];
+ int burst;
+ struct vbdev_crypto_op *op_to_queue;
+ uint64_t alignment = spdk_bdev_get_buf_align(&io_ctx->crypto_bdev->crypto_bdev);
+
+ assert((bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen) <= CRYPTO_MAX_IO);
+
+ /* Get the number of source mbufs that we need. These will always be 1:1 because we
+ * don't support chaining. The reason we don't is because of our decision to use
+ * LBA as IV, there can be no case where we'd need >1 mbuf per crypto op or the
+ * op would be > 1 LBA.
+ */
+ rc = spdk_mempool_get_bulk(g_mbuf_mp, (void **)&src_mbufs[0], cryop_cnt);
+ if (rc) {
+ SPDK_ERRLOG("ERROR trying to get src_mbufs!\n");
+ return -ENOMEM;
+ }
+
+ /* Get the same amount but these buffers to describe the encrypted data location (dst). */
+ if (crypto_op == RTE_CRYPTO_CIPHER_OP_ENCRYPT) {
+ rc = spdk_mempool_get_bulk(g_mbuf_mp, (void **)&dst_mbufs[0], cryop_cnt);
+ if (rc) {
+ SPDK_ERRLOG("ERROR trying to get dst_mbufs!\n");
+ rc = -ENOMEM;
+ goto error_get_dst;
+ }
+ }
+
+#ifdef __clang_analyzer__
+ /* silence scan-build false positive */
+ SPDK_CLANG_ANALYZER_PREINIT_PTR_ARRAY(crypto_ops, MAX_ENQUEUE_ARRAY_SIZE, 0x1000);
+#endif
+ /* Allocate crypto operations. */
+ allocated = rte_crypto_op_bulk_alloc(g_crypto_op_mp,
+ RTE_CRYPTO_OP_TYPE_SYMMETRIC,
+ crypto_ops, cryop_cnt);
+ if (allocated < cryop_cnt) {
+ SPDK_ERRLOG("ERROR trying to get crypto ops!\n");
+ rc = -ENOMEM;
+ goto error_get_ops;
+ }
+
+ /* For encryption, we need to prepare a single contiguous buffer as the encryption
+ * destination, we'll then pass that along for the write after encryption is done.
+ * This is done to avoiding encrypting the provided write buffer which may be
+ * undesirable in some use cases.
+ */
+ if (crypto_op == RTE_CRYPTO_CIPHER_OP_ENCRYPT) {
+ io_ctx->aux_buf_iov.iov_len = total_length;
+ io_ctx->aux_buf_raw = aux_buf;
+ io_ctx->aux_buf_iov.iov_base = (void *)(((uintptr_t)aux_buf + (alignment - 1)) & ~(alignment - 1));
+ io_ctx->aux_offset_blocks = bdev_io->u.bdev.offset_blocks;
+ io_ctx->aux_num_blocks = bdev_io->u.bdev.num_blocks;
+ }
+
+ /* This value is used in the completion callback to determine when the bdev_io is
+ * complete.
+ */
+ io_ctx->cryop_cnt_remaining = cryop_cnt;
+
+ /* As we don't support chaining because of a decision to use LBA as IV, construction
+ * of crypto operations is straightforward. We build both the op, the mbuf and the
+ * dst_mbuf in our local arrays by looping through the length of the bdev IO and
+ * picking off LBA sized blocks of memory from the IOVs as we walk through them. Each
+ * LBA sized chunk of memory will correspond 1:1 to a crypto operation and a single
+ * mbuf per crypto operation.
+ */
+ total_remaining = total_length;
+ current_iov = bdev_io->u.bdev.iovs[iov_index].iov_base;
+ current_iov_remaining = bdev_io->u.bdev.iovs[iov_index].iov_len;
+ do {
+ uint8_t *iv_ptr;
+ uint64_t op_block_offset;
+
+ /* Set the mbuf elements address and length. Null out the next pointer. */
+ src_mbufs[crypto_index]->buf_addr = current_iov;
+ src_mbufs[crypto_index]->data_len = updated_length = crypto_len;
+ /* TODO: Make this assignment conditional on QAT usage and add an assert. */
+ src_mbufs[crypto_index]->buf_iova = spdk_vtophys((void *)current_iov, &updated_length);
+ src_mbufs[crypto_index]->next = NULL;
+ /* Store context in every mbuf as we don't know anything about completion order */
+ src_mbufs[crypto_index]->userdata = bdev_io;
+
+ /* Set the IV - we use the LBA of the crypto_op */
+ iv_ptr = rte_crypto_op_ctod_offset(crypto_ops[crypto_index], uint8_t *,
+ IV_OFFSET);
+ memset(iv_ptr, 0, AES_CBC_IV_LENGTH);
+ op_block_offset = bdev_io->u.bdev.offset_blocks + crypto_index;
+ rte_memcpy(iv_ptr, &op_block_offset, sizeof(uint64_t));
+
+ /* Set the data to encrypt/decrypt length */
+ crypto_ops[crypto_index]->sym->cipher.data.length = crypto_len;
+ crypto_ops[crypto_index]->sym->cipher.data.offset = 0;
+
+ /* link the mbuf to the crypto op. */
+ crypto_ops[crypto_index]->sym->m_src = src_mbufs[crypto_index];
+ if (crypto_op == RTE_CRYPTO_CIPHER_OP_ENCRYPT) {
+ crypto_ops[crypto_index]->sym->m_dst = src_mbufs[crypto_index];
+ } else {
+ crypto_ops[crypto_index]->sym->m_dst = NULL;
+ }
+
+ /* For encrypt, point the destination to a buffer we allocate and redirect the bdev_io
+ * that will be used to process the write on completion to the same buffer. Setting
+ * up the en_buffer is a little simpler as we know the destination buffer is single IOV.
+ */
+ if (crypto_op == RTE_CRYPTO_CIPHER_OP_ENCRYPT) {
+
+ /* Set the relevant destination en_mbuf elements. */
+ dst_mbufs[crypto_index]->buf_addr = io_ctx->aux_buf_iov.iov_base + en_offset;
+ dst_mbufs[crypto_index]->data_len = updated_length = crypto_len;
+ /* TODO: Make this assignment conditional on QAT usage and add an assert. */
+ dst_mbufs[crypto_index]->buf_iova = spdk_vtophys(dst_mbufs[crypto_index]->buf_addr,
+ &updated_length);
+ crypto_ops[crypto_index]->sym->m_dst = dst_mbufs[crypto_index];
+ en_offset += crypto_len;
+ dst_mbufs[crypto_index]->next = NULL;
+
+ /* Attach the crypto session to the operation */
+ rc = rte_crypto_op_attach_sym_session(crypto_ops[crypto_index],
+ io_ctx->crypto_bdev->session_encrypt);
+ if (rc) {
+ rc = -EINVAL;
+ goto error_attach_session;
+ }
+
+ } else {
+ /* Attach the crypto session to the operation */
+ rc = rte_crypto_op_attach_sym_session(crypto_ops[crypto_index],
+ io_ctx->crypto_bdev->session_decrypt);
+ if (rc) {
+ rc = -EINVAL;
+ goto error_attach_session;
+ }
+
+
+ }
+
+ /* Subtract our running totals for the op in progress and the overall bdev io */
+ total_remaining -= crypto_len;
+ current_iov_remaining -= crypto_len;
+
+ /* move our current IOV pointer accordingly. */
+ current_iov += crypto_len;
+
+ /* move on to the next crypto operation */
+ crypto_index++;
+
+ /* If we're done with this IOV, move to the next one. */
+ if (current_iov_remaining == 0 && total_remaining > 0) {
+ iov_index++;
+ current_iov = bdev_io->u.bdev.iovs[iov_index].iov_base;
+ current_iov_remaining = bdev_io->u.bdev.iovs[iov_index].iov_len;
+ }
+ } while (total_remaining > 0);
+
+ /* Enqueue everything we've got but limit by the max number of descriptors we
+ * configured the crypto device for.
+ */
+ burst = spdk_min(cryop_cnt, CRYPTO_QP_DESCRIPTORS);
+ num_enqueued_ops = rte_cryptodev_enqueue_burst(cdev_id, crypto_ch->device_qp->qp,
+ &crypto_ops[0],
+ burst);
+
+ /* Add this bdev_io to our outstanding list if any of its crypto ops made it. */
+ if (num_enqueued_ops > 0) {
+ TAILQ_INSERT_TAIL(&crypto_ch->pending_cry_ios, bdev_io, module_link);
+ io_ctx->on_pending_list = true;
+ }
+ /* We were unable to enqueue everything but did get some, so need to decide what
+ * to do based on the status of the last op.
+ */
+ if (num_enqueued_ops < cryop_cnt) {
+ switch (crypto_ops[num_enqueued_ops]->status) {
+ case RTE_CRYPTO_OP_STATUS_NOT_PROCESSED:
+ /* Queue them up on a linked list to be resubmitted via the poller. */
+ for (crypto_index = num_enqueued_ops; crypto_index < cryop_cnt; crypto_index++) {
+ op_to_queue = (struct vbdev_crypto_op *)rte_crypto_op_ctod_offset(crypto_ops[crypto_index],
+ uint8_t *, QUEUED_OP_OFFSET);
+ op_to_queue->cdev_id = cdev_id;
+ op_to_queue->qp = crypto_ch->device_qp->qp;
+ op_to_queue->crypto_op = crypto_ops[crypto_index];
+ op_to_queue->bdev_io = bdev_io;
+ TAILQ_INSERT_TAIL(&crypto_ch->queued_cry_ops,
+ op_to_queue,
+ link);
+ }
+ break;
+ default:
+ /* For all other statuses, set the io_ctx bdev_io status so that
+ * the poller will pick the failure up for the overall bdev status.
+ */
+ io_ctx->bdev_io_status = SPDK_BDEV_IO_STATUS_FAILED;
+ if (num_enqueued_ops == 0) {
+ /* If nothing was enqueued, but the last one wasn't because of
+ * busy, fail it now as the poller won't know anything about it.
+ */
+ _crypto_operation_complete(bdev_io);
+ rc = -EINVAL;
+ goto error_attach_session;
+ }
+ break;
+ }
+ }
+
+ return rc;
+
+ /* Error cleanup paths. */
+error_attach_session:
+error_get_ops:
+ if (crypto_op == RTE_CRYPTO_CIPHER_OP_ENCRYPT) {
+ spdk_mempool_put_bulk(g_mbuf_mp, (void **)&dst_mbufs[0],
+ cryop_cnt);
+ }
+ if (allocated > 0) {
+ rte_mempool_put_bulk(g_crypto_op_mp, (void **)crypto_ops,
+ allocated);
+ }
+error_get_dst:
+ spdk_mempool_put_bulk(g_mbuf_mp, (void **)&src_mbufs[0],
+ cryop_cnt);
+ return rc;
+}
+
+/* This function is called after all channels have been quiesced following
+ * a bdev reset.
+ */
+static void
+_ch_quiesce_done(struct spdk_io_channel_iter *i, int status)
+{
+ struct crypto_bdev_io *io_ctx = spdk_io_channel_iter_get_ctx(i);
+
+ assert(TAILQ_EMPTY(&io_ctx->crypto_ch->pending_cry_ios));
+ assert(io_ctx->orig_io != NULL);
+
+ spdk_bdev_io_complete(io_ctx->orig_io, SPDK_BDEV_IO_STATUS_SUCCESS);
+}
+
+/* This function is called per channel to quiesce IOs before completing a
+ * bdev reset that we received.
+ */
+static void
+_ch_quiesce(struct spdk_io_channel_iter *i)
+{
+ struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
+ struct crypto_io_channel *crypto_ch = spdk_io_channel_get_ctx(ch);
+
+ crypto_ch->iter = i;
+ /* When the poller runs, it will see the non-NULL iter and handle
+ * the quiesce.
+ */
+}
+
+/* Completion callback for IO that were issued from this bdev other than read/write.
+ * They have their own for readability.
+ */
+static void
+_complete_internal_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct spdk_bdev_io *orig_io = cb_arg;
+ int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
+
+ if (bdev_io->type == SPDK_BDEV_IO_TYPE_RESET) {
+ struct crypto_bdev_io *orig_ctx = (struct crypto_bdev_io *)orig_io->driver_ctx;
+
+ assert(orig_io == orig_ctx->orig_io);
+
+ spdk_bdev_free_io(bdev_io);
+
+ spdk_for_each_channel(orig_ctx->crypto_bdev,
+ _ch_quiesce,
+ orig_ctx,
+ _ch_quiesce_done);
+ return;
+ }
+
+ spdk_bdev_io_complete(orig_io, status);
+ spdk_bdev_free_io(bdev_io);
+}
+
+/* Completion callback for writes that were issued from this bdev. */
+static void
+_complete_internal_write(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct spdk_bdev_io *orig_io = cb_arg;
+ int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
+ struct crypto_bdev_io *orig_ctx = (struct crypto_bdev_io *)orig_io->driver_ctx;
+
+ spdk_bdev_io_put_aux_buf(orig_io, orig_ctx->aux_buf_raw);
+
+ spdk_bdev_io_complete(orig_io, status);
+ spdk_bdev_free_io(bdev_io);
+}
+
+/* Completion callback for reads that were issued from this bdev. */
+static void
+_complete_internal_read(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct spdk_bdev_io *orig_io = cb_arg;
+ struct crypto_bdev_io *orig_ctx = (struct crypto_bdev_io *)orig_io->driver_ctx;
+
+ if (success) {
+
+ /* Save off this bdev_io so it can be freed after decryption. */
+ orig_ctx->read_io = bdev_io;
+
+ if (!_crypto_operation(orig_io, RTE_CRYPTO_CIPHER_OP_DECRYPT, NULL)) {
+ return;
+ } else {
+ SPDK_ERRLOG("ERROR decrypting\n");
+ }
+ } else {
+ SPDK_ERRLOG("ERROR on read prior to decrypting\n");
+ }
+
+ spdk_bdev_io_complete(orig_io, SPDK_BDEV_IO_STATUS_FAILED);
+ spdk_bdev_free_io(bdev_io);
+}
+
+static void
+vbdev_crypto_resubmit_io(void *arg)
+{
+ struct spdk_bdev_io *bdev_io = (struct spdk_bdev_io *)arg;
+ struct crypto_bdev_io *io_ctx = (struct crypto_bdev_io *)bdev_io->driver_ctx;
+
+ vbdev_crypto_submit_request(io_ctx->ch, bdev_io);
+}
+
+static void
+vbdev_crypto_queue_io(struct spdk_bdev_io *bdev_io)
+{
+ struct crypto_bdev_io *io_ctx = (struct crypto_bdev_io *)bdev_io->driver_ctx;
+ int rc;
+
+ io_ctx->bdev_io_wait.bdev = bdev_io->bdev;
+ io_ctx->bdev_io_wait.cb_fn = vbdev_crypto_resubmit_io;
+ io_ctx->bdev_io_wait.cb_arg = bdev_io;
+
+ rc = spdk_bdev_queue_io_wait(bdev_io->bdev, io_ctx->crypto_ch->base_ch, &io_ctx->bdev_io_wait);
+ if (rc != 0) {
+ SPDK_ERRLOG("Queue io failed in vbdev_crypto_queue_io, rc=%d.\n", rc);
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+}
+
+/* Callback for getting a buf from the bdev pool in the event that the caller passed
+ * in NULL, we need to own the buffer so it doesn't get freed by another vbdev module
+ * beneath us before we're done with it.
+ */
+static void
+crypto_read_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
+ bool success)
+{
+ struct vbdev_crypto *crypto_bdev = SPDK_CONTAINEROF(bdev_io->bdev, struct vbdev_crypto,
+ crypto_bdev);
+ struct crypto_io_channel *crypto_ch = spdk_io_channel_get_ctx(ch);
+ struct crypto_bdev_io *io_ctx = (struct crypto_bdev_io *)bdev_io->driver_ctx;
+ int rc;
+
+ if (!success) {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ return;
+ }
+
+ rc = spdk_bdev_readv_blocks(crypto_bdev->base_desc, crypto_ch->base_ch, bdev_io->u.bdev.iovs,
+ bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks,
+ bdev_io->u.bdev.num_blocks, _complete_internal_read,
+ bdev_io);
+ if (rc != 0) {
+ if (rc == -ENOMEM) {
+ SPDK_DEBUGLOG(SPDK_LOG_CRYPTO, "No memory, queue the IO.\n");
+ io_ctx->ch = ch;
+ vbdev_crypto_queue_io(bdev_io);
+ } else {
+ SPDK_ERRLOG("ERROR on bdev_io submission!\n");
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+ }
+}
+
+/* For encryption we don't want to encrypt the data in place as the host isn't
+ * expecting us to mangle its data buffers so we need to encrypt into the bdev
+ * aux buffer, then we can use that as the source for the disk data transfer.
+ */
+static void
+crypto_write_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
+ void *aux_buf)
+{
+ struct crypto_bdev_io *io_ctx = (struct crypto_bdev_io *)bdev_io->driver_ctx;
+ int rc = 0;
+
+ rc = _crypto_operation(bdev_io, RTE_CRYPTO_CIPHER_OP_ENCRYPT, aux_buf);
+ if (rc != 0) {
+ spdk_bdev_io_put_aux_buf(bdev_io, aux_buf);
+ if (rc == -ENOMEM) {
+ SPDK_DEBUGLOG(SPDK_LOG_CRYPTO, "No memory, queue the IO.\n");
+ io_ctx->ch = ch;
+ vbdev_crypto_queue_io(bdev_io);
+ } else {
+ SPDK_ERRLOG("ERROR on bdev_io submission!\n");
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+ }
+}
+
+/* Called when someone submits IO to this crypto vbdev. For IO's not relevant to crypto,
+ * we're simply passing it on here via SPDK IO calls which in turn allocate another bdev IO
+ * and call our cpl callback provided below along with the original bdev_io so that we can
+ * complete it once this IO completes. For crypto operations, we'll either encrypt it first
+ * (writes) then call back into bdev to submit it or we'll submit a read and then catch it
+ * on the way back for decryption.
+ */
+static void
+vbdev_crypto_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
+{
+ struct vbdev_crypto *crypto_bdev = SPDK_CONTAINEROF(bdev_io->bdev, struct vbdev_crypto,
+ crypto_bdev);
+ struct crypto_io_channel *crypto_ch = spdk_io_channel_get_ctx(ch);
+ struct crypto_bdev_io *io_ctx = (struct crypto_bdev_io *)bdev_io->driver_ctx;
+ int rc = 0;
+
+ memset(io_ctx, 0, sizeof(struct crypto_bdev_io));
+ io_ctx->crypto_bdev = crypto_bdev;
+ io_ctx->crypto_ch = crypto_ch;
+ io_ctx->orig_io = bdev_io;
+ io_ctx->bdev_io_status = SPDK_BDEV_IO_STATUS_SUCCESS;
+
+ switch (bdev_io->type) {
+ case SPDK_BDEV_IO_TYPE_READ:
+ spdk_bdev_io_get_buf(bdev_io, crypto_read_get_buf_cb,
+ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
+ break;
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ /* Tell the bdev layer that we need an aux buf in addition to the data
+ * buf already associated with the bdev.
+ */
+ spdk_bdev_io_get_aux_buf(bdev_io, crypto_write_get_buf_cb);
+ break;
+ case SPDK_BDEV_IO_TYPE_UNMAP:
+ rc = spdk_bdev_unmap_blocks(crypto_bdev->base_desc, crypto_ch->base_ch,
+ bdev_io->u.bdev.offset_blocks,
+ bdev_io->u.bdev.num_blocks,
+ _complete_internal_io, bdev_io);
+ break;
+ case SPDK_BDEV_IO_TYPE_FLUSH:
+ rc = spdk_bdev_flush_blocks(crypto_bdev->base_desc, crypto_ch->base_ch,
+ bdev_io->u.bdev.offset_blocks,
+ bdev_io->u.bdev.num_blocks,
+ _complete_internal_io, bdev_io);
+ break;
+ case SPDK_BDEV_IO_TYPE_RESET:
+ rc = spdk_bdev_reset(crypto_bdev->base_desc, crypto_ch->base_ch,
+ _complete_internal_io, bdev_io);
+ break;
+ case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
+ default:
+ SPDK_ERRLOG("crypto: unknown I/O type %d\n", bdev_io->type);
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ return;
+ }
+
+ if (rc != 0) {
+ if (rc == -ENOMEM) {
+ SPDK_DEBUGLOG(SPDK_LOG_CRYPTO, "No memory, queue the IO.\n");
+ io_ctx->ch = ch;
+ vbdev_crypto_queue_io(bdev_io);
+ } else {
+ SPDK_ERRLOG("ERROR on bdev_io submission!\n");
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+ }
+}
+
+/* We'll just call the base bdev and let it answer except for WZ command which
+ * we always say we don't support so that the bdev layer will actually send us
+ * real writes that we can encrypt.
+ */
+static bool
+vbdev_crypto_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
+{
+ struct vbdev_crypto *crypto_bdev = (struct vbdev_crypto *)ctx;
+
+ switch (io_type) {
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ case SPDK_BDEV_IO_TYPE_UNMAP:
+ case SPDK_BDEV_IO_TYPE_RESET:
+ case SPDK_BDEV_IO_TYPE_READ:
+ case SPDK_BDEV_IO_TYPE_FLUSH:
+ return spdk_bdev_io_type_supported(crypto_bdev->base_bdev, io_type);
+ case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
+ /* Force the bdev layer to issue actual writes of zeroes so we can
+ * encrypt them as regular writes.
+ */
+ default:
+ return false;
+ }
+}
+
+/* Callback for unregistering the IO device. */
+static void
+_device_unregister_cb(void *io_device)
+{
+ struct vbdev_crypto *crypto_bdev = io_device;
+
+ /* Done with this crypto_bdev. */
+ rte_cryptodev_sym_session_free(crypto_bdev->session_decrypt);
+ rte_cryptodev_sym_session_free(crypto_bdev->session_encrypt);
+ free(crypto_bdev->drv_name);
+ if (crypto_bdev->key) {
+ memset(crypto_bdev->key, 0, strnlen(crypto_bdev->key, (AES_CBC_KEY_LENGTH + 1)));
+ free(crypto_bdev->key);
+ }
+ if (crypto_bdev->key2) {
+ memset(crypto_bdev->key2, 0, strnlen(crypto_bdev->key2, (AES_XTS_KEY_LENGTH + 1)));
+ free(crypto_bdev->key2);
+ }
+ if (crypto_bdev->xts_key) {
+ memset(crypto_bdev->xts_key, 0, strnlen(crypto_bdev->xts_key, (AES_XTS_KEY_LENGTH * 2) + 1));
+ free(crypto_bdev->xts_key);
+ }
+ free(crypto_bdev->crypto_bdev.name);
+ free(crypto_bdev);
+}
+
+/* Wrapper for the bdev close operation. */
+static void
+_vbdev_crypto_destruct(void *ctx)
+{
+ struct spdk_bdev_desc *desc = ctx;
+
+ spdk_bdev_close(desc);
+}
+
+/* Called after we've unregistered following a hot remove callback.
+ * Our finish entry point will be called next.
+ */
+static int
+vbdev_crypto_destruct(void *ctx)
+{
+ struct vbdev_crypto *crypto_bdev = (struct vbdev_crypto *)ctx;
+
+ /* Remove this device from the internal list */
+ TAILQ_REMOVE(&g_vbdev_crypto, crypto_bdev, link);
+
+ /* Unclaim the underlying bdev. */
+ spdk_bdev_module_release_bdev(crypto_bdev->base_bdev);
+
+ /* Close the underlying bdev on its same opened thread. */
+ if (crypto_bdev->thread && crypto_bdev->thread != spdk_get_thread()) {
+ spdk_thread_send_msg(crypto_bdev->thread, _vbdev_crypto_destruct, crypto_bdev->base_desc);
+ } else {
+ spdk_bdev_close(crypto_bdev->base_desc);
+ }
+
+ /* Unregister the io_device. */
+ spdk_io_device_unregister(crypto_bdev, _device_unregister_cb);
+
+ g_number_of_claimed_volumes--;
+
+ return 0;
+}
+
+/* We supplied this as an entry point for upper layers who want to communicate to this
+ * bdev. This is how they get a channel. We are passed the same context we provided when
+ * we created our crypto vbdev in examine() which, for this bdev, is the address of one of
+ * our context nodes. From here we'll ask the SPDK channel code to fill out our channel
+ * struct and we'll keep it in our crypto node.
+ */
+static struct spdk_io_channel *
+vbdev_crypto_get_io_channel(void *ctx)
+{
+ struct vbdev_crypto *crypto_bdev = (struct vbdev_crypto *)ctx;
+
+ /* The IO channel code will allocate a channel for us which consists of
+ * the SPDK channel structure plus the size of our crypto_io_channel struct
+ * that we passed in when we registered our IO device. It will then call
+ * our channel create callback to populate any elements that we need to
+ * update.
+ */
+ return spdk_get_io_channel(crypto_bdev);
+}
+
+/* This is the output for bdev_get_bdevs() for this vbdev */
+static int
+vbdev_crypto_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
+{
+ struct vbdev_crypto *crypto_bdev = (struct vbdev_crypto *)ctx;
+
+ spdk_json_write_name(w, "crypto");
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "base_bdev_name", spdk_bdev_get_name(crypto_bdev->base_bdev));
+ spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&crypto_bdev->crypto_bdev));
+ spdk_json_write_named_string(w, "crypto_pmd", crypto_bdev->drv_name);
+ spdk_json_write_named_string(w, "key", crypto_bdev->key);
+ if (strcmp(crypto_bdev->cipher, AES_XTS) == 0) {
+ spdk_json_write_named_string(w, "key2", crypto_bdev->key);
+ }
+ spdk_json_write_named_string(w, "cipher", crypto_bdev->cipher);
+ spdk_json_write_object_end(w);
+ return 0;
+}
+
+static int
+vbdev_crypto_config_json(struct spdk_json_write_ctx *w)
+{
+ struct vbdev_crypto *crypto_bdev;
+
+ TAILQ_FOREACH(crypto_bdev, &g_vbdev_crypto, link) {
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "method", "bdev_crypto_create");
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_string(w, "base_bdev_name", spdk_bdev_get_name(crypto_bdev->base_bdev));
+ spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&crypto_bdev->crypto_bdev));
+ spdk_json_write_named_string(w, "crypto_pmd", crypto_bdev->drv_name);
+ spdk_json_write_named_string(w, "key", crypto_bdev->key);
+ if (strcmp(crypto_bdev->cipher, AES_XTS) == 0) {
+ spdk_json_write_named_string(w, "key2", crypto_bdev->key);
+ }
+ spdk_json_write_named_string(w, "cipher", crypto_bdev->cipher);
+ spdk_json_write_object_end(w);
+ spdk_json_write_object_end(w);
+ }
+ return 0;
+}
+
+/* Helper function for the channel creation callback. */
+static void
+_assign_device_qp(struct vbdev_crypto *crypto_bdev, struct device_qp *device_qp,
+ struct crypto_io_channel *crypto_ch)
+{
+ pthread_mutex_lock(&g_device_qp_lock);
+ if (strcmp(crypto_bdev->drv_name, QAT) == 0) {
+ /* For some QAT devices, the optimal qp to use is every 32nd as this spreads the
+ * workload out over the multiple virtual functions in the device. For the devices
+ * where this isn't the case, it doesn't hurt.
+ */
+ TAILQ_FOREACH(device_qp, &g_device_qp_qat, link) {
+ if (device_qp->index != g_next_qat_index) {
+ continue;
+ }
+ if (device_qp->in_use == false) {
+ crypto_ch->device_qp = device_qp;
+ device_qp->in_use = true;
+ g_next_qat_index = (g_next_qat_index + QAT_VF_SPREAD) % g_qat_total_qp;
+ break;
+ } else {
+ /* if the preferred index is used, skip to the next one in this set. */
+ g_next_qat_index = (g_next_qat_index + 1) % g_qat_total_qp;
+ }
+ }
+ } else if (strcmp(crypto_bdev->drv_name, AESNI_MB) == 0) {
+ TAILQ_FOREACH(device_qp, &g_device_qp_aesni_mb, link) {
+ if (device_qp->in_use == false) {
+ crypto_ch->device_qp = device_qp;
+ device_qp->in_use = true;
+ break;
+ }
+ }
+ }
+ pthread_mutex_unlock(&g_device_qp_lock);
+}
+
+/* We provide this callback for the SPDK channel code to create a channel using
+ * the channel struct we provided in our module get_io_channel() entry point. Here
+ * we get and save off an underlying base channel of the device below us so that
+ * we can communicate with the base bdev on a per channel basis. We also register the
+ * poller used to complete crypto operations from the device.
+ */
+static int
+crypto_bdev_ch_create_cb(void *io_device, void *ctx_buf)
+{
+ struct crypto_io_channel *crypto_ch = ctx_buf;
+ struct vbdev_crypto *crypto_bdev = io_device;
+ struct device_qp *device_qp = NULL;
+
+ crypto_ch->base_ch = spdk_bdev_get_io_channel(crypto_bdev->base_desc);
+ crypto_ch->poller = SPDK_POLLER_REGISTER(crypto_dev_poller, crypto_ch, 0);
+ crypto_ch->device_qp = NULL;
+
+ /* Assign a device/qp combination that is unique per channel per PMD. */
+ _assign_device_qp(crypto_bdev, device_qp, crypto_ch);
+ assert(crypto_ch->device_qp);
+
+ /* We use this queue to track outstanding IO in our layer. */
+ TAILQ_INIT(&crypto_ch->pending_cry_ios);
+
+ /* We use this to queue up crypto ops when the device is busy. */
+ TAILQ_INIT(&crypto_ch->queued_cry_ops);
+
+ return 0;
+}
+
+/* We provide this callback for the SPDK channel code to destroy a channel
+ * created with our create callback. We just need to undo anything we did
+ * when we created.
+ */
+static void
+crypto_bdev_ch_destroy_cb(void *io_device, void *ctx_buf)
+{
+ struct crypto_io_channel *crypto_ch = ctx_buf;
+
+ pthread_mutex_lock(&g_device_qp_lock);
+ crypto_ch->device_qp->in_use = false;
+ pthread_mutex_unlock(&g_device_qp_lock);
+
+ spdk_poller_unregister(&crypto_ch->poller);
+ spdk_put_io_channel(crypto_ch->base_ch);
+}
+
+/* Create the association from the bdev and vbdev name and insert
+ * on the global list. */
+static int
+vbdev_crypto_insert_name(const char *bdev_name, const char *vbdev_name,
+ const char *crypto_pmd, const char *key,
+ const char *cipher, const char *key2)
+{
+ struct bdev_names *name;
+ int rc, j;
+ bool found = false;
+
+ TAILQ_FOREACH(name, &g_bdev_names, link) {
+ if (strcmp(vbdev_name, name->vbdev_name) == 0) {
+ SPDK_ERRLOG("crypto bdev %s already exists\n", vbdev_name);
+ return -EEXIST;
+ }
+ }
+
+ name = calloc(1, sizeof(struct bdev_names));
+ if (!name) {
+ SPDK_ERRLOG("could not allocate bdev_names\n");
+ return -ENOMEM;
+ }
+
+ name->bdev_name = strdup(bdev_name);
+ if (!name->bdev_name) {
+ SPDK_ERRLOG("could not allocate name->bdev_name\n");
+ rc = -ENOMEM;
+ goto error_alloc_bname;
+ }
+
+ name->vbdev_name = strdup(vbdev_name);
+ if (!name->vbdev_name) {
+ SPDK_ERRLOG("could not allocate name->vbdev_name\n");
+ rc = -ENOMEM;
+ goto error_alloc_vname;
+ }
+
+ name->drv_name = strdup(crypto_pmd);
+ if (!name->drv_name) {
+ SPDK_ERRLOG("could not allocate name->drv_name\n");
+ rc = -ENOMEM;
+ goto error_alloc_dname;
+ }
+ for (j = 0; j < MAX_NUM_DRV_TYPES ; j++) {
+ if (strcmp(crypto_pmd, g_driver_names[j]) == 0) {
+ found = true;
+ break;
+ }
+ }
+ if (!found) {
+ SPDK_ERRLOG("invalid crypto PMD type %s\n", crypto_pmd);
+ rc = -EINVAL;
+ goto error_invalid_pmd;
+ }
+
+ name->key = strdup(key);
+ if (!name->key) {
+ SPDK_ERRLOG("could not allocate name->key\n");
+ rc = -ENOMEM;
+ goto error_alloc_key;
+ }
+ if (strnlen(name->key, (AES_CBC_KEY_LENGTH + 1)) != AES_CBC_KEY_LENGTH) {
+ SPDK_ERRLOG("invalid AES_CBC key length\n");
+ rc = -EINVAL;
+ goto error_invalid_key;
+ }
+
+ if (strncmp(cipher, AES_XTS, sizeof(AES_XTS)) == 0) {
+ /* To please scan-build, input validation makes sure we can't
+ * have this cipher without providing a key2.
+ */
+ name->cipher = AES_XTS;
+ assert(key2);
+ if (strnlen(key2, (AES_XTS_KEY_LENGTH + 1)) != AES_XTS_KEY_LENGTH) {
+ SPDK_ERRLOG("invalid AES_XTS key length\n");
+ rc = -EINVAL;
+ goto error_invalid_key2;
+ }
+
+ name->key2 = strdup(key2);
+ if (!name->key2) {
+ SPDK_ERRLOG("could not allocate name->key2\n");
+ rc = -ENOMEM;
+ goto error_alloc_key2;
+ }
+ } else if (strncmp(cipher, AES_CBC, sizeof(AES_CBC)) == 0) {
+ name->cipher = AES_CBC;
+ } else {
+ SPDK_ERRLOG("Invalid cipher: %s\n", cipher);
+ rc = -EINVAL;
+ goto error_cipher;
+ }
+
+ TAILQ_INSERT_TAIL(&g_bdev_names, name, link);
+
+ return 0;
+
+ /* Error cleanup paths. */
+error_cipher:
+ free(name->key2);
+error_alloc_key2:
+error_invalid_key2:
+error_invalid_key:
+ free(name->key);
+error_alloc_key:
+error_invalid_pmd:
+ free(name->drv_name);
+error_alloc_dname:
+ free(name->vbdev_name);
+error_alloc_vname:
+ free(name->bdev_name);
+error_alloc_bname:
+ free(name);
+ return rc;
+}
+
+/* RPC entry point for crypto creation. */
+int
+create_crypto_disk(const char *bdev_name, const char *vbdev_name,
+ const char *crypto_pmd, const char *key,
+ const char *cipher, const char *key2)
+{
+ struct spdk_bdev *bdev = NULL;
+ int rc = 0;
+
+ bdev = spdk_bdev_get_by_name(bdev_name);
+
+ rc = vbdev_crypto_insert_name(bdev_name, vbdev_name, crypto_pmd, key, cipher, key2);
+ if (rc) {
+ return rc;
+ }
+
+ if (!bdev) {
+ SPDK_NOTICELOG("vbdev creation deferred pending base bdev arrival\n");
+ return 0;
+ }
+
+ rc = vbdev_crypto_claim(bdev);
+ if (rc) {
+ return rc;
+ }
+
+ return rc;
+}
+
+/* Called at driver init time, parses config file to prepare for examine calls,
+ * also fully initializes the crypto drivers.
+ */
+static int
+vbdev_crypto_init(void)
+{
+ struct spdk_conf_section *sp = NULL;
+ const char *conf_bdev_name = NULL;
+ const char *conf_vbdev_name = NULL;
+ const char *crypto_pmd = NULL;
+ int i;
+ int rc = 0;
+ const char *key = NULL;
+ const char *cipher = NULL;
+ const char *key2 = NULL;
+
+ /* Fully configure both SW and HW drivers. */
+ rc = vbdev_crypto_init_crypto_drivers();
+ if (rc) {
+ SPDK_ERRLOG("Error setting up crypto devices\n");
+ return rc;
+ }
+
+ sp = spdk_conf_find_section(NULL, "crypto");
+ if (sp == NULL) {
+ return 0;
+ }
+
+ for (i = 0; ; i++) {
+
+ if (!spdk_conf_section_get_nval(sp, "CRY", i)) {
+ break;
+ }
+
+ conf_bdev_name = spdk_conf_section_get_nmval(sp, "CRY", i, 0);
+ if (!conf_bdev_name) {
+ SPDK_ERRLOG("crypto configuration missing bdev name\n");
+ return -EINVAL;
+ }
+
+ conf_vbdev_name = spdk_conf_section_get_nmval(sp, "CRY", i, 1);
+ if (!conf_vbdev_name) {
+ SPDK_ERRLOG("crypto configuration missing crypto_bdev name\n");
+ return -EINVAL;
+ }
+
+ key = spdk_conf_section_get_nmval(sp, "CRY", i, 2);
+ if (!key) {
+ SPDK_ERRLOG("crypto configuration missing crypto_bdev key\n");
+ return -EINVAL;
+ }
+ SPDK_NOTICELOG("WARNING: You are storing your key in a plain text file!!\n");
+
+ crypto_pmd = spdk_conf_section_get_nmval(sp, "CRY", i, 3);
+ if (!crypto_pmd) {
+ SPDK_ERRLOG("crypto configuration missing driver type\n");
+ return -EINVAL;
+ }
+
+ /* These are optional. */
+ cipher = spdk_conf_section_get_nmval(sp, "CRY", i, 4);
+ if (cipher == NULL) {
+ cipher = AES_CBC;
+ }
+ key2 = spdk_conf_section_get_nmval(sp, "CRY", i, 5);
+
+ /* Note: config file options do not support QAT AES_XTS, use RPC */
+ rc = vbdev_crypto_insert_name(conf_bdev_name, conf_vbdev_name,
+ crypto_pmd, key, cipher, key2);
+ if (rc != 0) {
+ return rc;
+ }
+ }
+
+ return rc;
+}
+
+/* Called when the entire module is being torn down. */
+static void
+vbdev_crypto_finish(void)
+{
+ struct bdev_names *name;
+ struct vbdev_dev *device;
+ struct device_qp *dev_qp;
+ unsigned i;
+ int rc;
+
+ while ((name = TAILQ_FIRST(&g_bdev_names))) {
+ TAILQ_REMOVE(&g_bdev_names, name, link);
+ free(name->drv_name);
+ free(name->key);
+ free(name->bdev_name);
+ free(name->vbdev_name);
+ free(name->key2);
+ free(name);
+ }
+
+ while ((device = TAILQ_FIRST(&g_vbdev_devs))) {
+ struct rte_cryptodev *rte_dev;
+
+ TAILQ_REMOVE(&g_vbdev_devs, device, link);
+ rte_cryptodev_stop(device->cdev_id);
+
+ assert(device->cdev_id < RTE_CRYPTO_MAX_DEVS);
+ rte_dev = &rte_cryptodevs[device->cdev_id];
+
+ if (rte_dev->dev_ops->queue_pair_release != NULL) {
+ for (i = 0; i < device->cdev_info.max_nb_queue_pairs; i++) {
+ rte_dev->dev_ops->queue_pair_release(rte_dev, i);
+ }
+ }
+ free(device);
+ }
+ rc = rte_vdev_uninit(AESNI_MB);
+ if (rc) {
+ SPDK_ERRLOG("%d from rte_vdev_uninit\n", rc);
+ }
+
+ while ((dev_qp = TAILQ_FIRST(&g_device_qp_qat))) {
+ TAILQ_REMOVE(&g_device_qp_qat, dev_qp, link);
+ free(dev_qp);
+ }
+
+ while ((dev_qp = TAILQ_FIRST(&g_device_qp_aesni_mb))) {
+ TAILQ_REMOVE(&g_device_qp_aesni_mb, dev_qp, link);
+ free(dev_qp);
+ }
+
+ rte_mempool_free(g_crypto_op_mp);
+ spdk_mempool_free(g_mbuf_mp);
+ rte_mempool_free(g_session_mp);
+ if (g_session_mp_priv != NULL) {
+ rte_mempool_free(g_session_mp_priv);
+ }
+}
+
+/* During init we'll be asked how much memory we'd like passed to us
+ * in bev_io structures as context. Here's where we specify how
+ * much context we want per IO.
+ */
+static int
+vbdev_crypto_get_ctx_size(void)
+{
+ return sizeof(struct crypto_bdev_io);
+}
+
+/* Called when SPDK wants to save the current config of this vbdev module to
+ * a file.
+ */
+static void
+vbdev_crypto_get_spdk_running_config(FILE *fp)
+{
+ struct bdev_names *names = NULL;
+ fprintf(fp, "\n[crypto]\n");
+ TAILQ_FOREACH(names, &g_bdev_names, link) {
+ fprintf(fp, " crypto %s %s ", names->bdev_name, names->vbdev_name);
+ fprintf(fp, "\n");
+ }
+
+ fprintf(fp, "\n");
+}
+
+/* Called when the underlying base bdev goes away. */
+static void
+vbdev_crypto_examine_hotremove_cb(void *ctx)
+{
+ struct vbdev_crypto *crypto_bdev, *tmp;
+ struct spdk_bdev *bdev_find = ctx;
+
+ TAILQ_FOREACH_SAFE(crypto_bdev, &g_vbdev_crypto, link, tmp) {
+ if (bdev_find == crypto_bdev->base_bdev) {
+ spdk_bdev_unregister(&crypto_bdev->crypto_bdev, NULL, NULL);
+ }
+ }
+}
+
+static void
+vbdev_crypto_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
+{
+ /* No config per bdev needed */
+}
+
+/* When we register our bdev this is how we specify our entry points. */
+static const struct spdk_bdev_fn_table vbdev_crypto_fn_table = {
+ .destruct = vbdev_crypto_destruct,
+ .submit_request = vbdev_crypto_submit_request,
+ .io_type_supported = vbdev_crypto_io_type_supported,
+ .get_io_channel = vbdev_crypto_get_io_channel,
+ .dump_info_json = vbdev_crypto_dump_info_json,
+ .write_config_json = vbdev_crypto_write_config_json
+};
+
+static struct spdk_bdev_module crypto_if = {
+ .name = "crypto",
+ .module_init = vbdev_crypto_init,
+ .config_text = vbdev_crypto_get_spdk_running_config,
+ .get_ctx_size = vbdev_crypto_get_ctx_size,
+ .examine_config = vbdev_crypto_examine,
+ .module_fini = vbdev_crypto_finish,
+ .config_json = vbdev_crypto_config_json
+};
+
+SPDK_BDEV_MODULE_REGISTER(crypto, &crypto_if)
+
+static int
+vbdev_crypto_claim(struct spdk_bdev *bdev)
+{
+ struct bdev_names *name;
+ struct vbdev_crypto *vbdev;
+ struct vbdev_dev *device;
+ bool found = false;
+ int rc = 0;
+
+ if (g_number_of_claimed_volumes >= MAX_CRYPTO_VOLUMES) {
+ SPDK_DEBUGLOG(SPDK_LOG_CRYPTO, "Reached max number of claimed volumes\n");
+ rc = -EINVAL;
+ goto error_vbdev_alloc;
+ }
+ g_number_of_claimed_volumes++;
+
+ /* Check our list of names from config versus this bdev and if
+ * there's a match, create the crypto_bdev & bdev accordingly.
+ */
+ TAILQ_FOREACH(name, &g_bdev_names, link) {
+ if (strcmp(name->bdev_name, bdev->name) != 0) {
+ continue;
+ }
+ SPDK_DEBUGLOG(SPDK_LOG_CRYPTO, "Match on %s\n", bdev->name);
+
+ vbdev = calloc(1, sizeof(struct vbdev_crypto));
+ if (!vbdev) {
+ SPDK_ERRLOG("could not allocate crypto_bdev\n");
+ rc = -ENOMEM;
+ goto error_vbdev_alloc;
+ }
+
+ /* The base bdev that we're attaching to. */
+ vbdev->base_bdev = bdev;
+ vbdev->crypto_bdev.name = strdup(name->vbdev_name);
+ if (!vbdev->crypto_bdev.name) {
+ SPDK_ERRLOG("could not allocate crypto_bdev name\n");
+ rc = -ENOMEM;
+ goto error_bdev_name;
+ }
+
+ vbdev->key = strdup(name->key);
+ if (!vbdev->key) {
+ SPDK_ERRLOG("could not allocate crypto_bdev key\n");
+ rc = -ENOMEM;
+ goto error_alloc_key;
+ }
+
+ if (name->key2) {
+ vbdev->key2 = strdup(name->key2);
+ if (!vbdev->key2) {
+ SPDK_ERRLOG("could not allocate crypto_bdev key2\n");
+ rc = -ENOMEM;
+ goto error_alloc_key2;
+ }
+ }
+
+ vbdev->drv_name = strdup(name->drv_name);
+ if (!vbdev->drv_name) {
+ SPDK_ERRLOG("could not allocate crypto_bdev drv_name\n");
+ rc = -ENOMEM;
+ goto error_drv_name;
+ }
+
+ vbdev->crypto_bdev.product_name = "crypto";
+ vbdev->crypto_bdev.write_cache = bdev->write_cache;
+ vbdev->cipher = AES_CBC;
+ if (strcmp(vbdev->drv_name, QAT) == 0) {
+ vbdev->crypto_bdev.required_alignment =
+ spdk_max(spdk_u32log2(bdev->blocklen), bdev->required_alignment);
+ SPDK_NOTICELOG("QAT in use: Required alignment set to %u\n",
+ vbdev->crypto_bdev.required_alignment);
+ if (strcmp(name->cipher, AES_CBC) == 0) {
+ SPDK_NOTICELOG("QAT using cipher: AES_CBC\n");
+ } else {
+ SPDK_NOTICELOG("QAT using cipher: AES_XTS\n");
+ vbdev->cipher = AES_XTS;
+ /* DPDK expects they keys to be concatenated together. */
+ vbdev->xts_key = calloc(1, (AES_XTS_KEY_LENGTH * 2) + 1);
+ if (vbdev->xts_key == NULL) {
+ SPDK_ERRLOG("could not allocate memory for XTS key\n");
+ rc = -ENOMEM;
+ goto error_xts_key;
+ }
+ memcpy(vbdev->xts_key, vbdev->key, AES_XTS_KEY_LENGTH);
+ assert(name->key2);
+ memcpy(vbdev->xts_key + AES_XTS_KEY_LENGTH, name->key2, AES_XTS_KEY_LENGTH + 1);
+ }
+ } else {
+ vbdev->crypto_bdev.required_alignment = bdev->required_alignment;
+ }
+ /* Note: CRYPTO_MAX_IO is in units of bytes, optimal_io_boundary is
+ * in units of blocks.
+ */
+ if (bdev->optimal_io_boundary > 0) {
+ vbdev->crypto_bdev.optimal_io_boundary =
+ spdk_min((CRYPTO_MAX_IO / bdev->blocklen), bdev->optimal_io_boundary);
+ } else {
+ vbdev->crypto_bdev.optimal_io_boundary = (CRYPTO_MAX_IO / bdev->blocklen);
+ }
+ vbdev->crypto_bdev.split_on_optimal_io_boundary = true;
+ vbdev->crypto_bdev.blocklen = bdev->blocklen;
+ vbdev->crypto_bdev.blockcnt = bdev->blockcnt;
+
+ /* This is the context that is passed to us when the bdev
+ * layer calls in so we'll save our crypto_bdev node here.
+ */
+ vbdev->crypto_bdev.ctxt = vbdev;
+ vbdev->crypto_bdev.fn_table = &vbdev_crypto_fn_table;
+ vbdev->crypto_bdev.module = &crypto_if;
+ TAILQ_INSERT_TAIL(&g_vbdev_crypto, vbdev, link);
+
+ spdk_io_device_register(vbdev, crypto_bdev_ch_create_cb, crypto_bdev_ch_destroy_cb,
+ sizeof(struct crypto_io_channel), vbdev->crypto_bdev.name);
+
+ rc = spdk_bdev_open(bdev, true, vbdev_crypto_examine_hotremove_cb,
+ bdev, &vbdev->base_desc);
+ if (rc) {
+ SPDK_ERRLOG("could not open bdev %s\n", spdk_bdev_get_name(bdev));
+ goto error_open;
+ }
+
+ /* Save the thread where the base device is opened */
+ vbdev->thread = spdk_get_thread();
+
+ rc = spdk_bdev_module_claim_bdev(bdev, vbdev->base_desc, vbdev->crypto_bdev.module);
+ if (rc) {
+ SPDK_ERRLOG("could not claim bdev %s\n", spdk_bdev_get_name(bdev));
+ goto error_claim;
+ }
+
+ /* To init the session we have to get the cryptoDev device ID for this vbdev */
+ TAILQ_FOREACH(device, &g_vbdev_devs, link) {
+ if (strcmp(device->cdev_info.driver_name, vbdev->drv_name) == 0) {
+ found = true;
+ break;
+ }
+ }
+ if (found == false) {
+ SPDK_ERRLOG("ERROR can't match crypto device driver to crypto vbdev!\n");
+ rc = -EINVAL;
+ goto error_cant_find_devid;
+ }
+
+ /* Get sessions. */
+ vbdev->session_encrypt = rte_cryptodev_sym_session_create(g_session_mp);
+ if (NULL == vbdev->session_encrypt) {
+ SPDK_ERRLOG("ERROR trying to create crypto session!\n");
+ rc = -EINVAL;
+ goto error_session_en_create;
+ }
+
+ vbdev->session_decrypt = rte_cryptodev_sym_session_create(g_session_mp);
+ if (NULL == vbdev->session_decrypt) {
+ SPDK_ERRLOG("ERROR trying to create crypto session!\n");
+ rc = -EINVAL;
+ goto error_session_de_create;
+ }
+
+ /* Init our per vbdev xform with the desired cipher options. */
+ vbdev->cipher_xform.type = RTE_CRYPTO_SYM_XFORM_CIPHER;
+ vbdev->cipher_xform.cipher.iv.offset = IV_OFFSET;
+ if (strcmp(name->cipher, AES_CBC) == 0) {
+ vbdev->cipher_xform.cipher.key.data = vbdev->key;
+ vbdev->cipher_xform.cipher.algo = RTE_CRYPTO_CIPHER_AES_CBC;
+ vbdev->cipher_xform.cipher.key.length = AES_CBC_KEY_LENGTH;
+ } else {
+ vbdev->cipher_xform.cipher.key.data = vbdev->xts_key;
+ vbdev->cipher_xform.cipher.algo = RTE_CRYPTO_CIPHER_AES_XTS;
+ vbdev->cipher_xform.cipher.key.length = AES_XTS_KEY_LENGTH * 2;
+ }
+ vbdev->cipher_xform.cipher.iv.length = AES_CBC_IV_LENGTH;
+
+ vbdev->cipher_xform.cipher.op = RTE_CRYPTO_CIPHER_OP_ENCRYPT;
+ rc = rte_cryptodev_sym_session_init(device->cdev_id, vbdev->session_encrypt,
+ &vbdev->cipher_xform,
+ g_session_mp_priv ? g_session_mp_priv : g_session_mp);
+ if (rc < 0) {
+ SPDK_ERRLOG("ERROR trying to init encrypt session!\n");
+ rc = -EINVAL;
+ goto error_session_init;
+ }
+
+ vbdev->cipher_xform.cipher.op = RTE_CRYPTO_CIPHER_OP_DECRYPT;
+ rc = rte_cryptodev_sym_session_init(device->cdev_id, vbdev->session_decrypt,
+ &vbdev->cipher_xform,
+ g_session_mp_priv ? g_session_mp_priv : g_session_mp);
+ if (rc < 0) {
+ SPDK_ERRLOG("ERROR trying to init decrypt session!\n");
+ rc = -EINVAL;
+ goto error_session_init;
+ }
+
+ rc = spdk_bdev_register(&vbdev->crypto_bdev);
+ if (rc < 0) {
+ SPDK_ERRLOG("ERROR trying to register bdev\n");
+ rc = -EINVAL;
+ goto error_bdev_register;
+ }
+ SPDK_DEBUGLOG(SPDK_LOG_CRYPTO, "registered io_device and virtual bdev for: %s\n",
+ name->vbdev_name);
+ break;
+ }
+
+ return rc;
+
+ /* Error cleanup paths. */
+error_bdev_register:
+error_session_init:
+ rte_cryptodev_sym_session_free(vbdev->session_decrypt);
+error_session_de_create:
+ rte_cryptodev_sym_session_free(vbdev->session_encrypt);
+error_session_en_create:
+error_cant_find_devid:
+error_claim:
+ spdk_bdev_close(vbdev->base_desc);
+error_open:
+ TAILQ_REMOVE(&g_vbdev_crypto, vbdev, link);
+ spdk_io_device_unregister(vbdev, NULL);
+ free(vbdev->xts_key);
+error_xts_key:
+ free(vbdev->drv_name);
+error_drv_name:
+ free(vbdev->key2);
+error_alloc_key2:
+ free(vbdev->key);
+error_alloc_key:
+ free(vbdev->crypto_bdev.name);
+error_bdev_name:
+ free(vbdev);
+error_vbdev_alloc:
+ g_number_of_claimed_volumes--;
+ return rc;
+}
+
+/* RPC entry for deleting a crypto vbdev. */
+void
+delete_crypto_disk(struct spdk_bdev *bdev, spdk_delete_crypto_complete cb_fn,
+ void *cb_arg)
+{
+ struct bdev_names *name;
+
+ if (!bdev || bdev->module != &crypto_if) {
+ cb_fn(cb_arg, -ENODEV);
+ return;
+ }
+
+ /* Remove the association (vbdev, bdev) from g_bdev_names. This is required so that the
+ * vbdev does not get re-created if the same bdev is constructed at some other time,
+ * unless the underlying bdev was hot-removed.
+ */
+ TAILQ_FOREACH(name, &g_bdev_names, link) {
+ if (strcmp(name->vbdev_name, bdev->name) == 0) {
+ TAILQ_REMOVE(&g_bdev_names, name, link);
+ free(name->bdev_name);
+ free(name->vbdev_name);
+ free(name->drv_name);
+ free(name->key);
+ free(name->key2);
+ free(name);
+ break;
+ }
+ }
+
+ /* Additional cleanup happens in the destruct callback. */
+ spdk_bdev_unregister(bdev, cb_fn, cb_arg);
+}
+
+/* Because we specified this function in our crypto bdev function table when we
+ * registered our crypto bdev, we'll get this call anytime a new bdev shows up.
+ * Here we need to decide if we care about it and if so what to do. We
+ * parsed the config file at init so we check the new bdev against the list
+ * we built up at that time and if the user configured us to attach to this
+ * bdev, here's where we do it.
+ */
+static void
+vbdev_crypto_examine(struct spdk_bdev *bdev)
+{
+ vbdev_crypto_claim(bdev);
+ spdk_bdev_module_examine_done(&crypto_if);
+}
+
+SPDK_LOG_REGISTER_COMPONENT("vbdev_crypto", SPDK_LOG_CRYPTO)
diff --git a/src/spdk/module/bdev/crypto/vbdev_crypto.h b/src/spdk/module/bdev/crypto/vbdev_crypto.h
new file mode 100644
index 000000000..458b29c6b
--- /dev/null
+++ b/src/spdk/module/bdev/crypto/vbdev_crypto.h
@@ -0,0 +1,78 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_VBDEV_CRYPTO_H
+#define SPDK_VBDEV_CRYPTO_H
+
+#include "spdk/rpc.h"
+#include "spdk/util.h"
+#include "spdk/string.h"
+#include "spdk_internal/log.h"
+
+#include "spdk/bdev.h"
+
+#define AESNI_MB "crypto_aesni_mb"
+#define QAT "crypto_qat"
+
+/* Supported ciphers */
+#define AES_CBC "AES_CBC" /* QAT and AESNI_MB */
+#define AES_XTS "AES_XTS" /* QAT only */
+
+typedef void (*spdk_delete_crypto_complete)(void *cb_arg, int bdeverrno);
+
+/**
+ * Create new crypto bdev.
+ *
+ * \param bdev_name Name of the bdev on which the crypto vbdev will be created.
+ * \param vbdev_name Name of the new crypto vbdev.
+ * \param crypto_pmd Name of the polled mode driver to use for this vbdev.
+ * \param key The key to use for this vbdev.
+ * \param cipher The cipher to use for this vbdev.
+ * \param keys The 2nd key to use for AES_XTS cipher.
+ * \return 0 on success, other on failure.
+ */
+int create_crypto_disk(const char *bdev_name, const char *vbdev_name,
+ const char *crypto_pmd, const char *key,
+ const char *cipher, const char *key2);
+
+/**
+ * Delete crypto bdev.
+ *
+ * \param bdev Pointer to crypto bdev.
+ * \param cb_fn Function to call after deletion.
+ * \param cb_arg Argument to pass to cb_fn.
+ */
+void delete_crypto_disk(struct spdk_bdev *bdev, spdk_delete_crypto_complete cb_fn,
+ void *cb_arg);
+
+#endif /* SPDK_VBDEV_CRYPTO_H */
diff --git a/src/spdk/module/bdev/crypto/vbdev_crypto_rpc.c b/src/spdk/module/bdev/crypto/vbdev_crypto_rpc.c
new file mode 100644
index 000000000..46c1e210d
--- /dev/null
+++ b/src/spdk/module/bdev/crypto/vbdev_crypto_rpc.c
@@ -0,0 +1,195 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "vbdev_crypto.h"
+
+/* Structure to hold the parameters for this RPC method. */
+struct rpc_construct_crypto {
+ char *base_bdev_name;
+ char *name;
+ char *crypto_pmd;
+ char *key;
+ char *cipher;
+ char *key2;
+};
+
+/* Free the allocated memory resource after the RPC handling. */
+static void
+free_rpc_construct_crypto(struct rpc_construct_crypto *r)
+{
+ free(r->base_bdev_name);
+ free(r->name);
+ free(r->crypto_pmd);
+ free(r->key);
+ free(r->cipher);
+ free(r->key2);
+}
+
+/* Structure to decode the input parameters for this RPC method. */
+static const struct spdk_json_object_decoder rpc_construct_crypto_decoders[] = {
+ {"base_bdev_name", offsetof(struct rpc_construct_crypto, base_bdev_name), spdk_json_decode_string},
+ {"name", offsetof(struct rpc_construct_crypto, name), spdk_json_decode_string},
+ {"crypto_pmd", offsetof(struct rpc_construct_crypto, crypto_pmd), spdk_json_decode_string},
+ {"key", offsetof(struct rpc_construct_crypto, key), spdk_json_decode_string},
+ {"cipher", offsetof(struct rpc_construct_crypto, cipher), spdk_json_decode_string, true},
+ {"key2", offsetof(struct rpc_construct_crypto, key2), spdk_json_decode_string, true},
+};
+
+/* Decode the parameters for this RPC method and properly construct the crypto
+ * device. Error status returned in the failed cases.
+ */
+static void
+rpc_bdev_crypto_create(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_construct_crypto req = {NULL};
+ struct spdk_json_write_ctx *w;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_construct_crypto_decoders,
+ SPDK_COUNTOF(rpc_construct_crypto_decoders),
+ &req)) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid parameters");
+ goto cleanup;
+ }
+
+ if (req.cipher == NULL) {
+ req.cipher = strdup(AES_CBC);
+ if (req.cipher == NULL) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Unable to allocate memory for req.cipher");
+ goto cleanup;
+ }
+ }
+
+ if (strcmp(req.cipher, AES_XTS) != 0 && strcmp(req.cipher, AES_CBC) != 0) {
+ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid cipher: %s",
+ req.cipher);
+ goto cleanup;
+ }
+
+ if (strcmp(req.crypto_pmd, AESNI_MB) == 0 && strcmp(req.cipher, AES_XTS) == 0) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid cipher. AES_XTS is only available on QAT.");
+ goto cleanup;
+ }
+
+ if (strcmp(req.cipher, AES_XTS) == 0 && req.key2 == NULL) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid key. A 2nd key is needed for AES_XTS.");
+ goto cleanup;
+ }
+
+ if (strcmp(req.cipher, AES_CBC) == 0 && req.key2 != NULL) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid key. A 2nd key is needed only for AES_XTS.");
+ goto cleanup;
+ }
+
+ rc = create_crypto_disk(req.base_bdev_name, req.name,
+ req.crypto_pmd, req.key, req.cipher, req.key2);
+ if (rc) {
+ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc));
+ goto cleanup;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_string(w, req.name);
+ spdk_jsonrpc_end_result(request, w);
+ free_rpc_construct_crypto(&req);
+ return;
+
+cleanup:
+ free_rpc_construct_crypto(&req);
+}
+SPDK_RPC_REGISTER("bdev_crypto_create", rpc_bdev_crypto_create, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_crypto_create, construct_crypto_bdev)
+
+struct rpc_delete_crypto {
+ char *name;
+};
+
+static void
+free_rpc_delete_crypto(struct rpc_delete_crypto *req)
+{
+ free(req->name);
+}
+
+static const struct spdk_json_object_decoder rpc_delete_crypto_decoders[] = {
+ {"name", offsetof(struct rpc_delete_crypto, name), spdk_json_decode_string},
+};
+
+static void
+rpc_bdev_crypto_delete_cb(void *cb_arg, int bdeverrno)
+{
+ struct spdk_jsonrpc_request *request = cb_arg;
+ struct spdk_json_write_ctx *w = spdk_jsonrpc_begin_result(request);
+
+ spdk_json_write_bool(w, bdeverrno == 0);
+ spdk_jsonrpc_end_result(request, w);
+}
+
+static void
+rpc_bdev_crypto_delete(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_delete_crypto req = {NULL};
+ struct spdk_bdev *bdev;
+
+ if (spdk_json_decode_object(params, rpc_delete_crypto_decoders,
+ SPDK_COUNTOF(rpc_delete_crypto_decoders),
+ &req)) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid parameters");
+ goto cleanup;
+ }
+
+ bdev = spdk_bdev_get_by_name(req.name);
+ if (bdev == NULL) {
+ spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV));
+ goto cleanup;
+ }
+
+ delete_crypto_disk(bdev, rpc_bdev_crypto_delete_cb, request);
+
+ free_rpc_delete_crypto(&req);
+
+ return;
+
+cleanup:
+ free_rpc_delete_crypto(&req);
+}
+SPDK_RPC_REGISTER("bdev_crypto_delete", rpc_bdev_crypto_delete, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_crypto_delete, delete_crypto_bdev)
diff --git a/src/spdk/module/bdev/delay/Makefile b/src/spdk/module/bdev/delay/Makefile
new file mode 100644
index 000000000..f043ca5a8
--- /dev/null
+++ b/src/spdk/module/bdev/delay/Makefile
@@ -0,0 +1,47 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+CFLAGS += -I$(SPDK_ROOT_DIR)/lib/bdev/
+
+C_SRCS = vbdev_delay.c vbdev_delay_rpc.c
+LIBNAME = bdev_delay
+
+SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/module/bdev/delay/vbdev_delay.c b/src/spdk/module/bdev/delay/vbdev_delay.c
new file mode 100644
index 000000000..b4ea1b413
--- /dev/null
+++ b/src/spdk/module/bdev/delay/vbdev_delay.c
@@ -0,0 +1,851 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "vbdev_delay.h"
+#include "spdk/rpc.h"
+#include "spdk/env.h"
+#include "spdk/conf.h"
+#include "spdk/endian.h"
+#include "spdk/string.h"
+#include "spdk/thread.h"
+#include "spdk/util.h"
+
+#include "spdk/bdev_module.h"
+#include "spdk_internal/log.h"
+
+
+static int vbdev_delay_init(void);
+static int vbdev_delay_get_ctx_size(void);
+static void vbdev_delay_examine(struct spdk_bdev *bdev);
+static void vbdev_delay_finish(void);
+static int vbdev_delay_config_json(struct spdk_json_write_ctx *w);
+
+static struct spdk_bdev_module delay_if = {
+ .name = "delay",
+ .module_init = vbdev_delay_init,
+ .config_text = NULL,
+ .get_ctx_size = vbdev_delay_get_ctx_size,
+ .examine_config = vbdev_delay_examine,
+ .module_fini = vbdev_delay_finish,
+ .config_json = vbdev_delay_config_json
+};
+
+SPDK_BDEV_MODULE_REGISTER(delay, &delay_if)
+
+/* Associative list to be used in examine */
+struct bdev_association {
+ char *vbdev_name;
+ char *bdev_name;
+ uint64_t avg_read_latency;
+ uint64_t p99_read_latency;
+ uint64_t avg_write_latency;
+ uint64_t p99_write_latency;
+ TAILQ_ENTRY(bdev_association) link;
+};
+static TAILQ_HEAD(, bdev_association) g_bdev_associations = TAILQ_HEAD_INITIALIZER(
+ g_bdev_associations);
+
+/* List of virtual bdevs and associated info for each. */
+struct vbdev_delay {
+ struct spdk_bdev *base_bdev; /* the thing we're attaching to */
+ struct spdk_bdev_desc *base_desc; /* its descriptor we get from open */
+ struct spdk_bdev delay_bdev; /* the delay virtual bdev */
+ uint64_t average_read_latency_ticks; /* the average read delay */
+ uint64_t p99_read_latency_ticks; /* the p99 read delay */
+ uint64_t average_write_latency_ticks; /* the average write delay */
+ uint64_t p99_write_latency_ticks; /* the p99 write delay */
+ TAILQ_ENTRY(vbdev_delay) link;
+ struct spdk_thread *thread; /* thread where base device is opened */
+};
+static TAILQ_HEAD(, vbdev_delay) g_delay_nodes = TAILQ_HEAD_INITIALIZER(g_delay_nodes);
+
+struct delay_bdev_io {
+ int status;
+
+ uint64_t completion_tick;
+
+ enum delay_io_type type;
+
+ struct spdk_io_channel *ch;
+
+ struct spdk_bdev_io_wait_entry bdev_io_wait;
+
+ STAILQ_ENTRY(delay_bdev_io) link;
+};
+
+struct delay_io_channel {
+ struct spdk_io_channel *base_ch; /* IO channel of base device */
+ STAILQ_HEAD(, delay_bdev_io) avg_read_io;
+ STAILQ_HEAD(, delay_bdev_io) p99_read_io;
+ STAILQ_HEAD(, delay_bdev_io) avg_write_io;
+ STAILQ_HEAD(, delay_bdev_io) p99_write_io;
+ struct spdk_poller *io_poller;
+ unsigned int rand_seed;
+};
+
+static void
+vbdev_delay_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io);
+
+
+/* Callback for unregistering the IO device. */
+static void
+_device_unregister_cb(void *io_device)
+{
+ struct vbdev_delay *delay_node = io_device;
+
+ /* Done with this delay_node. */
+ free(delay_node->delay_bdev.name);
+ free(delay_node);
+}
+
+static void
+_vbdev_delay_destruct(void *ctx)
+{
+ struct spdk_bdev_desc *desc = ctx;
+
+ spdk_bdev_close(desc);
+}
+
+static int
+vbdev_delay_destruct(void *ctx)
+{
+ struct vbdev_delay *delay_node = (struct vbdev_delay *)ctx;
+
+ /* It is important to follow this exact sequence of steps for destroying
+ * a vbdev...
+ */
+
+ TAILQ_REMOVE(&g_delay_nodes, delay_node, link);
+
+ /* Unclaim the underlying bdev. */
+ spdk_bdev_module_release_bdev(delay_node->base_bdev);
+
+ /* Close the underlying bdev on its same opened thread. */
+ if (delay_node->thread && delay_node->thread != spdk_get_thread()) {
+ spdk_thread_send_msg(delay_node->thread, _vbdev_delay_destruct, delay_node->base_desc);
+ } else {
+ spdk_bdev_close(delay_node->base_desc);
+ }
+
+ /* Unregister the io_device. */
+ spdk_io_device_unregister(delay_node, _device_unregister_cb);
+
+ return 0;
+}
+
+static int
+_process_io_stailq(void *arg, uint64_t ticks)
+{
+ STAILQ_HEAD(, delay_bdev_io) *head = arg;
+ struct delay_bdev_io *io_ctx, *tmp;
+ int completions = 0;
+
+ STAILQ_FOREACH_SAFE(io_ctx, head, link, tmp) {
+ if (io_ctx->completion_tick <= ticks) {
+ STAILQ_REMOVE(head, io_ctx, delay_bdev_io, link);
+ spdk_bdev_io_complete(spdk_bdev_io_from_ctx(io_ctx), io_ctx->status);
+ completions++;
+ } else {
+ /* In the general case, I/O will become ready in an fifo order. When timeouts are dynamically
+ * changed, this is not necessarily the case. However, the normal behavior will be restored
+ * after the outstanding I/O at the time of the change have been completed.
+ * This essentially means that moving from a high to low latency creates a dam for the new I/O
+ * submitted after the latency change. This is considered desirable behavior for the use case where
+ * we are trying to trigger a pre-defined timeout on an initiator.
+ */
+ break;
+ }
+ }
+
+ return completions;
+}
+
+static int
+_delay_finish_io(void *arg)
+{
+ struct delay_io_channel *delay_ch = arg;
+ uint64_t ticks = spdk_get_ticks();
+ int completions = 0;
+
+ completions += _process_io_stailq(&delay_ch->avg_read_io, ticks);
+ completions += _process_io_stailq(&delay_ch->avg_write_io, ticks);
+ completions += _process_io_stailq(&delay_ch->p99_read_io, ticks);
+ completions += _process_io_stailq(&delay_ch->p99_write_io, ticks);
+
+ return completions == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY;
+}
+
+/* Completion callback for IO that were issued from this bdev. The original bdev_io
+ * is passed in as an arg so we'll complete that one with the appropriate status
+ * and then free the one that this module issued.
+ */
+static void
+_delay_complete_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct spdk_bdev_io *orig_io = cb_arg;
+ struct vbdev_delay *delay_node = SPDK_CONTAINEROF(orig_io->bdev, struct vbdev_delay, delay_bdev);
+ struct delay_bdev_io *io_ctx = (struct delay_bdev_io *)orig_io->driver_ctx;
+ struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(io_ctx->ch);
+
+ io_ctx->status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
+ spdk_bdev_free_io(bdev_io);
+
+ /* Put the I/O into the proper list for processing by the channel poller. */
+ switch (io_ctx->type) {
+ case DELAY_AVG_READ:
+ io_ctx->completion_tick = spdk_get_ticks() + delay_node->average_read_latency_ticks;
+ STAILQ_INSERT_TAIL(&delay_ch->avg_read_io, io_ctx, link);
+ break;
+ case DELAY_AVG_WRITE:
+ io_ctx->completion_tick = spdk_get_ticks() + delay_node->average_write_latency_ticks;
+ STAILQ_INSERT_TAIL(&delay_ch->avg_write_io, io_ctx, link);
+ break;
+ case DELAY_P99_READ:
+ io_ctx->completion_tick = spdk_get_ticks() + delay_node->p99_read_latency_ticks;
+ STAILQ_INSERT_TAIL(&delay_ch->p99_read_io, io_ctx, link);
+ break;
+ case DELAY_P99_WRITE:
+ io_ctx->completion_tick = spdk_get_ticks() + delay_node->p99_write_latency_ticks;
+ STAILQ_INSERT_TAIL(&delay_ch->p99_write_io, io_ctx, link);
+ break;
+ case DELAY_NONE:
+ default:
+ spdk_bdev_io_complete(orig_io, io_ctx->status);
+ break;
+ }
+}
+
+static void
+vbdev_delay_resubmit_io(void *arg)
+{
+ struct spdk_bdev_io *bdev_io = (struct spdk_bdev_io *)arg;
+ struct delay_bdev_io *io_ctx = (struct delay_bdev_io *)bdev_io->driver_ctx;
+
+ vbdev_delay_submit_request(io_ctx->ch, bdev_io);
+}
+
+static void
+vbdev_delay_queue_io(struct spdk_bdev_io *bdev_io)
+{
+ struct delay_bdev_io *io_ctx = (struct delay_bdev_io *)bdev_io->driver_ctx;
+ struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(io_ctx->ch);
+ int rc;
+
+ io_ctx->bdev_io_wait.bdev = bdev_io->bdev;
+ io_ctx->bdev_io_wait.cb_fn = vbdev_delay_resubmit_io;
+ io_ctx->bdev_io_wait.cb_arg = bdev_io;
+
+ rc = spdk_bdev_queue_io_wait(bdev_io->bdev, delay_ch->base_ch, &io_ctx->bdev_io_wait);
+ if (rc != 0) {
+ SPDK_ERRLOG("Queue io failed in vbdev_delay_queue_io, rc=%d.\n", rc);
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+}
+
+static void
+delay_read_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success)
+{
+ struct vbdev_delay *delay_node = SPDK_CONTAINEROF(bdev_io->bdev, struct vbdev_delay,
+ delay_bdev);
+ struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(ch);
+ int rc;
+
+ if (!success) {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ return;
+ }
+
+ rc = spdk_bdev_readv_blocks(delay_node->base_desc, delay_ch->base_ch, bdev_io->u.bdev.iovs,
+ bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks,
+ bdev_io->u.bdev.num_blocks, _delay_complete_io,
+ bdev_io);
+
+ if (rc == -ENOMEM) {
+ SPDK_ERRLOG("No memory, start to queue io for delay.\n");
+ vbdev_delay_queue_io(bdev_io);
+ } else if (rc != 0) {
+ SPDK_ERRLOG("ERROR on bdev_io submission!\n");
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+}
+
+static void
+vbdev_delay_reset_dev(struct spdk_io_channel_iter *i, int status)
+{
+ struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i);
+ struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
+ struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(ch);
+ struct vbdev_delay *delay_node = spdk_io_channel_iter_get_io_device(i);
+ int rc;
+
+ rc = spdk_bdev_reset(delay_node->base_desc, delay_ch->base_ch,
+ _delay_complete_io, bdev_io);
+
+ if (rc == -ENOMEM) {
+ SPDK_ERRLOG("No memory, start to queue io for delay.\n");
+ vbdev_delay_queue_io(bdev_io);
+ } else if (rc != 0) {
+ SPDK_ERRLOG("ERROR on bdev_io submission!\n");
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+}
+
+static void
+_abort_all_delayed_io(void *arg)
+{
+ STAILQ_HEAD(, delay_bdev_io) *head = arg;
+ struct delay_bdev_io *io_ctx, *tmp;
+
+ STAILQ_FOREACH_SAFE(io_ctx, head, link, tmp) {
+ STAILQ_REMOVE(head, io_ctx, delay_bdev_io, link);
+ spdk_bdev_io_complete(spdk_bdev_io_from_ctx(io_ctx), SPDK_BDEV_IO_STATUS_ABORTED);
+ }
+}
+
+static void
+vbdev_delay_reset_channel(struct spdk_io_channel_iter *i)
+{
+ struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
+ struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(ch);
+
+ _abort_all_delayed_io(&delay_ch->avg_read_io);
+ _abort_all_delayed_io(&delay_ch->avg_write_io);
+ _abort_all_delayed_io(&delay_ch->p99_read_io);
+ _abort_all_delayed_io(&delay_ch->p99_write_io);
+
+ spdk_for_each_channel_continue(i, 0);
+}
+
+static bool
+abort_delayed_io(void *_head, struct spdk_bdev_io *bio_to_abort)
+{
+ STAILQ_HEAD(, delay_bdev_io) *head = _head;
+ struct delay_bdev_io *io_ctx_to_abort = (struct delay_bdev_io *)bio_to_abort->driver_ctx;
+ struct delay_bdev_io *io_ctx;
+
+ STAILQ_FOREACH(io_ctx, head, link) {
+ if (io_ctx == io_ctx_to_abort) {
+ STAILQ_REMOVE(head, io_ctx_to_abort, delay_bdev_io, link);
+ spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static int
+vbdev_delay_abort(struct vbdev_delay *delay_node, struct delay_io_channel *delay_ch,
+ struct spdk_bdev_io *bdev_io)
+{
+ struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort;
+
+ if (abort_delayed_io(&delay_ch->avg_read_io, bio_to_abort) ||
+ abort_delayed_io(&delay_ch->avg_write_io, bio_to_abort) ||
+ abort_delayed_io(&delay_ch->p99_read_io, bio_to_abort) ||
+ abort_delayed_io(&delay_ch->p99_write_io, bio_to_abort)) {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
+ return 0;
+ }
+
+ return spdk_bdev_abort(delay_node->base_desc, delay_ch->base_ch, bio_to_abort,
+ _delay_complete_io, bdev_io);
+}
+
+static void
+vbdev_delay_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
+{
+ struct vbdev_delay *delay_node = SPDK_CONTAINEROF(bdev_io->bdev, struct vbdev_delay, delay_bdev);
+ struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(ch);
+ struct delay_bdev_io *io_ctx = (struct delay_bdev_io *)bdev_io->driver_ctx;
+ int rc = 0;
+ bool is_p99;
+
+ is_p99 = rand_r(&delay_ch->rand_seed) % 100 == 0 ? true : false;
+
+ io_ctx->ch = ch;
+ io_ctx->type = DELAY_NONE;
+
+ switch (bdev_io->type) {
+ case SPDK_BDEV_IO_TYPE_READ:
+ io_ctx->type = is_p99 ? DELAY_P99_READ : DELAY_AVG_READ;
+ spdk_bdev_io_get_buf(bdev_io, delay_read_get_buf_cb,
+ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
+ break;
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ io_ctx->type = is_p99 ? DELAY_P99_WRITE : DELAY_AVG_WRITE;
+ rc = spdk_bdev_writev_blocks(delay_node->base_desc, delay_ch->base_ch, bdev_io->u.bdev.iovs,
+ bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks,
+ bdev_io->u.bdev.num_blocks, _delay_complete_io,
+ bdev_io);
+ break;
+ case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
+ rc = spdk_bdev_write_zeroes_blocks(delay_node->base_desc, delay_ch->base_ch,
+ bdev_io->u.bdev.offset_blocks,
+ bdev_io->u.bdev.num_blocks,
+ _delay_complete_io, bdev_io);
+ break;
+ case SPDK_BDEV_IO_TYPE_UNMAP:
+ rc = spdk_bdev_unmap_blocks(delay_node->base_desc, delay_ch->base_ch,
+ bdev_io->u.bdev.offset_blocks,
+ bdev_io->u.bdev.num_blocks,
+ _delay_complete_io, bdev_io);
+ break;
+ case SPDK_BDEV_IO_TYPE_FLUSH:
+ rc = spdk_bdev_flush_blocks(delay_node->base_desc, delay_ch->base_ch,
+ bdev_io->u.bdev.offset_blocks,
+ bdev_io->u.bdev.num_blocks,
+ _delay_complete_io, bdev_io);
+ break;
+ case SPDK_BDEV_IO_TYPE_RESET:
+ /* During reset, the generic bdev layer aborts all new I/Os and queues all new resets.
+ * Hence we can simply abort all I/Os delayed to complete.
+ */
+ spdk_for_each_channel(delay_node, vbdev_delay_reset_channel, bdev_io,
+ vbdev_delay_reset_dev);
+ break;
+ case SPDK_BDEV_IO_TYPE_ABORT:
+ rc = vbdev_delay_abort(delay_node, delay_ch, bdev_io);
+ break;
+ default:
+ SPDK_ERRLOG("delay: unknown I/O type %d\n", bdev_io->type);
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ return;
+ }
+
+ if (rc == -ENOMEM) {
+ SPDK_ERRLOG("No memory, start to queue io for delay.\n");
+ vbdev_delay_queue_io(bdev_io);
+ } else if (rc != 0) {
+ SPDK_ERRLOG("ERROR on bdev_io submission!\n");
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+}
+
+static bool
+vbdev_delay_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
+{
+ struct vbdev_delay *delay_node = (struct vbdev_delay *)ctx;
+
+ if (io_type == SPDK_BDEV_IO_TYPE_ZCOPY) {
+ return false;
+ } else {
+ return spdk_bdev_io_type_supported(delay_node->base_bdev, io_type);
+ }
+}
+
+static struct spdk_io_channel *
+vbdev_delay_get_io_channel(void *ctx)
+{
+ struct vbdev_delay *delay_node = (struct vbdev_delay *)ctx;
+ struct spdk_io_channel *delay_ch = NULL;
+
+ delay_ch = spdk_get_io_channel(delay_node);
+
+ return delay_ch;
+}
+
+static void
+_delay_write_conf_values(struct vbdev_delay *delay_node, struct spdk_json_write_ctx *w)
+{
+ spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&delay_node->delay_bdev));
+ spdk_json_write_named_string(w, "base_bdev_name", spdk_bdev_get_name(delay_node->base_bdev));
+ spdk_json_write_named_int64(w, "avg_read_latency",
+ delay_node->average_read_latency_ticks * SPDK_SEC_TO_USEC / spdk_get_ticks_hz());
+ spdk_json_write_named_int64(w, "p99_read_latency",
+ delay_node->p99_read_latency_ticks * SPDK_SEC_TO_USEC / spdk_get_ticks_hz());
+ spdk_json_write_named_int64(w, "avg_write_latency",
+ delay_node->average_write_latency_ticks * SPDK_SEC_TO_USEC / spdk_get_ticks_hz());
+ spdk_json_write_named_int64(w, "p99_write_latency",
+ delay_node->p99_write_latency_ticks * SPDK_SEC_TO_USEC / spdk_get_ticks_hz());
+}
+
+static int
+vbdev_delay_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
+{
+ struct vbdev_delay *delay_node = (struct vbdev_delay *)ctx;
+
+ spdk_json_write_name(w, "delay");
+ spdk_json_write_object_begin(w);
+ _delay_write_conf_values(delay_node, w);
+ spdk_json_write_object_end(w);
+
+ return 0;
+}
+
+/* This is used to generate JSON that can configure this module to its current state. */
+static int
+vbdev_delay_config_json(struct spdk_json_write_ctx *w)
+{
+ struct vbdev_delay *delay_node;
+
+ TAILQ_FOREACH(delay_node, &g_delay_nodes, link) {
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "method", "bdev_delay_create");
+ spdk_json_write_named_object_begin(w, "params");
+ _delay_write_conf_values(delay_node, w);
+ spdk_json_write_object_end(w);
+ }
+ return 0;
+}
+
+/* We provide this callback for the SPDK channel code to create a channel using
+ * the channel struct we provided in our module get_io_channel() entry point. Here
+ * we get and save off an underlying base channel of the device below us so that
+ * we can communicate with the base bdev on a per channel basis. If we needed
+ * our own poller for this vbdev, we'd register it here.
+ */
+static int
+delay_bdev_ch_create_cb(void *io_device, void *ctx_buf)
+{
+ struct delay_io_channel *delay_ch = ctx_buf;
+ struct vbdev_delay *delay_node = io_device;
+
+ STAILQ_INIT(&delay_ch->avg_read_io);
+ STAILQ_INIT(&delay_ch->p99_read_io);
+ STAILQ_INIT(&delay_ch->avg_write_io);
+ STAILQ_INIT(&delay_ch->p99_write_io);
+
+ delay_ch->io_poller = SPDK_POLLER_REGISTER(_delay_finish_io, delay_ch, 0);
+ delay_ch->base_ch = spdk_bdev_get_io_channel(delay_node->base_desc);
+ delay_ch->rand_seed = time(NULL);
+
+ return 0;
+}
+
+/* We provide this callback for the SPDK channel code to destroy a channel
+ * created with our create callback. We just need to undo anything we did
+ * when we created. If this bdev used its own poller, we'd unregsiter it here.
+ */
+static void
+delay_bdev_ch_destroy_cb(void *io_device, void *ctx_buf)
+{
+ struct delay_io_channel *delay_ch = ctx_buf;
+
+ spdk_poller_unregister(&delay_ch->io_poller);
+ spdk_put_io_channel(delay_ch->base_ch);
+}
+
+/* Create the delay association from the bdev and vbdev name and insert
+ * on the global list. */
+static int
+vbdev_delay_insert_association(const char *bdev_name, const char *vbdev_name,
+ uint64_t avg_read_latency, uint64_t p99_read_latency,
+ uint64_t avg_write_latency, uint64_t p99_write_latency)
+{
+ struct bdev_association *assoc;
+
+ TAILQ_FOREACH(assoc, &g_bdev_associations, link) {
+ if (strcmp(vbdev_name, assoc->vbdev_name) == 0) {
+ SPDK_ERRLOG("delay bdev %s already exists\n", vbdev_name);
+ return -EEXIST;
+ }
+ }
+
+ assoc = calloc(1, sizeof(struct bdev_association));
+ if (!assoc) {
+ SPDK_ERRLOG("could not allocate bdev_association\n");
+ return -ENOMEM;
+ }
+
+ assoc->bdev_name = strdup(bdev_name);
+ if (!assoc->bdev_name) {
+ SPDK_ERRLOG("could not allocate assoc->bdev_name\n");
+ free(assoc);
+ return -ENOMEM;
+ }
+
+ assoc->vbdev_name = strdup(vbdev_name);
+ if (!assoc->vbdev_name) {
+ SPDK_ERRLOG("could not allocate assoc->vbdev_name\n");
+ free(assoc->bdev_name);
+ free(assoc);
+ return -ENOMEM;
+ }
+
+ assoc->avg_read_latency = avg_read_latency;
+ assoc->p99_read_latency = p99_read_latency;
+ assoc->avg_write_latency = avg_write_latency;
+ assoc->p99_write_latency = p99_write_latency;
+
+ TAILQ_INSERT_TAIL(&g_bdev_associations, assoc, link);
+
+ return 0;
+}
+
+int
+vbdev_delay_update_latency_value(char *delay_name, uint64_t latency_us, enum delay_io_type type)
+{
+ struct spdk_bdev *delay_bdev;
+ struct vbdev_delay *delay_node;
+ uint64_t ticks_mhz = spdk_get_ticks_hz() / SPDK_SEC_TO_USEC;
+
+ delay_bdev = spdk_bdev_get_by_name(delay_name);
+ if (delay_bdev == NULL) {
+ return -ENODEV;
+ } else if (delay_bdev->module != &delay_if) {
+ return -EINVAL;
+ }
+
+ delay_node = SPDK_CONTAINEROF(delay_bdev, struct vbdev_delay, delay_bdev);
+
+ switch (type) {
+ case DELAY_AVG_READ:
+ delay_node->average_read_latency_ticks = ticks_mhz * latency_us;
+ break;
+ case DELAY_AVG_WRITE:
+ delay_node->average_write_latency_ticks = ticks_mhz * latency_us;
+ break;
+ case DELAY_P99_READ:
+ delay_node->p99_read_latency_ticks = ticks_mhz * latency_us;
+ break;
+ case DELAY_P99_WRITE:
+ delay_node->p99_write_latency_ticks = ticks_mhz * latency_us;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int
+vbdev_delay_init(void)
+{
+ /* Not allowing for .ini style configuration. */
+ return 0;
+}
+
+static void
+vbdev_delay_finish(void)
+{
+ struct bdev_association *assoc;
+
+ while ((assoc = TAILQ_FIRST(&g_bdev_associations))) {
+ TAILQ_REMOVE(&g_bdev_associations, assoc, link);
+ free(assoc->bdev_name);
+ free(assoc->vbdev_name);
+ free(assoc);
+ }
+}
+
+static int
+vbdev_delay_get_ctx_size(void)
+{
+ return sizeof(struct delay_bdev_io);
+}
+
+static void
+vbdev_delay_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
+{
+ /* No config per bdev needed */
+}
+
+/* When we register our bdev this is how we specify our entry points. */
+static const struct spdk_bdev_fn_table vbdev_delay_fn_table = {
+ .destruct = vbdev_delay_destruct,
+ .submit_request = vbdev_delay_submit_request,
+ .io_type_supported = vbdev_delay_io_type_supported,
+ .get_io_channel = vbdev_delay_get_io_channel,
+ .dump_info_json = vbdev_delay_dump_info_json,
+ .write_config_json = vbdev_delay_write_config_json,
+};
+
+/* Called when the underlying base bdev goes away. */
+static void
+vbdev_delay_base_bdev_hotremove_cb(void *ctx)
+{
+ struct vbdev_delay *delay_node, *tmp;
+ struct spdk_bdev *bdev_find = ctx;
+
+ TAILQ_FOREACH_SAFE(delay_node, &g_delay_nodes, link, tmp) {
+ if (bdev_find == delay_node->base_bdev) {
+ spdk_bdev_unregister(&delay_node->delay_bdev, NULL, NULL);
+ }
+ }
+}
+
+/* Create and register the delay vbdev if we find it in our list of bdev names.
+ * This can be called either by the examine path or RPC method.
+ */
+static int
+vbdev_delay_register(struct spdk_bdev *bdev)
+{
+ struct bdev_association *assoc;
+ struct vbdev_delay *delay_node;
+ uint64_t ticks_mhz = spdk_get_ticks_hz() / SPDK_SEC_TO_USEC;
+ int rc = 0;
+
+ /* Check our list of names from config versus this bdev and if
+ * there's a match, create the delay_node & bdev accordingly.
+ */
+ TAILQ_FOREACH(assoc, &g_bdev_associations, link) {
+ if (strcmp(assoc->bdev_name, bdev->name) != 0) {
+ continue;
+ }
+
+ delay_node = calloc(1, sizeof(struct vbdev_delay));
+ if (!delay_node) {
+ rc = -ENOMEM;
+ SPDK_ERRLOG("could not allocate delay_node\n");
+ break;
+ }
+
+ /* The base bdev that we're attaching to. */
+ delay_node->base_bdev = bdev;
+ delay_node->delay_bdev.name = strdup(assoc->vbdev_name);
+ if (!delay_node->delay_bdev.name) {
+ rc = -ENOMEM;
+ SPDK_ERRLOG("could not allocate delay_bdev name\n");
+ free(delay_node);
+ break;
+ }
+ delay_node->delay_bdev.product_name = "delay";
+
+ delay_node->delay_bdev.write_cache = bdev->write_cache;
+ delay_node->delay_bdev.required_alignment = bdev->required_alignment;
+ delay_node->delay_bdev.optimal_io_boundary = bdev->optimal_io_boundary;
+ delay_node->delay_bdev.blocklen = bdev->blocklen;
+ delay_node->delay_bdev.blockcnt = bdev->blockcnt;
+
+ delay_node->delay_bdev.ctxt = delay_node;
+ delay_node->delay_bdev.fn_table = &vbdev_delay_fn_table;
+ delay_node->delay_bdev.module = &delay_if;
+
+ /* Store the number of ticks you need to add to get the I/O expiration time. */
+ delay_node->average_read_latency_ticks = ticks_mhz * assoc->avg_read_latency;
+ delay_node->p99_read_latency_ticks = ticks_mhz * assoc->p99_read_latency;
+ delay_node->average_write_latency_ticks = ticks_mhz * assoc->avg_write_latency;
+ delay_node->p99_write_latency_ticks = ticks_mhz * assoc->p99_write_latency;
+
+ spdk_io_device_register(delay_node, delay_bdev_ch_create_cb, delay_bdev_ch_destroy_cb,
+ sizeof(struct delay_io_channel),
+ assoc->vbdev_name);
+
+ rc = spdk_bdev_open(bdev, true, vbdev_delay_base_bdev_hotremove_cb,
+ bdev, &delay_node->base_desc);
+ if (rc) {
+ SPDK_ERRLOG("could not open bdev %s\n", spdk_bdev_get_name(bdev));
+ goto error_unregister;
+ }
+
+ /* Save the thread where the base device is opened */
+ delay_node->thread = spdk_get_thread();
+
+ rc = spdk_bdev_module_claim_bdev(bdev, delay_node->base_desc, delay_node->delay_bdev.module);
+ if (rc) {
+ SPDK_ERRLOG("could not claim bdev %s\n", spdk_bdev_get_name(bdev));
+ goto error_close;
+ }
+
+ rc = spdk_bdev_register(&delay_node->delay_bdev);
+ if (rc) {
+ SPDK_ERRLOG("could not register delay_bdev\n");
+ spdk_bdev_module_release_bdev(delay_node->base_bdev);
+ goto error_close;
+ }
+
+ TAILQ_INSERT_TAIL(&g_delay_nodes, delay_node, link);
+ }
+
+ return rc;
+
+error_close:
+ spdk_bdev_close(delay_node->base_desc);
+error_unregister:
+ spdk_io_device_unregister(delay_node, NULL);
+ free(delay_node->delay_bdev.name);
+ free(delay_node);
+ return rc;
+}
+
+int
+create_delay_disk(const char *bdev_name, const char *vbdev_name, uint64_t avg_read_latency,
+ uint64_t p99_read_latency, uint64_t avg_write_latency, uint64_t p99_write_latency)
+{
+ struct spdk_bdev *bdev = NULL;
+ int rc = 0;
+
+ if (p99_read_latency < avg_read_latency || p99_write_latency < avg_write_latency) {
+ SPDK_ERRLOG("Unable to create a delay bdev where p99 latency is less than average latency.\n");
+ return -EINVAL;
+ }
+
+ rc = vbdev_delay_insert_association(bdev_name, vbdev_name, avg_read_latency, p99_read_latency,
+ avg_write_latency, p99_write_latency);
+ if (rc) {
+ return rc;
+ }
+
+ bdev = spdk_bdev_get_by_name(bdev_name);
+ if (!bdev) {
+ return 0;
+ }
+
+ return vbdev_delay_register(bdev);
+}
+
+void
+delete_delay_disk(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg)
+{
+ struct bdev_association *assoc;
+
+ if (!bdev || bdev->module != &delay_if) {
+ cb_fn(cb_arg, -ENODEV);
+ return;
+ }
+
+ TAILQ_FOREACH(assoc, &g_bdev_associations, link) {
+ if (strcmp(assoc->vbdev_name, bdev->name) == 0) {
+ TAILQ_REMOVE(&g_bdev_associations, assoc, link);
+ free(assoc->bdev_name);
+ free(assoc->vbdev_name);
+ free(assoc);
+ break;
+ }
+ }
+
+ spdk_bdev_unregister(bdev, cb_fn, cb_arg);
+}
+
+static void
+vbdev_delay_examine(struct spdk_bdev *bdev)
+{
+ vbdev_delay_register(bdev);
+
+ spdk_bdev_module_examine_done(&delay_if);
+}
+
+SPDK_LOG_REGISTER_COMPONENT("vbdev_delay", SPDK_LOG_VBDEV_DELAY)
diff --git a/src/spdk/module/bdev/delay/vbdev_delay.h b/src/spdk/module/bdev/delay/vbdev_delay.h
new file mode 100644
index 000000000..4f88a5e2f
--- /dev/null
+++ b/src/spdk/module/bdev/delay/vbdev_delay.h
@@ -0,0 +1,85 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_VBDEV_DELAY_H
+#define SPDK_VBDEV_DELAY_H
+
+#include "spdk/stdinc.h"
+
+#include "spdk/bdev.h"
+#include "spdk/bdev_module.h"
+
+enum delay_io_type {
+ DELAY_AVG_READ,
+ DELAY_P99_READ,
+ DELAY_AVG_WRITE,
+ DELAY_P99_WRITE,
+ DELAY_NONE
+};
+
+/**
+ * Create new delay bdev.
+ *
+ * \param bdev_name Bdev on which delay vbdev will be created.
+ * \param vbdev_name Name of the delay bdev.
+ * \param avg_read_latency Desired typical read latency.
+ * \param p99_read_latency Desired p99 read latency
+ * \param avg_write_latency Desired typical write latency.
+ * \param p99_write_latency Desired p99 write latency
+ * \return 0 on success, other on failure.
+ */
+int create_delay_disk(const char *bdev_name, const char *vbdev_name, uint64_t avg_read_latency,
+ uint64_t p99_read_latency, uint64_t avg_write_latency, uint64_t p99_write_latency);
+
+/**
+ * Delete delay bdev.
+ *
+ * \param bdev Pointer to delay bdev.
+ * \param cb_fn Function to call after deletion.
+ * \param cb_arg Argument to pass to cb_fn.
+ */
+void delete_delay_disk(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn,
+ void *cb_arg);
+
+/**
+ * Update one of the latency values for a given delay bdev.
+ *
+ * \param delay_name The name of the delay bdev
+ * \param latency_us The new latency value, in microseconds
+ * \param type a valid value from the delay_io_type enum
+ * \return 0 on success, -ENODEV if the bdev cannot be found, and -EINVAL if the bdev is not a delay device.
+ */
+int vbdev_delay_update_latency_value(char *delay_name, uint64_t latency_us,
+ enum delay_io_type type);
+
+#endif /* SPDK_VBDEV_DELAY_H */
diff --git a/src/spdk/module/bdev/delay/vbdev_delay_rpc.c b/src/spdk/module/bdev/delay/vbdev_delay_rpc.c
new file mode 100644
index 000000000..aabbadd69
--- /dev/null
+++ b/src/spdk/module/bdev/delay/vbdev_delay_rpc.c
@@ -0,0 +1,225 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "vbdev_delay.h"
+#include "spdk/rpc.h"
+#include "spdk/util.h"
+#include "spdk/string.h"
+#include "spdk_internal/log.h"
+#include "spdk_internal/assert.h"
+
+struct rpc_update_latency {
+ char *delay_bdev_name;
+ char *latency_type;
+ uint64_t latency_us;
+};
+
+static const struct spdk_json_object_decoder rpc_update_latency_decoders[] = {
+ {"delay_bdev_name", offsetof(struct rpc_update_latency, delay_bdev_name), spdk_json_decode_string},
+ {"latency_type", offsetof(struct rpc_update_latency, latency_type), spdk_json_decode_string},
+ {"latency_us", offsetof(struct rpc_update_latency, latency_us), spdk_json_decode_uint64}
+};
+
+static void
+free_rpc_update_latency(struct rpc_update_latency *req)
+{
+ free(req->delay_bdev_name);
+ free(req->latency_type);
+}
+
+static void
+rpc_bdev_delay_update_latency(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_update_latency req = {NULL};
+ struct spdk_json_write_ctx *w;
+ enum delay_io_type latency_type;
+ int rc = 0;
+
+ if (spdk_json_decode_object(params, rpc_update_latency_decoders,
+ SPDK_COUNTOF(rpc_update_latency_decoders),
+ &req)) {
+ SPDK_DEBUGLOG(SPDK_LOG_VBDEV_DELAY, "spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ if (!strncmp(req.latency_type, "avg_read", 9)) {
+ latency_type = DELAY_AVG_READ;
+ } else if (!strncmp(req.latency_type, "p99_read", 9)) {
+ latency_type = DELAY_P99_READ;
+ } else if (!strncmp(req.latency_type, "avg_write", 10)) {
+ latency_type = DELAY_AVG_WRITE;
+ } else if (!strncmp(req.latency_type, "p99_write", 10)) {
+ latency_type = DELAY_P99_WRITE;
+ } else {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Please specify a valid latency type.");
+ goto cleanup;
+ }
+
+ rc = vbdev_delay_update_latency_value(req.delay_bdev_name, req.latency_us, latency_type);
+
+ if (rc == -ENODEV) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "The requested bdev does not exist.");
+ goto cleanup;
+ } else if (rc == -EINVAL) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_REQUEST,
+ "The requested bdev is not a delay bdev.");
+ goto cleanup;
+ } else if (rc) {
+ SPDK_UNREACHABLE();
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+
+cleanup:
+ free_rpc_update_latency(&req);
+}
+SPDK_RPC_REGISTER("bdev_delay_update_latency", rpc_bdev_delay_update_latency, SPDK_RPC_RUNTIME)
+
+struct rpc_construct_delay {
+ char *base_bdev_name;
+ char *name;
+ uint64_t avg_read_latency;
+ uint64_t p99_read_latency;
+ uint64_t avg_write_latency;
+ uint64_t p99_write_latency;
+};
+
+static void
+free_rpc_construct_delay(struct rpc_construct_delay *r)
+{
+ free(r->base_bdev_name);
+ free(r->name);
+}
+
+static const struct spdk_json_object_decoder rpc_construct_delay_decoders[] = {
+ {"base_bdev_name", offsetof(struct rpc_construct_delay, base_bdev_name), spdk_json_decode_string},
+ {"name", offsetof(struct rpc_construct_delay, name), spdk_json_decode_string},
+ {"avg_read_latency", offsetof(struct rpc_construct_delay, avg_read_latency), spdk_json_decode_uint64},
+ {"p99_read_latency", offsetof(struct rpc_construct_delay, p99_read_latency), spdk_json_decode_uint64},
+ {"avg_write_latency", offsetof(struct rpc_construct_delay, avg_write_latency), spdk_json_decode_uint64},
+ {"p99_write_latency", offsetof(struct rpc_construct_delay, p99_write_latency), spdk_json_decode_uint64},
+};
+
+static void
+rpc_bdev_delay_create(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_construct_delay req = {NULL};
+ struct spdk_json_write_ctx *w;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_construct_delay_decoders,
+ SPDK_COUNTOF(rpc_construct_delay_decoders),
+ &req)) {
+ SPDK_DEBUGLOG(SPDK_LOG_VBDEV_DELAY, "spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ rc = create_delay_disk(req.base_bdev_name, req.name, req.avg_read_latency, req.p99_read_latency,
+ req.avg_write_latency, req.p99_write_latency);
+ if (rc != 0) {
+ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc));
+ goto cleanup;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_string(w, req.name);
+ spdk_jsonrpc_end_result(request, w);
+
+cleanup:
+ free_rpc_construct_delay(&req);
+}
+SPDK_RPC_REGISTER("bdev_delay_create", rpc_bdev_delay_create, SPDK_RPC_RUNTIME)
+
+struct rpc_delete_delay {
+ char *name;
+};
+
+static void
+free_rpc_delete_delay(struct rpc_delete_delay *req)
+{
+ free(req->name);
+}
+
+static const struct spdk_json_object_decoder rpc_delete_delay_decoders[] = {
+ {"name", offsetof(struct rpc_delete_delay, name), spdk_json_decode_string},
+};
+
+static void
+rpc_bdev_delay_delete_cb(void *cb_arg, int bdeverrno)
+{
+ struct spdk_jsonrpc_request *request = cb_arg;
+ struct spdk_json_write_ctx *w;
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, bdeverrno == 0);
+ spdk_jsonrpc_end_result(request, w);
+}
+
+static void
+rpc_bdev_delay_delete(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_delete_delay req = {NULL};
+ struct spdk_bdev *bdev;
+
+ if (spdk_json_decode_object(params, rpc_delete_delay_decoders,
+ SPDK_COUNTOF(rpc_delete_delay_decoders),
+ &req)) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ bdev = spdk_bdev_get_by_name(req.name);
+ if (bdev == NULL) {
+ spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV));
+ goto cleanup;
+ }
+
+ delete_delay_disk(bdev, rpc_bdev_delay_delete_cb, request);
+
+cleanup:
+ free_rpc_delete_delay(&req);
+}
+SPDK_RPC_REGISTER("bdev_delay_delete", rpc_bdev_delay_delete, SPDK_RPC_RUNTIME)
diff --git a/src/spdk/module/bdev/error/Makefile b/src/spdk/module/bdev/error/Makefile
new file mode 100644
index 000000000..e67a18530
--- /dev/null
+++ b/src/spdk/module/bdev/error/Makefile
@@ -0,0 +1,45 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = vbdev_error.c vbdev_error_rpc.c
+LIBNAME = bdev_error
+
+SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/module/bdev/error/vbdev_error.c b/src/spdk/module/bdev/error/vbdev_error.c
new file mode 100644
index 000000000..643d0d8a1
--- /dev/null
+++ b/src/spdk/module/bdev/error/vbdev_error.c
@@ -0,0 +1,508 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This is a module for test purpose which will simulate error cases for bdev.
+ */
+
+#include "spdk/stdinc.h"
+#include "spdk/rpc.h"
+#include "spdk/conf.h"
+#include "spdk/util.h"
+#include "spdk/endian.h"
+#include "spdk/nvme_spec.h"
+#include "spdk/string.h"
+
+#include "spdk/bdev_module.h"
+#include "spdk_internal/log.h"
+
+#include "vbdev_error.h"
+
+struct spdk_vbdev_error_config {
+ char *base_bdev;
+ TAILQ_ENTRY(spdk_vbdev_error_config) tailq;
+};
+
+static TAILQ_HEAD(, spdk_vbdev_error_config) g_error_config
+ = TAILQ_HEAD_INITIALIZER(g_error_config);
+
+struct vbdev_error_info {
+ uint32_t error_type;
+ uint32_t error_num;
+};
+
+/* Context for each error bdev */
+struct error_disk {
+ struct spdk_bdev_part part;
+ struct vbdev_error_info error_vector[SPDK_BDEV_IO_TYPE_RESET];
+ TAILQ_HEAD(, spdk_bdev_io) pending_ios;
+};
+
+struct error_channel {
+ struct spdk_bdev_part_channel part_ch;
+};
+
+static pthread_mutex_t g_vbdev_error_mutex = PTHREAD_MUTEX_INITIALIZER;
+static SPDK_BDEV_PART_TAILQ g_error_disks = TAILQ_HEAD_INITIALIZER(g_error_disks);
+
+static int vbdev_error_init(void);
+static void vbdev_error_fini(void);
+
+static void vbdev_error_examine(struct spdk_bdev *bdev);
+static int vbdev_error_config_json(struct spdk_json_write_ctx *w);
+
+static int vbdev_error_config_add(const char *base_bdev_name);
+static int vbdev_error_config_remove(const char *base_bdev_name);
+
+static struct spdk_bdev_module error_if = {
+ .name = "error",
+ .module_init = vbdev_error_init,
+ .module_fini = vbdev_error_fini,
+ .examine_config = vbdev_error_examine,
+ .config_json = vbdev_error_config_json,
+
+};
+
+SPDK_BDEV_MODULE_REGISTER(error, &error_if)
+
+int
+vbdev_error_inject_error(char *name, uint32_t io_type, uint32_t error_type, uint32_t error_num)
+{
+ struct spdk_bdev *bdev;
+ struct spdk_bdev_part *part;
+ struct error_disk *error_disk = NULL;
+ uint32_t i;
+
+ pthread_mutex_lock(&g_vbdev_error_mutex);
+ bdev = spdk_bdev_get_by_name(name);
+ if (!bdev) {
+ SPDK_ERRLOG("Could not find ErrorInjection bdev %s\n", name);
+ pthread_mutex_unlock(&g_vbdev_error_mutex);
+ return -ENODEV;
+ }
+
+ TAILQ_FOREACH(part, &g_error_disks, tailq) {
+ if (bdev == spdk_bdev_part_get_bdev(part)) {
+ error_disk = (struct error_disk *)part;
+ break;
+ }
+ }
+
+ if (error_disk == NULL) {
+ SPDK_ERRLOG("Could not find ErrorInjection bdev %s\n", name);
+ pthread_mutex_unlock(&g_vbdev_error_mutex);
+ return -ENODEV;
+ }
+
+ if (0xffffffff == io_type) {
+ for (i = 0; i < SPDK_COUNTOF(error_disk->error_vector); i++) {
+ error_disk->error_vector[i].error_type = error_type;
+ error_disk->error_vector[i].error_num = error_num;
+ }
+ } else if (0 == io_type) {
+ for (i = 0; i < SPDK_COUNTOF(error_disk->error_vector); i++) {
+ error_disk->error_vector[i].error_num = 0;
+ }
+ } else {
+ error_disk->error_vector[io_type].error_type = error_type;
+ error_disk->error_vector[io_type].error_num = error_num;
+ }
+ pthread_mutex_unlock(&g_vbdev_error_mutex);
+ return 0;
+}
+
+static void
+vbdev_error_reset(struct error_disk *error_disk, struct spdk_bdev_io *bdev_io)
+{
+ struct spdk_bdev_io *pending_io, *tmp;
+
+ TAILQ_FOREACH_SAFE(pending_io, &error_disk->pending_ios, module_link, tmp) {
+ TAILQ_REMOVE(&error_disk->pending_ios, pending_io, module_link);
+ spdk_bdev_io_complete(pending_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
+}
+
+static uint32_t
+vbdev_error_get_error_type(struct error_disk *error_disk, uint32_t io_type)
+{
+ if (error_disk->error_vector[io_type].error_num) {
+ return error_disk->error_vector[io_type].error_type;
+ }
+ return 0;
+}
+
+static void
+vbdev_error_submit_request(struct spdk_io_channel *_ch, struct spdk_bdev_io *bdev_io)
+{
+ struct error_channel *ch = spdk_io_channel_get_ctx(_ch);
+ struct error_disk *error_disk = bdev_io->bdev->ctxt;
+ uint32_t error_type;
+
+ switch (bdev_io->type) {
+ case SPDK_BDEV_IO_TYPE_READ:
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ case SPDK_BDEV_IO_TYPE_UNMAP:
+ case SPDK_BDEV_IO_TYPE_FLUSH:
+ break;
+ case SPDK_BDEV_IO_TYPE_RESET:
+ vbdev_error_reset(error_disk, bdev_io);
+ return;
+ default:
+ SPDK_ERRLOG("Error Injection: unknown I/O type %d\n", bdev_io->type);
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ return;
+ }
+
+ error_type = vbdev_error_get_error_type(error_disk, bdev_io->type);
+ if (error_type == 0) {
+ int rc = spdk_bdev_part_submit_request(&ch->part_ch, bdev_io);
+
+ if (rc) {
+ SPDK_ERRLOG("bdev_error: submit request failed, rc=%d\n", rc);
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+ return;
+ } else if (error_type == VBDEV_IO_FAILURE) {
+ error_disk->error_vector[bdev_io->type].error_num--;
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ } else if (error_type == VBDEV_IO_PENDING) {
+ TAILQ_INSERT_TAIL(&error_disk->pending_ios, bdev_io, module_link);
+ error_disk->error_vector[bdev_io->type].error_num--;
+ }
+}
+
+static int
+vbdev_error_destruct(void *ctx)
+{
+ struct error_disk *error_disk = ctx;
+ struct spdk_bdev *base_bdev = spdk_bdev_part_get_base_bdev(&error_disk->part);
+ int rc;
+
+ rc = vbdev_error_config_remove(base_bdev->name);
+ if (rc != 0) {
+ SPDK_ERRLOG("vbdev_error_config_remove() failed\n");
+ }
+
+ return spdk_bdev_part_free(&error_disk->part);
+}
+
+static int
+vbdev_error_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
+{
+ struct error_disk *error_disk = ctx;
+ struct spdk_bdev *base_bdev = spdk_bdev_part_get_base_bdev(&error_disk->part);
+
+ spdk_json_write_named_object_begin(w, "error_disk");
+
+ spdk_json_write_named_string(w, "base_bdev", base_bdev->name);
+
+ spdk_json_write_object_end(w);
+
+ return 0;
+}
+
+static void
+vbdev_error_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
+{
+ /* No config per bdev. */
+}
+
+
+static struct spdk_bdev_fn_table vbdev_error_fn_table = {
+ .destruct = vbdev_error_destruct,
+ .submit_request = vbdev_error_submit_request,
+ .dump_info_json = vbdev_error_dump_info_json,
+ .write_config_json = vbdev_error_write_config_json
+};
+
+static void
+vbdev_error_base_bdev_hotremove_cb(void *_part_base)
+{
+ struct spdk_bdev_part_base *part_base = _part_base;
+
+ spdk_bdev_part_base_hotremove(part_base, &g_error_disks);
+}
+
+static int
+_vbdev_error_create(struct spdk_bdev *base_bdev)
+{
+ struct spdk_bdev_part_base *base = NULL;
+ struct error_disk *disk = NULL;
+ char *name;
+ int rc;
+
+ base = spdk_bdev_part_base_construct(base_bdev,
+ vbdev_error_base_bdev_hotremove_cb,
+ &error_if, &vbdev_error_fn_table, &g_error_disks,
+ NULL, NULL, sizeof(struct error_channel),
+ NULL, NULL);
+ if (!base) {
+ SPDK_ERRLOG("could not construct part base for bdev %s\n", spdk_bdev_get_name(base_bdev));
+ return -ENOMEM;
+ }
+
+ disk = calloc(1, sizeof(*disk));
+ if (!disk) {
+ SPDK_ERRLOG("Memory allocation failure\n");
+ spdk_bdev_part_base_free(base);
+ return -ENOMEM;
+ }
+
+ name = spdk_sprintf_alloc("EE_%s", spdk_bdev_get_name(base_bdev));
+ if (!name) {
+ SPDK_ERRLOG("name allocation failure\n");
+ spdk_bdev_part_base_free(base);
+ free(disk);
+ return -ENOMEM;
+ }
+
+ rc = spdk_bdev_part_construct(&disk->part, base, name, 0, base_bdev->blockcnt,
+ "Error Injection Disk");
+ free(name);
+ if (rc) {
+ SPDK_ERRLOG("could not construct part for bdev %s\n", spdk_bdev_get_name(base_bdev));
+ /* spdk_bdev_part_construct will free name on failure */
+ spdk_bdev_part_base_free(base);
+ free(disk);
+ return rc;
+ }
+
+ TAILQ_INIT(&disk->pending_ios);
+
+ return 0;
+}
+
+int
+vbdev_error_create(const char *base_bdev_name)
+{
+ int rc;
+ struct spdk_bdev *base_bdev;
+
+ rc = vbdev_error_config_add(base_bdev_name);
+ if (rc != 0) {
+ SPDK_ERRLOG("Adding config for ErrorInjection bdev %s failed (rc=%d)\n",
+ base_bdev_name, rc);
+ return rc;
+ }
+
+ base_bdev = spdk_bdev_get_by_name(base_bdev_name);
+ if (!base_bdev) {
+ return 0;
+ }
+
+ rc = _vbdev_error_create(base_bdev);
+ if (rc != 0) {
+ vbdev_error_config_remove(base_bdev_name);
+ SPDK_ERRLOG("Could not create ErrorInjection bdev %s (rc=%d)\n",
+ base_bdev_name, rc);
+ }
+
+ return rc;
+}
+
+void
+vbdev_error_delete(struct spdk_bdev *vbdev, spdk_delete_error_complete cb_fn, void *cb_arg)
+{
+ if (!vbdev || vbdev->module != &error_if) {
+ cb_fn(cb_arg, -ENODEV);
+ return;
+ }
+
+ spdk_bdev_unregister(vbdev, cb_fn, cb_arg);
+}
+
+static void
+vbdev_error_clear_config(void)
+{
+ struct spdk_vbdev_error_config *cfg;
+
+ while ((cfg = TAILQ_FIRST(&g_error_config))) {
+ TAILQ_REMOVE(&g_error_config, cfg, tailq);
+ free(cfg->base_bdev);
+ free(cfg);
+ }
+}
+
+static struct spdk_vbdev_error_config *
+vbdev_error_config_find_by_base_name(const char *base_bdev_name)
+{
+ struct spdk_vbdev_error_config *cfg;
+
+ TAILQ_FOREACH(cfg, &g_error_config, tailq) {
+ if (strcmp(cfg->base_bdev, base_bdev_name) == 0) {
+ return cfg;
+ }
+ }
+
+ return NULL;
+}
+
+static int
+vbdev_error_config_add(const char *base_bdev_name)
+{
+ struct spdk_vbdev_error_config *cfg;
+
+ cfg = vbdev_error_config_find_by_base_name(base_bdev_name);
+ if (cfg) {
+ SPDK_ERRLOG("vbdev_error_config for bdev %s already exists\n",
+ base_bdev_name);
+ return -EEXIST;
+ }
+
+ cfg = calloc(1, sizeof(*cfg));
+ if (!cfg) {
+ SPDK_ERRLOG("calloc() failed for vbdev_error_config\n");
+ return -ENOMEM;
+ }
+
+ cfg->base_bdev = strdup(base_bdev_name);
+ if (!cfg->base_bdev) {
+ free(cfg);
+ SPDK_ERRLOG("strdup() failed for base_bdev_name\n");
+ return -ENOMEM;
+ }
+
+ TAILQ_INSERT_TAIL(&g_error_config, cfg, tailq);
+
+ return 0;
+}
+
+static int
+vbdev_error_config_remove(const char *base_bdev_name)
+{
+ struct spdk_vbdev_error_config *cfg;
+
+ cfg = vbdev_error_config_find_by_base_name(base_bdev_name);
+ if (!cfg) {
+ return -ENOENT;
+ }
+
+ TAILQ_REMOVE(&g_error_config, cfg, tailq);
+ free(cfg->base_bdev);
+ free(cfg);
+ return 0;
+}
+
+static int
+vbdev_error_init(void)
+{
+ struct spdk_conf_section *sp;
+ struct spdk_vbdev_error_config *cfg;
+ const char *base_bdev_name;
+ int i, rc;
+
+ sp = spdk_conf_find_section(NULL, "BdevError");
+ if (sp == NULL) {
+ return 0;
+ }
+
+ for (i = 0; ; i++) {
+ if (!spdk_conf_section_get_nval(sp, "BdevError", i)) {
+ break;
+ }
+
+ base_bdev_name = spdk_conf_section_get_nmval(sp, "BdevError", i, 0);
+ if (!base_bdev_name) {
+ SPDK_ERRLOG("ErrorInjection configuration missing bdev name\n");
+ rc = -EINVAL;
+ goto error;
+ }
+
+ cfg = calloc(1, sizeof(*cfg));
+ if (!cfg) {
+ SPDK_ERRLOG("calloc() failed for vbdev_error_config\n");
+ rc = -ENOMEM;
+ goto error;
+ }
+
+ cfg->base_bdev = strdup(base_bdev_name);
+ if (!cfg->base_bdev) {
+ free(cfg);
+ SPDK_ERRLOG("strdup() failed for bdev name\n");
+ rc = -ENOMEM;
+ goto error;
+ }
+
+ TAILQ_INSERT_TAIL(&g_error_config, cfg, tailq);
+ }
+
+ return 0;
+
+error:
+ vbdev_error_clear_config();
+ return rc;
+}
+
+static void
+vbdev_error_fini(void)
+{
+ vbdev_error_clear_config();
+}
+
+static void
+vbdev_error_examine(struct spdk_bdev *bdev)
+{
+ struct spdk_vbdev_error_config *cfg;
+ int rc;
+
+ cfg = vbdev_error_config_find_by_base_name(bdev->name);
+ if (cfg != NULL) {
+ rc = _vbdev_error_create(bdev);
+ if (rc != 0) {
+ SPDK_ERRLOG("could not create error vbdev for bdev %s at examine\n",
+ bdev->name);
+ }
+ }
+
+ spdk_bdev_module_examine_done(&error_if);
+}
+
+static int
+vbdev_error_config_json(struct spdk_json_write_ctx *w)
+{
+ struct spdk_vbdev_error_config *cfg;
+
+ TAILQ_FOREACH(cfg, &g_error_config, tailq) {
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_string(w, "method", "bdev_error_create");
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_string(w, "base_name", cfg->base_bdev);
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+ }
+
+ return 0;
+}
diff --git a/src/spdk/module/bdev/error/vbdev_error.h b/src/spdk/module/bdev/error/vbdev_error.h
new file mode 100644
index 000000000..8c0daaeac
--- /dev/null
+++ b/src/spdk/module/bdev/error/vbdev_error.h
@@ -0,0 +1,76 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_VBDEV_ERROR_H
+#define SPDK_VBDEV_ERROR_H
+
+#include "spdk/stdinc.h"
+#include "spdk/bdev.h"
+
+enum vbdev_error_type {
+ VBDEV_IO_FAILURE = 1,
+ VBDEV_IO_PENDING,
+};
+
+typedef void (*spdk_delete_error_complete)(void *cb_arg, int bdeverrno);
+
+/**
+ * Create a vbdev on the base bdev to inject error into it.
+ *
+ * \param base_bdev_name Name of the base bdev.
+ * \return 0 on success or negative on failure.
+ */
+int vbdev_error_create(const char *base_bdev_name);
+
+/**
+ * Delete vbdev used to inject errors.
+ *
+ * \param bdev Pointer to error vbdev.
+ * \param cb_fn Function to call after deletion.
+ * \param cb_arg Arguments to pass to cb_fn.
+ */
+void vbdev_error_delete(struct spdk_bdev *vbdev, spdk_delete_error_complete cb_fn,
+ void *cb_arg);
+
+/**
+ * Inject error to the base bdev. Users can specify which IO type error is injected,
+ * what type of error is injected, and how many errors are injected.
+ *
+ * \param name Name of the base bdev into which error is injected.
+ * \param io_type IO type into which error is injected.
+ * \param error_num Count of injected errors
+ */
+int vbdev_error_inject_error(char *name, uint32_t io_type, uint32_t error_type,
+ uint32_t error_num);
+
+#endif /* SPDK_VBDEV_ERROR_H */
diff --git a/src/spdk/module/bdev/error/vbdev_error_rpc.c b/src/spdk/module/bdev/error/vbdev_error_rpc.c
new file mode 100644
index 000000000..2dcbfd33e
--- /dev/null
+++ b/src/spdk/module/bdev/error/vbdev_error_rpc.c
@@ -0,0 +1,245 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+#include "spdk/string.h"
+#include "spdk/rpc.h"
+#include "spdk/util.h"
+#include "spdk/string.h"
+#include "spdk_internal/log.h"
+#include "vbdev_error.h"
+
+#define ERROR_BDEV_IO_TYPE_INVALID (SPDK_BDEV_IO_TYPE_RESET + 1)
+#define ERROR_BDEV_ERROR_TYPE_INVALID (VBDEV_IO_PENDING + 1)
+
+static uint32_t
+rpc_error_bdev_io_type_parse(char *name)
+{
+ if (strcmp(name, "read") == 0) {
+ return SPDK_BDEV_IO_TYPE_READ;
+ } else if (strcmp(name, "write") == 0) {
+ return SPDK_BDEV_IO_TYPE_WRITE;
+ } else if (strcmp(name, "flush") == 0) {
+ return SPDK_BDEV_IO_TYPE_FLUSH;
+ } else if (strcmp(name, "unmap") == 0) {
+ return SPDK_BDEV_IO_TYPE_UNMAP;
+ } else if (strcmp(name, "all") == 0) {
+ return 0xffffffff;
+ } else if (strcmp(name, "clear") == 0) {
+ return 0;
+ }
+ return ERROR_BDEV_IO_TYPE_INVALID;
+}
+
+static uint32_t
+rpc_error_bdev_error_type_parse(char *name)
+{
+ if (strcmp(name, "failure") == 0) {
+ return VBDEV_IO_FAILURE;
+ } else if (strcmp(name, "pending") == 0) {
+ return VBDEV_IO_PENDING;
+ }
+ return ERROR_BDEV_ERROR_TYPE_INVALID;
+}
+
+struct rpc_bdev_error_create {
+ char *base_name;
+};
+
+static void
+free_rpc_bdev_error_create(struct rpc_bdev_error_create *req)
+{
+ free(req->base_name);
+}
+
+static const struct spdk_json_object_decoder rpc_bdev_error_create_decoders[] = {
+ {"base_name", offsetof(struct rpc_bdev_error_create, base_name), spdk_json_decode_string},
+};
+
+static void
+rpc_bdev_error_create(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_error_create req = {};
+ struct spdk_json_write_ctx *w;
+ int rc = 0;
+
+ if (spdk_json_decode_object(params, rpc_bdev_error_create_decoders,
+ SPDK_COUNTOF(rpc_bdev_error_create_decoders),
+ &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ rc = vbdev_error_create(req.base_name);
+ if (rc) {
+ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc));
+ goto cleanup;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+
+cleanup:
+ free_rpc_bdev_error_create(&req);
+}
+SPDK_RPC_REGISTER("bdev_error_create", rpc_bdev_error_create, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_error_create, construct_error_bdev)
+
+struct rpc_delete_error {
+ char *name;
+};
+
+static void
+free_rpc_delete_error(struct rpc_delete_error *r)
+{
+ free(r->name);
+}
+
+static const struct spdk_json_object_decoder rpc_delete_error_decoders[] = {
+ {"name", offsetof(struct rpc_delete_error, name), spdk_json_decode_string},
+};
+
+static void
+rpc_bdev_error_delete_cb(void *cb_arg, int bdeverrno)
+{
+ struct spdk_jsonrpc_request *request = cb_arg;
+ struct spdk_json_write_ctx *w;
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, bdeverrno == 0);
+ spdk_jsonrpc_end_result(request, w);
+}
+
+static void
+rpc_bdev_error_delete(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_delete_error req = {NULL};
+ struct spdk_bdev *vbdev;
+
+ if (spdk_json_decode_object(params, rpc_delete_error_decoders,
+ SPDK_COUNTOF(rpc_delete_error_decoders),
+ &req)) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ vbdev = spdk_bdev_get_by_name(req.name);
+ if (vbdev == NULL) {
+ spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV));
+ goto cleanup;
+ }
+
+ vbdev_error_delete(vbdev, rpc_bdev_error_delete_cb, request);
+
+cleanup:
+ free_rpc_delete_error(&req);
+}
+SPDK_RPC_REGISTER("bdev_error_delete", rpc_bdev_error_delete, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_error_delete, delete_error_bdev)
+
+struct rpc_error_information {
+ char *name;
+ char *io_type;
+ char *error_type;
+ uint32_t num;
+};
+
+static const struct spdk_json_object_decoder rpc_error_information_decoders[] = {
+ {"name", offsetof(struct rpc_error_information, name), spdk_json_decode_string},
+ {"io_type", offsetof(struct rpc_error_information, io_type), spdk_json_decode_string},
+ {"error_type", offsetof(struct rpc_error_information, error_type), spdk_json_decode_string},
+ {"num", offsetof(struct rpc_error_information, num), spdk_json_decode_uint32, true},
+};
+
+static void
+free_rpc_error_information(struct rpc_error_information *p)
+{
+ free(p->name);
+ free(p->io_type);
+ free(p->error_type);
+}
+
+static void
+rpc_bdev_error_inject_error(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_error_information req = {};
+ struct spdk_json_write_ctx *w;
+ uint32_t io_type;
+ uint32_t error_type;
+ int rc = 0;
+
+ if (spdk_json_decode_object(params, rpc_error_information_decoders,
+ SPDK_COUNTOF(rpc_error_information_decoders),
+ &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ io_type = rpc_error_bdev_io_type_parse(req.io_type);
+ if (io_type == ERROR_BDEV_IO_TYPE_INVALID) {
+ spdk_jsonrpc_send_error_response(request, -EINVAL,
+ "Unexpected io_type value");
+ goto cleanup;
+ }
+
+ error_type = rpc_error_bdev_error_type_parse(req.error_type);
+ if (error_type == ERROR_BDEV_ERROR_TYPE_INVALID) {
+ spdk_jsonrpc_send_error_response(request, -EINVAL,
+ "Unexpected error_type value");
+ goto cleanup;
+ }
+
+ rc = vbdev_error_inject_error(req.name, io_type, error_type, req.num);
+ if (rc) {
+ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc));
+ goto cleanup;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+
+cleanup:
+ free_rpc_error_information(&req);
+}
+SPDK_RPC_REGISTER("bdev_error_inject_error", rpc_bdev_error_inject_error, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_error_inject_error, bdev_inject_error)
diff --git a/src/spdk/module/bdev/ftl/Makefile b/src/spdk/module/bdev/ftl/Makefile
new file mode 100644
index 000000000..d0bfe1078
--- /dev/null
+++ b/src/spdk/module/bdev/ftl/Makefile
@@ -0,0 +1,45 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS += bdev_ftl.c bdev_ftl_rpc.c
+LIBNAME = bdev_ftl
+
+SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/module/bdev/ftl/bdev_ftl.c b/src/spdk/module/bdev/ftl/bdev_ftl.c
new file mode 100644
index 000000000..e959c8677
--- /dev/null
+++ b/src/spdk/module/bdev/ftl/bdev_ftl.c
@@ -0,0 +1,517 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+#include "spdk/bdev.h"
+#include "spdk/conf.h"
+#include "spdk/env.h"
+#include "spdk/thread.h"
+#include "spdk/json.h"
+#include "spdk/string.h"
+#include "spdk/likely.h"
+#include "spdk/util.h"
+#include "spdk/string.h"
+#include "spdk/ftl.h"
+#include "spdk_internal/log.h"
+
+#include "bdev_ftl.h"
+
+struct ftl_bdev {
+ struct spdk_bdev bdev;
+
+ struct spdk_ftl_dev *dev;
+
+ ftl_bdev_init_fn init_cb;
+
+ void *init_arg;
+};
+
+struct ftl_deferred_init {
+ struct ftl_bdev_init_opts opts;
+
+ LIST_ENTRY(ftl_deferred_init) entry;
+};
+
+static LIST_HEAD(, ftl_deferred_init) g_deferred_init = LIST_HEAD_INITIALIZER(g_deferred_init);
+
+static int bdev_ftl_initialize(void);
+static void bdev_ftl_finish(void);
+static void bdev_ftl_examine(struct spdk_bdev *bdev);
+
+static struct spdk_bdev_module g_ftl_if = {
+ .name = "ftl",
+ .module_init = bdev_ftl_initialize,
+ .module_fini = bdev_ftl_finish,
+ .examine_disk = bdev_ftl_examine,
+};
+
+SPDK_BDEV_MODULE_REGISTER(ftl, &g_ftl_if)
+
+static void
+bdev_ftl_free_cb(struct spdk_ftl_dev *dev, void *ctx, int status)
+{
+ struct ftl_bdev *ftl_bdev = ctx;
+
+ spdk_bdev_destruct_done(&ftl_bdev->bdev, status);
+ free(ftl_bdev->bdev.name);
+ free(ftl_bdev);
+}
+
+static int
+bdev_ftl_destruct(void *ctx)
+{
+ struct ftl_bdev *ftl_bdev = ctx;
+ spdk_ftl_dev_free(ftl_bdev->dev, bdev_ftl_free_cb, ftl_bdev);
+
+ /* return 1 to indicate that the destruction is asynchronous */
+ return 1;
+}
+
+static void
+bdev_ftl_cb(void *arg, int rc)
+{
+ struct spdk_bdev_io *bdev_io = arg;
+ enum spdk_bdev_io_status status;
+
+ switch (rc) {
+ case 0:
+ status = SPDK_BDEV_IO_STATUS_SUCCESS;
+ break;
+ case -ENOMEM:
+ status = SPDK_BDEV_IO_STATUS_NOMEM;
+ break;
+ default:
+ status = SPDK_BDEV_IO_STATUS_FAILED;
+ break;
+ }
+
+ spdk_bdev_io_complete(bdev_io, status);
+}
+
+static void
+bdev_ftl_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
+ bool success)
+{
+ struct ftl_bdev *ftl_bdev;
+ int rc;
+
+ ftl_bdev = bdev_io->bdev->ctxt;
+
+ if (!success) {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ return;
+ }
+
+ rc = spdk_ftl_read(ftl_bdev->dev,
+ ch,
+ bdev_io->u.bdev.offset_blocks,
+ bdev_io->u.bdev.num_blocks,
+ bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, bdev_ftl_cb, bdev_io);
+
+ if (spdk_unlikely(rc != 0)) {
+ spdk_bdev_io_complete(bdev_io, rc);
+ }
+}
+
+static int
+_bdev_ftl_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
+{
+ struct ftl_bdev *ftl_bdev = (struct ftl_bdev *)bdev_io->bdev->ctxt;
+
+ switch (bdev_io->type) {
+ case SPDK_BDEV_IO_TYPE_READ:
+ spdk_bdev_io_get_buf(bdev_io, bdev_ftl_get_buf_cb,
+ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
+ return 0;
+
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ return spdk_ftl_write(ftl_bdev->dev, ch, bdev_io->u.bdev.offset_blocks,
+ bdev_io->u.bdev.num_blocks, bdev_io->u.bdev.iovs,
+ bdev_io->u.bdev.iovcnt, bdev_ftl_cb, bdev_io);
+
+ case SPDK_BDEV_IO_TYPE_FLUSH:
+ return spdk_ftl_flush(ftl_bdev->dev, bdev_ftl_cb, bdev_io);
+
+ case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
+ case SPDK_BDEV_IO_TYPE_RESET:
+ case SPDK_BDEV_IO_TYPE_UNMAP:
+ default:
+ return -ENOTSUP;
+ break;
+ }
+}
+
+static void
+bdev_ftl_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
+{
+ int rc = _bdev_ftl_submit_request(ch, bdev_io);
+
+ if (spdk_unlikely(rc != 0)) {
+ spdk_bdev_io_complete(bdev_io, rc);
+ }
+}
+
+static bool
+bdev_ftl_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
+{
+ switch (io_type) {
+ case SPDK_BDEV_IO_TYPE_READ:
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ case SPDK_BDEV_IO_TYPE_FLUSH:
+ return true;
+ case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
+ case SPDK_BDEV_IO_TYPE_RESET:
+ case SPDK_BDEV_IO_TYPE_UNMAP:
+ default:
+ return false;
+ }
+}
+
+static struct spdk_io_channel *
+bdev_ftl_get_io_channel(void *ctx)
+{
+ struct ftl_bdev *ftl_bdev = ctx;
+
+ return spdk_get_io_channel(ftl_bdev->dev);
+}
+
+static void
+_bdev_ftl_write_config_info(struct ftl_bdev *ftl_bdev, struct spdk_json_write_ctx *w)
+{
+ struct spdk_ftl_attrs attrs = {};
+
+ spdk_ftl_dev_get_attrs(ftl_bdev->dev, &attrs);
+
+ spdk_json_write_named_string(w, "base_bdev", attrs.base_bdev);
+
+ if (attrs.cache_bdev) {
+ spdk_json_write_named_string(w, "cache", attrs.cache_bdev);
+ }
+}
+
+static void
+bdev_ftl_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
+{
+ struct ftl_bdev *ftl_bdev = bdev->ctxt;
+ struct spdk_ftl_attrs attrs;
+ struct spdk_ftl_conf *conf = &attrs.conf;
+ char uuid[SPDK_UUID_STRING_LEN];
+
+ spdk_ftl_dev_get_attrs(ftl_bdev->dev, &attrs);
+
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_string(w, "method", "bdev_ftl_create");
+
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_string(w, "name", ftl_bdev->bdev.name);
+
+ spdk_json_write_named_bool(w, "allow_open_bands", conf->allow_open_bands);
+ spdk_json_write_named_uint64(w, "overprovisioning", conf->lba_rsvd);
+ spdk_json_write_named_uint64(w, "limit_crit", conf->limits[SPDK_FTL_LIMIT_CRIT].limit);
+ spdk_json_write_named_uint64(w, "limit_crit_threshold", conf->limits[SPDK_FTL_LIMIT_CRIT].thld);
+ spdk_json_write_named_uint64(w, "limit_high", conf->limits[SPDK_FTL_LIMIT_HIGH].limit);
+ spdk_json_write_named_uint64(w, "limit_high_threshold", conf->limits[SPDK_FTL_LIMIT_HIGH].thld);
+ spdk_json_write_named_uint64(w, "limit_low", conf->limits[SPDK_FTL_LIMIT_LOW].limit);
+ spdk_json_write_named_uint64(w, "limit_low_threshold", conf->limits[SPDK_FTL_LIMIT_LOW].thld);
+ spdk_json_write_named_uint64(w, "limit_start", conf->limits[SPDK_FTL_LIMIT_START].limit);
+ spdk_json_write_named_uint64(w, "limit_start_threshold", conf->limits[SPDK_FTL_LIMIT_START].thld);
+ if (conf->l2p_path) {
+ spdk_json_write_named_string(w, "l2p_path", conf->l2p_path);
+ }
+
+ spdk_uuid_fmt_lower(uuid, sizeof(uuid), &attrs.uuid);
+ spdk_json_write_named_string(w, "uuid", uuid);
+
+ _bdev_ftl_write_config_info(ftl_bdev, w);
+
+ spdk_json_write_object_end(w);
+ spdk_json_write_object_end(w);
+}
+
+static int
+bdev_ftl_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
+{
+ struct ftl_bdev *ftl_bdev = ctx;
+ struct spdk_ftl_attrs attrs;
+
+ spdk_ftl_dev_get_attrs(ftl_bdev->dev, &attrs);
+
+ spdk_json_write_named_object_begin(w, "ftl");
+
+ _bdev_ftl_write_config_info(ftl_bdev, w);
+ spdk_json_write_named_string_fmt(w, "num_zones", "%zu", attrs.num_zones);
+ spdk_json_write_named_string_fmt(w, "zone_size", "%zu", attrs.zone_size);
+
+ /* ftl */
+ spdk_json_write_object_end(w);
+
+ return 0;
+}
+
+static const struct spdk_bdev_fn_table ftl_fn_table = {
+ .destruct = bdev_ftl_destruct,
+ .submit_request = bdev_ftl_submit_request,
+ .io_type_supported = bdev_ftl_io_type_supported,
+ .get_io_channel = bdev_ftl_get_io_channel,
+ .write_config_json = bdev_ftl_write_config_json,
+ .dump_info_json = bdev_ftl_dump_info_json,
+};
+
+static void
+bdev_ftl_create_cb(struct spdk_ftl_dev *dev, void *ctx, int status)
+{
+ struct ftl_bdev *ftl_bdev = ctx;
+ struct ftl_bdev_info info = {};
+ struct spdk_ftl_attrs attrs;
+ ftl_bdev_init_fn init_cb = ftl_bdev->init_cb;
+ void *init_arg = ftl_bdev->init_arg;
+ int rc = -ENODEV;
+
+ if (status) {
+ SPDK_ERRLOG("Failed to create FTL device (%d)\n", status);
+ rc = status;
+ goto error;
+ }
+
+ spdk_ftl_dev_get_attrs(dev, &attrs);
+
+ ftl_bdev->dev = dev;
+ ftl_bdev->bdev.product_name = "FTL disk";
+ ftl_bdev->bdev.write_cache = 0;
+ ftl_bdev->bdev.blocklen = attrs.block_size;
+ ftl_bdev->bdev.blockcnt = attrs.num_blocks;
+ ftl_bdev->bdev.uuid = attrs.uuid;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_FTL, "Creating bdev %s:\n", ftl_bdev->bdev.name);
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_FTL, "\tblock_len:\t%zu\n", attrs.block_size);
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_FTL, "\tnum_blocks:\t%"PRIu64"\n", attrs.num_blocks);
+
+ ftl_bdev->bdev.ctxt = ftl_bdev;
+ ftl_bdev->bdev.fn_table = &ftl_fn_table;
+ ftl_bdev->bdev.module = &g_ftl_if;
+
+ if (spdk_bdev_register(&ftl_bdev->bdev)) {
+ goto error;
+ }
+
+ info.name = ftl_bdev->bdev.name;
+ info.uuid = ftl_bdev->bdev.uuid;
+
+ init_cb(&info, init_arg, 0);
+ return;
+
+error:
+ free(ftl_bdev->bdev.name);
+ free(ftl_bdev);
+
+ init_cb(NULL, init_arg, rc);
+}
+
+static void
+bdev_ftl_defer_free(struct ftl_deferred_init *init)
+{
+ free((char *)init->opts.name);
+ free((char *)init->opts.base_bdev);
+ free((char *)init->opts.cache_bdev);
+ free(init);
+}
+
+static int
+bdev_ftl_defer_init(const struct ftl_bdev_init_opts *opts)
+{
+ struct ftl_deferred_init *init;
+
+ init = calloc(1, sizeof(*init));
+ if (!init) {
+ return -ENOMEM;
+ }
+
+ init->opts.mode = opts->mode;
+ init->opts.uuid = opts->uuid;
+ init->opts.ftl_conf = opts->ftl_conf;
+
+ init->opts.name = strdup(opts->name);
+ if (!init->opts.name) {
+ SPDK_ERRLOG("Could not allocate bdev name\n");
+ goto error;
+ }
+
+ init->opts.base_bdev = strdup(opts->base_bdev);
+ if (!init->opts.base_bdev) {
+ SPDK_ERRLOG("Could not allocate base bdev name\n");
+ goto error;
+ }
+
+ if (opts->cache_bdev) {
+ init->opts.cache_bdev = strdup(opts->cache_bdev);
+ if (!init->opts.cache_bdev) {
+ SPDK_ERRLOG("Could not allocate cache bdev name\n");
+ goto error;
+ }
+ }
+
+ LIST_INSERT_HEAD(&g_deferred_init, init, entry);
+
+ return 0;
+
+error:
+ bdev_ftl_defer_free(init);
+ return -ENOMEM;
+}
+
+int
+bdev_ftl_create_bdev(const struct ftl_bdev_init_opts *bdev_opts,
+ ftl_bdev_init_fn cb, void *cb_arg)
+{
+ struct ftl_bdev *ftl_bdev = NULL;
+ struct spdk_ftl_dev_init_opts opts = {};
+ int rc;
+
+ ftl_bdev = calloc(1, sizeof(*ftl_bdev));
+ if (!ftl_bdev) {
+ SPDK_ERRLOG("Could not allocate ftl_bdev\n");
+ return -ENOMEM;
+ }
+
+ ftl_bdev->bdev.name = strdup(bdev_opts->name);
+ if (!ftl_bdev->bdev.name) {
+ rc = -ENOMEM;
+ goto error_bdev;
+ }
+
+ if (spdk_bdev_get_by_name(bdev_opts->base_bdev) == NULL ||
+ (bdev_opts->cache_bdev && spdk_bdev_get_by_name(bdev_opts->cache_bdev) == NULL)) {
+ rc = bdev_ftl_defer_init(bdev_opts);
+ if (rc == 0) {
+ rc = -ENODEV;
+ }
+ goto error_name;
+ }
+
+ ftl_bdev->init_cb = cb;
+ ftl_bdev->init_arg = cb_arg;
+
+ opts.mode = bdev_opts->mode;
+ opts.uuid = bdev_opts->uuid;
+ opts.name = ftl_bdev->bdev.name;
+ opts.base_bdev = bdev_opts->base_bdev;
+ opts.cache_bdev = bdev_opts->cache_bdev;
+ opts.conf = &bdev_opts->ftl_conf;
+
+ /* TODO: set threads based on config */
+ opts.core_thread = spdk_get_thread();
+
+ rc = spdk_ftl_dev_init(&opts, bdev_ftl_create_cb, ftl_bdev);
+ if (rc) {
+ SPDK_ERRLOG("Could not create FTL device\n");
+ goto error_name;
+ }
+
+ return 0;
+
+error_name:
+ free(ftl_bdev->bdev.name);
+error_bdev:
+ free(ftl_bdev);
+ return rc;
+}
+
+static int
+bdev_ftl_initialize(void)
+{
+ return 0;
+}
+
+void
+bdev_ftl_delete_bdev(const char *name, spdk_bdev_unregister_cb cb_fn, void *cb_arg)
+{
+ struct spdk_bdev *bdev;
+
+ bdev = spdk_bdev_get_by_name(name);
+ if (bdev) {
+ spdk_bdev_unregister(bdev, cb_fn, cb_arg);
+ return;
+ }
+
+ cb_fn(cb_arg, -ENODEV);
+}
+
+static void
+bdev_ftl_finish(void)
+{
+}
+
+static void
+bdev_ftl_create_defered_cb(const struct ftl_bdev_info *info, void *ctx, int status)
+{
+ struct ftl_deferred_init *opts = ctx;
+
+ if (status) {
+ SPDK_ERRLOG("Failed to initialize FTL bdev '%s'\n", opts->opts.name);
+ }
+
+ bdev_ftl_defer_free(opts);
+
+ spdk_bdev_module_examine_done(&g_ftl_if);
+}
+
+static void
+bdev_ftl_examine(struct spdk_bdev *bdev)
+{
+ struct ftl_deferred_init *opts;
+
+ LIST_FOREACH(opts, &g_deferred_init, entry) {
+ if (spdk_bdev_get_by_name(opts->opts.base_bdev) == NULL) {
+ continue;
+ }
+
+ if (opts->opts.cache_bdev && spdk_bdev_get_by_name(opts->opts.base_bdev) == NULL) {
+ continue;
+ }
+
+ LIST_REMOVE(opts, entry);
+
+ /* spdk_bdev_module_examine_done will be called by bdev_ftl_create_defered_cb */
+ if (bdev_ftl_create_bdev(&opts->opts, bdev_ftl_create_defered_cb, opts)) {
+ SPDK_ERRLOG("Failed to initialize FTL bdev '%s'\n", opts->opts.name);
+ bdev_ftl_defer_free(opts);
+ break;
+ }
+ return;
+ }
+
+ spdk_bdev_module_examine_done(&g_ftl_if);
+}
+
+SPDK_LOG_REGISTER_COMPONENT("bdev_ftl", SPDK_LOG_BDEV_FTL)
diff --git a/src/spdk/module/bdev/ftl/bdev_ftl.h b/src/spdk/module/bdev/ftl/bdev_ftl.h
new file mode 100644
index 000000000..019a3b8f3
--- /dev/null
+++ b/src/spdk/module/bdev/ftl/bdev_ftl.h
@@ -0,0 +1,70 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_BDEV_FTL_H
+#define SPDK_BDEV_FTL_H
+
+#include "spdk/stdinc.h"
+#include "spdk/bdev_module.h"
+#include "spdk/ftl.h"
+
+struct spdk_bdev;
+struct spdk_uuid;
+
+struct ftl_bdev_info {
+ const char *name;
+ struct spdk_uuid uuid;
+};
+
+struct ftl_bdev_init_opts {
+ /* Bdev's name */
+ const char *name;
+ /* Base bdev's name */
+ const char *base_bdev;
+ /* Write buffer bdev's name */
+ const char *cache_bdev;
+ /* Bdev's mode */
+ uint32_t mode;
+ /* UUID if device is restored from SSD */
+ struct spdk_uuid uuid;
+ /* FTL library configuration */
+ struct spdk_ftl_conf ftl_conf;
+};
+
+typedef void (*ftl_bdev_init_fn)(const struct ftl_bdev_info *, void *, int);
+
+int bdev_ftl_create_bdev(const struct ftl_bdev_init_opts *bdev_opts,
+ ftl_bdev_init_fn cb, void *cb_arg);
+void bdev_ftl_delete_bdev(const char *name, spdk_bdev_unregister_cb cb_fn, void *cb_arg);
+
+#endif /* SPDK_BDEV_FTL_H */
diff --git a/src/spdk/module/bdev/ftl/bdev_ftl_rpc.c b/src/spdk/module/bdev/ftl/bdev_ftl_rpc.c
new file mode 100644
index 000000000..045619342
--- /dev/null
+++ b/src/spdk/module/bdev/ftl/bdev_ftl_rpc.c
@@ -0,0 +1,258 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/rpc.h"
+#include "spdk/util.h"
+#include "spdk/bdev_module.h"
+#include "spdk/string.h"
+#include "spdk_internal/log.h"
+
+#include "bdev_ftl.h"
+
+struct rpc_bdev_ftl_create {
+ char *name;
+ char *base_bdev;
+ char *uuid;
+ char *cache_bdev;
+ struct spdk_ftl_conf ftl_conf;
+};
+
+static void
+free_rpc_bdev_ftl_create(struct rpc_bdev_ftl_create *req)
+{
+ free(req->name);
+ free(req->base_bdev);
+ free(req->uuid);
+ free(req->cache_bdev);
+ free((char *)req->ftl_conf.l2p_path);
+}
+
+static const struct spdk_json_object_decoder rpc_bdev_ftl_create_decoders[] = {
+ {"name", offsetof(struct rpc_bdev_ftl_create, name), spdk_json_decode_string},
+ {"base_bdev", offsetof(struct rpc_bdev_ftl_create, base_bdev), spdk_json_decode_string},
+ {"uuid", offsetof(struct rpc_bdev_ftl_create, uuid), spdk_json_decode_string, true},
+ {"cache", offsetof(struct rpc_bdev_ftl_create, cache_bdev), spdk_json_decode_string, true},
+ {
+ "allow_open_bands", offsetof(struct rpc_bdev_ftl_create, ftl_conf) +
+ offsetof(struct spdk_ftl_conf, allow_open_bands), spdk_json_decode_bool, true
+ },
+ {
+ "overprovisioning", offsetof(struct rpc_bdev_ftl_create, ftl_conf) +
+ offsetof(struct spdk_ftl_conf, lba_rsvd), spdk_json_decode_uint64, true
+ },
+ {
+ "use_append", offsetof(struct rpc_bdev_ftl_create, ftl_conf) +
+ offsetof(struct spdk_ftl_conf, use_append), spdk_json_decode_bool, true
+ },
+ {
+ "l2p_path", offsetof(struct rpc_bdev_ftl_create, ftl_conf) +
+ offsetof(struct spdk_ftl_conf, l2p_path),
+ spdk_json_decode_string, true
+ },
+ {
+ "limit_crit", offsetof(struct rpc_bdev_ftl_create, ftl_conf) +
+ offsetof(struct spdk_ftl_conf, limits[SPDK_FTL_LIMIT_CRIT]) +
+ offsetof(struct spdk_ftl_limit, limit),
+ spdk_json_decode_uint64, true
+ },
+ {
+ "limit_crit_threshold", offsetof(struct rpc_bdev_ftl_create, ftl_conf) +
+ offsetof(struct spdk_ftl_conf, limits[SPDK_FTL_LIMIT_CRIT]) +
+ offsetof(struct spdk_ftl_limit, thld),
+ spdk_json_decode_uint64, true
+ },
+ {
+ "limit_high", offsetof(struct rpc_bdev_ftl_create, ftl_conf) +
+ offsetof(struct spdk_ftl_conf, limits[SPDK_FTL_LIMIT_HIGH]) +
+ offsetof(struct spdk_ftl_limit, limit),
+ spdk_json_decode_uint64, true
+ },
+ {
+ "limit_high_threshold", offsetof(struct rpc_bdev_ftl_create, ftl_conf) +
+ offsetof(struct spdk_ftl_conf, limits[SPDK_FTL_LIMIT_HIGH]) +
+ offsetof(struct spdk_ftl_limit, thld),
+ spdk_json_decode_uint64, true
+ },
+ {
+ "limit_low", offsetof(struct rpc_bdev_ftl_create, ftl_conf) +
+ offsetof(struct spdk_ftl_conf, limits[SPDK_FTL_LIMIT_LOW]) +
+ offsetof(struct spdk_ftl_limit, limit),
+ spdk_json_decode_uint64, true
+ },
+ {
+ "limit_low_threshold", offsetof(struct rpc_bdev_ftl_create, ftl_conf) +
+ offsetof(struct spdk_ftl_conf, limits[SPDK_FTL_LIMIT_LOW]) +
+ offsetof(struct spdk_ftl_limit, thld),
+ spdk_json_decode_uint64, true
+ },
+ {
+ "limit_start", offsetof(struct rpc_bdev_ftl_create, ftl_conf) +
+ offsetof(struct spdk_ftl_conf, limits[SPDK_FTL_LIMIT_START]) +
+ offsetof(struct spdk_ftl_limit, limit),
+ spdk_json_decode_uint64, true
+ },
+ {
+ "limit_start_threshold", offsetof(struct rpc_bdev_ftl_create, ftl_conf) +
+ offsetof(struct spdk_ftl_conf, limits[SPDK_FTL_LIMIT_START]) +
+ offsetof(struct spdk_ftl_limit, thld),
+ spdk_json_decode_uint64, true
+ },
+};
+
+static void
+rpc_bdev_ftl_create_cb(const struct ftl_bdev_info *bdev_info, void *ctx, int status)
+{
+ struct spdk_jsonrpc_request *request = ctx;
+ char bdev_uuid[SPDK_UUID_STRING_LEN];
+ struct spdk_json_write_ctx *w;
+
+ if (status) {
+ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Failed to create FTL bdev: %s",
+ spdk_strerror(-status));
+ return;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_uuid_fmt_lower(bdev_uuid, sizeof(bdev_uuid), &bdev_info->uuid);
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "name", bdev_info->name);
+ spdk_json_write_named_string(w, "uuid", bdev_uuid);
+ spdk_json_write_object_end(w);
+ spdk_jsonrpc_end_result(request, w);
+}
+
+static void
+rpc_bdev_ftl_create(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_ftl_create req = {};
+ struct ftl_bdev_init_opts opts = {};
+ struct spdk_json_write_ctx *w;
+ int rc;
+
+ spdk_ftl_conf_init_defaults(&req.ftl_conf);
+
+ if (spdk_json_decode_object(params, rpc_bdev_ftl_create_decoders,
+ SPDK_COUNTOF(rpc_bdev_ftl_create_decoders),
+ &req)) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid parameters");
+ goto invalid;
+ }
+
+ if (req.cache_bdev && !spdk_bdev_get_by_name(req.cache_bdev)) {
+ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "No such bdev: %s", req.cache_bdev);
+ goto invalid;
+ }
+
+ opts.name = req.name;
+ opts.mode = SPDK_FTL_MODE_CREATE;
+ opts.base_bdev = req.base_bdev;
+ opts.cache_bdev = req.cache_bdev;
+ opts.ftl_conf = req.ftl_conf;
+
+ if (req.uuid) {
+ if (spdk_uuid_parse(&opts.uuid, req.uuid) < 0) {
+ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Failed to parse uuid: %s",
+ req.uuid);
+ goto invalid;
+ }
+
+ if (!spdk_mem_all_zero(&opts.uuid, sizeof(opts.uuid))) {
+ opts.mode &= ~SPDK_FTL_MODE_CREATE;
+ }
+ }
+
+ rc = bdev_ftl_create_bdev(&opts, rpc_bdev_ftl_create_cb, request);
+ if (rc) {
+ if (rc == -ENODEV) {
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_string_fmt(w, "FTL bdev: %s creation deferred", req.name);
+ spdk_jsonrpc_end_result(request, w);
+ } else {
+ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Failed to create FTL bdev: %s",
+ spdk_strerror(-rc));
+ }
+ goto invalid;
+ }
+
+invalid:
+ free_rpc_bdev_ftl_create(&req);
+}
+
+SPDK_RPC_REGISTER("bdev_ftl_create", rpc_bdev_ftl_create, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_ftl_create, construct_ftl_bdev)
+
+struct rpc_delete_ftl {
+ char *name;
+};
+
+static const struct spdk_json_object_decoder rpc_delete_ftl_decoders[] = {
+ {"name", offsetof(struct rpc_bdev_ftl_create, name), spdk_json_decode_string},
+};
+
+static void
+rpc_bdev_ftl_delete_cb(void *cb_arg, int bdeverrno)
+{
+ struct spdk_jsonrpc_request *request = cb_arg;
+ struct spdk_json_write_ctx *w = spdk_jsonrpc_begin_result(request);
+
+ spdk_json_write_bool(w, bdeverrno == 0);
+ spdk_jsonrpc_end_result(request, w);
+}
+
+static void
+rpc_bdev_ftl_delete(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_delete_ftl attrs = {};
+
+ if (spdk_json_decode_object(params, rpc_delete_ftl_decoders,
+ SPDK_COUNTOF(rpc_delete_ftl_decoders),
+ &attrs)) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid parameters");
+ goto invalid;
+ }
+
+ bdev_ftl_delete_bdev(attrs.name, rpc_bdev_ftl_delete_cb, request);
+invalid:
+ free(attrs.name);
+}
+
+SPDK_RPC_REGISTER("bdev_ftl_delete", rpc_bdev_ftl_delete, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_ftl_delete, delete_ftl_bdev)
diff --git a/src/spdk/module/bdev/gpt/Makefile b/src/spdk/module/bdev/gpt/Makefile
new file mode 100644
index 000000000..db27dbc38
--- /dev/null
+++ b/src/spdk/module/bdev/gpt/Makefile
@@ -0,0 +1,45 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = gpt.c vbdev_gpt.c
+LIBNAME = bdev_gpt
+
+SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/module/bdev/gpt/gpt.c b/src/spdk/module/bdev/gpt/gpt.c
new file mode 100644
index 000000000..d31168b0b
--- /dev/null
+++ b/src/spdk/module/bdev/gpt/gpt.c
@@ -0,0 +1,320 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "gpt.h"
+
+#include "spdk/crc32.h"
+#include "spdk/endian.h"
+#include "spdk/event.h"
+
+#include "spdk_internal/log.h"
+
+#define GPT_PRIMARY_PARTITION_TABLE_LBA 0x1
+#define PRIMARY_PARTITION_NUMBER 4
+#define GPT_PROTECTIVE_MBR 1
+#define SPDK_MAX_NUM_PARTITION_ENTRIES 128
+
+static uint64_t
+gpt_get_expected_head_lba(struct spdk_gpt *gpt)
+{
+ switch (gpt->parse_phase) {
+ case SPDK_GPT_PARSE_PHASE_PRIMARY:
+ return GPT_PRIMARY_PARTITION_TABLE_LBA;
+ case SPDK_GPT_PARSE_PHASE_SECONDARY:
+ return gpt->lba_end;
+ default:
+ assert(false);
+ }
+ return 0;
+}
+
+static struct spdk_gpt_header *
+gpt_get_header_buf(struct spdk_gpt *gpt)
+{
+ switch (gpt->parse_phase) {
+ case SPDK_GPT_PARSE_PHASE_PRIMARY:
+ return (struct spdk_gpt_header *)
+ (gpt->buf + GPT_PRIMARY_PARTITION_TABLE_LBA * gpt->sector_size);
+ case SPDK_GPT_PARSE_PHASE_SECONDARY:
+ return (struct spdk_gpt_header *)
+ (gpt->buf + (gpt->buf_size - gpt->sector_size));
+ default:
+ assert(false);
+ }
+ return NULL;
+}
+
+static struct spdk_gpt_partition_entry *
+gpt_get_partitions_buf(struct spdk_gpt *gpt, uint64_t total_partition_size,
+ uint64_t partition_start_lba)
+{
+ uint64_t secondary_total_size;
+
+ switch (gpt->parse_phase) {
+ case SPDK_GPT_PARSE_PHASE_PRIMARY:
+ if ((total_partition_size + partition_start_lba * gpt->sector_size) >
+ gpt->buf_size) {
+ SPDK_ERRLOG("Buffer size is not enough\n");
+ return NULL;
+ }
+ return (struct spdk_gpt_partition_entry *)
+ (gpt->buf + partition_start_lba * gpt->sector_size);
+ case SPDK_GPT_PARSE_PHASE_SECONDARY:
+ secondary_total_size = (gpt->lba_end - partition_start_lba + 1) * gpt->sector_size;
+ if (secondary_total_size > gpt->buf_size) {
+ SPDK_ERRLOG("Buffer size is not enough\n");
+ return NULL;
+ }
+ return (struct spdk_gpt_partition_entry *)
+ (gpt->buf + (gpt->buf_size - secondary_total_size));
+ default:
+ assert(false);
+ }
+ return NULL;
+}
+
+static int
+gpt_read_partitions(struct spdk_gpt *gpt)
+{
+ uint32_t total_partition_size, num_partition_entries, partition_entry_size;
+ uint64_t partition_start_lba;
+ struct spdk_gpt_header *head = gpt->header;
+ uint32_t crc32;
+
+ num_partition_entries = from_le32(&head->num_partition_entries);
+ if (num_partition_entries > SPDK_MAX_NUM_PARTITION_ENTRIES) {
+ SPDK_ERRLOG("Num_partition_entries=%u which exceeds max=%u\n",
+ num_partition_entries, SPDK_MAX_NUM_PARTITION_ENTRIES);
+ return -1;
+ }
+
+ partition_entry_size = from_le32(&head->size_of_partition_entry);
+ if (partition_entry_size != sizeof(struct spdk_gpt_partition_entry)) {
+ SPDK_ERRLOG("Partition_entry_size(%x) != expected(%lx)\n",
+ partition_entry_size, sizeof(struct spdk_gpt_partition_entry));
+ return -1;
+ }
+
+ total_partition_size = num_partition_entries * partition_entry_size;
+ partition_start_lba = from_le64(&head->partition_entry_lba);
+ gpt->partitions = gpt_get_partitions_buf(gpt, total_partition_size,
+ partition_start_lba);
+ if (!gpt->partitions) {
+ SPDK_ERRLOG("Failed to get gpt partitions buf\n");
+ return -1;
+ }
+
+ crc32 = spdk_crc32_ieee_update(gpt->partitions, total_partition_size, ~0);
+ crc32 ^= ~0;
+
+ if (crc32 != from_le32(&head->partition_entry_array_crc32)) {
+ SPDK_ERRLOG("GPT partition entry array crc32 did not match\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int
+gpt_lba_range_check(struct spdk_gpt_header *head, uint64_t lba_end)
+{
+ uint64_t usable_lba_start, usable_lba_end;
+
+ usable_lba_start = from_le64(&head->first_usable_lba);
+ usable_lba_end = from_le64(&head->last_usable_lba);
+
+ if (usable_lba_end < usable_lba_start) {
+ SPDK_ERRLOG("Head's usable_lba_end(%" PRIu64 ") < usable_lba_start(%" PRIu64 ")\n",
+ usable_lba_end, usable_lba_start);
+ return -1;
+ }
+
+ if (usable_lba_end > lba_end) {
+ SPDK_ERRLOG("Head's usable_lba_end(%" PRIu64 ") > lba_end(%" PRIu64 ")\n",
+ usable_lba_end, lba_end);
+ return -1;
+ }
+
+ if ((usable_lba_start < GPT_PRIMARY_PARTITION_TABLE_LBA) &&
+ (GPT_PRIMARY_PARTITION_TABLE_LBA < usable_lba_end)) {
+ SPDK_ERRLOG("Head lba is not in the usable range\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int
+gpt_read_header(struct spdk_gpt *gpt)
+{
+ uint32_t head_size;
+ uint32_t new_crc, original_crc;
+ uint64_t my_lba, head_lba;
+ struct spdk_gpt_header *head;
+
+ head = gpt_get_header_buf(gpt);
+ if (!head) {
+ SPDK_ERRLOG("Failed to get gpt header buf\n");
+ return -1;
+ }
+
+ head_size = from_le32(&head->header_size);
+ if (head_size < sizeof(*head) || head_size > gpt->sector_size) {
+ SPDK_ERRLOG("head_size=%u\n", head_size);
+ return -1;
+ }
+
+ original_crc = from_le32(&head->header_crc32);
+ head->header_crc32 = 0;
+ new_crc = spdk_crc32_ieee_update(head, from_le32(&head->header_size), ~0);
+ new_crc ^= ~0;
+ /* restore header crc32 */
+ to_le32(&head->header_crc32, original_crc);
+
+ if (new_crc != original_crc) {
+ SPDK_ERRLOG("head crc32 does not match, provided=%u, caculated=%u\n",
+ original_crc, new_crc);
+ return -1;
+ }
+
+ if (memcmp(SPDK_GPT_SIGNATURE, head->gpt_signature,
+ sizeof(head->gpt_signature))) {
+ SPDK_ERRLOG("signature did not match\n");
+ return -1;
+ }
+
+ head_lba = gpt_get_expected_head_lba(gpt);
+ my_lba = from_le64(&head->my_lba);
+ if (my_lba != head_lba) {
+ SPDK_ERRLOG("head my_lba(%" PRIu64 ") != expected(%" PRIu64 ")\n",
+ my_lba, head_lba);
+ return -1;
+ }
+
+ if (gpt_lba_range_check(head, gpt->lba_end)) {
+ SPDK_ERRLOG("lba range check error\n");
+ return -1;
+ }
+
+ gpt->header = head;
+ return 0;
+}
+
+static int
+gpt_check_mbr(struct spdk_gpt *gpt)
+{
+ int i, primary_partition = 0;
+ uint32_t total_lba_size = 0, ret = 0, expected_start_lba;
+ struct spdk_mbr *mbr;
+
+ mbr = (struct spdk_mbr *)gpt->buf;
+ if (from_le16(&mbr->mbr_signature) != SPDK_MBR_SIGNATURE) {
+ SPDK_DEBUGLOG(SPDK_LOG_GPT_PARSE, "Signature mismatch, provided=%x,"
+ "expected=%x\n", from_le16(&mbr->disk_signature),
+ SPDK_MBR_SIGNATURE);
+ return -1;
+ }
+
+ for (i = 0; i < PRIMARY_PARTITION_NUMBER; i++) {
+ if (mbr->partitions[i].os_type == SPDK_MBR_OS_TYPE_GPT_PROTECTIVE) {
+ primary_partition = i;
+ ret = GPT_PROTECTIVE_MBR;
+ break;
+ }
+ }
+
+ if (ret == GPT_PROTECTIVE_MBR) {
+ expected_start_lba = GPT_PRIMARY_PARTITION_TABLE_LBA;
+ if (from_le32(&mbr->partitions[primary_partition].start_lba) != expected_start_lba) {
+ SPDK_DEBUGLOG(SPDK_LOG_GPT_PARSE, "start lba mismatch, provided=%u, expected=%u\n",
+ from_le32(&mbr->partitions[primary_partition].start_lba),
+ expected_start_lba);
+ return -1;
+ }
+
+ total_lba_size = from_le32(&mbr->partitions[primary_partition].size_lba);
+ if ((total_lba_size != ((uint32_t) gpt->total_sectors - 1)) &&
+ (total_lba_size != 0xFFFFFFFF)) {
+ SPDK_DEBUGLOG(SPDK_LOG_GPT_PARSE,
+ "GPT Primary MBR size does not equal: (record_size %u != actual_size %u)!\n",
+ total_lba_size, (uint32_t) gpt->total_sectors - 1);
+ return -1;
+ }
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_GPT_PARSE, "Currently only support GPT Protective MBR format\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+int
+gpt_parse_mbr(struct spdk_gpt *gpt)
+{
+ int rc;
+
+ if (!gpt || !gpt->buf) {
+ SPDK_ERRLOG("Gpt and the related buffer should not be NULL\n");
+ return -1;
+ }
+
+ rc = gpt_check_mbr(gpt);
+ if (rc) {
+ SPDK_DEBUGLOG(SPDK_LOG_GPT_PARSE, "Failed to detect gpt in MBR\n");
+ return rc;
+ }
+
+ return 0;
+}
+
+int
+gpt_parse_partition_table(struct spdk_gpt *gpt)
+{
+ int rc;
+
+ rc = gpt_read_header(gpt);
+ if (rc) {
+ SPDK_ERRLOG("Failed to read gpt header\n");
+ return rc;
+ }
+
+ rc = gpt_read_partitions(gpt);
+ if (rc) {
+ SPDK_ERRLOG("Failed to read gpt partitions\n");
+ return rc;
+ }
+
+ return 0;
+}
+
+SPDK_LOG_REGISTER_COMPONENT("gpt_parse", SPDK_LOG_GPT_PARSE)
diff --git a/src/spdk/module/bdev/gpt/gpt.h b/src/spdk/module/bdev/gpt/gpt.h
new file mode 100644
index 000000000..9fa870843
--- /dev/null
+++ b/src/spdk/module/bdev/gpt/gpt.h
@@ -0,0 +1,70 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * GPT internal Interface
+ */
+
+#ifndef SPDK_INTERNAL_GPT_H
+#define SPDK_INTERNAL_GPT_H
+
+#include "spdk/stdinc.h"
+
+#include "spdk/gpt_spec.h"
+
+#define SPDK_GPT_PART_TYPE_GUID SPDK_GPT_GUID(0x7c5222bd, 0x8f5d, 0x4087, 0x9c00, 0xbf9843c7b58c)
+#define SPDK_GPT_BUFFER_SIZE 32768 /* 32KB */
+#define SPDK_GPT_GUID_EQUAL(x,y) (memcmp(x, y, sizeof(struct spdk_gpt_guid)) == 0)
+
+enum spdk_gpt_parse_phase {
+ SPDK_GPT_PARSE_PHASE_INVALID = 0,
+ SPDK_GPT_PARSE_PHASE_PRIMARY,
+ SPDK_GPT_PARSE_PHASE_SECONDARY,
+};
+
+struct spdk_gpt {
+ uint8_t parse_phase;
+ unsigned char *buf;
+ uint64_t buf_size;
+ uint64_t lba_start;
+ uint64_t lba_end;
+ uint64_t total_sectors;
+ uint32_t sector_size;
+ struct spdk_gpt_header *header;
+ struct spdk_gpt_partition_entry *partitions;
+};
+
+int gpt_parse_mbr(struct spdk_gpt *gpt);
+int gpt_parse_partition_table(struct spdk_gpt *gpt);
+
+#endif /* SPDK_INTERNAL_GPT_H */
diff --git a/src/spdk/module/bdev/gpt/vbdev_gpt.c b/src/spdk/module/bdev/gpt/vbdev_gpt.c
new file mode 100644
index 000000000..5232444fb
--- /dev/null
+++ b/src/spdk/module/bdev/gpt/vbdev_gpt.c
@@ -0,0 +1,565 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This driver reads a GPT partition table from a bdev and exposes a virtual block device for
+ * each partition.
+ */
+
+#include "gpt.h"
+
+#include "spdk/conf.h"
+#include "spdk/endian.h"
+#include "spdk/env.h"
+#include "spdk/thread.h"
+#include "spdk/rpc.h"
+#include "spdk/string.h"
+#include "spdk/util.h"
+
+#include "spdk/bdev_module.h"
+#include "spdk_internal/log.h"
+
+static int vbdev_gpt_init(void);
+static void vbdev_gpt_examine(struct spdk_bdev *bdev);
+static int vbdev_gpt_get_ctx_size(void);
+
+static struct spdk_bdev_module gpt_if = {
+ .name = "gpt",
+ .module_init = vbdev_gpt_init,
+ .get_ctx_size = vbdev_gpt_get_ctx_size,
+ .examine_disk = vbdev_gpt_examine,
+
+};
+SPDK_BDEV_MODULE_REGISTER(gpt, &gpt_if)
+
+/* Base block device gpt context */
+struct gpt_base {
+ struct spdk_gpt gpt;
+ struct spdk_bdev_part_base *part_base;
+ SPDK_BDEV_PART_TAILQ parts;
+
+ /* This channel is only used for reading the partition table. */
+ struct spdk_io_channel *ch;
+};
+
+/* Context for each gpt virtual bdev */
+struct gpt_disk {
+ struct spdk_bdev_part part;
+ uint32_t partition_index;
+};
+
+struct gpt_channel {
+ struct spdk_bdev_part_channel part_ch;
+};
+
+struct gpt_io {
+ struct spdk_io_channel *ch;
+ struct spdk_bdev_io *bdev_io;
+
+ /* for bdev_io_wait */
+ struct spdk_bdev_io_wait_entry bdev_io_wait;
+};
+
+static bool g_gpt_disabled;
+
+static void
+gpt_base_free(void *ctx)
+{
+ struct gpt_base *gpt_base = ctx;
+
+ spdk_free(gpt_base->gpt.buf);
+ free(gpt_base);
+}
+
+static void
+gpt_base_bdev_hotremove_cb(void *_part_base)
+{
+ struct spdk_bdev_part_base *part_base = _part_base;
+ struct gpt_base *gpt_base = spdk_bdev_part_base_get_ctx(part_base);
+
+ spdk_bdev_part_base_hotremove(part_base, &gpt_base->parts);
+}
+
+static int vbdev_gpt_destruct(void *ctx);
+static void vbdev_gpt_submit_request(struct spdk_io_channel *_ch, struct spdk_bdev_io *bdev_io);
+static int vbdev_gpt_dump_info_json(void *ctx, struct spdk_json_write_ctx *w);
+
+static struct spdk_bdev_fn_table vbdev_gpt_fn_table = {
+ .destruct = vbdev_gpt_destruct,
+ .submit_request = vbdev_gpt_submit_request,
+ .dump_info_json = vbdev_gpt_dump_info_json,
+};
+
+static struct gpt_base *
+gpt_base_bdev_init(struct spdk_bdev *bdev)
+{
+ struct gpt_base *gpt_base;
+ struct spdk_gpt *gpt;
+
+ gpt_base = calloc(1, sizeof(*gpt_base));
+ if (!gpt_base) {
+ SPDK_ERRLOG("Cannot alloc memory for gpt_base pointer\n");
+ return NULL;
+ }
+
+ TAILQ_INIT(&gpt_base->parts);
+ gpt_base->part_base = spdk_bdev_part_base_construct(bdev,
+ gpt_base_bdev_hotremove_cb,
+ &gpt_if, &vbdev_gpt_fn_table,
+ &gpt_base->parts, gpt_base_free, gpt_base,
+ sizeof(struct gpt_channel), NULL, NULL);
+ if (!gpt_base->part_base) {
+ free(gpt_base);
+ SPDK_ERRLOG("cannot construct gpt_base");
+ return NULL;
+ }
+
+ gpt = &gpt_base->gpt;
+ gpt->parse_phase = SPDK_GPT_PARSE_PHASE_PRIMARY;
+ gpt->buf_size = spdk_max(SPDK_GPT_BUFFER_SIZE, bdev->blocklen);
+ gpt->buf = spdk_zmalloc(gpt->buf_size, spdk_bdev_get_buf_align(bdev), NULL,
+ SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+ if (!gpt->buf) {
+ SPDK_ERRLOG("Cannot alloc buf\n");
+ spdk_bdev_part_base_free(gpt_base->part_base);
+ return NULL;
+ }
+
+ gpt->sector_size = bdev->blocklen;
+ gpt->total_sectors = bdev->blockcnt;
+ gpt->lba_start = 0;
+ gpt->lba_end = gpt->total_sectors - 1;
+
+ return gpt_base;
+}
+
+static int
+vbdev_gpt_destruct(void *ctx)
+{
+ struct gpt_disk *gpt_disk = ctx;
+
+ return spdk_bdev_part_free(&gpt_disk->part);
+}
+
+static void
+_vbdev_gpt_submit_request(struct spdk_io_channel *_ch, struct spdk_bdev_io *bdev_io);
+
+static void
+vbdev_gpt_resubmit_request(void *arg)
+{
+ struct gpt_io *io = (struct gpt_io *)arg;
+
+ _vbdev_gpt_submit_request(io->ch, io->bdev_io);
+}
+
+static void
+vbdev_gpt_queue_io(struct gpt_io *io)
+{
+ struct gpt_channel *ch = spdk_io_channel_get_ctx(io->ch);
+ int rc;
+
+ io->bdev_io_wait.bdev = io->bdev_io->bdev;
+ io->bdev_io_wait.cb_fn = vbdev_gpt_resubmit_request;
+ io->bdev_io_wait.cb_arg = io;
+
+ rc = spdk_bdev_queue_io_wait(io->bdev_io->bdev,
+ ch->part_ch.base_ch, &io->bdev_io_wait);
+ if (rc != 0) {
+ SPDK_ERRLOG("Queue io failed in vbdev_gpt_queue_io, rc=%d.\n", rc);
+ spdk_bdev_io_complete(io->bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+}
+
+static void
+vbdev_gpt_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success)
+{
+ if (!success) {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ return;
+ }
+
+ _vbdev_gpt_submit_request(ch, bdev_io);
+}
+
+static void
+_vbdev_gpt_submit_request(struct spdk_io_channel *_ch, struct spdk_bdev_io *bdev_io)
+{
+ struct gpt_channel *ch = spdk_io_channel_get_ctx(_ch);
+ struct gpt_io *io = (struct gpt_io *)bdev_io->driver_ctx;
+ int rc;
+
+ rc = spdk_bdev_part_submit_request(&ch->part_ch, bdev_io);
+ if (rc) {
+ if (rc == -ENOMEM) {
+ SPDK_DEBUGLOG(SPDK_LOG_VBDEV_GPT, "gpt: no memory, queue io\n");
+ io->ch = _ch;
+ io->bdev_io = bdev_io;
+ vbdev_gpt_queue_io(io);
+ } else {
+ SPDK_ERRLOG("gpt: error on bdev_io submission, rc=%d.\n", rc);
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+ }
+}
+
+static void
+vbdev_gpt_submit_request(struct spdk_io_channel *_ch, struct spdk_bdev_io *bdev_io)
+{
+ switch (bdev_io->type) {
+ case SPDK_BDEV_IO_TYPE_READ:
+ spdk_bdev_io_get_buf(bdev_io, vbdev_gpt_get_buf_cb,
+ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
+ break;
+ default:
+ _vbdev_gpt_submit_request(_ch, bdev_io);
+ break;
+ }
+}
+
+static void
+write_guid(struct spdk_json_write_ctx *w, const struct spdk_gpt_guid *guid)
+{
+ spdk_json_write_string_fmt(w, "%08x-%04x-%04x-%04x-%04x%08x",
+ from_le32(&guid->raw[0]),
+ from_le16(&guid->raw[4]),
+ from_le16(&guid->raw[6]),
+ from_be16(&guid->raw[8]),
+ from_be16(&guid->raw[10]),
+ from_be32(&guid->raw[12]));
+}
+
+static void
+write_string_utf16le(struct spdk_json_write_ctx *w, const uint16_t *str, size_t max_len)
+{
+ size_t len;
+ const uint16_t *p;
+
+ for (len = 0, p = str; len < max_len && *p; p++) {
+ len++;
+ }
+
+ spdk_json_write_string_utf16le_raw(w, str, len);
+}
+
+static int
+vbdev_gpt_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
+{
+ struct gpt_disk *gpt_disk = SPDK_CONTAINEROF(ctx, struct gpt_disk, part);
+ struct spdk_bdev_part_base *base_bdev = spdk_bdev_part_get_base(&gpt_disk->part);
+ struct gpt_base *gpt_base = spdk_bdev_part_base_get_ctx(base_bdev);
+ struct spdk_bdev *part_base_bdev = spdk_bdev_part_base_get_bdev(base_bdev);
+ struct spdk_gpt *gpt = &gpt_base->gpt;
+ struct spdk_gpt_partition_entry *gpt_entry = &gpt->partitions[gpt_disk->partition_index];
+ uint64_t offset_blocks = spdk_bdev_part_get_offset_blocks(&gpt_disk->part);
+
+ spdk_json_write_named_object_begin(w, "gpt");
+
+ spdk_json_write_named_string(w, "base_bdev", spdk_bdev_get_name(part_base_bdev));
+
+ spdk_json_write_named_uint64(w, "offset_blocks", offset_blocks);
+
+ spdk_json_write_name(w, "partition_type_guid");
+ write_guid(w, &gpt_entry->part_type_guid);
+
+ spdk_json_write_name(w, "unique_partition_guid");
+ write_guid(w, &gpt_entry->unique_partition_guid);
+
+ spdk_json_write_name(w, "partition_name");
+ write_string_utf16le(w, gpt_entry->partition_name, SPDK_COUNTOF(gpt_entry->partition_name));
+
+ spdk_json_write_object_end(w);
+
+ return 0;
+}
+
+static int
+vbdev_gpt_create_bdevs(struct gpt_base *gpt_base)
+{
+ uint32_t num_partition_entries;
+ uint64_t i, head_lba_start, head_lba_end;
+ uint32_t num_partitions;
+ struct spdk_gpt_partition_entry *p;
+ struct gpt_disk *d;
+ struct spdk_gpt *gpt;
+ char *name;
+ struct spdk_bdev *base_bdev;
+ int rc;
+
+ gpt = &gpt_base->gpt;
+ num_partition_entries = from_le32(&gpt->header->num_partition_entries);
+ head_lba_start = from_le64(&gpt->header->first_usable_lba);
+ head_lba_end = from_le64(&gpt->header->last_usable_lba);
+ num_partitions = 0;
+
+ for (i = 0; i < num_partition_entries; i++) {
+ p = &gpt->partitions[i];
+ uint64_t lba_start = from_le64(&p->starting_lba);
+ uint64_t lba_end = from_le64(&p->ending_lba);
+
+ if (!SPDK_GPT_GUID_EQUAL(&gpt->partitions[i].part_type_guid,
+ &SPDK_GPT_PART_TYPE_GUID) ||
+ lba_start == 0) {
+ continue;
+ }
+ if (lba_start < head_lba_start || lba_end > head_lba_end) {
+ continue;
+ }
+
+ d = calloc(1, sizeof(*d));
+ if (!d) {
+ SPDK_ERRLOG("Memory allocation failure\n");
+ return -1;
+ }
+
+ /* index start at 1 instead of 0 to match the existing style */
+ base_bdev = spdk_bdev_part_base_get_bdev(gpt_base->part_base);
+ name = spdk_sprintf_alloc("%sp%" PRIu64, spdk_bdev_get_name(base_bdev), i + 1);
+ if (!name) {
+ SPDK_ERRLOG("name allocation failure\n");
+ free(d);
+ return -1;
+ }
+
+ rc = spdk_bdev_part_construct(&d->part, gpt_base->part_base, name,
+ lba_start, lba_end - lba_start, "GPT Disk");
+ free(name);
+ if (rc) {
+ SPDK_ERRLOG("could not construct bdev part\n");
+ /* spdk_bdev_part_construct will free name on failure */
+ free(d);
+ return -1;
+ }
+ num_partitions++;
+ d->partition_index = i;
+ }
+
+ return num_partitions;
+}
+
+static void
+gpt_read_secondary_table_complete(struct spdk_bdev_io *bdev_io, bool status, void *arg)
+{
+ struct gpt_base *gpt_base = (struct gpt_base *)arg;
+ struct spdk_bdev *bdev = spdk_bdev_part_base_get_bdev(gpt_base->part_base);
+ int rc, num_partitions = 0;
+
+ spdk_bdev_free_io(bdev_io);
+ spdk_put_io_channel(gpt_base->ch);
+ gpt_base->ch = NULL;
+
+ if (status != SPDK_BDEV_IO_STATUS_SUCCESS) {
+ SPDK_ERRLOG("Gpt: bdev=%s io error status=%d\n",
+ spdk_bdev_get_name(bdev), status);
+ goto end;
+ }
+
+ rc = gpt_parse_partition_table(&gpt_base->gpt);
+ if (rc) {
+ SPDK_DEBUGLOG(SPDK_LOG_VBDEV_GPT, "Failed to parse secondary partition table\n");
+ goto end;
+ }
+
+ SPDK_WARNLOG("Gpt: bdev=%s primary partition table broken, use the secondary\n",
+ spdk_bdev_get_name(bdev));
+
+ num_partitions = vbdev_gpt_create_bdevs(gpt_base);
+ if (num_partitions < 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_VBDEV_GPT, "Failed to split dev=%s by gpt table\n",
+ spdk_bdev_get_name(bdev));
+ }
+
+end:
+ spdk_bdev_module_examine_done(&gpt_if);
+ if (num_partitions <= 0) {
+ /* If no gpt_disk instances were created, free the base context */
+ spdk_bdev_part_base_free(gpt_base->part_base);
+ }
+}
+
+static int
+vbdev_gpt_read_secondary_table(struct gpt_base *gpt_base)
+{
+ struct spdk_gpt *gpt;
+ struct spdk_bdev_desc *part_base_desc;
+ uint64_t secondary_offset;
+
+ gpt = &gpt_base->gpt;
+ gpt->parse_phase = SPDK_GPT_PARSE_PHASE_SECONDARY;
+ gpt->header = NULL;
+ gpt->partitions = NULL;
+
+ part_base_desc = spdk_bdev_part_base_get_desc(gpt_base->part_base);
+
+ secondary_offset = gpt->total_sectors * gpt->sector_size - gpt->buf_size;
+ return spdk_bdev_read(part_base_desc, gpt_base->ch, gpt_base->gpt.buf, secondary_offset,
+ gpt_base->gpt.buf_size, gpt_read_secondary_table_complete,
+ gpt_base);
+}
+
+static void
+gpt_bdev_complete(struct spdk_bdev_io *bdev_io, bool status, void *arg)
+{
+ struct gpt_base *gpt_base = (struct gpt_base *)arg;
+ struct spdk_bdev *bdev = spdk_bdev_part_base_get_bdev(gpt_base->part_base);
+ int rc, num_partitions = 0;
+
+ spdk_bdev_free_io(bdev_io);
+
+ if (status != SPDK_BDEV_IO_STATUS_SUCCESS) {
+ SPDK_ERRLOG("Gpt: bdev=%s io error status=%d\n",
+ spdk_bdev_get_name(bdev), status);
+ goto end;
+ }
+
+ rc = gpt_parse_mbr(&gpt_base->gpt);
+ if (rc) {
+ SPDK_DEBUGLOG(SPDK_LOG_VBDEV_GPT, "Failed to parse mbr\n");
+ goto end;
+ }
+
+ rc = gpt_parse_partition_table(&gpt_base->gpt);
+ if (rc) {
+ SPDK_DEBUGLOG(SPDK_LOG_VBDEV_GPT, "Failed to parse primary partition table\n");
+ rc = vbdev_gpt_read_secondary_table(gpt_base);
+ if (rc) {
+ SPDK_ERRLOG("Failed to read secondary table\n");
+ goto end;
+ }
+ return;
+ }
+
+ num_partitions = vbdev_gpt_create_bdevs(gpt_base);
+ if (num_partitions < 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_VBDEV_GPT, "Failed to split dev=%s by gpt table\n",
+ spdk_bdev_get_name(bdev));
+ }
+
+end:
+ spdk_put_io_channel(gpt_base->ch);
+ gpt_base->ch = NULL;
+ /*
+ * Notify the generic bdev layer that the actions related to the original examine
+ * callback are now completed.
+ */
+ spdk_bdev_module_examine_done(&gpt_if);
+
+ /*
+ * vbdev_gpt_create_bdevs returns the number of bdevs created upon success.
+ * We can branch on this value.
+ */
+ if (num_partitions <= 0) {
+ /* If no gpt_disk instances were created, free the base context */
+ spdk_bdev_part_base_free(gpt_base->part_base);
+ }
+}
+
+static int
+vbdev_gpt_read_gpt(struct spdk_bdev *bdev)
+{
+ struct gpt_base *gpt_base;
+ struct spdk_bdev_desc *part_base_desc;
+ int rc;
+
+ gpt_base = gpt_base_bdev_init(bdev);
+ if (!gpt_base) {
+ SPDK_ERRLOG("Cannot allocated gpt_base\n");
+ return -1;
+ }
+
+ part_base_desc = spdk_bdev_part_base_get_desc(gpt_base->part_base);
+ gpt_base->ch = spdk_bdev_get_io_channel(part_base_desc);
+ if (gpt_base->ch == NULL) {
+ SPDK_ERRLOG("Failed to get an io_channel.\n");
+ spdk_bdev_part_base_free(gpt_base->part_base);
+ return -1;
+ }
+
+ rc = spdk_bdev_read(part_base_desc, gpt_base->ch, gpt_base->gpt.buf, 0,
+ gpt_base->gpt.buf_size, gpt_bdev_complete, gpt_base);
+ if (rc < 0) {
+ spdk_put_io_channel(gpt_base->ch);
+ spdk_bdev_part_base_free(gpt_base->part_base);
+ SPDK_ERRLOG("Failed to send bdev_io command\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int
+vbdev_gpt_init(void)
+{
+ struct spdk_conf_section *sp = spdk_conf_find_section(NULL, "Gpt");
+
+ if (sp && spdk_conf_section_get_boolval(sp, "Disable", false)) {
+ /* Disable Gpt probe */
+ g_gpt_disabled = true;
+ }
+
+ return 0;
+}
+
+static int
+vbdev_gpt_get_ctx_size(void)
+{
+ return sizeof(struct gpt_io);
+}
+
+static void
+vbdev_gpt_examine(struct spdk_bdev *bdev)
+{
+ int rc;
+
+ /* A bdev with fewer than 2 blocks cannot have a GPT. Block 0 has
+ * the MBR and block 1 has the GPT header.
+ */
+ if (g_gpt_disabled || spdk_bdev_get_num_blocks(bdev) < 2) {
+ spdk_bdev_module_examine_done(&gpt_if);
+ return;
+ }
+
+ if (spdk_bdev_get_block_size(bdev) % 512 != 0) {
+ SPDK_ERRLOG("GPT module does not support block size %" PRIu32 " for bdev %s\n",
+ spdk_bdev_get_block_size(bdev), spdk_bdev_get_name(bdev));
+ spdk_bdev_module_examine_done(&gpt_if);
+ return;
+ }
+
+ rc = vbdev_gpt_read_gpt(bdev);
+ if (rc) {
+ spdk_bdev_module_examine_done(&gpt_if);
+ SPDK_ERRLOG("Failed to read info from bdev %s\n", spdk_bdev_get_name(bdev));
+ }
+}
+
+SPDK_LOG_REGISTER_COMPONENT("vbdev_gpt", SPDK_LOG_VBDEV_GPT)
diff --git a/src/spdk/module/bdev/iscsi/Makefile b/src/spdk/module/bdev/iscsi/Makefile
new file mode 100644
index 000000000..38ba8b709
--- /dev/null
+++ b/src/spdk/module/bdev/iscsi/Makefile
@@ -0,0 +1,51 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+CFLAGS += -I$(SPDK_ROOT_DIR)/lib/bdev/
+# CentOS 7 libiscsi package has functions declared inline but not
+# defined in the header file. Not aware of any way to disable
+# this warning so just make sure the warning isn't treated as
+# an error.
+CFLAGS += -Wno-error
+C_SRCS = bdev_iscsi.c bdev_iscsi_rpc.c
+LIBNAME = bdev_iscsi
+
+SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/module/bdev/iscsi/bdev_iscsi.c b/src/spdk/module/bdev/iscsi/bdev_iscsi.c
new file mode 100644
index 000000000..18e8e0090
--- /dev/null
+++ b/src/spdk/module/bdev/iscsi/bdev_iscsi.c
@@ -0,0 +1,936 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/bdev.h"
+#include "spdk/conf.h"
+#include "spdk/env.h"
+#include "spdk/fd.h"
+#include "spdk/thread.h"
+#include "spdk/json.h"
+#include "spdk/util.h"
+#include "spdk/rpc.h"
+#include "spdk/string.h"
+#include "spdk/iscsi_spec.h"
+
+#include "spdk_internal/log.h"
+#include "spdk/bdev_module.h"
+
+#include "iscsi/iscsi.h"
+#include "iscsi/scsi-lowlevel.h"
+
+#include "bdev_iscsi.h"
+
+struct bdev_iscsi_lun;
+
+#define BDEV_ISCSI_CONNECTION_POLL_US 500 /* 0.5 ms */
+#define BDEV_ISCSI_NO_MASTER_CH_POLL_US 10000 /* 10ms */
+
+#define DEFAULT_INITIATOR_NAME "iqn.2016-06.io.spdk:init"
+
+static int bdev_iscsi_initialize(void);
+static TAILQ_HEAD(, bdev_iscsi_conn_req) g_iscsi_conn_req = TAILQ_HEAD_INITIALIZER(
+ g_iscsi_conn_req);
+static struct spdk_poller *g_conn_poller = NULL;
+
+struct bdev_iscsi_io {
+ struct spdk_thread *submit_td;
+ enum spdk_bdev_io_status status;
+ int scsi_status;
+ enum spdk_scsi_sense sk;
+ uint8_t asc;
+ uint8_t ascq;
+};
+
+struct bdev_iscsi_lun {
+ struct spdk_bdev bdev;
+ struct iscsi_context *context;
+ char *initiator_iqn;
+ int lun_id;
+ char *url;
+ pthread_mutex_t mutex;
+ uint32_t ch_count;
+ struct spdk_thread *master_td;
+ struct spdk_poller *no_master_ch_poller;
+ struct spdk_thread *no_master_ch_poller_td;
+ bool unmap_supported;
+ struct spdk_poller *poller;
+};
+
+struct bdev_iscsi_io_channel {
+ struct bdev_iscsi_lun *lun;
+};
+
+struct bdev_iscsi_conn_req {
+ char *url;
+ char *bdev_name;
+ char *initiator_iqn;
+ struct iscsi_context *context;
+ spdk_bdev_iscsi_create_cb create_cb;
+ void *create_cb_arg;
+ bool unmap_supported;
+ int lun;
+ int status;
+ TAILQ_ENTRY(bdev_iscsi_conn_req) link;
+};
+
+static void
+complete_conn_req(struct bdev_iscsi_conn_req *req, struct spdk_bdev *bdev,
+ int status)
+{
+ TAILQ_REMOVE(&g_iscsi_conn_req, req, link);
+ req->create_cb(req->create_cb_arg, bdev, status);
+
+ /*
+ * we are still running in the context of iscsi_service()
+ * so do not tear down its data structures here
+ */
+ req->status = status;
+}
+
+static int
+bdev_iscsi_get_ctx_size(void)
+{
+ return sizeof(struct bdev_iscsi_io);
+}
+
+static void
+_iscsi_free_lun(void *arg)
+{
+ struct bdev_iscsi_lun *lun = arg;
+
+ assert(lun != NULL);
+ iscsi_destroy_context(lun->context);
+ pthread_mutex_destroy(&lun->mutex);
+ free(lun->bdev.name);
+ free(lun->url);
+ free(lun->initiator_iqn);
+
+ spdk_bdev_destruct_done(&lun->bdev, 0);
+ free(lun);
+}
+
+static void
+_bdev_iscsi_conn_req_free(struct bdev_iscsi_conn_req *req)
+{
+ free(req->initiator_iqn);
+ free(req->bdev_name);
+ free(req->url);
+ /* destroy will call iscsi_disconnect() implicitly if connected */
+ iscsi_destroy_context(req->context);
+ free(req);
+}
+
+static void
+bdev_iscsi_finish(void)
+{
+ struct bdev_iscsi_conn_req *req, *tmp;
+
+ /* clear out pending connection requests here. We cannot
+ * simply set the state to a non SCSI_STATUS_GOOD state as
+ * the connection poller wont run anymore
+ */
+ TAILQ_FOREACH_SAFE(req, &g_iscsi_conn_req, link, tmp) {
+ _bdev_iscsi_conn_req_free(req);
+ }
+
+ if (g_conn_poller) {
+ spdk_poller_unregister(&g_conn_poller);
+ }
+}
+
+static struct spdk_bdev_module g_iscsi_bdev_module = {
+ .name = "iscsi",
+ .module_init = bdev_iscsi_initialize,
+ .module_fini = bdev_iscsi_finish,
+ .get_ctx_size = bdev_iscsi_get_ctx_size,
+ .async_init = true,
+};
+
+SPDK_BDEV_MODULE_REGISTER(iscsi, &g_iscsi_bdev_module);
+
+static void
+_bdev_iscsi_io_complete(void *_iscsi_io)
+{
+ struct bdev_iscsi_io *iscsi_io = _iscsi_io;
+
+ if (iscsi_io->status == SPDK_BDEV_IO_STATUS_SUCCESS) {
+ spdk_bdev_io_complete_scsi_status(spdk_bdev_io_from_ctx(iscsi_io), iscsi_io->scsi_status,
+ iscsi_io->sk, iscsi_io->asc, iscsi_io->ascq);
+ } else {
+ spdk_bdev_io_complete(spdk_bdev_io_from_ctx(iscsi_io), iscsi_io->status);
+ }
+}
+
+static void
+bdev_iscsi_io_complete(struct bdev_iscsi_io *iscsi_io, enum spdk_bdev_io_status status)
+{
+ iscsi_io->status = status;
+ if (iscsi_io->submit_td != NULL) {
+ spdk_thread_send_msg(iscsi_io->submit_td, _bdev_iscsi_io_complete, iscsi_io);
+ } else {
+ _bdev_iscsi_io_complete(iscsi_io);
+ }
+}
+
+/* Common call back function for read/write/flush command */
+static void
+bdev_iscsi_command_cb(struct iscsi_context *context, int status, void *_task, void *_iscsi_io)
+{
+ struct scsi_task *task = _task;
+ struct bdev_iscsi_io *iscsi_io = _iscsi_io;
+
+ iscsi_io->scsi_status = status;
+ iscsi_io->sk = (uint8_t)task->sense.key;
+ iscsi_io->asc = (task->sense.ascq >> 8) & 0xFF;
+ iscsi_io->ascq = task->sense.ascq & 0xFF;
+
+ scsi_free_scsi_task(task);
+ bdev_iscsi_io_complete(iscsi_io, SPDK_BDEV_IO_STATUS_SUCCESS);
+}
+
+static void
+bdev_iscsi_readv(struct bdev_iscsi_lun *lun, struct bdev_iscsi_io *iscsi_io,
+ struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t lba)
+{
+ struct scsi_task *task;
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI_INIT, "read %d iovs size %lu to lba: %#lx\n",
+ iovcnt, nbytes, lba);
+
+ task = iscsi_read16_task(lun->context, lun->lun_id, lba, nbytes, lun->bdev.blocklen, 0, 0, 0, 0, 0,
+ bdev_iscsi_command_cb, iscsi_io);
+ if (task == NULL) {
+ SPDK_ERRLOG("failed to get read16_task\n");
+ bdev_iscsi_io_complete(iscsi_io, SPDK_BDEV_IO_STATUS_FAILED);
+ return;
+ }
+
+#if defined(LIBISCSI_FEATURE_IOVECTOR)
+ scsi_task_set_iov_in(task, (struct scsi_iovec *)iov, iovcnt);
+#else
+ int i;
+ for (i = 0; i < iovcnt; i++) {
+ scsi_task_add_data_in_buffer(task, iov[i].iov_len, iov[i].iov_base);
+ }
+#endif
+}
+
+static void
+bdev_iscsi_writev(struct bdev_iscsi_lun *lun, struct bdev_iscsi_io *iscsi_io,
+ struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t lba)
+{
+ struct scsi_task *task;
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI_INIT, "write %d iovs size %lu to lba: %#lx\n",
+ iovcnt, nbytes, lba);
+
+ task = iscsi_write16_task(lun->context, lun->lun_id, lba, NULL, nbytes, lun->bdev.blocklen, 0, 0, 0,
+ 0, 0,
+ bdev_iscsi_command_cb, iscsi_io);
+ if (task == NULL) {
+ SPDK_ERRLOG("failed to get write16_task\n");
+ bdev_iscsi_io_complete(iscsi_io, SPDK_BDEV_IO_STATUS_FAILED);
+ return;
+ }
+
+#if defined(LIBISCSI_FEATURE_IOVECTOR)
+ scsi_task_set_iov_out(task, (struct scsi_iovec *)iov, iovcnt);
+#else
+ int i;
+ for (i = 0; i < iovcnt; i++) {
+ scsi_task_add_data_in_buffer(task, iov[i].iov_len, iov[i].iov_base);
+ }
+#endif
+}
+
+static void
+bdev_iscsi_destruct_cb(void *ctx)
+{
+ struct bdev_iscsi_lun *lun = ctx;
+
+ spdk_poller_unregister(&lun->no_master_ch_poller);
+ spdk_io_device_unregister(lun, _iscsi_free_lun);
+}
+
+static int
+bdev_iscsi_destruct(void *ctx)
+{
+ struct bdev_iscsi_lun *lun = ctx;
+
+ assert(lun->no_master_ch_poller_td);
+ spdk_thread_send_msg(lun->no_master_ch_poller_td, bdev_iscsi_destruct_cb, lun);
+ return 1;
+}
+
+static void
+bdev_iscsi_flush(struct bdev_iscsi_lun *lun, struct bdev_iscsi_io *iscsi_io, uint32_t num_blocks,
+ int immed, uint64_t lba)
+{
+ struct scsi_task *task;
+
+ task = iscsi_synchronizecache16_task(lun->context, lun->lun_id, lba,
+ num_blocks, 0, immed, bdev_iscsi_command_cb, iscsi_io);
+ if (task == NULL) {
+ SPDK_ERRLOG("failed to get sync16_task\n");
+ bdev_iscsi_io_complete(iscsi_io, SPDK_BDEV_IO_STATUS_FAILED);
+ return;
+ }
+}
+
+static void
+bdev_iscsi_unmap(struct bdev_iscsi_lun *lun, struct bdev_iscsi_io *iscsi_io,
+ uint64_t lba, uint64_t num_blocks)
+{
+ struct scsi_task *task;
+ struct unmap_list list[1];
+
+ list[0].lba = lba;
+ list[0].num = num_blocks;
+ task = iscsi_unmap_task(lun->context, 0, 0, 0, list, 1,
+ bdev_iscsi_command_cb, iscsi_io);
+ if (task == NULL) {
+ SPDK_ERRLOG("failed to get unmap_task\n");
+ bdev_iscsi_io_complete(iscsi_io, SPDK_BDEV_IO_STATUS_FAILED);
+ return;
+ }
+}
+
+static void
+bdev_iscsi_reset_cb(struct iscsi_context *context __attribute__((unused)), int status,
+ void *command_data, void *private_data)
+{
+ uint32_t tmf_response;
+ struct bdev_iscsi_io *iscsi_io = private_data;
+
+ tmf_response = *(uint32_t *)command_data;
+ if (tmf_response == ISCSI_TASK_FUNC_RESP_COMPLETE) {
+ bdev_iscsi_io_complete(iscsi_io, SPDK_BDEV_IO_STATUS_SUCCESS);
+ } else {
+ bdev_iscsi_io_complete(iscsi_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+}
+
+static void
+_bdev_iscsi_reset(void *_bdev_io)
+{
+ int rc;
+ struct spdk_bdev_io *bdev_io = _bdev_io;
+ struct bdev_iscsi_lun *lun = (struct bdev_iscsi_lun *)bdev_io->bdev->ctxt;
+ struct bdev_iscsi_io *iscsi_io = (struct bdev_iscsi_io *)bdev_io->driver_ctx;
+ struct iscsi_context *context = lun->context;
+
+ rc = iscsi_task_mgmt_lun_reset_async(context, lun->lun_id,
+ bdev_iscsi_reset_cb, iscsi_io);
+ if (rc != 0) {
+ SPDK_ERRLOG("failed to do iscsi reset\n");
+ bdev_iscsi_io_complete(iscsi_io, SPDK_BDEV_IO_STATUS_FAILED);
+ return;
+ }
+}
+
+static void
+bdev_iscsi_reset(struct spdk_bdev_io *bdev_io)
+{
+ struct bdev_iscsi_lun *lun = (struct bdev_iscsi_lun *)bdev_io->bdev->ctxt;
+ spdk_thread_send_msg(lun->master_td, _bdev_iscsi_reset, bdev_io);
+}
+
+static int
+bdev_iscsi_poll_lun(void *_lun)
+{
+ struct bdev_iscsi_lun *lun = _lun;
+ struct pollfd pfd = {};
+
+ pfd.fd = iscsi_get_fd(lun->context);
+ pfd.events = iscsi_which_events(lun->context);
+
+ if (poll(&pfd, 1, 0) < 0) {
+ SPDK_ERRLOG("poll failed\n");
+ return SPDK_POLLER_IDLE;
+ }
+
+ if (pfd.revents != 0) {
+ if (iscsi_service(lun->context, pfd.revents) < 0) {
+ SPDK_ERRLOG("iscsi_service failed: %s\n", iscsi_get_error(lun->context));
+ }
+
+ return SPDK_POLLER_BUSY;
+ }
+
+ return SPDK_POLLER_IDLE;
+}
+
+static int
+bdev_iscsi_no_master_ch_poll(void *arg)
+{
+ struct bdev_iscsi_lun *lun = arg;
+ enum spdk_thread_poller_rc rc = SPDK_POLLER_IDLE;
+
+ if (pthread_mutex_trylock(&lun->mutex)) {
+ /* Don't care about the error code here. */
+ return SPDK_POLLER_IDLE;
+ }
+
+ if (lun->ch_count == 0) {
+ rc = bdev_iscsi_poll_lun(arg);
+ }
+
+ pthread_mutex_unlock(&lun->mutex);
+ return rc;
+}
+
+static void
+bdev_iscsi_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
+ bool success)
+{
+ if (!success) {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ return;
+ }
+
+ bdev_iscsi_readv((struct bdev_iscsi_lun *)bdev_io->bdev->ctxt,
+ (struct bdev_iscsi_io *)bdev_io->driver_ctx,
+ bdev_io->u.bdev.iovs,
+ bdev_io->u.bdev.iovcnt,
+ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen,
+ bdev_io->u.bdev.offset_blocks);
+}
+
+static void _bdev_iscsi_submit_request(void *_bdev_io)
+{
+ struct spdk_bdev_io *bdev_io = _bdev_io;
+ struct bdev_iscsi_io *iscsi_io = (struct bdev_iscsi_io *)bdev_io->driver_ctx;
+ struct bdev_iscsi_lun *lun = (struct bdev_iscsi_lun *)bdev_io->bdev->ctxt;
+
+ switch (bdev_io->type) {
+ case SPDK_BDEV_IO_TYPE_READ:
+ spdk_bdev_io_get_buf(bdev_io, bdev_iscsi_get_buf_cb,
+ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
+ break;
+
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ bdev_iscsi_writev(lun, iscsi_io,
+ bdev_io->u.bdev.iovs,
+ bdev_io->u.bdev.iovcnt,
+ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen,
+ bdev_io->u.bdev.offset_blocks);
+ break;
+ case SPDK_BDEV_IO_TYPE_FLUSH:
+ bdev_iscsi_flush(lun, iscsi_io,
+ bdev_io->u.bdev.num_blocks,
+ ISCSI_IMMEDIATE_DATA_NO,
+ bdev_io->u.bdev.offset_blocks);
+ break;
+ case SPDK_BDEV_IO_TYPE_RESET:
+ bdev_iscsi_reset(bdev_io);
+ break;
+ case SPDK_BDEV_IO_TYPE_UNMAP:
+ bdev_iscsi_unmap(lun, iscsi_io,
+ bdev_io->u.bdev.offset_blocks,
+ bdev_io->u.bdev.num_blocks);
+ break;
+ default:
+ bdev_iscsi_io_complete(iscsi_io, SPDK_BDEV_IO_STATUS_FAILED);
+ break;
+ }
+}
+
+static void bdev_iscsi_submit_request(struct spdk_io_channel *_ch, struct spdk_bdev_io *bdev_io)
+{
+ struct spdk_thread *submit_td = spdk_io_channel_get_thread(_ch);
+ struct bdev_iscsi_io *iscsi_io = (struct bdev_iscsi_io *)bdev_io->driver_ctx;
+ struct bdev_iscsi_lun *lun = (struct bdev_iscsi_lun *)bdev_io->bdev->ctxt;
+
+ if (lun->master_td != submit_td) {
+ iscsi_io->submit_td = submit_td;
+ spdk_thread_send_msg(lun->master_td, _bdev_iscsi_submit_request, bdev_io);
+ return;
+ } else {
+ iscsi_io->submit_td = NULL;
+ }
+
+ _bdev_iscsi_submit_request(bdev_io);
+}
+
+static bool
+bdev_iscsi_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
+{
+ struct bdev_iscsi_lun *lun = ctx;
+
+ switch (io_type) {
+ case SPDK_BDEV_IO_TYPE_READ:
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ case SPDK_BDEV_IO_TYPE_FLUSH:
+ case SPDK_BDEV_IO_TYPE_RESET:
+ return true;
+
+ case SPDK_BDEV_IO_TYPE_UNMAP:
+ return lun->unmap_supported;
+ default:
+ return false;
+ }
+}
+
+static int
+bdev_iscsi_create_cb(void *io_device, void *ctx_buf)
+{
+ struct bdev_iscsi_io_channel *ch = ctx_buf;
+ struct bdev_iscsi_lun *lun = io_device;
+
+ pthread_mutex_lock(&lun->mutex);
+ if (lun->ch_count == 0) {
+ assert(lun->master_td == NULL);
+ lun->master_td = spdk_get_thread();
+ lun->poller = SPDK_POLLER_REGISTER(bdev_iscsi_poll_lun, lun, 0);
+ ch->lun = lun;
+ }
+ lun->ch_count++;
+ pthread_mutex_unlock(&lun->mutex);
+
+ return 0;
+}
+
+static void
+_iscsi_destroy_cb(void *ctx)
+{
+ struct bdev_iscsi_lun *lun = ctx;
+
+ pthread_mutex_lock(&lun->mutex);
+
+ assert(lun->master_td == spdk_get_thread());
+ assert(lun->ch_count > 0);
+
+ lun->ch_count--;
+ if (lun->ch_count > 0) {
+ pthread_mutex_unlock(&lun->mutex);
+ return;
+ }
+
+ lun->master_td = NULL;
+ spdk_poller_unregister(&lun->poller);
+
+ pthread_mutex_unlock(&lun->mutex);
+}
+
+static void
+bdev_iscsi_destroy_cb(void *io_device, void *ctx_buf)
+{
+ struct bdev_iscsi_lun *lun = io_device;
+ struct spdk_thread *thread;
+
+ pthread_mutex_lock(&lun->mutex);
+ lun->ch_count--;
+ if (lun->ch_count == 0) {
+ assert(lun->master_td != NULL);
+
+ if (lun->master_td != spdk_get_thread()) {
+ /* The final channel was destroyed on a different thread
+ * than where the first channel was created. Pass a message
+ * to the master thread to unregister the poller. */
+ lun->ch_count++;
+ thread = lun->master_td;
+ pthread_mutex_unlock(&lun->mutex);
+ spdk_thread_send_msg(thread, _iscsi_destroy_cb, lun);
+ return;
+ }
+
+ lun->master_td = NULL;
+ spdk_poller_unregister(&lun->poller);
+ }
+ pthread_mutex_unlock(&lun->mutex);
+}
+
+static struct spdk_io_channel *
+bdev_iscsi_get_io_channel(void *ctx)
+{
+ struct bdev_iscsi_lun *lun = ctx;
+
+ return spdk_get_io_channel(lun);
+}
+
+static int
+bdev_iscsi_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
+{
+ struct bdev_iscsi_lun *lun = ctx;
+
+ spdk_json_write_named_object_begin(w, "iscsi");
+ spdk_json_write_named_string(w, "initiator_name", lun->initiator_iqn);
+ spdk_json_write_named_string(w, "url", lun->url);
+ spdk_json_write_object_end(w);
+
+ return 0;
+}
+
+static void
+bdev_iscsi_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
+{
+ struct bdev_iscsi_lun *lun = bdev->ctxt;
+
+ pthread_mutex_lock(&lun->mutex);
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_string(w, "method", "bdev_iscsi_create");
+
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_string(w, "name", bdev->name);
+ spdk_json_write_named_string(w, "initiator_iqn", lun->initiator_iqn);
+ spdk_json_write_named_string(w, "url", lun->url);
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+ pthread_mutex_unlock(&lun->mutex);
+}
+
+static const struct spdk_bdev_fn_table iscsi_fn_table = {
+ .destruct = bdev_iscsi_destruct,
+ .submit_request = bdev_iscsi_submit_request,
+ .io_type_supported = bdev_iscsi_io_type_supported,
+ .get_io_channel = bdev_iscsi_get_io_channel,
+ .dump_info_json = bdev_iscsi_dump_info_json,
+ .write_config_json = bdev_iscsi_write_config_json,
+};
+
+static int
+create_iscsi_lun(struct iscsi_context *context, int lun_id, char *url, char *initiator_iqn,
+ char *name,
+ uint64_t num_blocks, uint32_t block_size, struct spdk_bdev **bdev, bool unmap_supported)
+{
+ struct bdev_iscsi_lun *lun;
+ int rc;
+
+ lun = calloc(sizeof(*lun), 1);
+ if (!lun) {
+ SPDK_ERRLOG("Unable to allocate enough memory for iscsi backend\n");
+ return -ENOMEM;
+ }
+
+ lun->context = context;
+ lun->lun_id = lun_id;
+ lun->url = url;
+ lun->initiator_iqn = initiator_iqn;
+
+ pthread_mutex_init(&lun->mutex, NULL);
+
+ lun->bdev.name = name;
+ lun->bdev.product_name = "iSCSI LUN";
+ lun->bdev.module = &g_iscsi_bdev_module;
+ lun->bdev.blocklen = block_size;
+ lun->bdev.blockcnt = num_blocks;
+ lun->bdev.ctxt = lun;
+ lun->unmap_supported = unmap_supported;
+
+ lun->bdev.fn_table = &iscsi_fn_table;
+
+ spdk_io_device_register(lun, bdev_iscsi_create_cb, bdev_iscsi_destroy_cb,
+ sizeof(struct bdev_iscsi_io_channel),
+ name);
+ rc = spdk_bdev_register(&lun->bdev);
+ if (rc) {
+ spdk_io_device_unregister(lun, NULL);
+ pthread_mutex_destroy(&lun->mutex);
+ free(lun);
+ return rc;
+ }
+
+ lun->no_master_ch_poller_td = spdk_get_thread();
+ lun->no_master_ch_poller = SPDK_POLLER_REGISTER(bdev_iscsi_no_master_ch_poll, lun,
+ BDEV_ISCSI_NO_MASTER_CH_POLL_US);
+
+ *bdev = &lun->bdev;
+ return 0;
+}
+
+static void
+iscsi_readcapacity16_cb(struct iscsi_context *iscsi, int status,
+ void *command_data, void *private_data)
+{
+ struct bdev_iscsi_conn_req *req = private_data;
+ struct scsi_readcapacity16 *readcap16;
+ struct spdk_bdev *bdev = NULL;
+ struct scsi_task *task = command_data;
+
+ if (status != SPDK_SCSI_STATUS_GOOD) {
+ SPDK_ERRLOG("iSCSI error: %s\n", iscsi_get_error(iscsi));
+ goto ret;
+ }
+
+ readcap16 = scsi_datain_unmarshall(task);
+ if (!readcap16) {
+ status = -ENOMEM;
+ goto ret;
+ }
+
+ status = create_iscsi_lun(req->context, req->lun, req->url, req->initiator_iqn, req->bdev_name,
+ readcap16->returned_lba + 1, readcap16->block_length, &bdev, req->unmap_supported);
+ if (status) {
+ SPDK_ERRLOG("Unable to create iscsi bdev: %s (%d)\n", spdk_strerror(-status), status);
+ }
+
+ret:
+ scsi_free_scsi_task(task);
+ complete_conn_req(req, bdev, status);
+}
+
+static void
+bdev_iscsi_inquiry_cb(struct iscsi_context *context, int status, void *_task, void *private_data)
+{
+ struct scsi_task *task = _task;
+ struct scsi_inquiry_logical_block_provisioning *lbp_inq = NULL;
+ struct bdev_iscsi_conn_req *req = private_data;
+
+ if (status == SPDK_SCSI_STATUS_GOOD) {
+ lbp_inq = scsi_datain_unmarshall(task);
+ if (lbp_inq != NULL && lbp_inq->lbpu) {
+ req->unmap_supported = true;
+ }
+ }
+
+ task = iscsi_readcapacity16_task(context, req->lun, iscsi_readcapacity16_cb, req);
+ if (task) {
+ return;
+ }
+
+ SPDK_ERRLOG("iSCSI error: %s\n", iscsi_get_error(req->context));
+ complete_conn_req(req, NULL, status);
+}
+
+static void
+iscsi_connect_cb(struct iscsi_context *iscsi, int status,
+ void *command_data, void *private_data)
+{
+ struct bdev_iscsi_conn_req *req = private_data;
+ struct scsi_task *task;
+
+ if (status != SPDK_SCSI_STATUS_GOOD) {
+ goto ret;
+ }
+
+ task = iscsi_inquiry_task(iscsi, req->lun, 1,
+ SCSI_INQUIRY_PAGECODE_LOGICAL_BLOCK_PROVISIONING,
+ 255, bdev_iscsi_inquiry_cb, req);
+ if (task) {
+ return;
+ }
+
+ret:
+ SPDK_ERRLOG("iSCSI error: %s\n", iscsi_get_error(req->context));
+ complete_conn_req(req, NULL, status);
+}
+
+static int
+iscsi_bdev_conn_poll(void *arg)
+{
+ struct bdev_iscsi_conn_req *req, *tmp;
+ struct pollfd pfd;
+ struct iscsi_context *context;
+
+ if (TAILQ_EMPTY(&g_iscsi_conn_req)) {
+ return SPDK_POLLER_IDLE;
+ }
+
+ TAILQ_FOREACH_SAFE(req, &g_iscsi_conn_req, link, tmp) {
+ context = req->context;
+ pfd.fd = iscsi_get_fd(context);
+ pfd.events = iscsi_which_events(context);
+ pfd.revents = 0;
+ if (poll(&pfd, 1, 0) < 0) {
+ SPDK_ERRLOG("poll failed\n");
+ return SPDK_POLLER_BUSY;
+ }
+
+ if (pfd.revents != 0) {
+ if (iscsi_service(context, pfd.revents) < 0) {
+ SPDK_ERRLOG("iscsi_service failed: %s\n", iscsi_get_error(context));
+ }
+ }
+
+ if (req->status == 0) {
+ /*
+ * The request completed successfully.
+ */
+ free(req);
+ } else if (req->status > 0) {
+ /*
+ * An error has occurred during connecting. This req has already
+ * been removed from the g_iscsi_conn_req list, but we needed to
+ * wait until iscsi_service unwound before we could free the req.
+ */
+ _bdev_iscsi_conn_req_free(req);
+ }
+ }
+ return SPDK_POLLER_BUSY;
+}
+
+int
+create_iscsi_disk(const char *bdev_name, const char *url, const char *initiator_iqn,
+ spdk_bdev_iscsi_create_cb cb_fn, void *cb_arg)
+{
+ struct bdev_iscsi_conn_req *req;
+ struct iscsi_url *iscsi_url = NULL;
+ int rc;
+
+ if (!bdev_name || !url || !initiator_iqn || strlen(initiator_iqn) == 0 || !cb_fn) {
+ return -EINVAL;
+ }
+
+ req = calloc(1, sizeof(struct bdev_iscsi_conn_req));
+ if (!req) {
+ SPDK_ERRLOG("Cannot allocate pointer of struct bdev_iscsi_conn_req\n");
+ return -ENOMEM;
+ }
+
+ req->status = SCSI_STATUS_GOOD;
+ req->bdev_name = strdup(bdev_name);
+ req->url = strdup(url);
+ req->initiator_iqn = strdup(initiator_iqn);
+ req->context = iscsi_create_context(initiator_iqn);
+ if (!req->bdev_name || !req->url || !req->initiator_iqn || !req->context) {
+ SPDK_ERRLOG("Out of memory\n");
+ rc = -ENOMEM;
+ goto err;
+ }
+
+ req->create_cb = cb_fn;
+ req->create_cb_arg = cb_arg;
+
+ iscsi_url = iscsi_parse_full_url(req->context, url);
+ if (iscsi_url == NULL) {
+ SPDK_ERRLOG("could not parse URL: %s\n", iscsi_get_error(req->context));
+ rc = -EINVAL;
+ goto err;
+ }
+
+ req->lun = iscsi_url->lun;
+ rc = iscsi_set_session_type(req->context, ISCSI_SESSION_NORMAL);
+ rc = rc ? rc : iscsi_set_header_digest(req->context, ISCSI_HEADER_DIGEST_NONE);
+ rc = rc ? rc : iscsi_set_targetname(req->context, iscsi_url->target);
+ rc = rc ? rc : iscsi_full_connect_async(req->context, iscsi_url->portal, iscsi_url->lun,
+ iscsi_connect_cb, req);
+ if (rc == 0 && iscsi_url->user[0] != '\0') {
+ rc = iscsi_set_initiator_username_pwd(req->context, iscsi_url->user, iscsi_url->passwd);
+ }
+
+ if (rc < 0) {
+ SPDK_ERRLOG("Failed to connect provided URL=%s: %s\n", url, iscsi_get_error(req->context));
+ goto err;
+ }
+
+ iscsi_destroy_url(iscsi_url);
+ req->status = -1;
+ TAILQ_INSERT_TAIL(&g_iscsi_conn_req, req, link);
+ if (!g_conn_poller) {
+ g_conn_poller = SPDK_POLLER_REGISTER(iscsi_bdev_conn_poll, NULL, BDEV_ISCSI_CONNECTION_POLL_US);
+ }
+
+ return 0;
+
+err:
+ /* iscsi_destroy_url() is not NULL-proof */
+ if (iscsi_url) {
+ iscsi_destroy_url(iscsi_url);
+ }
+
+ if (req->context) {
+ iscsi_destroy_context(req->context);
+ }
+
+ free(req->initiator_iqn);
+ free(req->bdev_name);
+ free(req->url);
+ free(req);
+ return rc;
+}
+
+void
+delete_iscsi_disk(struct spdk_bdev *bdev, spdk_delete_iscsi_complete cb_fn, void *cb_arg)
+{
+ if (!bdev || bdev->module != &g_iscsi_bdev_module) {
+ cb_fn(cb_arg, -ENODEV);
+ return;
+ }
+
+ spdk_bdev_unregister(bdev, cb_fn, cb_arg);
+}
+
+static void
+bdev_iscsi_initialize_cb(void *cb_arg, struct spdk_bdev *bdev, int status)
+{
+ if (TAILQ_EMPTY(&g_iscsi_conn_req)) {
+ spdk_bdev_module_init_done(&g_iscsi_bdev_module);
+ }
+}
+
+static int
+bdev_iscsi_initialize(void)
+{
+ struct spdk_conf_section *sp;
+
+ const char *url, *bdev_name, *initiator_iqn;
+ int i, rc;
+
+ sp = spdk_conf_find_section(NULL, "iSCSI_Initiator");
+ if (sp == NULL) {
+ spdk_bdev_module_init_done(&g_iscsi_bdev_module);
+ return 0;
+ }
+
+ initiator_iqn = spdk_conf_section_get_val(sp, "initiator_name");
+ if (!initiator_iqn) {
+ initiator_iqn = DEFAULT_INITIATOR_NAME;
+ }
+
+ rc = 0;
+ for (i = 0; (url = spdk_conf_section_get_nmval(sp, "URL", i, 0)) != NULL; i++) {
+ bdev_name = spdk_conf_section_get_nmval(sp, "URL", i, 1);
+ if (bdev_name == NULL) {
+ SPDK_ERRLOG("no bdev name specified for URL %s\n", url);
+ rc = -EINVAL;
+ break;
+ }
+
+ rc = create_iscsi_disk(bdev_name, url, initiator_iqn, bdev_iscsi_initialize_cb, NULL);
+ if (rc) {
+ break;
+ }
+ }
+
+ if (i == 0) {
+ spdk_bdev_module_init_done(&g_iscsi_bdev_module);
+ }
+
+ return rc;
+}
+
+SPDK_LOG_REGISTER_COMPONENT("iscsi_init", SPDK_LOG_ISCSI_INIT)
diff --git a/src/spdk/module/bdev/iscsi/bdev_iscsi.h b/src/spdk/module/bdev/iscsi/bdev_iscsi.h
new file mode 100644
index 000000000..6a343123b
--- /dev/null
+++ b/src/spdk/module/bdev/iscsi/bdev_iscsi.h
@@ -0,0 +1,75 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_BDEV_ISCSI_H
+#define SPDK_BDEV_ISCSI_H
+
+#include "spdk/bdev.h"
+
+typedef void (*spdk_delete_iscsi_complete)(void *cb_arg, int bdeverrno);
+
+/**
+ * SPDK bdev iSCSI callback type.
+ *
+ * \param cb_arg Completion callback custom arguments
+ * \param bdev created bdev
+ * \param status operation status. Zero on success.
+ */
+typedef void (*spdk_bdev_iscsi_create_cb)(void *cb_arg, struct spdk_bdev *bdev, int status);
+
+/**
+ * Create new iSCSI bdev.
+ *
+ * \warning iSCSI URL allow providing login and password. Be careful because
+ * they will show up in configuration dump.
+ *
+ * \param name name for new bdev.
+ * \param url iSCSI URL string.
+ * \param initiator_iqn connection iqn name we identify to target as
+ * \param cb_fn Completion callback
+ * \param cb_arg Completion callback custom arguments
+ * \return 0 on success or negative error code. If success bdev with provided name was created.
+ */
+int create_iscsi_disk(const char *bdev_name, const char *url, const char *initiator_iqn,
+ spdk_bdev_iscsi_create_cb cb_fn, void *cb_arg);
+
+/**
+ * Delete iSCSI bdev.
+ *
+ * \param bdev Pointer to iSCSI bdev.
+ * \param cb_fn Completion callback
+ * \param cb_arg Completion callback custom arguments
+ */
+void delete_iscsi_disk(struct spdk_bdev *bdev, spdk_delete_iscsi_complete cb_fn, void *cb_arg);
+
+#endif /* SPDK_BDEV_ISCSI_H */
diff --git a/src/spdk/module/bdev/iscsi/bdev_iscsi_rpc.c b/src/spdk/module/bdev/iscsi/bdev_iscsi_rpc.c
new file mode 100644
index 000000000..5c3bdf551
--- /dev/null
+++ b/src/spdk/module/bdev/iscsi/bdev_iscsi_rpc.c
@@ -0,0 +1,158 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "bdev_iscsi.h"
+#include "spdk/rpc.h"
+#include "spdk/util.h"
+#include "spdk/string.h"
+
+#include "spdk_internal/log.h"
+
+struct rpc_bdev_iscsi_create {
+ char *name;
+ char *initiator_iqn;
+ char *url;
+};
+
+static const struct spdk_json_object_decoder rpc_bdev_iscsi_create_decoders[] = {
+ {"name", offsetof(struct rpc_bdev_iscsi_create, name), spdk_json_decode_string},
+ {"initiator_iqn", offsetof(struct rpc_bdev_iscsi_create, initiator_iqn), spdk_json_decode_string},
+ {"url", offsetof(struct rpc_bdev_iscsi_create, url), spdk_json_decode_string},
+};
+
+static void
+free_rpc_bdev_iscsi_create(struct rpc_bdev_iscsi_create *req)
+{
+ free(req->name);
+ free(req->initiator_iqn);
+ free(req->url);
+}
+
+static void
+bdev_iscsi_create_cb(void *cb_arg, struct spdk_bdev *bdev, int status)
+{
+ struct spdk_jsonrpc_request *request = cb_arg;
+ struct spdk_json_write_ctx *w;
+
+ if (status > 0) {
+ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "iSCSI error (%d).", status);
+ } else if (status < 0) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ spdk_strerror(-status));
+ } else {
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_string(w, spdk_bdev_get_name(bdev));
+ spdk_jsonrpc_end_result(request, w);
+ }
+}
+
+static void
+rpc_bdev_iscsi_create(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_iscsi_create req = {};
+ int rc = 0;
+
+ if (spdk_json_decode_object(params, rpc_bdev_iscsi_create_decoders,
+ SPDK_COUNTOF(rpc_bdev_iscsi_create_decoders),
+ &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ rc = create_iscsi_disk(req.name, req.url, req.initiator_iqn, bdev_iscsi_create_cb, request);
+ if (rc) {
+ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc));
+ }
+
+cleanup:
+ free_rpc_bdev_iscsi_create(&req);
+}
+SPDK_RPC_REGISTER("bdev_iscsi_create", rpc_bdev_iscsi_create, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_iscsi_create, construct_iscsi_bdev)
+
+struct rpc_delete_iscsi {
+ char *name;
+};
+
+static void
+free_rpc_delete_iscsi(struct rpc_delete_iscsi *r)
+{
+ free(r->name);
+}
+
+static const struct spdk_json_object_decoder rpc_delete_iscsi_decoders[] = {
+ {"name", offsetof(struct rpc_delete_iscsi, name), spdk_json_decode_string},
+};
+
+static void
+rpc_bdev_iscsi_delete_cb(void *cb_arg, int bdeverrno)
+{
+ struct spdk_jsonrpc_request *request = cb_arg;
+ struct spdk_json_write_ctx *w = spdk_jsonrpc_begin_result(request);
+
+ spdk_json_write_bool(w, bdeverrno == 0);
+ spdk_jsonrpc_end_result(request, w);
+}
+
+static void
+rpc_bdev_iscsi_delete(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_delete_iscsi req = {NULL};
+ struct spdk_bdev *bdev;
+
+ if (spdk_json_decode_object(params, rpc_delete_iscsi_decoders,
+ SPDK_COUNTOF(rpc_delete_iscsi_decoders),
+ &req)) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ bdev = spdk_bdev_get_by_name(req.name);
+ if (bdev == NULL) {
+ spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV));
+ goto cleanup;
+ }
+
+ delete_iscsi_disk(bdev, rpc_bdev_iscsi_delete_cb, request);
+
+cleanup:
+ free_rpc_delete_iscsi(&req);
+}
+SPDK_RPC_REGISTER("bdev_iscsi_delete", rpc_bdev_iscsi_delete, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_iscsi_delete, delete_iscsi_bdev)
diff --git a/src/spdk/module/bdev/lvol/Makefile b/src/spdk/module/bdev/lvol/Makefile
new file mode 100644
index 000000000..37034593c
--- /dev/null
+++ b/src/spdk/module/bdev/lvol/Makefile
@@ -0,0 +1,46 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = vbdev_lvol.c vbdev_lvol_rpc.c
+LIBNAME = bdev_lvol
+LOCAL_SYS_LIBS = -luuid
+
+SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/module/bdev/lvol/vbdev_lvol.c b/src/spdk/module/bdev/lvol/vbdev_lvol.c
new file mode 100644
index 000000000..275d68e6a
--- /dev/null
+++ b/src/spdk/module/bdev/lvol/vbdev_lvol.c
@@ -0,0 +1,1354 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/blob_bdev.h"
+#include "spdk/rpc.h"
+#include "spdk/bdev_module.h"
+#include "spdk_internal/log.h"
+#include "spdk/string.h"
+#include "spdk/uuid.h"
+
+#include "vbdev_lvol.h"
+
+static TAILQ_HEAD(, lvol_store_bdev) g_spdk_lvol_pairs = TAILQ_HEAD_INITIALIZER(
+ g_spdk_lvol_pairs);
+
+static int vbdev_lvs_init(void);
+static int vbdev_lvs_get_ctx_size(void);
+static void vbdev_lvs_examine(struct spdk_bdev *bdev);
+
+static struct spdk_bdev_module g_lvol_if = {
+ .name = "lvol",
+ .module_init = vbdev_lvs_init,
+ .examine_disk = vbdev_lvs_examine,
+ .get_ctx_size = vbdev_lvs_get_ctx_size,
+
+};
+
+SPDK_BDEV_MODULE_REGISTER(lvol, &g_lvol_if)
+
+struct lvol_store_bdev *
+vbdev_get_lvs_bdev_by_lvs(struct spdk_lvol_store *lvs_orig)
+{
+ struct spdk_lvol_store *lvs = NULL;
+ struct lvol_store_bdev *lvs_bdev = vbdev_lvol_store_first();
+
+ while (lvs_bdev != NULL) {
+ lvs = lvs_bdev->lvs;
+ if (lvs == lvs_orig) {
+ if (lvs_bdev->req != NULL) {
+ /* We do not allow access to lvs that are being destroyed */
+ return NULL;
+ } else {
+ return lvs_bdev;
+ }
+ }
+ lvs_bdev = vbdev_lvol_store_next(lvs_bdev);
+ }
+
+ return NULL;
+}
+
+static int
+_vbdev_lvol_change_bdev_alias(struct spdk_lvol *lvol, const char *new_lvol_name)
+{
+ struct spdk_bdev_alias *tmp;
+ char *old_alias;
+ char *alias;
+ int rc;
+ int alias_number = 0;
+
+ /* bdev representing lvols have only one alias,
+ * while we changed lvs name earlier, we have to iterate alias list to get one,
+ * and check if there is only one alias */
+
+ TAILQ_FOREACH(tmp, &lvol->bdev->aliases, tailq) {
+ if (++alias_number > 1) {
+ SPDK_ERRLOG("There is more than 1 alias in bdev %s\n", lvol->bdev->name);
+ return -EINVAL;
+ }
+
+ old_alias = tmp->alias;
+ }
+
+ if (alias_number == 0) {
+ SPDK_ERRLOG("There are no aliases in bdev %s\n", lvol->bdev->name);
+ return -EINVAL;
+ }
+
+ alias = spdk_sprintf_alloc("%s/%s", lvol->lvol_store->name, new_lvol_name);
+ if (alias == NULL) {
+ SPDK_ERRLOG("Cannot alloc memory for alias\n");
+ return -ENOMEM;
+ }
+
+ rc = spdk_bdev_alias_add(lvol->bdev, alias);
+ if (rc != 0) {
+ SPDK_ERRLOG("cannot add alias '%s'\n", alias);
+ free(alias);
+ return rc;
+ }
+ free(alias);
+
+ rc = spdk_bdev_alias_del(lvol->bdev, old_alias);
+ if (rc != 0) {
+ SPDK_ERRLOG("cannot remove alias '%s'\n", old_alias);
+ return rc;
+ }
+
+ return 0;
+}
+
+static struct lvol_store_bdev *
+vbdev_get_lvs_bdev_by_bdev(struct spdk_bdev *bdev_orig)
+{
+ struct lvol_store_bdev *lvs_bdev = vbdev_lvol_store_first();
+
+ while (lvs_bdev != NULL) {
+ if (lvs_bdev->bdev == bdev_orig) {
+ if (lvs_bdev->req != NULL) {
+ /* We do not allow access to lvs that are being destroyed */
+ return NULL;
+ } else {
+ return lvs_bdev;
+ }
+ }
+ lvs_bdev = vbdev_lvol_store_next(lvs_bdev);
+ }
+
+ return NULL;
+}
+
+static void
+vbdev_lvs_hotremove_cb(void *ctx)
+{
+ struct spdk_bdev *bdev = ctx;
+ struct lvol_store_bdev *lvs_bdev;
+
+ lvs_bdev = vbdev_get_lvs_bdev_by_bdev(bdev);
+ if (lvs_bdev != NULL) {
+ vbdev_lvs_unload(lvs_bdev->lvs, NULL, NULL);
+ }
+}
+
+static void
+_vbdev_lvs_create_cb(void *cb_arg, struct spdk_lvol_store *lvs, int lvserrno)
+{
+ struct spdk_lvs_with_handle_req *req = cb_arg;
+ struct lvol_store_bdev *lvs_bdev;
+ struct spdk_bdev *bdev = req->base_bdev;
+ struct spdk_bs_dev *bs_dev = req->bs_dev;
+
+ if (lvserrno != 0) {
+ assert(lvs == NULL);
+ SPDK_ERRLOG("Cannot create lvol store bdev\n");
+ goto end;
+ }
+
+ lvserrno = spdk_bs_bdev_claim(bs_dev, &g_lvol_if);
+ if (lvserrno != 0) {
+ SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Lvol store base bdev already claimed by another bdev\n");
+ req->bs_dev->destroy(req->bs_dev);
+ goto end;
+ }
+
+ assert(lvs != NULL);
+
+ lvs_bdev = calloc(1, sizeof(*lvs_bdev));
+ if (!lvs_bdev) {
+ lvserrno = -ENOMEM;
+ goto end;
+ }
+ lvs_bdev->lvs = lvs;
+ lvs_bdev->bdev = bdev;
+ lvs_bdev->req = NULL;
+
+ TAILQ_INSERT_TAIL(&g_spdk_lvol_pairs, lvs_bdev, lvol_stores);
+ SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Lvol store bdev inserted\n");
+
+end:
+ req->cb_fn(req->cb_arg, lvs, lvserrno);
+ free(req);
+
+ return;
+}
+
+int
+vbdev_lvs_create(struct spdk_bdev *base_bdev, const char *name, uint32_t cluster_sz,
+ enum lvs_clear_method clear_method, spdk_lvs_op_with_handle_complete cb_fn, void *cb_arg)
+{
+ struct spdk_bs_dev *bs_dev;
+ struct spdk_lvs_with_handle_req *lvs_req;
+ struct spdk_lvs_opts opts;
+ int rc;
+ int len;
+
+ if (base_bdev == NULL) {
+ SPDK_ERRLOG("Bdev does not exist\n");
+ return -ENODEV;
+ }
+
+ spdk_lvs_opts_init(&opts);
+ if (cluster_sz != 0) {
+ opts.cluster_sz = cluster_sz;
+ }
+
+ if (clear_method != 0) {
+ opts.clear_method = clear_method;
+ }
+
+ if (name == NULL) {
+ SPDK_ERRLOG("missing name param\n");
+ return -EINVAL;
+ }
+
+ len = strnlen(name, SPDK_LVS_NAME_MAX);
+
+ if (len == 0 || len == SPDK_LVS_NAME_MAX) {
+ SPDK_ERRLOG("name must be between 1 and %d characters\n", SPDK_LVS_NAME_MAX - 1);
+ return -EINVAL;
+ }
+ snprintf(opts.name, sizeof(opts.name), "%s", name);
+
+ lvs_req = calloc(1, sizeof(*lvs_req));
+ if (!lvs_req) {
+ SPDK_ERRLOG("Cannot alloc memory for vbdev lvol store request pointer\n");
+ return -ENOMEM;
+ }
+
+ bs_dev = spdk_bdev_create_bs_dev(base_bdev, vbdev_lvs_hotremove_cb, base_bdev);
+ if (!bs_dev) {
+ SPDK_ERRLOG("Cannot create blobstore device\n");
+ free(lvs_req);
+ return -ENODEV;
+ }
+
+ lvs_req->bs_dev = bs_dev;
+ lvs_req->base_bdev = base_bdev;
+ lvs_req->cb_fn = cb_fn;
+ lvs_req->cb_arg = cb_arg;
+
+ rc = spdk_lvs_init(bs_dev, &opts, _vbdev_lvs_create_cb, lvs_req);
+ if (rc < 0) {
+ free(lvs_req);
+ bs_dev->destroy(bs_dev);
+ return rc;
+ }
+
+ return 0;
+}
+
+static void
+_vbdev_lvs_rename_cb(void *cb_arg, int lvserrno)
+{
+ struct spdk_lvs_req *req = cb_arg;
+ struct spdk_lvol *tmp;
+
+ if (lvserrno != 0) {
+ SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Lvol store rename failed\n");
+ } else {
+ TAILQ_FOREACH(tmp, &req->lvol_store->lvols, link) {
+ /* We have to pass current lvol name, since only lvs name changed */
+ _vbdev_lvol_change_bdev_alias(tmp, tmp->name);
+ }
+ }
+
+ req->cb_fn(req->cb_arg, lvserrno);
+ free(req);
+}
+
+void
+vbdev_lvs_rename(struct spdk_lvol_store *lvs, const char *new_lvs_name,
+ spdk_lvs_op_complete cb_fn, void *cb_arg)
+{
+ struct lvol_store_bdev *lvs_bdev;
+
+ struct spdk_lvs_req *req;
+
+ lvs_bdev = vbdev_get_lvs_bdev_by_lvs(lvs);
+ if (!lvs_bdev) {
+ SPDK_ERRLOG("No such lvol store found\n");
+ cb_fn(cb_arg, -ENODEV);
+ return;
+ }
+
+ req = calloc(1, sizeof(*req));
+ if (!req) {
+ SPDK_ERRLOG("Cannot alloc memory for vbdev lvol store request pointer\n");
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+ req->cb_fn = cb_fn;
+ req->cb_arg = cb_arg;
+ req->lvol_store = lvs;
+
+ spdk_lvs_rename(lvs, new_lvs_name, _vbdev_lvs_rename_cb, req);
+}
+
+static void
+_vbdev_lvs_remove_cb(void *cb_arg, int lvserrno)
+{
+ struct lvol_store_bdev *lvs_bdev = cb_arg;
+ struct spdk_lvs_req *req = lvs_bdev->req;
+
+ if (lvserrno != 0) {
+ SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Lvol store removed with error: %d.\n", lvserrno);
+ }
+
+ TAILQ_REMOVE(&g_spdk_lvol_pairs, lvs_bdev, lvol_stores);
+ free(lvs_bdev);
+
+ if (req->cb_fn != NULL) {
+ req->cb_fn(req->cb_arg, lvserrno);
+ }
+ free(req);
+}
+
+static void
+_vbdev_lvs_remove_lvol_cb(void *cb_arg, int lvolerrno)
+{
+ struct lvol_store_bdev *lvs_bdev = cb_arg;
+ struct spdk_lvol_store *lvs = lvs_bdev->lvs;
+ struct spdk_lvol *lvol;
+
+ if (lvolerrno != 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_VBDEV_LVOL, "Lvol removed with errno %d\n", lvolerrno);
+ }
+
+ if (TAILQ_EMPTY(&lvs->lvols)) {
+ spdk_lvs_destroy(lvs, _vbdev_lvs_remove_cb, lvs_bdev);
+ return;
+ }
+
+ lvol = TAILQ_FIRST(&lvs->lvols);
+ while (lvol != NULL) {
+ if (spdk_lvol_deletable(lvol)) {
+ vbdev_lvol_destroy(lvol, _vbdev_lvs_remove_lvol_cb, lvs_bdev);
+ return;
+ }
+ lvol = TAILQ_NEXT(lvol, link);
+ }
+
+ /* If no lvol is deletable, that means there is circular dependency. */
+ SPDK_ERRLOG("Lvols left in lvs, but unable to delete.\n");
+ assert(false);
+}
+
+static void
+_vbdev_lvs_remove_bdev_unregistered_cb(void *cb_arg, int bdeverrno)
+{
+ struct lvol_store_bdev *lvs_bdev = cb_arg;
+ struct spdk_lvol_store *lvs = lvs_bdev->lvs;
+ struct spdk_lvol *lvol, *tmp;
+
+ if (bdeverrno != 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_VBDEV_LVOL, "Lvol unregistered with errno %d\n", bdeverrno);
+ }
+
+ TAILQ_FOREACH_SAFE(lvol, &lvs->lvols, link, tmp) {
+ if (lvol->ref_count != 0) {
+ /* An lvol is still open, don't unload whole lvol store. */
+ return;
+ }
+ }
+ spdk_lvs_unload(lvs, _vbdev_lvs_remove_cb, lvs_bdev);
+}
+
+static void
+_vbdev_lvs_remove(struct spdk_lvol_store *lvs, spdk_lvs_op_complete cb_fn, void *cb_arg,
+ bool destroy)
+{
+ struct spdk_lvs_req *req;
+ struct lvol_store_bdev *lvs_bdev;
+ struct spdk_lvol *lvol, *tmp;
+ bool all_lvols_closed = true;
+
+ lvs_bdev = vbdev_get_lvs_bdev_by_lvs(lvs);
+ if (!lvs_bdev) {
+ SPDK_ERRLOG("No such lvol store found\n");
+ if (cb_fn != NULL) {
+ cb_fn(cb_arg, -ENODEV);
+ }
+ return;
+ }
+
+ req = calloc(1, sizeof(*req));
+ if (!req) {
+ SPDK_ERRLOG("Cannot alloc memory for vbdev lvol store request pointer\n");
+ if (cb_fn != NULL) {
+ cb_fn(cb_arg, -ENOMEM);
+ }
+ return;
+ }
+
+ req->cb_fn = cb_fn;
+ req->cb_arg = cb_arg;
+ lvs_bdev->req = req;
+
+ TAILQ_FOREACH_SAFE(lvol, &lvs->lvols, link, tmp) {
+ if (lvol->ref_count != 0) {
+ all_lvols_closed = false;
+ }
+ }
+
+ if (all_lvols_closed == true) {
+ if (destroy) {
+ spdk_lvs_destroy(lvs, _vbdev_lvs_remove_cb, lvs_bdev);
+ } else {
+ spdk_lvs_unload(lvs, _vbdev_lvs_remove_cb, lvs_bdev);
+ }
+ } else {
+ lvs->destruct = destroy;
+ if (destroy) {
+ _vbdev_lvs_remove_lvol_cb(lvs_bdev, 0);
+ } else {
+ TAILQ_FOREACH_SAFE(lvol, &lvs->lvols, link, tmp) {
+ spdk_bdev_unregister(lvol->bdev, _vbdev_lvs_remove_bdev_unregistered_cb, lvs_bdev);
+ }
+ }
+ }
+}
+
+void
+vbdev_lvs_unload(struct spdk_lvol_store *lvs, spdk_lvs_op_complete cb_fn, void *cb_arg)
+{
+ _vbdev_lvs_remove(lvs, cb_fn, cb_arg, false);
+}
+
+void
+vbdev_lvs_destruct(struct spdk_lvol_store *lvs, spdk_lvs_op_complete cb_fn, void *cb_arg)
+{
+ _vbdev_lvs_remove(lvs, cb_fn, cb_arg, true);
+}
+
+struct lvol_store_bdev *
+vbdev_lvol_store_first(void)
+{
+ struct lvol_store_bdev *lvs_bdev;
+
+ lvs_bdev = TAILQ_FIRST(&g_spdk_lvol_pairs);
+ if (lvs_bdev) {
+ SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Starting lvolstore iteration at %p\n", lvs_bdev->lvs);
+ }
+
+ return lvs_bdev;
+}
+
+struct lvol_store_bdev *
+vbdev_lvol_store_next(struct lvol_store_bdev *prev)
+{
+ struct lvol_store_bdev *lvs_bdev;
+
+ if (prev == NULL) {
+ SPDK_ERRLOG("prev argument cannot be NULL\n");
+ return NULL;
+ }
+
+ lvs_bdev = TAILQ_NEXT(prev, lvol_stores);
+ if (lvs_bdev) {
+ SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Continuing lvolstore iteration at %p\n", lvs_bdev->lvs);
+ }
+
+ return lvs_bdev;
+}
+
+static struct spdk_lvol_store *
+_vbdev_get_lvol_store_by_uuid(const struct spdk_uuid *uuid)
+{
+ struct spdk_lvol_store *lvs = NULL;
+ struct lvol_store_bdev *lvs_bdev = vbdev_lvol_store_first();
+
+ while (lvs_bdev != NULL) {
+ lvs = lvs_bdev->lvs;
+ if (spdk_uuid_compare(&lvs->uuid, uuid) == 0) {
+ return lvs;
+ }
+ lvs_bdev = vbdev_lvol_store_next(lvs_bdev);
+ }
+ return NULL;
+}
+
+struct spdk_lvol_store *
+vbdev_get_lvol_store_by_uuid(const char *uuid_str)
+{
+ struct spdk_uuid uuid;
+
+ if (spdk_uuid_parse(&uuid, uuid_str)) {
+ return NULL;
+ }
+
+ return _vbdev_get_lvol_store_by_uuid(&uuid);
+}
+
+struct spdk_lvol_store *
+vbdev_get_lvol_store_by_name(const char *name)
+{
+ struct spdk_lvol_store *lvs = NULL;
+ struct lvol_store_bdev *lvs_bdev = vbdev_lvol_store_first();
+
+ while (lvs_bdev != NULL) {
+ lvs = lvs_bdev->lvs;
+ if (strncmp(lvs->name, name, sizeof(lvs->name)) == 0) {
+ return lvs;
+ }
+ lvs_bdev = vbdev_lvol_store_next(lvs_bdev);
+ }
+ return NULL;
+}
+
+struct vbdev_lvol_destroy_ctx {
+ struct spdk_lvol *lvol;
+ spdk_lvol_op_complete cb_fn;
+ void *cb_arg;
+};
+
+static void
+_vbdev_lvol_unregister_cb(void *ctx, int lvolerrno)
+{
+ struct spdk_bdev *bdev = ctx;
+
+ spdk_bdev_destruct_done(bdev, lvolerrno);
+ free(bdev);
+}
+
+static int
+vbdev_lvol_unregister(void *ctx)
+{
+ struct spdk_lvol *lvol = ctx;
+
+ assert(lvol != NULL);
+
+ spdk_bdev_alias_del_all(lvol->bdev);
+ spdk_lvol_close(lvol, _vbdev_lvol_unregister_cb, lvol->bdev);
+
+ /* return 1 to indicate we have an operation that must finish asynchronously before the
+ * lvol is closed
+ */
+ return 1;
+}
+
+static void
+_vbdev_lvol_destroy_cb(void *cb_arg, int bdeverrno)
+{
+ struct vbdev_lvol_destroy_ctx *ctx = cb_arg;
+ struct spdk_lvol *lvol = ctx->lvol;
+
+ if (bdeverrno < 0) {
+ SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Could not unregister bdev during lvol (%s) destroy\n",
+ lvol->unique_id);
+ ctx->cb_fn(ctx->cb_arg, bdeverrno);
+ free(ctx);
+ return;
+ }
+
+ spdk_lvol_destroy(lvol, ctx->cb_fn, ctx->cb_arg);
+ free(ctx);
+}
+
+void
+vbdev_lvol_destroy(struct spdk_lvol *lvol, spdk_lvol_op_complete cb_fn, void *cb_arg)
+{
+ struct vbdev_lvol_destroy_ctx *ctx;
+ size_t count;
+
+ assert(lvol != NULL);
+ assert(cb_fn != NULL);
+
+ /* Check if it is possible to delete lvol */
+ spdk_blob_get_clones(lvol->lvol_store->blobstore, lvol->blob_id, NULL, &count);
+ if (count > 1) {
+ /* throw an error */
+ SPDK_ERRLOG("Cannot delete lvol\n");
+ cb_fn(cb_arg, -EPERM);
+ return;
+ }
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx) {
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ ctx->lvol = lvol;
+ ctx->cb_fn = cb_fn;
+ ctx->cb_arg = cb_arg;
+
+ spdk_bdev_unregister(lvol->bdev, _vbdev_lvol_destroy_cb, ctx);
+}
+
+static char *
+vbdev_lvol_find_name(struct spdk_lvol *lvol, spdk_blob_id blob_id)
+{
+ struct spdk_lvol_store *lvs;
+ struct spdk_lvol *_lvol;
+
+ assert(lvol != NULL);
+
+ lvs = lvol->lvol_store;
+
+ assert(lvs);
+
+ TAILQ_FOREACH(_lvol, &lvs->lvols, link) {
+ if (_lvol->blob_id == blob_id) {
+ return _lvol->name;
+ }
+ }
+
+ return NULL;
+}
+
+static int
+vbdev_lvol_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
+{
+ struct spdk_lvol *lvol = ctx;
+ struct lvol_store_bdev *lvs_bdev;
+ struct spdk_bdev *bdev;
+ struct spdk_blob *blob;
+ char lvol_store_uuid[SPDK_UUID_STRING_LEN];
+ spdk_blob_id *ids = NULL;
+ size_t count, i;
+ char *name;
+ int rc = 0;
+
+ spdk_json_write_named_object_begin(w, "lvol");
+
+ lvs_bdev = vbdev_get_lvs_bdev_by_lvs(lvol->lvol_store);
+ if (!lvs_bdev) {
+ SPDK_ERRLOG("No such lvol store found\n");
+ rc = -ENODEV;
+ goto end;
+ }
+
+ bdev = lvs_bdev->bdev;
+
+ spdk_uuid_fmt_lower(lvol_store_uuid, sizeof(lvol_store_uuid), &lvol->lvol_store->uuid);
+ spdk_json_write_named_string(w, "lvol_store_uuid", lvol_store_uuid);
+
+ spdk_json_write_named_string(w, "base_bdev", spdk_bdev_get_name(bdev));
+
+ blob = lvol->blob;
+
+ spdk_json_write_named_bool(w, "thin_provision", spdk_blob_is_thin_provisioned(blob));
+
+ spdk_json_write_named_bool(w, "snapshot", spdk_blob_is_snapshot(blob));
+
+ spdk_json_write_named_bool(w, "clone", spdk_blob_is_clone(blob));
+
+ if (spdk_blob_is_clone(blob)) {
+ spdk_blob_id snapshotid = spdk_blob_get_parent_snapshot(lvol->lvol_store->blobstore, lvol->blob_id);
+ if (snapshotid != SPDK_BLOBID_INVALID) {
+ name = vbdev_lvol_find_name(lvol, snapshotid);
+ if (name != NULL) {
+ spdk_json_write_named_string(w, "base_snapshot", name);
+ } else {
+ SPDK_ERRLOG("Cannot obtain snapshots name\n");
+ }
+ }
+ }
+
+ if (spdk_blob_is_snapshot(blob)) {
+ /* Take a number of clones */
+ rc = spdk_blob_get_clones(lvol->lvol_store->blobstore, lvol->blob_id, NULL, &count);
+ if (rc == -ENOMEM && count > 0) {
+ ids = malloc(sizeof(spdk_blob_id) * count);
+ if (ids == NULL) {
+ SPDK_ERRLOG("Cannot allocate memory\n");
+ rc = -ENOMEM;
+ goto end;
+ }
+
+ rc = spdk_blob_get_clones(lvol->lvol_store->blobstore, lvol->blob_id, ids, &count);
+ if (rc == 0) {
+ spdk_json_write_named_array_begin(w, "clones");
+ for (i = 0; i < count; i++) {
+ name = vbdev_lvol_find_name(lvol, ids[i]);
+ if (name != NULL) {
+ spdk_json_write_string(w, name);
+ } else {
+ SPDK_ERRLOG("Cannot obtain clone name\n");
+ }
+
+ }
+ spdk_json_write_array_end(w);
+ }
+ free(ids);
+ }
+
+ }
+
+end:
+ spdk_json_write_object_end(w);
+
+ return rc;
+}
+
+static void
+vbdev_lvol_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
+{
+ /* Nothing to dump as lvol configuration is saved on physical device. */
+}
+
+static struct spdk_io_channel *
+vbdev_lvol_get_io_channel(void *ctx)
+{
+ struct spdk_lvol *lvol = ctx;
+
+ return spdk_lvol_get_io_channel(lvol);
+}
+
+static bool
+vbdev_lvol_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
+{
+ struct spdk_lvol *lvol = ctx;
+
+ switch (io_type) {
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ case SPDK_BDEV_IO_TYPE_UNMAP:
+ case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
+ return !spdk_blob_is_read_only(lvol->blob);
+ case SPDK_BDEV_IO_TYPE_RESET:
+ case SPDK_BDEV_IO_TYPE_READ:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static void
+lvol_op_comp(void *cb_arg, int bserrno)
+{
+ struct spdk_bdev_io *bdev_io = cb_arg;
+ enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS;
+
+ if (bserrno != 0) {
+ if (bserrno == -ENOMEM) {
+ status = SPDK_BDEV_IO_STATUS_NOMEM;
+ } else {
+ status = SPDK_BDEV_IO_STATUS_FAILED;
+ }
+ }
+
+ SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Vbdev processing callback on device %s with type %d\n",
+ bdev_io->bdev->name, bdev_io->type);
+ spdk_bdev_io_complete(bdev_io, status);
+}
+
+static void
+lvol_unmap(struct spdk_lvol *lvol, struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
+{
+ uint64_t start_page, num_pages;
+ struct spdk_blob *blob = lvol->blob;
+
+ start_page = bdev_io->u.bdev.offset_blocks;
+ num_pages = bdev_io->u.bdev.num_blocks;
+
+ SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL,
+ "Vbdev doing unmap at offset %" PRIu64 " using %" PRIu64 " pages on device %s\n", start_page,
+ num_pages, bdev_io->bdev->name);
+ spdk_blob_io_unmap(blob, ch, start_page, num_pages, lvol_op_comp, bdev_io);
+}
+
+static void
+lvol_write_zeroes(struct spdk_lvol *lvol, struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
+{
+ uint64_t start_page, num_pages;
+ struct spdk_blob *blob = lvol->blob;
+
+ start_page = bdev_io->u.bdev.offset_blocks;
+ num_pages = bdev_io->u.bdev.num_blocks;
+
+ SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL,
+ "Vbdev doing write zeros at offset %" PRIu64 " using %" PRIu64 " pages on device %s\n", start_page,
+ num_pages, bdev_io->bdev->name);
+ spdk_blob_io_write_zeroes(blob, ch, start_page, num_pages, lvol_op_comp, bdev_io);
+}
+
+static void
+lvol_read(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
+{
+ uint64_t start_page, num_pages;
+ struct spdk_lvol *lvol = bdev_io->bdev->ctxt;
+ struct spdk_blob *blob = lvol->blob;
+
+ start_page = bdev_io->u.bdev.offset_blocks;
+ num_pages = bdev_io->u.bdev.num_blocks;
+
+ SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL,
+ "Vbdev doing read at offset %" PRIu64 " using %" PRIu64 " pages on device %s\n", start_page,
+ num_pages, bdev_io->bdev->name);
+ spdk_blob_io_readv(blob, ch, bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, start_page,
+ num_pages, lvol_op_comp, bdev_io);
+}
+
+static void
+lvol_write(struct spdk_lvol *lvol, struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
+{
+ uint64_t start_page, num_pages;
+ struct spdk_blob *blob = lvol->blob;
+
+ start_page = bdev_io->u.bdev.offset_blocks;
+ num_pages = bdev_io->u.bdev.num_blocks;
+
+ SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL,
+ "Vbdev doing write at offset %" PRIu64 " using %" PRIu64 " pages on device %s\n", start_page,
+ num_pages, bdev_io->bdev->name);
+ spdk_blob_io_writev(blob, ch, bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, start_page,
+ num_pages, lvol_op_comp, bdev_io);
+}
+
+static int
+lvol_reset(struct spdk_bdev_io *bdev_io)
+{
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+
+ return 0;
+}
+
+static void
+lvol_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success)
+{
+ if (!success) {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ return;
+ }
+
+ lvol_read(ch, bdev_io);
+}
+
+static void
+vbdev_lvol_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
+{
+ struct spdk_lvol *lvol = bdev_io->bdev->ctxt;
+
+ SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Vbdev request type %d submitted\n", bdev_io->type);
+
+ switch (bdev_io->type) {
+ case SPDK_BDEV_IO_TYPE_READ:
+ spdk_bdev_io_get_buf(bdev_io, lvol_get_buf_cb,
+ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
+ break;
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ lvol_write(lvol, ch, bdev_io);
+ break;
+ case SPDK_BDEV_IO_TYPE_RESET:
+ lvol_reset(bdev_io);
+ break;
+ case SPDK_BDEV_IO_TYPE_UNMAP:
+ lvol_unmap(lvol, ch, bdev_io);
+ break;
+ case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
+ lvol_write_zeroes(lvol, ch, bdev_io);
+ break;
+ default:
+ SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "lvol: unsupported I/O type %d\n", bdev_io->type);
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ return;
+ }
+ return;
+}
+
+static struct spdk_bdev_fn_table vbdev_lvol_fn_table = {
+ .destruct = vbdev_lvol_unregister,
+ .io_type_supported = vbdev_lvol_io_type_supported,
+ .submit_request = vbdev_lvol_submit_request,
+ .get_io_channel = vbdev_lvol_get_io_channel,
+ .dump_info_json = vbdev_lvol_dump_info_json,
+ .write_config_json = vbdev_lvol_write_config_json,
+};
+
+static void
+lvol_destroy_cb(void *cb_arg, int bdeverrno)
+{
+}
+
+static void
+_create_lvol_disk_destroy_cb(void *cb_arg, int bdeverrno)
+{
+ struct spdk_lvol *lvol = cb_arg;
+
+ if (bdeverrno < 0) {
+ SPDK_ERRLOG("Could not unregister bdev for lvol %s\n",
+ lvol->unique_id);
+ return;
+ }
+
+ spdk_lvol_destroy(lvol, lvol_destroy_cb, NULL);
+}
+
+static void
+_create_lvol_disk_unload_cb(void *cb_arg, int bdeverrno)
+{
+ struct spdk_lvol *lvol = cb_arg;
+
+ if (bdeverrno < 0) {
+ SPDK_ERRLOG("Could not unregister bdev for lvol %s\n",
+ lvol->unique_id);
+ return;
+ }
+
+ TAILQ_REMOVE(&lvol->lvol_store->lvols, lvol, link);
+ free(lvol);
+}
+
+static int
+_create_lvol_disk(struct spdk_lvol *lvol, bool destroy)
+{
+ struct spdk_bdev *bdev;
+ struct lvol_store_bdev *lvs_bdev;
+ uint64_t total_size;
+ unsigned char *alias;
+ int rc;
+
+ lvs_bdev = vbdev_get_lvs_bdev_by_lvs(lvol->lvol_store);
+ if (lvs_bdev == NULL) {
+ SPDK_ERRLOG("No spdk lvs-bdev pair found for lvol %s\n", lvol->unique_id);
+ return -ENODEV;
+ }
+
+ bdev = calloc(1, sizeof(struct spdk_bdev));
+ if (!bdev) {
+ SPDK_ERRLOG("Cannot alloc memory for lvol bdev\n");
+ return -ENOMEM;
+ }
+
+ bdev->name = lvol->unique_id;
+ bdev->product_name = "Logical Volume";
+ bdev->blocklen = spdk_bs_get_io_unit_size(lvol->lvol_store->blobstore);
+ total_size = spdk_blob_get_num_clusters(lvol->blob) *
+ spdk_bs_get_cluster_size(lvol->lvol_store->blobstore);
+ assert((total_size % bdev->blocklen) == 0);
+ bdev->blockcnt = total_size / bdev->blocklen;
+ bdev->uuid = lvol->uuid;
+ bdev->required_alignment = lvs_bdev->bdev->required_alignment;
+ bdev->split_on_optimal_io_boundary = true;
+ bdev->optimal_io_boundary = spdk_bs_get_cluster_size(lvol->lvol_store->blobstore) / bdev->blocklen;
+
+ bdev->ctxt = lvol;
+ bdev->fn_table = &vbdev_lvol_fn_table;
+ bdev->module = &g_lvol_if;
+
+ rc = spdk_bdev_register(bdev);
+ if (rc) {
+ free(bdev);
+ return rc;
+ }
+ lvol->bdev = bdev;
+
+ alias = spdk_sprintf_alloc("%s/%s", lvs_bdev->lvs->name, lvol->name);
+ if (alias == NULL) {
+ SPDK_ERRLOG("Cannot alloc memory for alias\n");
+ spdk_bdev_unregister(lvol->bdev, (destroy ? _create_lvol_disk_destroy_cb :
+ _create_lvol_disk_unload_cb), lvol);
+ return -ENOMEM;
+ }
+
+ rc = spdk_bdev_alias_add(bdev, alias);
+ if (rc != 0) {
+ SPDK_ERRLOG("Cannot add alias to lvol bdev\n");
+ spdk_bdev_unregister(lvol->bdev, (destroy ? _create_lvol_disk_destroy_cb :
+ _create_lvol_disk_unload_cb), lvol);
+ }
+ free(alias);
+
+ return rc;
+}
+
+static void
+_vbdev_lvol_create_cb(void *cb_arg, struct spdk_lvol *lvol, int lvolerrno)
+{
+ struct spdk_lvol_with_handle_req *req = cb_arg;
+
+ if (lvolerrno < 0) {
+ goto end;
+ }
+
+ lvolerrno = _create_lvol_disk(lvol, true);
+
+end:
+ req->cb_fn(req->cb_arg, lvol, lvolerrno);
+ free(req);
+}
+
+int
+vbdev_lvol_create(struct spdk_lvol_store *lvs, const char *name, uint64_t sz,
+ bool thin_provision, enum lvol_clear_method clear_method, spdk_lvol_op_with_handle_complete cb_fn,
+ void *cb_arg)
+{
+ struct spdk_lvol_with_handle_req *req;
+ int rc;
+
+ req = calloc(1, sizeof(*req));
+ if (req == NULL) {
+ return -ENOMEM;
+ }
+ req->cb_fn = cb_fn;
+ req->cb_arg = cb_arg;
+
+ rc = spdk_lvol_create(lvs, name, sz, thin_provision, clear_method,
+ _vbdev_lvol_create_cb, req);
+ if (rc != 0) {
+ free(req);
+ }
+
+ return rc;
+}
+
+void
+vbdev_lvol_create_snapshot(struct spdk_lvol *lvol, const char *snapshot_name,
+ spdk_lvol_op_with_handle_complete cb_fn, void *cb_arg)
+{
+ struct spdk_lvol_with_handle_req *req;
+
+ req = calloc(1, sizeof(*req));
+ if (req == NULL) {
+ cb_fn(cb_arg, NULL, -ENOMEM);
+ return;
+ }
+
+ req->cb_fn = cb_fn;
+ req->cb_arg = cb_arg;
+
+ spdk_lvol_create_snapshot(lvol, snapshot_name, _vbdev_lvol_create_cb, req);
+}
+
+void
+vbdev_lvol_create_clone(struct spdk_lvol *lvol, const char *clone_name,
+ spdk_lvol_op_with_handle_complete cb_fn, void *cb_arg)
+{
+ struct spdk_lvol_with_handle_req *req;
+
+ req = calloc(1, sizeof(*req));
+ if (req == NULL) {
+ cb_fn(cb_arg, NULL, -ENOMEM);
+ return;
+ }
+
+ req->cb_fn = cb_fn;
+ req->cb_arg = cb_arg;
+
+ spdk_lvol_create_clone(lvol, clone_name, _vbdev_lvol_create_cb, req);
+}
+
+static void
+_vbdev_lvol_rename_cb(void *cb_arg, int lvolerrno)
+{
+ struct spdk_lvol_req *req = cb_arg;
+
+ if (lvolerrno != 0) {
+ SPDK_ERRLOG("Renaming lvol failed\n");
+ }
+
+ req->cb_fn(req->cb_arg, lvolerrno);
+ free(req);
+}
+
+void
+vbdev_lvol_rename(struct spdk_lvol *lvol, const char *new_lvol_name,
+ spdk_lvol_op_complete cb_fn, void *cb_arg)
+{
+ struct spdk_lvol_req *req;
+ int rc;
+
+ rc = _vbdev_lvol_change_bdev_alias(lvol, new_lvol_name);
+ if (rc != 0) {
+ SPDK_ERRLOG("renaming lvol to '%s' does not succeed\n", new_lvol_name);
+ cb_fn(cb_arg, rc);
+ return;
+ }
+
+ req = calloc(1, sizeof(*req));
+ if (req == NULL) {
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+ req->cb_fn = cb_fn;
+ req->cb_arg = cb_arg;
+
+ spdk_lvol_rename(lvol, new_lvol_name, _vbdev_lvol_rename_cb, req);
+}
+
+static void
+_vbdev_lvol_resize_cb(void *cb_arg, int lvolerrno)
+{
+ struct spdk_lvol_req *req = cb_arg;
+ struct spdk_lvol *lvol = req->lvol;
+ uint64_t total_size;
+
+ /* change bdev size */
+ if (lvolerrno != 0) {
+ SPDK_ERRLOG("CB function for bdev lvol %s receive error no: %d.\n", lvol->name, lvolerrno);
+ goto finish;
+ }
+
+ total_size = spdk_blob_get_num_clusters(lvol->blob) *
+ spdk_bs_get_cluster_size(lvol->lvol_store->blobstore);
+ assert((total_size % lvol->bdev->blocklen) == 0);
+
+ lvolerrno = spdk_bdev_notify_blockcnt_change(lvol->bdev, total_size / lvol->bdev->blocklen);
+ if (lvolerrno != 0) {
+ SPDK_ERRLOG("Could not change num blocks for bdev lvol %s with error no: %d.\n",
+ lvol->name, lvolerrno);
+ }
+
+finish:
+ req->cb_fn(req->cb_arg, lvolerrno);
+ free(req);
+}
+
+void
+vbdev_lvol_resize(struct spdk_lvol *lvol, uint64_t sz, spdk_lvol_op_complete cb_fn, void *cb_arg)
+{
+ struct spdk_lvol_req *req;
+
+ if (lvol == NULL) {
+ SPDK_ERRLOG("lvol does not exist\n");
+ cb_fn(cb_arg, -EINVAL);
+ return;
+ }
+
+ assert(lvol->bdev != NULL);
+
+ req = calloc(1, sizeof(*req));
+ if (req == NULL) {
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ req->cb_fn = cb_fn;
+ req->cb_arg = cb_arg;
+ req->sz = sz;
+ req->lvol = lvol;
+
+ spdk_lvol_resize(req->lvol, req->sz, _vbdev_lvol_resize_cb, req);
+}
+
+static void
+_vbdev_lvol_set_read_only_cb(void *cb_arg, int lvolerrno)
+{
+ struct spdk_lvol_req *req = cb_arg;
+ struct spdk_lvol *lvol = req->lvol;
+
+ if (lvolerrno != 0) {
+ SPDK_ERRLOG("Could not set bdev lvol %s as read only due to error: %d.\n", lvol->name, lvolerrno);
+ }
+
+ req->cb_fn(req->cb_arg, lvolerrno);
+ free(req);
+}
+
+void
+vbdev_lvol_set_read_only(struct spdk_lvol *lvol, spdk_lvol_op_complete cb_fn, void *cb_arg)
+{
+ struct spdk_lvol_req *req;
+
+ if (lvol == NULL) {
+ SPDK_ERRLOG("lvol does not exist\n");
+ cb_fn(cb_arg, -EINVAL);
+ return;
+ }
+
+ assert(lvol->bdev != NULL);
+
+ req = calloc(1, sizeof(*req));
+ if (req == NULL) {
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ req->cb_fn = cb_fn;
+ req->cb_arg = cb_arg;
+ req->lvol = lvol;
+
+ spdk_lvol_set_read_only(lvol, _vbdev_lvol_set_read_only_cb, req);
+}
+
+static int
+vbdev_lvs_init(void)
+{
+ return 0;
+}
+
+static int
+vbdev_lvs_get_ctx_size(void)
+{
+ return 0;
+}
+
+static void
+_vbdev_lvs_examine_failed(void *cb_arg, int lvserrno)
+{
+ spdk_bdev_module_examine_done(&g_lvol_if);
+}
+
+static void
+_vbdev_lvol_examine_close_cb(struct spdk_lvol_store *lvs)
+{
+ if (lvs->lvols_opened >= lvs->lvol_count) {
+ SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Opening lvols finished\n");
+ spdk_bdev_module_examine_done(&g_lvol_if);
+ }
+}
+
+static void
+_vbdev_lvs_examine_finish(void *cb_arg, struct spdk_lvol *lvol, int lvolerrno)
+{
+ struct spdk_lvol_store *lvs = cb_arg;
+
+ if (lvolerrno != 0) {
+ SPDK_ERRLOG("Error opening lvol %s\n", lvol->unique_id);
+ TAILQ_REMOVE(&lvs->lvols, lvol, link);
+ lvs->lvol_count--;
+ free(lvol);
+ goto end;
+ }
+
+ if (_create_lvol_disk(lvol, false)) {
+ SPDK_ERRLOG("Cannot create bdev for lvol %s\n", lvol->unique_id);
+ lvs->lvol_count--;
+ _vbdev_lvol_examine_close_cb(lvs);
+ SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Opening lvol %s failed\n", lvol->unique_id);
+ return;
+ }
+
+ lvs->lvols_opened++;
+ SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Opening lvol %s succeeded\n", lvol->unique_id);
+
+end:
+
+ if (lvs->lvols_opened >= lvs->lvol_count) {
+ SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Opening lvols finished\n");
+ spdk_bdev_module_examine_done(&g_lvol_if);
+ }
+}
+
+static void
+_vbdev_lvs_examine_cb(void *arg, struct spdk_lvol_store *lvol_store, int lvserrno)
+{
+ struct lvol_store_bdev *lvs_bdev;
+ struct spdk_lvs_with_handle_req *req = (struct spdk_lvs_with_handle_req *)arg;
+ struct spdk_lvol *lvol, *tmp;
+
+ if (lvserrno == -EEXIST) {
+ SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL,
+ "Name for lvolstore on device %s conflicts with name for already loaded lvs\n",
+ req->base_bdev->name);
+ /* On error blobstore destroys bs_dev itself */
+ spdk_bdev_module_examine_done(&g_lvol_if);
+ goto end;
+ } else if (lvserrno != 0) {
+ SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Lvol store not found on %s\n", req->base_bdev->name);
+ /* On error blobstore destroys bs_dev itself */
+ spdk_bdev_module_examine_done(&g_lvol_if);
+ goto end;
+ }
+
+ lvserrno = spdk_bs_bdev_claim(lvol_store->bs_dev, &g_lvol_if);
+ if (lvserrno != 0) {
+ SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Lvol store base bdev already claimed by another bdev\n");
+ spdk_lvs_unload(lvol_store, _vbdev_lvs_examine_failed, NULL);
+ goto end;
+ }
+
+ lvs_bdev = calloc(1, sizeof(*lvs_bdev));
+ if (!lvs_bdev) {
+ SPDK_ERRLOG("Cannot alloc memory for lvs_bdev\n");
+ spdk_lvs_unload(lvol_store, _vbdev_lvs_examine_failed, NULL);
+ goto end;
+ }
+
+ lvs_bdev->lvs = lvol_store;
+ lvs_bdev->bdev = req->base_bdev;
+
+ TAILQ_INSERT_TAIL(&g_spdk_lvol_pairs, lvs_bdev, lvol_stores);
+
+ SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Lvol store found on %s - begin parsing\n",
+ req->base_bdev->name);
+
+ lvol_store->lvols_opened = 0;
+
+ if (TAILQ_EMPTY(&lvol_store->lvols)) {
+ SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Lvol store examination done\n");
+ spdk_bdev_module_examine_done(&g_lvol_if);
+ } else {
+ /* Open all lvols */
+ TAILQ_FOREACH_SAFE(lvol, &lvol_store->lvols, link, tmp) {
+ spdk_lvol_open(lvol, _vbdev_lvs_examine_finish, lvol_store);
+ }
+ }
+
+end:
+ free(req);
+}
+
+static void
+vbdev_lvs_examine(struct spdk_bdev *bdev)
+{
+ struct spdk_bs_dev *bs_dev;
+ struct spdk_lvs_with_handle_req *req;
+
+ req = calloc(1, sizeof(*req));
+ if (req == NULL) {
+ spdk_bdev_module_examine_done(&g_lvol_if);
+ SPDK_ERRLOG("Cannot alloc memory for vbdev lvol store request pointer\n");
+ return;
+ }
+
+ bs_dev = spdk_bdev_create_bs_dev(bdev, vbdev_lvs_hotremove_cb, bdev);
+ if (!bs_dev) {
+ SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Cannot create bs dev on %s\n", bdev->name);
+ spdk_bdev_module_examine_done(&g_lvol_if);
+ free(req);
+ return;
+ }
+
+ req->base_bdev = bdev;
+
+ spdk_lvs_load(bs_dev, _vbdev_lvs_examine_cb, req);
+}
+
+struct spdk_lvol *
+vbdev_lvol_get_from_bdev(struct spdk_bdev *bdev)
+{
+ if (!bdev || bdev->module != &g_lvol_if) {
+ return NULL;
+ }
+
+ if (bdev->ctxt == NULL) {
+ SPDK_ERRLOG("No lvol ctx assigned to bdev %s\n", bdev->name);
+ return NULL;
+ }
+
+ return (struct spdk_lvol *)bdev->ctxt;
+}
+
+SPDK_LOG_REGISTER_COMPONENT("vbdev_lvol", SPDK_LOG_VBDEV_LVOL);
diff --git a/src/spdk/module/bdev/lvol/vbdev_lvol.h b/src/spdk/module/bdev/lvol/vbdev_lvol.h
new file mode 100644
index 000000000..ed3eb1c8e
--- /dev/null
+++ b/src/spdk/module/bdev/lvol/vbdev_lvol.h
@@ -0,0 +1,130 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_VBDEV_LVOL_H
+#define SPDK_VBDEV_LVOL_H
+
+#include "spdk/lvol.h"
+#include "spdk/bdev_module.h"
+
+#include "spdk_internal/lvolstore.h"
+
+struct lvol_store_bdev {
+ struct spdk_lvol_store *lvs;
+ struct spdk_bdev *bdev;
+ struct spdk_lvs_req *req;
+
+ TAILQ_ENTRY(lvol_store_bdev) lvol_stores;
+};
+
+int vbdev_lvs_create(struct spdk_bdev *base_bdev, const char *name, uint32_t cluster_sz,
+ enum lvs_clear_method clear_method, spdk_lvs_op_with_handle_complete cb_fn, void *cb_arg);
+void vbdev_lvs_destruct(struct spdk_lvol_store *lvs, spdk_lvs_op_complete cb_fn, void *cb_arg);
+void vbdev_lvs_unload(struct spdk_lvol_store *lvs, spdk_lvs_op_complete cb_fn, void *cb_arg);
+
+int vbdev_lvol_create(struct spdk_lvol_store *lvs, const char *name, uint64_t sz,
+ bool thin_provisioned, enum lvol_clear_method clear_method,
+ spdk_lvol_op_with_handle_complete cb_fn,
+ void *cb_arg);
+
+void vbdev_lvol_create_snapshot(struct spdk_lvol *lvol, const char *snapshot_name,
+ spdk_lvol_op_with_handle_complete cb_fn, void *cb_arg);
+
+void vbdev_lvol_create_clone(struct spdk_lvol *lvol, const char *clone_name,
+ spdk_lvol_op_with_handle_complete cb_fn, void *cb_arg);
+
+/**
+ * \brief Change size of lvol
+ * \param lvol Handle to lvol
+ * \param sz Size of lvol to change
+ * \param cb_fn Completion callback
+ * \param cb_arg Completion callback custom arguments
+ * \return error
+ */
+void vbdev_lvol_resize(struct spdk_lvol *lvol, uint64_t sz, spdk_lvol_op_complete cb_fn,
+ void *cb_arg);
+
+/**
+ * \brief Mark lvol as read only
+ * \param lvol Handle to lvol
+ * \param cb_fn Completion callback
+ * \param cb_arg Completion callback custom arguments
+ */
+void vbdev_lvol_set_read_only(struct spdk_lvol *lvol, spdk_lvol_op_complete cb_fn, void *cb_arg);
+
+void vbdev_lvol_rename(struct spdk_lvol *lvol, const char *new_lvol_name,
+ spdk_lvol_op_complete cb_fn, void *cb_arg);
+
+/**
+ * Destroy a logical volume
+ * \param lvol Handle to lvol
+ * \param cb_fn Completion callback
+ * \param cb_arg Completion callback custom arguments
+ */
+void vbdev_lvol_destroy(struct spdk_lvol *lvol, spdk_lvol_op_complete cb_fn, void *cb_arg);
+
+/**
+ * \brief Renames given lvolstore.
+ *
+ * \param lvs Pointer to lvolstore
+ * \param new_name New name of lvs
+ * \param cb_fn Completion callback
+ * \param cb_arg Completion callback custom arguments
+ */
+void vbdev_lvs_rename(struct spdk_lvol_store *lvs, const char *new_lvs_name,
+ spdk_lvs_op_complete cb_fn, void *cb_arg);
+
+/**
+ * \brief Search for handle lvolstore
+ * \param uuid_str UUID of lvolstore
+ * \return Handle to spdk_lvol_store or NULL if not found.
+ */
+struct spdk_lvol_store *vbdev_get_lvol_store_by_uuid(const char *uuid_str);
+
+/**
+ * \brief Search for handle to lvolstore
+ * \param name name of lvolstore
+ * \return Handle to spdk_lvol_store or NULL if not found.
+ */
+struct spdk_lvol_store *vbdev_get_lvol_store_by_name(const char *name);
+
+/**
+ * \brief Search for handle to lvol_store_bdev
+ * \param lvs handle to lvolstore
+ * \return Handle to lvol_store_bdev or NULL if not found.
+ */
+struct lvol_store_bdev *vbdev_get_lvs_bdev_by_lvs(struct spdk_lvol_store *lvs);
+
+struct spdk_lvol *vbdev_lvol_get_from_bdev(struct spdk_bdev *bdev);
+
+#endif /* SPDK_VBDEV_LVOL_H */
diff --git a/src/spdk/module/bdev/lvol/vbdev_lvol_rpc.c b/src/spdk/module/bdev/lvol/vbdev_lvol_rpc.c
new file mode 100644
index 000000000..79e74f6a5
--- /dev/null
+++ b/src/spdk/module/bdev/lvol/vbdev_lvol_rpc.c
@@ -0,0 +1,1098 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/rpc.h"
+#include "spdk/bdev.h"
+#include "spdk/util.h"
+#include "vbdev_lvol.h"
+#include "spdk/string.h"
+#include "spdk_internal/log.h"
+
+SPDK_LOG_REGISTER_COMPONENT("lvolrpc", SPDK_LOG_LVOL_RPC)
+
+struct rpc_bdev_lvol_create_lvstore {
+ char *lvs_name;
+ char *bdev_name;
+ uint32_t cluster_sz;
+ char *clear_method;
+};
+
+static int
+vbdev_get_lvol_store_by_uuid_xor_name(const char *uuid, const char *lvs_name,
+ struct spdk_lvol_store **lvs)
+{
+ if ((uuid == NULL && lvs_name == NULL)) {
+ SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "lvs UUID nor lvs name specified\n");
+ return -EINVAL;
+ } else if ((uuid && lvs_name)) {
+ SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "both lvs UUID '%s' and lvs name '%s' specified\n", uuid,
+ lvs_name);
+ return -EINVAL;
+ } else if (uuid) {
+ *lvs = vbdev_get_lvol_store_by_uuid(uuid);
+
+ if (*lvs == NULL) {
+ SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "blobstore with UUID '%s' not found\n", uuid);
+ return -ENODEV;
+ }
+ } else if (lvs_name) {
+
+ *lvs = vbdev_get_lvol_store_by_name(lvs_name);
+
+ if (*lvs == NULL) {
+ SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "blobstore with name '%s' not found\n", lvs_name);
+ return -ENODEV;
+ }
+ }
+ return 0;
+}
+
+static void
+free_rpc_bdev_lvol_create_lvstore(struct rpc_bdev_lvol_create_lvstore *req)
+{
+ free(req->bdev_name);
+ free(req->lvs_name);
+ free(req->clear_method);
+}
+
+static const struct spdk_json_object_decoder rpc_bdev_lvol_create_lvstore_decoders[] = {
+ {"bdev_name", offsetof(struct rpc_bdev_lvol_create_lvstore, bdev_name), spdk_json_decode_string},
+ {"cluster_sz", offsetof(struct rpc_bdev_lvol_create_lvstore, cluster_sz), spdk_json_decode_uint32, true},
+ {"lvs_name", offsetof(struct rpc_bdev_lvol_create_lvstore, lvs_name), spdk_json_decode_string},
+ {"clear_method", offsetof(struct rpc_bdev_lvol_create_lvstore, clear_method), spdk_json_decode_string, true},
+};
+
+static void
+rpc_lvol_store_construct_cb(void *cb_arg, struct spdk_lvol_store *lvol_store, int lvserrno)
+{
+ struct spdk_json_write_ctx *w;
+ char lvol_store_uuid[SPDK_UUID_STRING_LEN];
+ struct spdk_jsonrpc_request *request = cb_arg;
+
+ if (lvserrno != 0) {
+ goto invalid;
+ }
+
+ spdk_uuid_fmt_lower(lvol_store_uuid, sizeof(lvol_store_uuid), &lvol_store->uuid);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_string(w, lvol_store_uuid);
+ spdk_jsonrpc_end_result(request, w);
+ return;
+
+invalid:
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ spdk_strerror(-lvserrno));
+}
+
+static void
+rpc_bdev_lvol_create_lvstore(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_lvol_create_lvstore req = {};
+ struct spdk_bdev *bdev;
+ int rc = 0;
+ enum lvs_clear_method clear_method;
+
+ if (spdk_json_decode_object(params, rpc_bdev_lvol_create_lvstore_decoders,
+ SPDK_COUNTOF(rpc_bdev_lvol_create_lvstore_decoders),
+ &req)) {
+ SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ bdev = spdk_bdev_get_by_name(req.bdev_name);
+ if (bdev == NULL) {
+ SPDK_ERRLOG("bdev '%s' does not exist\n", req.bdev_name);
+ spdk_jsonrpc_send_error_response_fmt(request, -ENODEV, "Bdev %s not found", req.bdev_name);
+ goto cleanup;
+ }
+
+ if (req.clear_method != NULL) {
+ if (!strcasecmp(req.clear_method, "none")) {
+ clear_method = LVS_CLEAR_WITH_NONE;
+ } else if (!strcasecmp(req.clear_method, "unmap")) {
+ clear_method = LVS_CLEAR_WITH_UNMAP;
+ } else if (!strcasecmp(req.clear_method, "write_zeroes")) {
+ clear_method = LVS_CLEAR_WITH_WRITE_ZEROES;
+ } else {
+ spdk_jsonrpc_send_error_response(request, -EINVAL, "Invalid clear_method parameter");
+ goto cleanup;
+ }
+ } else {
+ clear_method = LVS_CLEAR_WITH_UNMAP;
+ }
+
+ rc = vbdev_lvs_create(bdev, req.lvs_name, req.cluster_sz, clear_method,
+ rpc_lvol_store_construct_cb, request);
+ if (rc < 0) {
+ spdk_jsonrpc_send_error_response(request, -rc, spdk_strerror(rc));
+ goto cleanup;
+ }
+ free_rpc_bdev_lvol_create_lvstore(&req);
+
+ return;
+
+cleanup:
+ free_rpc_bdev_lvol_create_lvstore(&req);
+}
+SPDK_RPC_REGISTER("bdev_lvol_create_lvstore", rpc_bdev_lvol_create_lvstore, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_lvol_create_lvstore, construct_lvol_store)
+
+struct rpc_bdev_lvol_rename_lvstore {
+ char *old_name;
+ char *new_name;
+};
+
+static void
+free_rpc_bdev_lvol_rename_lvstore(struct rpc_bdev_lvol_rename_lvstore *req)
+{
+ free(req->old_name);
+ free(req->new_name);
+}
+
+static const struct spdk_json_object_decoder rpc_bdev_lvol_rename_lvstore_decoders[] = {
+ {"old_name", offsetof(struct rpc_bdev_lvol_rename_lvstore, old_name), spdk_json_decode_string},
+ {"new_name", offsetof(struct rpc_bdev_lvol_rename_lvstore, new_name), spdk_json_decode_string},
+};
+
+static void
+rpc_bdev_lvol_rename_lvstore_cb(void *cb_arg, int lvserrno)
+{
+ struct spdk_json_write_ctx *w;
+ struct spdk_jsonrpc_request *request = cb_arg;
+
+ if (lvserrno != 0) {
+ goto invalid;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ return;
+
+invalid:
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ spdk_strerror(-lvserrno));
+}
+
+static void
+rpc_bdev_lvol_rename_lvstore(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_lvol_rename_lvstore req = {};
+ struct spdk_lvol_store *lvs;
+
+ if (spdk_json_decode_object(params, rpc_bdev_lvol_rename_lvstore_decoders,
+ SPDK_COUNTOF(rpc_bdev_lvol_rename_lvstore_decoders),
+ &req)) {
+ SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ lvs = vbdev_get_lvol_store_by_name(req.old_name);
+ if (lvs == NULL) {
+ SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "no lvs existing for given name\n");
+ spdk_jsonrpc_send_error_response_fmt(request, -ENOENT, "Lvol store %s not found", req.old_name);
+ goto cleanup;
+ }
+
+ vbdev_lvs_rename(lvs, req.new_name, rpc_bdev_lvol_rename_lvstore_cb, request);
+
+cleanup:
+ free_rpc_bdev_lvol_rename_lvstore(&req);
+}
+SPDK_RPC_REGISTER("bdev_lvol_rename_lvstore", rpc_bdev_lvol_rename_lvstore, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_lvol_rename_lvstore, rename_lvol_store)
+
+struct rpc_bdev_lvol_delete_lvstore {
+ char *uuid;
+ char *lvs_name;
+};
+
+static void
+free_rpc_bdev_lvol_delete_lvstore(struct rpc_bdev_lvol_delete_lvstore *req)
+{
+ free(req->uuid);
+ free(req->lvs_name);
+}
+
+static const struct spdk_json_object_decoder rpc_bdev_lvol_delete_lvstore_decoders[] = {
+ {"uuid", offsetof(struct rpc_bdev_lvol_delete_lvstore, uuid), spdk_json_decode_string, true},
+ {"lvs_name", offsetof(struct rpc_bdev_lvol_delete_lvstore, lvs_name), spdk_json_decode_string, true},
+};
+
+static void
+rpc_lvol_store_destroy_cb(void *cb_arg, int lvserrno)
+{
+ struct spdk_json_write_ctx *w;
+ struct spdk_jsonrpc_request *request = cb_arg;
+
+ if (lvserrno != 0) {
+ goto invalid;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ return;
+
+invalid:
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ spdk_strerror(-lvserrno));
+}
+
+static void
+rpc_bdev_lvol_delete_lvstore(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_lvol_delete_lvstore req = {};
+ struct spdk_lvol_store *lvs = NULL;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_bdev_lvol_delete_lvstore_decoders,
+ SPDK_COUNTOF(rpc_bdev_lvol_delete_lvstore_decoders),
+ &req)) {
+ SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ rc = vbdev_get_lvol_store_by_uuid_xor_name(req.uuid, req.lvs_name, &lvs);
+ if (rc != 0) {
+ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc));
+ goto cleanup;
+ }
+
+ vbdev_lvs_destruct(lvs, rpc_lvol_store_destroy_cb, request);
+
+cleanup:
+ free_rpc_bdev_lvol_delete_lvstore(&req);
+}
+SPDK_RPC_REGISTER("bdev_lvol_delete_lvstore", rpc_bdev_lvol_delete_lvstore, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_lvol_delete_lvstore, destroy_lvol_store)
+
+struct rpc_bdev_lvol_create {
+ char *uuid;
+ char *lvs_name;
+ char *lvol_name;
+ uint64_t size;
+ bool thin_provision;
+ char *clear_method;
+};
+
+static void
+free_rpc_bdev_lvol_create(struct rpc_bdev_lvol_create *req)
+{
+ free(req->uuid);
+ free(req->lvs_name);
+ free(req->lvol_name);
+ free(req->clear_method);
+}
+
+static const struct spdk_json_object_decoder rpc_bdev_lvol_create_decoders[] = {
+ {"uuid", offsetof(struct rpc_bdev_lvol_create, uuid), spdk_json_decode_string, true},
+ {"lvs_name", offsetof(struct rpc_bdev_lvol_create, lvs_name), spdk_json_decode_string, true},
+ {"lvol_name", offsetof(struct rpc_bdev_lvol_create, lvol_name), spdk_json_decode_string},
+ {"size", offsetof(struct rpc_bdev_lvol_create, size), spdk_json_decode_uint64},
+ {"thin_provision", offsetof(struct rpc_bdev_lvol_create, thin_provision), spdk_json_decode_bool, true},
+ {"clear_method", offsetof(struct rpc_bdev_lvol_create, clear_method), spdk_json_decode_string, true},
+};
+
+static void
+rpc_bdev_lvol_create_cb(void *cb_arg, struct spdk_lvol *lvol, int lvolerrno)
+{
+ struct spdk_json_write_ctx *w;
+ struct spdk_jsonrpc_request *request = cb_arg;
+
+ if (lvolerrno != 0) {
+ goto invalid;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_string(w, lvol->unique_id);
+ spdk_jsonrpc_end_result(request, w);
+ return;
+
+invalid:
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ spdk_strerror(-lvolerrno));
+}
+
+static void
+rpc_bdev_lvol_create(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_lvol_create req = {};
+ enum lvol_clear_method clear_method;
+ int rc = 0;
+ struct spdk_lvol_store *lvs = NULL;
+
+ SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "Creating blob\n");
+
+ if (spdk_json_decode_object(params, rpc_bdev_lvol_create_decoders,
+ SPDK_COUNTOF(rpc_bdev_lvol_create_decoders),
+ &req)) {
+ SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ rc = vbdev_get_lvol_store_by_uuid_xor_name(req.uuid, req.lvs_name, &lvs);
+ if (rc != 0) {
+ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc));
+ goto cleanup;
+ }
+
+ if (req.clear_method != NULL) {
+ if (!strcasecmp(req.clear_method, "none")) {
+ clear_method = LVOL_CLEAR_WITH_NONE;
+ } else if (!strcasecmp(req.clear_method, "unmap")) {
+ clear_method = LVOL_CLEAR_WITH_UNMAP;
+ } else if (!strcasecmp(req.clear_method, "write_zeroes")) {
+ clear_method = LVOL_CLEAR_WITH_WRITE_ZEROES;
+ } else {
+ spdk_jsonrpc_send_error_response(request, -EINVAL, "Invalid clean_method option");
+ goto cleanup;
+ }
+ } else {
+ clear_method = LVOL_CLEAR_WITH_DEFAULT;
+ }
+
+ rc = vbdev_lvol_create(lvs, req.lvol_name, req.size, req.thin_provision,
+ clear_method, rpc_bdev_lvol_create_cb, request);
+ if (rc < 0) {
+ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc));
+ goto cleanup;
+ }
+
+cleanup:
+ free_rpc_bdev_lvol_create(&req);
+}
+
+SPDK_RPC_REGISTER("bdev_lvol_create", rpc_bdev_lvol_create, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_lvol_create, construct_lvol_bdev)
+
+struct rpc_bdev_lvol_snapshot {
+ char *lvol_name;
+ char *snapshot_name;
+};
+
+static void
+free_rpc_bdev_lvol_snapshot(struct rpc_bdev_lvol_snapshot *req)
+{
+ free(req->lvol_name);
+ free(req->snapshot_name);
+}
+
+static const struct spdk_json_object_decoder rpc_bdev_lvol_snapshot_decoders[] = {
+ {"lvol_name", offsetof(struct rpc_bdev_lvol_snapshot, lvol_name), spdk_json_decode_string},
+ {"snapshot_name", offsetof(struct rpc_bdev_lvol_snapshot, snapshot_name), spdk_json_decode_string},
+};
+
+static void
+rpc_bdev_lvol_snapshot_cb(void *cb_arg, struct spdk_lvol *lvol, int lvolerrno)
+{
+ struct spdk_json_write_ctx *w;
+ struct spdk_jsonrpc_request *request = cb_arg;
+
+ if (lvolerrno != 0) {
+ goto invalid;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_string(w, lvol->unique_id);
+ spdk_jsonrpc_end_result(request, w);
+ return;
+
+invalid:
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ spdk_strerror(-lvolerrno));
+}
+
+static void
+rpc_bdev_lvol_snapshot(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_lvol_snapshot req = {};
+ struct spdk_bdev *bdev;
+ struct spdk_lvol *lvol;
+
+ SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "Snapshotting blob\n");
+
+ if (spdk_json_decode_object(params, rpc_bdev_lvol_snapshot_decoders,
+ SPDK_COUNTOF(rpc_bdev_lvol_snapshot_decoders),
+ &req)) {
+ SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ bdev = spdk_bdev_get_by_name(req.lvol_name);
+ if (bdev == NULL) {
+ SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "bdev '%s' does not exist\n", req.lvol_name);
+ spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV));
+ goto cleanup;
+ }
+
+ lvol = vbdev_lvol_get_from_bdev(bdev);
+ if (lvol == NULL) {
+ SPDK_ERRLOG("lvol does not exist\n");
+ spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV));
+ goto cleanup;
+ }
+
+ vbdev_lvol_create_snapshot(lvol, req.snapshot_name, rpc_bdev_lvol_snapshot_cb, request);
+
+cleanup:
+ free_rpc_bdev_lvol_snapshot(&req);
+}
+
+SPDK_RPC_REGISTER("bdev_lvol_snapshot", rpc_bdev_lvol_snapshot, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_lvol_snapshot, snapshot_lvol_bdev)
+
+struct rpc_bdev_lvol_clone {
+ char *snapshot_name;
+ char *clone_name;
+};
+
+static void
+free_rpc_bdev_lvol_clone(struct rpc_bdev_lvol_clone *req)
+{
+ free(req->snapshot_name);
+ free(req->clone_name);
+}
+
+static const struct spdk_json_object_decoder rpc_bdev_lvol_clone_decoders[] = {
+ {"snapshot_name", offsetof(struct rpc_bdev_lvol_clone, snapshot_name), spdk_json_decode_string},
+ {"clone_name", offsetof(struct rpc_bdev_lvol_clone, clone_name), spdk_json_decode_string, true},
+};
+
+static void
+rpc_bdev_lvol_clone_cb(void *cb_arg, struct spdk_lvol *lvol, int lvolerrno)
+{
+ struct spdk_json_write_ctx *w;
+ struct spdk_jsonrpc_request *request = cb_arg;
+
+ if (lvolerrno != 0) {
+ goto invalid;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_string(w, lvol->unique_id);
+ spdk_jsonrpc_end_result(request, w);
+ return;
+
+invalid:
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ spdk_strerror(-lvolerrno));
+}
+
+static void
+rpc_bdev_lvol_clone(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_lvol_clone req = {};
+ struct spdk_bdev *bdev;
+ struct spdk_lvol *lvol;
+
+ SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "Cloning blob\n");
+
+ if (spdk_json_decode_object(params, rpc_bdev_lvol_clone_decoders,
+ SPDK_COUNTOF(rpc_bdev_lvol_clone_decoders),
+ &req)) {
+ SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ bdev = spdk_bdev_get_by_name(req.snapshot_name);
+ if (bdev == NULL) {
+ SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "bdev '%s' does not exist\n", req.snapshot_name);
+ spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV));
+ goto cleanup;
+ }
+
+ lvol = vbdev_lvol_get_from_bdev(bdev);
+ if (lvol == NULL) {
+ SPDK_ERRLOG("lvol does not exist\n");
+ spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV));
+ goto cleanup;
+ }
+
+ vbdev_lvol_create_clone(lvol, req.clone_name, rpc_bdev_lvol_clone_cb, request);
+
+cleanup:
+ free_rpc_bdev_lvol_clone(&req);
+}
+
+SPDK_RPC_REGISTER("bdev_lvol_clone", rpc_bdev_lvol_clone, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_lvol_clone, clone_lvol_bdev)
+
+struct rpc_bdev_lvol_rename {
+ char *old_name;
+ char *new_name;
+};
+
+static void
+free_rpc_bdev_lvol_rename(struct rpc_bdev_lvol_rename *req)
+{
+ free(req->old_name);
+ free(req->new_name);
+}
+
+static const struct spdk_json_object_decoder rpc_bdev_lvol_rename_decoders[] = {
+ {"old_name", offsetof(struct rpc_bdev_lvol_rename, old_name), spdk_json_decode_string},
+ {"new_name", offsetof(struct rpc_bdev_lvol_rename, new_name), spdk_json_decode_string},
+};
+
+static void
+rpc_bdev_lvol_rename_cb(void *cb_arg, int lvolerrno)
+{
+ struct spdk_json_write_ctx *w;
+ struct spdk_jsonrpc_request *request = cb_arg;
+
+ if (lvolerrno != 0) {
+ goto invalid;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ return;
+
+invalid:
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ spdk_strerror(-lvolerrno));
+}
+
+static void
+rpc_bdev_lvol_rename(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_lvol_rename req = {};
+ struct spdk_bdev *bdev;
+ struct spdk_lvol *lvol;
+
+ SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "Renaming lvol\n");
+
+ if (spdk_json_decode_object(params, rpc_bdev_lvol_rename_decoders,
+ SPDK_COUNTOF(rpc_bdev_lvol_rename_decoders),
+ &req)) {
+ SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ bdev = spdk_bdev_get_by_name(req.old_name);
+ if (bdev == NULL) {
+ SPDK_ERRLOG("bdev '%s' does not exist\n", req.old_name);
+ spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV));
+ goto cleanup;
+ }
+
+ lvol = vbdev_lvol_get_from_bdev(bdev);
+ if (lvol == NULL) {
+ SPDK_ERRLOG("lvol does not exist\n");
+ spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV));
+ goto cleanup;
+ }
+
+ vbdev_lvol_rename(lvol, req.new_name, rpc_bdev_lvol_rename_cb, request);
+
+cleanup:
+ free_rpc_bdev_lvol_rename(&req);
+}
+
+SPDK_RPC_REGISTER("bdev_lvol_rename", rpc_bdev_lvol_rename, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_lvol_rename, rename_lvol_bdev)
+
+struct rpc_bdev_lvol_inflate {
+ char *name;
+};
+
+static void
+free_rpc_bdev_lvol_inflate(struct rpc_bdev_lvol_inflate *req)
+{
+ free(req->name);
+}
+
+static const struct spdk_json_object_decoder rpc_bdev_lvol_inflate_decoders[] = {
+ {"name", offsetof(struct rpc_bdev_lvol_inflate, name), spdk_json_decode_string},
+};
+
+static void
+rpc_bdev_lvol_inflate_cb(void *cb_arg, int lvolerrno)
+{
+ struct spdk_json_write_ctx *w;
+ struct spdk_jsonrpc_request *request = cb_arg;
+
+ if (lvolerrno != 0) {
+ goto invalid;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ return;
+
+invalid:
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ spdk_strerror(-lvolerrno));
+}
+
+static void
+rpc_bdev_lvol_inflate(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_lvol_inflate req = {};
+ struct spdk_bdev *bdev;
+ struct spdk_lvol *lvol;
+
+ SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "Inflating lvol\n");
+
+ if (spdk_json_decode_object(params, rpc_bdev_lvol_inflate_decoders,
+ SPDK_COUNTOF(rpc_bdev_lvol_inflate_decoders),
+ &req)) {
+ SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ bdev = spdk_bdev_get_by_name(req.name);
+ if (bdev == NULL) {
+ SPDK_ERRLOG("bdev '%s' does not exist\n", req.name);
+ spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV));
+ goto cleanup;
+ }
+
+ lvol = vbdev_lvol_get_from_bdev(bdev);
+ if (lvol == NULL) {
+ SPDK_ERRLOG("lvol does not exist\n");
+ spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV));
+ goto cleanup;
+ }
+
+ spdk_lvol_inflate(lvol, rpc_bdev_lvol_inflate_cb, request);
+
+cleanup:
+ free_rpc_bdev_lvol_inflate(&req);
+}
+
+SPDK_RPC_REGISTER("bdev_lvol_inflate", rpc_bdev_lvol_inflate, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_lvol_inflate, inflate_lvol_bdev)
+
+static void
+rpc_bdev_lvol_decouple_parent(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_lvol_inflate req = {};
+ struct spdk_bdev *bdev;
+ struct spdk_lvol *lvol;
+
+ SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "Decoupling parent of lvol\n");
+
+ if (spdk_json_decode_object(params, rpc_bdev_lvol_inflate_decoders,
+ SPDK_COUNTOF(rpc_bdev_lvol_inflate_decoders),
+ &req)) {
+ SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ bdev = spdk_bdev_get_by_name(req.name);
+ if (bdev == NULL) {
+ SPDK_ERRLOG("bdev '%s' does not exist\n", req.name);
+ spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV));
+ goto cleanup;
+ }
+
+ lvol = vbdev_lvol_get_from_bdev(bdev);
+ if (lvol == NULL) {
+ SPDK_ERRLOG("lvol does not exist\n");
+ spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV));
+ goto cleanup;
+ }
+
+ spdk_lvol_decouple_parent(lvol, rpc_bdev_lvol_inflate_cb, request);
+
+cleanup:
+ free_rpc_bdev_lvol_inflate(&req);
+}
+
+SPDK_RPC_REGISTER("bdev_lvol_decouple_parent", rpc_bdev_lvol_decouple_parent, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_lvol_decouple_parent, decouple_parent_lvol_bdev)
+
+struct rpc_bdev_lvol_resize {
+ char *name;
+ uint64_t size;
+};
+
+static void
+free_rpc_bdev_lvol_resize(struct rpc_bdev_lvol_resize *req)
+{
+ free(req->name);
+}
+
+static const struct spdk_json_object_decoder rpc_bdev_lvol_resize_decoders[] = {
+ {"name", offsetof(struct rpc_bdev_lvol_resize, name), spdk_json_decode_string},
+ {"size", offsetof(struct rpc_bdev_lvol_resize, size), spdk_json_decode_uint64},
+};
+
+static void
+rpc_bdev_lvol_resize_cb(void *cb_arg, int lvolerrno)
+{
+ struct spdk_json_write_ctx *w;
+ struct spdk_jsonrpc_request *request = cb_arg;
+
+ if (lvolerrno != 0) {
+ goto invalid;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ return;
+
+invalid:
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ spdk_strerror(-lvolerrno));
+}
+
+static void
+rpc_bdev_lvol_resize(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_lvol_resize req = {};
+ struct spdk_bdev *bdev;
+ struct spdk_lvol *lvol;
+
+ SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "Resizing lvol\n");
+
+ if (spdk_json_decode_object(params, rpc_bdev_lvol_resize_decoders,
+ SPDK_COUNTOF(rpc_bdev_lvol_resize_decoders),
+ &req)) {
+ SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ bdev = spdk_bdev_get_by_name(req.name);
+ if (bdev == NULL) {
+ SPDK_ERRLOG("no bdev for provided name %s\n", req.name);
+ spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV));
+ goto cleanup;
+ }
+
+ lvol = vbdev_lvol_get_from_bdev(bdev);
+ if (lvol == NULL) {
+ spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV));
+ goto cleanup;
+ }
+
+ vbdev_lvol_resize(lvol, req.size, rpc_bdev_lvol_resize_cb, request);
+
+cleanup:
+ free_rpc_bdev_lvol_resize(&req);
+}
+
+SPDK_RPC_REGISTER("bdev_lvol_resize", rpc_bdev_lvol_resize, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_lvol_resize, resize_lvol_bdev)
+
+struct rpc_set_ro_lvol_bdev {
+ char *name;
+};
+
+static void
+free_rpc_set_ro_lvol_bdev(struct rpc_set_ro_lvol_bdev *req)
+{
+ free(req->name);
+}
+
+static const struct spdk_json_object_decoder rpc_set_ro_lvol_bdev_decoders[] = {
+ {"name", offsetof(struct rpc_set_ro_lvol_bdev, name), spdk_json_decode_string},
+};
+
+static void
+rpc_set_ro_lvol_bdev_cb(void *cb_arg, int lvolerrno)
+{
+ struct spdk_json_write_ctx *w;
+ struct spdk_jsonrpc_request *request = cb_arg;
+
+ if (lvolerrno != 0) {
+ goto invalid;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ return;
+
+invalid:
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ spdk_strerror(-lvolerrno));
+}
+
+static void
+rpc_bdev_lvol_set_read_only(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_set_ro_lvol_bdev req = {};
+ struct spdk_bdev *bdev;
+ struct spdk_lvol *lvol;
+
+ SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "Setting lvol as read only\n");
+
+ if (spdk_json_decode_object(params, rpc_set_ro_lvol_bdev_decoders,
+ SPDK_COUNTOF(rpc_set_ro_lvol_bdev_decoders),
+ &req)) {
+ SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ if (req.name == NULL) {
+ SPDK_ERRLOG("missing name param\n");
+ spdk_jsonrpc_send_error_response(request, -EINVAL, "Missing name parameter");
+ goto cleanup;
+ }
+
+ bdev = spdk_bdev_get_by_name(req.name);
+ if (bdev == NULL) {
+ SPDK_ERRLOG("no bdev for provided name %s\n", req.name);
+ spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV));
+ goto cleanup;
+ }
+
+ lvol = vbdev_lvol_get_from_bdev(bdev);
+ if (lvol == NULL) {
+ spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV));
+ goto cleanup;
+ }
+
+ vbdev_lvol_set_read_only(lvol, rpc_set_ro_lvol_bdev_cb, request);
+
+cleanup:
+ free_rpc_set_ro_lvol_bdev(&req);
+}
+
+SPDK_RPC_REGISTER("bdev_lvol_set_read_only", rpc_bdev_lvol_set_read_only, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_lvol_set_read_only, set_read_only_lvol_bdev)
+
+struct rpc_bdev_lvol_delete {
+ char *name;
+};
+
+static void
+free_rpc_bdev_lvol_delete(struct rpc_bdev_lvol_delete *req)
+{
+ free(req->name);
+}
+
+static const struct spdk_json_object_decoder rpc_bdev_lvol_delete_decoders[] = {
+ {"name", offsetof(struct rpc_bdev_lvol_delete, name), spdk_json_decode_string},
+};
+
+static void
+rpc_bdev_lvol_delete_cb(void *cb_arg, int lvolerrno)
+{
+ struct spdk_json_write_ctx *w;
+ struct spdk_jsonrpc_request *request = cb_arg;
+
+ if (lvolerrno != 0) {
+ goto invalid;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ return;
+
+invalid:
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ spdk_strerror(-lvolerrno));
+}
+
+static void
+rpc_bdev_lvol_delete(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_lvol_delete req = {};
+ struct spdk_bdev *bdev;
+ struct spdk_lvol *lvol;
+
+ if (spdk_json_decode_object(params, rpc_bdev_lvol_delete_decoders,
+ SPDK_COUNTOF(rpc_bdev_lvol_delete_decoders),
+ &req)) {
+ SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ bdev = spdk_bdev_get_by_name(req.name);
+ if (bdev == NULL) {
+ SPDK_ERRLOG("no bdev for provided name %s\n", req.name);
+ spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV));
+ goto cleanup;
+ }
+
+ lvol = vbdev_lvol_get_from_bdev(bdev);
+ if (lvol == NULL) {
+ spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV));
+ goto cleanup;
+ }
+
+ vbdev_lvol_destroy(lvol, rpc_bdev_lvol_delete_cb, request);
+
+cleanup:
+ free_rpc_bdev_lvol_delete(&req);
+}
+
+SPDK_RPC_REGISTER("bdev_lvol_delete", rpc_bdev_lvol_delete, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_lvol_delete, destroy_lvol_bdev)
+
+struct rpc_bdev_lvol_get_lvstores {
+ char *uuid;
+ char *lvs_name;
+};
+
+static void
+free_rpc_bdev_lvol_get_lvstores(struct rpc_bdev_lvol_get_lvstores *req)
+{
+ free(req->uuid);
+ free(req->lvs_name);
+}
+
+static const struct spdk_json_object_decoder rpc_bdev_lvol_get_lvstores_decoders[] = {
+ {"uuid", offsetof(struct rpc_bdev_lvol_get_lvstores, uuid), spdk_json_decode_string, true},
+ {"lvs_name", offsetof(struct rpc_bdev_lvol_get_lvstores, lvs_name), spdk_json_decode_string, true},
+};
+
+static void
+rpc_dump_lvol_store_info(struct spdk_json_write_ctx *w, struct lvol_store_bdev *lvs_bdev)
+{
+ struct spdk_blob_store *bs;
+ uint64_t cluster_size;
+ char uuid[SPDK_UUID_STRING_LEN];
+
+ bs = lvs_bdev->lvs->blobstore;
+ cluster_size = spdk_bs_get_cluster_size(bs);
+
+ spdk_json_write_object_begin(w);
+
+ spdk_uuid_fmt_lower(uuid, sizeof(uuid), &lvs_bdev->lvs->uuid);
+ spdk_json_write_named_string(w, "uuid", uuid);
+
+ spdk_json_write_named_string(w, "name", lvs_bdev->lvs->name);
+
+ spdk_json_write_named_string(w, "base_bdev", spdk_bdev_get_name(lvs_bdev->bdev));
+
+ spdk_json_write_named_uint64(w, "total_data_clusters", spdk_bs_total_data_cluster_count(bs));
+
+ spdk_json_write_named_uint64(w, "free_clusters", spdk_bs_free_cluster_count(bs));
+
+ spdk_json_write_named_uint64(w, "block_size", spdk_bs_get_io_unit_size(bs));
+
+ spdk_json_write_named_uint64(w, "cluster_size", cluster_size);
+
+ spdk_json_write_object_end(w);
+}
+
+static void
+rpc_bdev_lvol_get_lvstores(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_lvol_get_lvstores req = {};
+ struct spdk_json_write_ctx *w;
+ struct lvol_store_bdev *lvs_bdev = NULL;
+ struct spdk_lvol_store *lvs = NULL;
+ int rc;
+
+ if (params != NULL) {
+ if (spdk_json_decode_object(params, rpc_bdev_lvol_get_lvstores_decoders,
+ SPDK_COUNTOF(rpc_bdev_lvol_get_lvstores_decoders),
+ &req)) {
+ SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ rc = vbdev_get_lvol_store_by_uuid_xor_name(req.uuid, req.lvs_name, &lvs);
+ if (rc != 0) {
+ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc));
+ goto cleanup;
+ }
+
+ lvs_bdev = vbdev_get_lvs_bdev_by_lvs(lvs);
+ if (lvs_bdev == NULL) {
+ spdk_jsonrpc_send_error_response(request, ENODEV, spdk_strerror(-ENODEV));
+ goto cleanup;
+ }
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_array_begin(w);
+
+ if (lvs_bdev != NULL) {
+ rpc_dump_lvol_store_info(w, lvs_bdev);
+ } else {
+ for (lvs_bdev = vbdev_lvol_store_first(); lvs_bdev != NULL;
+ lvs_bdev = vbdev_lvol_store_next(lvs_bdev)) {
+ rpc_dump_lvol_store_info(w, lvs_bdev);
+ }
+ }
+ spdk_json_write_array_end(w);
+
+ spdk_jsonrpc_end_result(request, w);
+
+cleanup:
+ free_rpc_bdev_lvol_get_lvstores(&req);
+}
+
+SPDK_RPC_REGISTER("bdev_lvol_get_lvstores", rpc_bdev_lvol_get_lvstores, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_lvol_get_lvstores, get_lvol_stores)
diff --git a/src/spdk/module/bdev/malloc/Makefile b/src/spdk/module/bdev/malloc/Makefile
new file mode 100644
index 000000000..c55db23ce
--- /dev/null
+++ b/src/spdk/module/bdev/malloc/Makefile
@@ -0,0 +1,46 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = bdev_malloc.c bdev_malloc_rpc.c
+LIBNAME = bdev_malloc
+LOCAL_SYS_LIBS = -luuid
+
+SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/module/bdev/malloc/bdev_malloc.c b/src/spdk/module/bdev/malloc/bdev_malloc.c
new file mode 100644
index 000000000..ce0403153
--- /dev/null
+++ b/src/spdk/module/bdev/malloc/bdev_malloc.c
@@ -0,0 +1,532 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "bdev_malloc.h"
+#include "spdk/bdev.h"
+#include "spdk/conf.h"
+#include "spdk/endian.h"
+#include "spdk/env.h"
+#include "spdk/accel_engine.h"
+#include "spdk/json.h"
+#include "spdk/thread.h"
+#include "spdk/queue.h"
+#include "spdk/string.h"
+
+#include "spdk/bdev_module.h"
+#include "spdk_internal/log.h"
+
+struct malloc_disk {
+ struct spdk_bdev disk;
+ void *malloc_buf;
+ TAILQ_ENTRY(malloc_disk) link;
+};
+
+struct malloc_task {
+ int num_outstanding;
+ enum spdk_bdev_io_status status;
+};
+
+static void
+malloc_done(void *ref, int status)
+{
+ struct malloc_task *task = (struct malloc_task *)ref;
+
+ if (status != 0) {
+ if (status == -ENOMEM) {
+ task->status = SPDK_BDEV_IO_STATUS_NOMEM;
+ } else {
+ task->status = SPDK_BDEV_IO_STATUS_FAILED;
+ }
+ }
+
+ if (--task->num_outstanding == 0) {
+ spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), task->status);
+ }
+}
+
+static TAILQ_HEAD(, malloc_disk) g_malloc_disks = TAILQ_HEAD_INITIALIZER(g_malloc_disks);
+
+int malloc_disk_count = 0;
+
+static int bdev_malloc_initialize(void);
+static void bdev_malloc_get_spdk_running_config(FILE *fp);
+
+static int
+bdev_malloc_get_ctx_size(void)
+{
+ return sizeof(struct malloc_task);
+}
+
+static struct spdk_bdev_module malloc_if = {
+ .name = "malloc",
+ .module_init = bdev_malloc_initialize,
+ .config_text = bdev_malloc_get_spdk_running_config,
+ .get_ctx_size = bdev_malloc_get_ctx_size,
+
+};
+
+SPDK_BDEV_MODULE_REGISTER(malloc, &malloc_if)
+
+static void
+malloc_disk_free(struct malloc_disk *malloc_disk)
+{
+ if (!malloc_disk) {
+ return;
+ }
+
+ free(malloc_disk->disk.name);
+ spdk_free(malloc_disk->malloc_buf);
+ free(malloc_disk);
+}
+
+static int
+bdev_malloc_destruct(void *ctx)
+{
+ struct malloc_disk *malloc_disk = ctx;
+
+ TAILQ_REMOVE(&g_malloc_disks, malloc_disk, link);
+ malloc_disk_free(malloc_disk);
+ return 0;
+}
+
+static int
+bdev_malloc_check_iov_len(struct iovec *iovs, int iovcnt, size_t nbytes)
+{
+ int i;
+
+ for (i = 0; i < iovcnt; i++) {
+ if (nbytes < iovs[i].iov_len) {
+ return 0;
+ }
+
+ nbytes -= iovs[i].iov_len;
+ }
+
+ return nbytes != 0;
+}
+
+static void
+bdev_malloc_readv(struct malloc_disk *mdisk, struct spdk_io_channel *ch,
+ struct malloc_task *task,
+ struct iovec *iov, int iovcnt, size_t len, uint64_t offset)
+{
+ int64_t res = 0;
+ void *src = mdisk->malloc_buf + offset;
+ int i;
+
+ if (bdev_malloc_check_iov_len(iov, iovcnt, len)) {
+ spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task),
+ SPDK_BDEV_IO_STATUS_FAILED);
+ return;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_MALLOC, "read %lu bytes from offset %#lx\n",
+ len, offset);
+
+ task->status = SPDK_BDEV_IO_STATUS_SUCCESS;
+ task->num_outstanding = iovcnt;
+
+ for (i = 0; i < iovcnt; i++) {
+ res = spdk_accel_submit_copy(ch, iov[i].iov_base,
+ src, iov[i].iov_len, malloc_done, task);
+
+ if (res != 0) {
+ malloc_done(task, res);
+ }
+
+ src += iov[i].iov_len;
+ len -= iov[i].iov_len;
+ }
+}
+
+static void
+bdev_malloc_writev(struct malloc_disk *mdisk, struct spdk_io_channel *ch,
+ struct malloc_task *task,
+ struct iovec *iov, int iovcnt, size_t len, uint64_t offset)
+{
+ int64_t res = 0;
+ void *dst = mdisk->malloc_buf + offset;
+ int i;
+
+ if (bdev_malloc_check_iov_len(iov, iovcnt, len)) {
+ spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task),
+ SPDK_BDEV_IO_STATUS_FAILED);
+ return;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_MALLOC, "wrote %lu bytes to offset %#lx\n",
+ len, offset);
+
+ task->status = SPDK_BDEV_IO_STATUS_SUCCESS;
+ task->num_outstanding = iovcnt;
+
+ for (i = 0; i < iovcnt; i++) {
+ res = spdk_accel_submit_copy(ch, dst, iov[i].iov_base,
+ iov[i].iov_len, malloc_done, task);
+
+ if (res != 0) {
+ malloc_done(task, res);
+ }
+
+ dst += iov[i].iov_len;
+ }
+}
+
+static int
+bdev_malloc_unmap(struct malloc_disk *mdisk,
+ struct spdk_io_channel *ch,
+ struct malloc_task *task,
+ uint64_t offset,
+ uint64_t byte_count)
+{
+ task->status = SPDK_BDEV_IO_STATUS_SUCCESS;
+ task->num_outstanding = 1;
+
+ return spdk_accel_submit_fill(ch, mdisk->malloc_buf + offset, 0,
+ byte_count, malloc_done, task);
+}
+
+static int64_t
+bdev_malloc_flush(struct malloc_disk *mdisk, struct malloc_task *task,
+ uint64_t offset, uint64_t nbytes)
+{
+ spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), SPDK_BDEV_IO_STATUS_SUCCESS);
+
+ return 0;
+}
+
+static int
+bdev_malloc_reset(struct malloc_disk *mdisk, struct malloc_task *task)
+{
+ spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), SPDK_BDEV_IO_STATUS_SUCCESS);
+
+ return 0;
+}
+
+static int _bdev_malloc_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
+{
+ uint32_t block_size = bdev_io->bdev->blocklen;
+
+ switch (bdev_io->type) {
+ case SPDK_BDEV_IO_TYPE_READ:
+ if (bdev_io->u.bdev.iovs[0].iov_base == NULL) {
+ assert(bdev_io->u.bdev.iovcnt == 1);
+ bdev_io->u.bdev.iovs[0].iov_base =
+ ((struct malloc_disk *)bdev_io->bdev->ctxt)->malloc_buf +
+ bdev_io->u.bdev.offset_blocks * block_size;
+ bdev_io->u.bdev.iovs[0].iov_len = bdev_io->u.bdev.num_blocks * block_size;
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
+ return 0;
+ }
+
+ bdev_malloc_readv((struct malloc_disk *)bdev_io->bdev->ctxt,
+ ch,
+ (struct malloc_task *)bdev_io->driver_ctx,
+ bdev_io->u.bdev.iovs,
+ bdev_io->u.bdev.iovcnt,
+ bdev_io->u.bdev.num_blocks * block_size,
+ bdev_io->u.bdev.offset_blocks * block_size);
+ return 0;
+
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ bdev_malloc_writev((struct malloc_disk *)bdev_io->bdev->ctxt,
+ ch,
+ (struct malloc_task *)bdev_io->driver_ctx,
+ bdev_io->u.bdev.iovs,
+ bdev_io->u.bdev.iovcnt,
+ bdev_io->u.bdev.num_blocks * block_size,
+ bdev_io->u.bdev.offset_blocks * block_size);
+ return 0;
+
+ case SPDK_BDEV_IO_TYPE_RESET:
+ return bdev_malloc_reset((struct malloc_disk *)bdev_io->bdev->ctxt,
+ (struct malloc_task *)bdev_io->driver_ctx);
+
+ case SPDK_BDEV_IO_TYPE_FLUSH:
+ return bdev_malloc_flush((struct malloc_disk *)bdev_io->bdev->ctxt,
+ (struct malloc_task *)bdev_io->driver_ctx,
+ bdev_io->u.bdev.offset_blocks * block_size,
+ bdev_io->u.bdev.num_blocks * block_size);
+
+ case SPDK_BDEV_IO_TYPE_UNMAP:
+ return bdev_malloc_unmap((struct malloc_disk *)bdev_io->bdev->ctxt,
+ ch,
+ (struct malloc_task *)bdev_io->driver_ctx,
+ bdev_io->u.bdev.offset_blocks * block_size,
+ bdev_io->u.bdev.num_blocks * block_size);
+
+ case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
+ /* bdev_malloc_unmap is implemented with a call to mem_cpy_fill which zeroes out all of the requested bytes. */
+ return bdev_malloc_unmap((struct malloc_disk *)bdev_io->bdev->ctxt,
+ ch,
+ (struct malloc_task *)bdev_io->driver_ctx,
+ bdev_io->u.bdev.offset_blocks * block_size,
+ bdev_io->u.bdev.num_blocks * block_size);
+
+ case SPDK_BDEV_IO_TYPE_ZCOPY:
+ if (bdev_io->u.bdev.zcopy.start) {
+ void *buf;
+ size_t len;
+
+ buf = ((struct malloc_disk *)bdev_io->bdev->ctxt)->malloc_buf +
+ bdev_io->u.bdev.offset_blocks * block_size;
+ len = bdev_io->u.bdev.num_blocks * block_size;
+ spdk_bdev_io_set_buf(bdev_io, buf, len);
+
+ }
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
+ return 0;
+ case SPDK_BDEV_IO_TYPE_ABORT:
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ return 0;
+ default:
+ return -1;
+ }
+ return 0;
+}
+
+static void bdev_malloc_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
+{
+ if (_bdev_malloc_submit_request(ch, bdev_io) != 0) {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+}
+
+static bool
+bdev_malloc_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
+{
+ switch (io_type) {
+ case SPDK_BDEV_IO_TYPE_READ:
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ case SPDK_BDEV_IO_TYPE_FLUSH:
+ case SPDK_BDEV_IO_TYPE_RESET:
+ case SPDK_BDEV_IO_TYPE_UNMAP:
+ case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
+ case SPDK_BDEV_IO_TYPE_ZCOPY:
+ case SPDK_BDEV_IO_TYPE_ABORT:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+static struct spdk_io_channel *
+bdev_malloc_get_io_channel(void *ctx)
+{
+ return spdk_accel_engine_get_io_channel();
+}
+
+static void
+bdev_malloc_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
+{
+ char uuid_str[SPDK_UUID_STRING_LEN];
+
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_string(w, "method", "bdev_malloc_create");
+
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_string(w, "name", bdev->name);
+ spdk_json_write_named_uint64(w, "num_blocks", bdev->blockcnt);
+ spdk_json_write_named_uint32(w, "block_size", bdev->blocklen);
+ spdk_uuid_fmt_lower(uuid_str, sizeof(uuid_str), &bdev->uuid);
+ spdk_json_write_named_string(w, "uuid", uuid_str);
+
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+}
+
+static const struct spdk_bdev_fn_table malloc_fn_table = {
+ .destruct = bdev_malloc_destruct,
+ .submit_request = bdev_malloc_submit_request,
+ .io_type_supported = bdev_malloc_io_type_supported,
+ .get_io_channel = bdev_malloc_get_io_channel,
+ .write_config_json = bdev_malloc_write_json_config,
+};
+
+int
+create_malloc_disk(struct spdk_bdev **bdev, const char *name, const struct spdk_uuid *uuid,
+ uint64_t num_blocks, uint32_t block_size)
+{
+ struct malloc_disk *mdisk;
+ int rc;
+
+ if (num_blocks == 0) {
+ SPDK_ERRLOG("Disk num_blocks must be greater than 0");
+ return -EINVAL;
+ }
+
+ mdisk = calloc(1, sizeof(*mdisk));
+ if (!mdisk) {
+ SPDK_ERRLOG("mdisk calloc() failed\n");
+ return -ENOMEM;
+ }
+
+ /*
+ * Allocate the large backend memory buffer from pinned memory.
+ *
+ * TODO: need to pass a hint so we know which socket to allocate
+ * from on multi-socket systems.
+ */
+ mdisk->malloc_buf = spdk_zmalloc(num_blocks * block_size, 2 * 1024 * 1024, NULL,
+ SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+ if (!mdisk->malloc_buf) {
+ SPDK_ERRLOG("malloc_buf spdk_zmalloc() failed\n");
+ malloc_disk_free(mdisk);
+ return -ENOMEM;
+ }
+
+ if (name) {
+ mdisk->disk.name = strdup(name);
+ } else {
+ /* Auto-generate a name */
+ mdisk->disk.name = spdk_sprintf_alloc("Malloc%d", malloc_disk_count);
+ malloc_disk_count++;
+ }
+ if (!mdisk->disk.name) {
+ malloc_disk_free(mdisk);
+ return -ENOMEM;
+ }
+ mdisk->disk.product_name = "Malloc disk";
+
+ mdisk->disk.write_cache = 1;
+ mdisk->disk.blocklen = block_size;
+ mdisk->disk.blockcnt = num_blocks;
+ if (uuid) {
+ mdisk->disk.uuid = *uuid;
+ } else {
+ spdk_uuid_generate(&mdisk->disk.uuid);
+ }
+
+ mdisk->disk.ctxt = mdisk;
+ mdisk->disk.fn_table = &malloc_fn_table;
+ mdisk->disk.module = &malloc_if;
+
+ rc = spdk_bdev_register(&mdisk->disk);
+ if (rc) {
+ malloc_disk_free(mdisk);
+ return rc;
+ }
+
+ *bdev = &(mdisk->disk);
+
+ TAILQ_INSERT_TAIL(&g_malloc_disks, mdisk, link);
+
+ return rc;
+}
+
+void
+delete_malloc_disk(struct spdk_bdev *bdev, spdk_delete_malloc_complete cb_fn, void *cb_arg)
+{
+ if (!bdev || bdev->module != &malloc_if) {
+ cb_fn(cb_arg, -ENODEV);
+ return;
+ }
+
+ spdk_bdev_unregister(bdev, cb_fn, cb_arg);
+}
+
+static int bdev_malloc_initialize(void)
+{
+ struct spdk_conf_section *sp = spdk_conf_find_section(NULL, "Malloc");
+ int NumberOfLuns, LunSizeInMB, BlockSize, i, rc = 0;
+ uint64_t size;
+ struct spdk_bdev *bdev;
+
+ malloc_disk_count = 0;
+
+ if (sp != NULL) {
+ NumberOfLuns = spdk_conf_section_get_intval(sp, "NumberOfLuns");
+ LunSizeInMB = spdk_conf_section_get_intval(sp, "LunSizeInMB");
+ BlockSize = spdk_conf_section_get_intval(sp, "BlockSize");
+ if ((NumberOfLuns < 1) || (LunSizeInMB < 1)) {
+ SPDK_ERRLOG("Malloc section present, but no devices specified\n");
+ goto end;
+ }
+ if (BlockSize < 1) {
+ /* Default is 512 bytes */
+ BlockSize = 512;
+ }
+ size = (uint64_t)LunSizeInMB * 1024 * 1024;
+ for (i = 0; i < NumberOfLuns; i++) {
+ rc = create_malloc_disk(&bdev, NULL, NULL, size / BlockSize, BlockSize);
+ if (rc) {
+ SPDK_ERRLOG("Could not create malloc disk\n");
+ goto end;
+ }
+ }
+ }
+
+end:
+ return rc;
+}
+
+static void
+bdev_malloc_get_spdk_running_config(FILE *fp)
+{
+ int num_malloc_luns = 0;
+ uint64_t malloc_lun_size = 0;
+ struct malloc_disk *mdisk;
+
+ /* count number of malloc LUNs, get LUN size */
+ TAILQ_FOREACH(mdisk, &g_malloc_disks, link) {
+ if (0 == malloc_lun_size) {
+ /* assume all malloc luns the same size */
+ malloc_lun_size = mdisk->disk.blocklen * mdisk->disk.blockcnt;
+ malloc_lun_size /= (1024 * 1024);
+ }
+ num_malloc_luns++;
+ }
+
+ if (num_malloc_luns > 0) {
+ fprintf(fp,
+ "\n"
+ "# Users may change this section to create a different number or size of\n"
+ "# malloc LUNs.\n"
+ "# This will generate %d LUNs with a malloc-allocated backend. Each LUN\n"
+ "# will be %" PRIu64 "MB in size and these will be named Malloc0 through Malloc%d.\n"
+ "# Not all LUNs defined here are necessarily used below.\n"
+ "[Malloc]\n"
+ " NumberOfLuns %d\n"
+ " LunSizeInMB %" PRIu64 "\n",
+ num_malloc_luns, malloc_lun_size,
+ num_malloc_luns - 1, num_malloc_luns,
+ malloc_lun_size);
+ }
+}
+
+SPDK_LOG_REGISTER_COMPONENT("bdev_malloc", SPDK_LOG_BDEV_MALLOC)
diff --git a/src/spdk/module/bdev/malloc/bdev_malloc.h b/src/spdk/module/bdev/malloc/bdev_malloc.h
new file mode 100644
index 000000000..b683b1062
--- /dev/null
+++ b/src/spdk/module/bdev/malloc/bdev_malloc.h
@@ -0,0 +1,48 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_BDEV_MALLOC_H
+#define SPDK_BDEV_MALLOC_H
+
+#include "spdk/stdinc.h"
+
+#include "spdk/bdev.h"
+
+typedef void (*spdk_delete_malloc_complete)(void *cb_arg, int bdeverrno);
+
+int create_malloc_disk(struct spdk_bdev **bdev, const char *name, const struct spdk_uuid *uuid,
+ uint64_t num_blocks, uint32_t block_size);
+
+void delete_malloc_disk(struct spdk_bdev *bdev, spdk_delete_malloc_complete cb_fn, void *cb_arg);
+
+#endif /* SPDK_BDEV_MALLOC_H */
diff --git a/src/spdk/module/bdev/malloc/bdev_malloc_rpc.c b/src/spdk/module/bdev/malloc/bdev_malloc_rpc.c
new file mode 100644
index 000000000..f151e8b1f
--- /dev/null
+++ b/src/spdk/module/bdev/malloc/bdev_malloc_rpc.c
@@ -0,0 +1,173 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "bdev_malloc.h"
+#include "spdk/rpc.h"
+#include "spdk/util.h"
+#include "spdk/uuid.h"
+#include "spdk/string.h"
+#include "spdk_internal/log.h"
+
+struct rpc_construct_malloc {
+ char *name;
+ char *uuid;
+ uint64_t num_blocks;
+ uint32_t block_size;
+};
+
+static void
+free_rpc_construct_malloc(struct rpc_construct_malloc *r)
+{
+ free(r->name);
+ free(r->uuid);
+}
+
+static const struct spdk_json_object_decoder rpc_construct_malloc_decoders[] = {
+ {"name", offsetof(struct rpc_construct_malloc, name), spdk_json_decode_string, true},
+ {"uuid", offsetof(struct rpc_construct_malloc, uuid), spdk_json_decode_string, true},
+ {"num_blocks", offsetof(struct rpc_construct_malloc, num_blocks), spdk_json_decode_uint64},
+ {"block_size", offsetof(struct rpc_construct_malloc, block_size), spdk_json_decode_uint32},
+};
+
+static void
+rpc_bdev_malloc_create(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_construct_malloc req = {NULL};
+ struct spdk_json_write_ctx *w;
+ struct spdk_uuid *uuid = NULL;
+ struct spdk_uuid decoded_uuid;
+ struct spdk_bdev *bdev;
+ int rc = 0;
+
+ if (spdk_json_decode_object(params, rpc_construct_malloc_decoders,
+ SPDK_COUNTOF(rpc_construct_malloc_decoders),
+ &req)) {
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_MALLOC, "spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ if (req.num_blocks == 0) {
+ spdk_jsonrpc_send_error_response(request, -EINVAL,
+ "Disk num_blocks must be greater than 0");
+ goto cleanup;
+ }
+
+ if (req.uuid) {
+ if (spdk_uuid_parse(&decoded_uuid, req.uuid)) {
+ spdk_jsonrpc_send_error_response(request, -EINVAL,
+ "Failed to parse bdev UUID");
+ goto cleanup;
+ }
+ uuid = &decoded_uuid;
+ }
+
+ rc = create_malloc_disk(&bdev, req.name, uuid, req.num_blocks, req.block_size);
+ if (rc) {
+ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc));
+ goto cleanup;
+ }
+
+ free_rpc_construct_malloc(&req);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_string(w, spdk_bdev_get_name(bdev));
+ spdk_jsonrpc_end_result(request, w);
+ return;
+
+cleanup:
+ free_rpc_construct_malloc(&req);
+}
+SPDK_RPC_REGISTER("bdev_malloc_create", rpc_bdev_malloc_create, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_malloc_create, construct_malloc_bdev)
+
+struct rpc_delete_malloc {
+ char *name;
+};
+
+static void
+free_rpc_delete_malloc(struct rpc_delete_malloc *r)
+{
+ free(r->name);
+}
+
+static const struct spdk_json_object_decoder rpc_delete_malloc_decoders[] = {
+ {"name", offsetof(struct rpc_delete_malloc, name), spdk_json_decode_string},
+};
+
+static void
+rpc_bdev_malloc_delete_cb(void *cb_arg, int bdeverrno)
+{
+ struct spdk_jsonrpc_request *request = cb_arg;
+ struct spdk_json_write_ctx *w = spdk_jsonrpc_begin_result(request);
+
+ spdk_json_write_bool(w, bdeverrno == 0);
+ spdk_jsonrpc_end_result(request, w);
+}
+
+static void
+rpc_bdev_malloc_delete(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_delete_malloc req = {NULL};
+ struct spdk_bdev *bdev;
+
+ if (spdk_json_decode_object(params, rpc_delete_malloc_decoders,
+ SPDK_COUNTOF(rpc_delete_malloc_decoders),
+ &req)) {
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_MALLOC, "spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ bdev = spdk_bdev_get_by_name(req.name);
+ if (bdev == NULL) {
+ SPDK_INFOLOG(SPDK_LOG_BDEV_MALLOC, "bdev '%s' does not exist\n", req.name);
+ spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV));
+ goto cleanup;
+ }
+
+ delete_malloc_disk(bdev, rpc_bdev_malloc_delete_cb, request);
+
+ free_rpc_delete_malloc(&req);
+
+ return;
+
+cleanup:
+ free_rpc_delete_malloc(&req);
+}
+SPDK_RPC_REGISTER("bdev_malloc_delete", rpc_bdev_malloc_delete, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_malloc_delete, delete_malloc_bdev)
diff --git a/src/spdk/module/bdev/null/Makefile b/src/spdk/module/bdev/null/Makefile
new file mode 100644
index 000000000..e179b01ed
--- /dev/null
+++ b/src/spdk/module/bdev/null/Makefile
@@ -0,0 +1,45 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = bdev_null.c bdev_null_rpc.c
+LIBNAME = bdev_null
+
+SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/module/bdev/null/bdev_null.c b/src/spdk/module/bdev/null/bdev_null.c
new file mode 100644
index 000000000..97aa8b03f
--- /dev/null
+++ b/src/spdk/module/bdev/null/bdev_null.c
@@ -0,0 +1,550 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation. All rights reserved.
+ * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/bdev.h"
+#include "spdk/conf.h"
+#include "spdk/env.h"
+#include "spdk/thread.h"
+#include "spdk/json.h"
+#include "spdk/string.h"
+#include "spdk/likely.h"
+
+#include "spdk/bdev_module.h"
+#include "spdk_internal/log.h"
+
+#include "bdev_null.h"
+
+struct null_bdev {
+ struct spdk_bdev bdev;
+ TAILQ_ENTRY(null_bdev) tailq;
+};
+
+struct null_io_channel {
+ struct spdk_poller *poller;
+ TAILQ_HEAD(, spdk_bdev_io) io;
+};
+
+static TAILQ_HEAD(, null_bdev) g_null_bdev_head;
+static void *g_null_read_buf;
+
+static int bdev_null_initialize(void);
+static void bdev_null_finish(void);
+static void bdev_null_get_spdk_running_config(FILE *fp);
+
+static struct spdk_bdev_module null_if = {
+ .name = "null",
+ .module_init = bdev_null_initialize,
+ .module_fini = bdev_null_finish,
+ .config_text = bdev_null_get_spdk_running_config,
+ .async_fini = true,
+};
+
+SPDK_BDEV_MODULE_REGISTER(null, &null_if)
+
+static int
+bdev_null_destruct(void *ctx)
+{
+ struct null_bdev *bdev = ctx;
+
+ TAILQ_REMOVE(&g_null_bdev_head, bdev, tailq);
+ free(bdev->bdev.name);
+ free(bdev);
+
+ return 0;
+}
+
+static bool
+bdev_null_abort_io(struct null_io_channel *ch, struct spdk_bdev_io *bio_to_abort)
+{
+ struct spdk_bdev_io *bdev_io;
+
+ TAILQ_FOREACH(bdev_io, &ch->io, module_link) {
+ if (bdev_io == bio_to_abort) {
+ TAILQ_REMOVE(&ch->io, bio_to_abort, module_link);
+ spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static void
+bdev_null_submit_request(struct spdk_io_channel *_ch, struct spdk_bdev_io *bdev_io)
+{
+ struct null_io_channel *ch = spdk_io_channel_get_ctx(_ch);
+ struct spdk_bdev *bdev = bdev_io->bdev;
+ struct spdk_dif_ctx dif_ctx;
+ struct spdk_dif_error err_blk;
+ int rc;
+
+ if (SPDK_DIF_DISABLE != bdev->dif_type &&
+ (SPDK_BDEV_IO_TYPE_READ == bdev_io->type ||
+ SPDK_BDEV_IO_TYPE_WRITE == bdev_io->type)) {
+ rc = spdk_dif_ctx_init(&dif_ctx,
+ bdev->blocklen,
+ bdev->md_len,
+ bdev->md_interleave,
+ bdev->dif_is_head_of_md,
+ bdev->dif_type,
+ bdev->dif_check_flags,
+ bdev_io->u.bdev.offset_blocks & 0xFFFFFFFF,
+ 0xFFFF, 0, 0, 0);
+ if (0 != rc) {
+ SPDK_ERRLOG("Failed to initialize DIF context, error %d\n", rc);
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ return;
+ }
+ }
+
+ switch (bdev_io->type) {
+ case SPDK_BDEV_IO_TYPE_READ:
+ if (bdev_io->u.bdev.iovs[0].iov_base == NULL) {
+ assert(bdev_io->u.bdev.iovcnt == 1);
+ if (spdk_likely(bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen <=
+ SPDK_BDEV_LARGE_BUF_MAX_SIZE)) {
+ bdev_io->u.bdev.iovs[0].iov_base = g_null_read_buf;
+ bdev_io->u.bdev.iovs[0].iov_len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
+ } else {
+ SPDK_ERRLOG("Overflow occurred. Read I/O size %" PRIu64 " was larger than permitted %d\n",
+ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen,
+ SPDK_BDEV_LARGE_BUF_MAX_SIZE);
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ return;
+ }
+ }
+ if (SPDK_DIF_DISABLE != bdev->dif_type) {
+ rc = spdk_dif_generate(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
+ bdev_io->u.bdev.num_blocks, &dif_ctx);
+ if (0 != rc) {
+ SPDK_ERRLOG("IO DIF generation failed: lba %lu, num_block %lu\n",
+ bdev_io->u.bdev.offset_blocks,
+ bdev_io->u.bdev.num_blocks);
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ return;
+ }
+ }
+ TAILQ_INSERT_TAIL(&ch->io, bdev_io, module_link);
+ break;
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ if (SPDK_DIF_DISABLE != bdev->dif_type) {
+ rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
+ bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
+ if (0 != rc) {
+ SPDK_ERRLOG("IO DIF verification failed: lba %lu, num_blocks %lu, "
+ "err_type %u, expected %u, actual %u, err_offset %u\n",
+ bdev_io->u.bdev.offset_blocks,
+ bdev_io->u.bdev.num_blocks,
+ err_blk.err_type,
+ err_blk.expected,
+ err_blk.actual,
+ err_blk.err_offset);
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ return;
+ }
+ }
+ TAILQ_INSERT_TAIL(&ch->io, bdev_io, module_link);
+ break;
+ case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
+ case SPDK_BDEV_IO_TYPE_RESET:
+ TAILQ_INSERT_TAIL(&ch->io, bdev_io, module_link);
+ break;
+ case SPDK_BDEV_IO_TYPE_ABORT:
+ if (bdev_null_abort_io(ch, bdev_io->u.abort.bio_to_abort)) {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
+ } else {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+ break;
+ case SPDK_BDEV_IO_TYPE_FLUSH:
+ case SPDK_BDEV_IO_TYPE_UNMAP:
+ default:
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ break;
+ }
+}
+
+static bool
+bdev_null_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
+{
+ switch (io_type) {
+ case SPDK_BDEV_IO_TYPE_READ:
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
+ case SPDK_BDEV_IO_TYPE_RESET:
+ case SPDK_BDEV_IO_TYPE_ABORT:
+ return true;
+ case SPDK_BDEV_IO_TYPE_FLUSH:
+ case SPDK_BDEV_IO_TYPE_UNMAP:
+ default:
+ return false;
+ }
+}
+
+static struct spdk_io_channel *
+bdev_null_get_io_channel(void *ctx)
+{
+ return spdk_get_io_channel(&g_null_bdev_head);
+}
+
+static void
+bdev_null_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
+{
+ char uuid_str[SPDK_UUID_STRING_LEN];
+
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_string(w, "method", "bdev_null_create");
+
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_string(w, "name", bdev->name);
+ spdk_json_write_named_uint64(w, "num_blocks", bdev->blockcnt);
+ spdk_json_write_named_uint32(w, "block_size", bdev->blocklen);
+ spdk_json_write_named_uint32(w, "md_size", bdev->md_len);
+ spdk_json_write_named_uint32(w, "dif_type", bdev->dif_type);
+ spdk_json_write_named_bool(w, "dif_is_head_of_md", bdev->dif_is_head_of_md);
+ spdk_uuid_fmt_lower(uuid_str, sizeof(uuid_str), &bdev->uuid);
+ spdk_json_write_named_string(w, "uuid", uuid_str);
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+}
+
+static const struct spdk_bdev_fn_table null_fn_table = {
+ .destruct = bdev_null_destruct,
+ .submit_request = bdev_null_submit_request,
+ .io_type_supported = bdev_null_io_type_supported,
+ .get_io_channel = bdev_null_get_io_channel,
+ .write_config_json = bdev_null_write_config_json,
+};
+
+int
+bdev_null_create(struct spdk_bdev **bdev, const struct spdk_null_bdev_opts *opts)
+{
+ struct null_bdev *null_disk;
+ uint32_t data_block_size;
+ int rc;
+
+ if (!opts) {
+ SPDK_ERRLOG("No options provided for Null bdev.\n");
+ return -EINVAL;
+ }
+
+ if (opts->md_interleave) {
+ if (opts->block_size < opts->md_size) {
+ SPDK_ERRLOG("Interleaved metadata size can not be greater than block size.\n");
+ return -EINVAL;
+ }
+ data_block_size = opts->block_size - opts->md_size;
+ } else {
+ if (opts->md_size != 0) {
+ SPDK_ERRLOG("Metadata in separate buffer is not supported\n");
+ return -ENOTSUP;
+ }
+ data_block_size = opts->block_size;
+ }
+
+ if (data_block_size % 512 != 0) {
+ SPDK_ERRLOG("Data block size %u is not a multiple of 512.\n", opts->block_size);
+ return -EINVAL;
+ }
+
+ if (opts->num_blocks == 0) {
+ SPDK_ERRLOG("Disk must be more than 0 blocks\n");
+ return -EINVAL;
+ }
+
+ null_disk = calloc(1, sizeof(*null_disk));
+ if (!null_disk) {
+ SPDK_ERRLOG("could not allocate null_bdev\n");
+ return -ENOMEM;
+ }
+
+ null_disk->bdev.name = strdup(opts->name);
+ if (!null_disk->bdev.name) {
+ free(null_disk);
+ return -ENOMEM;
+ }
+ null_disk->bdev.product_name = "Null disk";
+
+ null_disk->bdev.write_cache = 0;
+ null_disk->bdev.blocklen = opts->block_size;
+ null_disk->bdev.blockcnt = opts->num_blocks;
+ null_disk->bdev.md_len = opts->md_size;
+ null_disk->bdev.md_interleave = opts->md_interleave;
+ null_disk->bdev.dif_type = opts->dif_type;
+ null_disk->bdev.dif_is_head_of_md = opts->dif_is_head_of_md;
+ /* Current block device layer API does not propagate
+ * any DIF related information from user. So, we can
+ * not generate or verify Application Tag.
+ */
+ switch (opts->dif_type) {
+ case SPDK_DIF_TYPE1:
+ case SPDK_DIF_TYPE2:
+ null_disk->bdev.dif_check_flags = SPDK_DIF_FLAGS_GUARD_CHECK |
+ SPDK_DIF_FLAGS_REFTAG_CHECK;
+ break;
+ case SPDK_DIF_TYPE3:
+ null_disk->bdev.dif_check_flags = SPDK_DIF_FLAGS_GUARD_CHECK;
+ break;
+ case SPDK_DIF_DISABLE:
+ break;
+ }
+ if (opts->uuid) {
+ null_disk->bdev.uuid = *opts->uuid;
+ } else {
+ spdk_uuid_generate(&null_disk->bdev.uuid);
+ }
+
+ null_disk->bdev.ctxt = null_disk;
+ null_disk->bdev.fn_table = &null_fn_table;
+ null_disk->bdev.module = &null_if;
+
+ rc = spdk_bdev_register(&null_disk->bdev);
+ if (rc) {
+ free(null_disk->bdev.name);
+ free(null_disk);
+ return rc;
+ }
+
+ *bdev = &(null_disk->bdev);
+
+ TAILQ_INSERT_TAIL(&g_null_bdev_head, null_disk, tailq);
+
+ return rc;
+}
+
+void
+bdev_null_delete(struct spdk_bdev *bdev, spdk_delete_null_complete cb_fn, void *cb_arg)
+{
+ if (!bdev || bdev->module != &null_if) {
+ cb_fn(cb_arg, -ENODEV);
+ return;
+ }
+
+ spdk_bdev_unregister(bdev, cb_fn, cb_arg);
+}
+
+static int
+null_io_poll(void *arg)
+{
+ struct null_io_channel *ch = arg;
+ TAILQ_HEAD(, spdk_bdev_io) io;
+ struct spdk_bdev_io *bdev_io;
+
+ TAILQ_INIT(&io);
+ TAILQ_SWAP(&ch->io, &io, spdk_bdev_io, module_link);
+
+ if (TAILQ_EMPTY(&io)) {
+ return SPDK_POLLER_IDLE;
+ }
+
+ while (!TAILQ_EMPTY(&io)) {
+ bdev_io = TAILQ_FIRST(&io);
+ TAILQ_REMOVE(&io, bdev_io, module_link);
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
+ }
+
+ return SPDK_POLLER_BUSY;
+}
+
+static int
+null_bdev_create_cb(void *io_device, void *ctx_buf)
+{
+ struct null_io_channel *ch = ctx_buf;
+
+ TAILQ_INIT(&ch->io);
+ ch->poller = SPDK_POLLER_REGISTER(null_io_poll, ch, 0);
+
+ return 0;
+}
+
+static void
+null_bdev_destroy_cb(void *io_device, void *ctx_buf)
+{
+ struct null_io_channel *ch = ctx_buf;
+
+ spdk_poller_unregister(&ch->poller);
+}
+
+static void
+_bdev_null_cleanup_cb(void *arg)
+{
+ spdk_free(g_null_read_buf);
+}
+
+static int
+bdev_null_initialize(void)
+{
+ struct spdk_conf_section *sp = spdk_conf_find_section(NULL, "Null");
+ uint64_t size_in_mb, num_blocks;
+ int block_size, i, rc = 0;
+ int md_size, dif_type;
+ struct spdk_bdev *bdev;
+ const char *name, *val;
+ struct spdk_null_bdev_opts opts = {};
+
+ TAILQ_INIT(&g_null_bdev_head);
+
+ /*
+ * This will be used if upper layer expects us to allocate the read buffer.
+ * Instead of using a real rbuf from the bdev pool, just always point to
+ * this same zeroed buffer.
+ */
+ g_null_read_buf = spdk_zmalloc(SPDK_BDEV_LARGE_BUF_MAX_SIZE, 0, NULL,
+ SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
+
+ /*
+ * We need to pick some unique address as our "io device" - so just use the
+ * address of the global tailq.
+ */
+ spdk_io_device_register(&g_null_bdev_head, null_bdev_create_cb, null_bdev_destroy_cb,
+ sizeof(struct null_io_channel),
+ "null_bdev");
+
+ if (sp == NULL) {
+ goto end;
+ }
+
+ for (i = 0; ; ++i) {
+ val = spdk_conf_section_get_nval(sp, "Dev", i);
+ if (val == NULL) {
+ break;
+ }
+
+ name = spdk_conf_section_get_nmval(sp, "Dev", i, 0);
+ if (name == NULL) {
+ SPDK_ERRLOG("Null entry %d: Name must be provided\n", i);
+ continue;
+ }
+
+ val = spdk_conf_section_get_nmval(sp, "Dev", i, 1);
+ if (val == NULL) {
+ SPDK_ERRLOG("Null entry %d: Size in MB must be provided\n", i);
+ continue;
+ }
+
+ errno = 0;
+ size_in_mb = strtoull(val, NULL, 10);
+ if (errno) {
+ SPDK_ERRLOG("Null entry %d: Invalid size in MB %s\n", i, val);
+ continue;
+ }
+
+ val = spdk_conf_section_get_nmval(sp, "Dev", i, 2);
+ if (val == NULL) {
+ block_size = 512;
+ } else {
+ block_size = (int)spdk_strtol(val, 10);
+ if (block_size <= 0) {
+ SPDK_ERRLOG("Null entry %d: Invalid block size %s\n", i, val);
+ continue;
+ }
+ }
+
+ val = spdk_conf_section_get_nmval(sp, "Dev", i, 3);
+ if (val == NULL) {
+ md_size = 0;
+ } else {
+ md_size = (int)spdk_strtol(val, 10);
+ if (md_size < 0) {
+ SPDK_ERRLOG("Null entry %d: Invalid metadata size %s\n", i, val);
+ continue;
+ }
+ }
+
+ val = spdk_conf_section_get_nmval(sp, "Dev", i, 4);
+ if (val == NULL) {
+ dif_type = SPDK_DIF_DISABLE;
+ } else {
+ dif_type = (int)spdk_strtol(val, 10);
+ if (dif_type < SPDK_DIF_DISABLE || dif_type > SPDK_DIF_TYPE3) {
+ SPDK_ERRLOG("Null entry %d: Invalid data protection type %s\n", i, val);
+ continue;
+ }
+ }
+ num_blocks = size_in_mb * (1024 * 1024) / block_size;
+
+ opts.name = name;
+ opts.num_blocks = num_blocks;
+ opts.block_size = block_size;
+ opts.md_size = md_size;
+ opts.md_interleave = true;
+ opts.dif_type = dif_type;
+ opts.dif_is_head_of_md = false;
+ rc = bdev_null_create(&bdev, &opts);
+ if (rc) {
+ SPDK_ERRLOG("Could not create null bdev\n");
+ goto end;
+ }
+ }
+end:
+ if (rc) {
+ spdk_io_device_unregister(&g_null_bdev_head, _bdev_null_cleanup_cb);
+ }
+ return rc;
+}
+
+static void
+_bdev_null_finish_cb(void *arg)
+{
+ spdk_free(g_null_read_buf);
+ spdk_bdev_module_finish_done();
+}
+
+static void
+bdev_null_finish(void)
+{
+ spdk_io_device_unregister(&g_null_bdev_head, _bdev_null_finish_cb);
+}
+
+static void
+bdev_null_get_spdk_running_config(FILE *fp)
+{
+ struct null_bdev *bdev;
+ uint64_t null_bdev_size;
+
+ fprintf(fp, "\n[Null]\n");
+
+ TAILQ_FOREACH(bdev, &g_null_bdev_head, tailq) {
+ null_bdev_size = bdev->bdev.blocklen * bdev->bdev.blockcnt;
+ null_bdev_size /= (1024 * 1024);
+ fprintf(fp, " Dev %s %" PRIu64 " %d %d %d\n",
+ bdev->bdev.name, null_bdev_size, bdev->bdev.blocklen, bdev->bdev.md_len,
+ bdev->bdev.dif_type);
+ }
+}
+
+SPDK_LOG_REGISTER_COMPONENT("bdev_null", SPDK_LOG_BDEV_NULL)
diff --git a/src/spdk/module/bdev/null/bdev_null.h b/src/spdk/module/bdev/null/bdev_null.h
new file mode 100644
index 000000000..07db54e48
--- /dev/null
+++ b/src/spdk/module/bdev/null/bdev_null.h
@@ -0,0 +1,67 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation. All rights reserved.
+ * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_BDEV_NULL_H
+#define SPDK_BDEV_NULL_H
+
+#include "spdk/stdinc.h"
+
+typedef void (*spdk_delete_null_complete)(void *cb_arg, int bdeverrno);
+
+struct spdk_bdev;
+struct spdk_uuid;
+
+struct spdk_null_bdev_opts {
+ const char *name;
+ const struct spdk_uuid *uuid;
+ uint64_t num_blocks;
+ uint32_t block_size;
+ uint32_t md_size;
+ bool md_interleave;
+ enum spdk_dif_type dif_type;
+ bool dif_is_head_of_md;
+};
+
+int bdev_null_create(struct spdk_bdev **bdev, const struct spdk_null_bdev_opts *opts);
+
+/**
+ * Delete null bdev.
+ *
+ * \param bdev Pointer to null bdev.
+ * \param cb_fn Function to call after deletion.
+ * \param cb_arg Argument to pass to cb_fn.
+ */
+void bdev_null_delete(struct spdk_bdev *bdev, spdk_delete_null_complete cb_fn,
+ void *cb_arg);
+
+#endif /* SPDK_BDEV_NULL_H */
diff --git a/src/spdk/module/bdev/null/bdev_null_rpc.c b/src/spdk/module/bdev/null/bdev_null_rpc.c
new file mode 100644
index 000000000..f3a433e75
--- /dev/null
+++ b/src/spdk/module/bdev/null/bdev_null_rpc.c
@@ -0,0 +1,204 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation. All rights reserved.
+ * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/rpc.h"
+#include "spdk/util.h"
+#include "spdk/string.h"
+#include "spdk/bdev_module.h"
+#include "spdk_internal/log.h"
+
+#include "bdev_null.h"
+
+struct rpc_construct_null {
+ char *name;
+ char *uuid;
+ uint64_t num_blocks;
+ uint32_t block_size;
+ uint32_t md_size;
+ int32_t dif_type;
+ bool dif_is_head_of_md;
+};
+
+static void
+free_rpc_construct_null(struct rpc_construct_null *req)
+{
+ free(req->name);
+ free(req->uuid);
+}
+
+static const struct spdk_json_object_decoder rpc_construct_null_decoders[] = {
+ {"name", offsetof(struct rpc_construct_null, name), spdk_json_decode_string},
+ {"uuid", offsetof(struct rpc_construct_null, uuid), spdk_json_decode_string, true},
+ {"num_blocks", offsetof(struct rpc_construct_null, num_blocks), spdk_json_decode_uint64},
+ {"block_size", offsetof(struct rpc_construct_null, block_size), spdk_json_decode_uint32},
+ {"md_size", offsetof(struct rpc_construct_null, md_size), spdk_json_decode_uint32, true},
+ {"dif_type", offsetof(struct rpc_construct_null, dif_type), spdk_json_decode_int32, true},
+ {"dif_is_head_of_md", offsetof(struct rpc_construct_null, dif_is_head_of_md), spdk_json_decode_bool, true},
+};
+
+static void
+rpc_bdev_null_create(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_construct_null req = {};
+ struct spdk_json_write_ctx *w;
+ struct spdk_uuid *uuid = NULL;
+ struct spdk_uuid decoded_uuid;
+ struct spdk_bdev *bdev;
+ struct spdk_null_bdev_opts opts = {};
+ uint32_t data_block_size;
+ int rc = 0;
+
+ if (spdk_json_decode_object(params, rpc_construct_null_decoders,
+ SPDK_COUNTOF(rpc_construct_null_decoders),
+ &req)) {
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_NULL, "spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ if (req.block_size < req.md_size) {
+ spdk_jsonrpc_send_error_response_fmt(request, -EINVAL,
+ "Interleaved metadata size can not be greater than block size");
+ goto cleanup;
+ }
+ data_block_size = req.block_size - req.md_size;
+ if (data_block_size % 512 != 0) {
+ spdk_jsonrpc_send_error_response_fmt(request, -EINVAL,
+ "Data block size %u is not a multiple of 512", req.block_size);
+ goto cleanup;
+ }
+
+ if (req.num_blocks == 0) {
+ spdk_jsonrpc_send_error_response(request, -EINVAL,
+ "Disk num_blocks must be greater than 0");
+ goto cleanup;
+ }
+
+ if (req.uuid) {
+ if (spdk_uuid_parse(&decoded_uuid, req.uuid)) {
+ spdk_jsonrpc_send_error_response(request, -EINVAL,
+ "Failed to parse bdev UUID");
+ goto cleanup;
+ }
+ uuid = &decoded_uuid;
+ }
+
+ if (req.dif_type < SPDK_DIF_DISABLE || req.dif_type > SPDK_DIF_TYPE3) {
+ spdk_jsonrpc_send_error_response(request, -EINVAL, "Invalid protection information type");
+ goto cleanup;
+ }
+
+ opts.name = req.name;
+ opts.uuid = uuid;
+ opts.num_blocks = req.num_blocks;
+ opts.block_size = req.block_size;
+ opts.md_size = req.md_size;
+ opts.md_interleave = true;
+ opts.dif_type = req.dif_type;
+ opts.dif_is_head_of_md = req.dif_is_head_of_md;
+ rc = bdev_null_create(&bdev, &opts);
+ if (rc) {
+ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc));
+ goto cleanup;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_string(w, bdev->name);
+ spdk_jsonrpc_end_result(request, w);
+ free_rpc_construct_null(&req);
+ return;
+
+cleanup:
+ free_rpc_construct_null(&req);
+}
+SPDK_RPC_REGISTER("bdev_null_create", rpc_bdev_null_create, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_null_create, construct_null_bdev)
+
+struct rpc_delete_null {
+ char *name;
+};
+
+static void
+free_rpc_delete_null(struct rpc_delete_null *req)
+{
+ free(req->name);
+}
+
+static const struct spdk_json_object_decoder rpc_delete_null_decoders[] = {
+ {"name", offsetof(struct rpc_delete_null, name), spdk_json_decode_string},
+};
+
+static void
+rpc_bdev_null_delete_cb(void *cb_arg, int bdeverrno)
+{
+ struct spdk_jsonrpc_request *request = cb_arg;
+ struct spdk_json_write_ctx *w = spdk_jsonrpc_begin_result(request);
+
+ spdk_json_write_bool(w, bdeverrno == 0);
+ spdk_jsonrpc_end_result(request, w);
+}
+
+static void
+rpc_bdev_null_delete(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_delete_null req = {NULL};
+ struct spdk_bdev *bdev;
+
+ if (spdk_json_decode_object(params, rpc_delete_null_decoders,
+ SPDK_COUNTOF(rpc_delete_null_decoders),
+ &req)) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ bdev = spdk_bdev_get_by_name(req.name);
+ if (bdev == NULL) {
+ spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV));
+ goto cleanup;
+ }
+
+ bdev_null_delete(bdev, rpc_bdev_null_delete_cb, request);
+
+ free_rpc_delete_null(&req);
+
+ return;
+
+cleanup:
+ free_rpc_delete_null(&req);
+}
+SPDK_RPC_REGISTER("bdev_null_delete", rpc_bdev_null_delete, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_null_delete, delete_null_bdev)
diff --git a/src/spdk/module/bdev/nvme/Makefile b/src/spdk/module/bdev/nvme/Makefile
new file mode 100644
index 000000000..f9ddb2389
--- /dev/null
+++ b/src/spdk/module/bdev/nvme/Makefile
@@ -0,0 +1,50 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 3
+SO_MINOR := 0
+
+C_SRCS = bdev_nvme.c bdev_nvme_rpc.c nvme_rpc.c common.c bdev_ocssd.c bdev_ocssd_rpc.c
+C_SRCS-$(CONFIG_NVME_CUSE) += bdev_nvme_cuse_rpc.c
+
+ifeq ($(OS),Linux)
+C_SRCS += vbdev_opal.c vbdev_opal_rpc.c
+endif
+LIBNAME = bdev_nvme
+
+SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/module/bdev/nvme/bdev_nvme.c b/src/spdk/module/bdev/nvme/bdev_nvme.c
new file mode 100644
index 000000000..4a89b8eb2
--- /dev/null
+++ b/src/spdk/module/bdev/nvme/bdev_nvme.c
@@ -0,0 +1,2924 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation. All rights reserved.
+ * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "bdev_nvme.h"
+#include "bdev_ocssd.h"
+
+#include "spdk/config.h"
+#include "spdk/conf.h"
+#include "spdk/endian.h"
+#include "spdk/bdev.h"
+#include "spdk/json.h"
+#include "spdk/nvme.h"
+#include "spdk/nvme_ocssd.h"
+#include "spdk/thread.h"
+#include "spdk/string.h"
+#include "spdk/likely.h"
+#include "spdk/util.h"
+
+#include "spdk/bdev_module.h"
+#include "spdk_internal/log.h"
+
+#define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true
+
+static void bdev_nvme_get_spdk_running_config(FILE *fp);
+static int bdev_nvme_config_json(struct spdk_json_write_ctx *w);
+
+struct nvme_bdev_io {
+ /** array of iovecs to transfer. */
+ struct iovec *iovs;
+
+ /** Number of iovecs in iovs array. */
+ int iovcnt;
+
+ /** Current iovec position. */
+ int iovpos;
+
+ /** Offset in current iovec. */
+ uint32_t iov_offset;
+
+ /** array of iovecs to transfer. */
+ struct iovec *fused_iovs;
+
+ /** Number of iovecs in iovs array. */
+ int fused_iovcnt;
+
+ /** Current iovec position. */
+ int fused_iovpos;
+
+ /** Offset in current iovec. */
+ uint32_t fused_iov_offset;
+
+ /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */
+ struct spdk_nvme_cpl cpl;
+
+ /** Originating thread */
+ struct spdk_thread *orig_thread;
+
+ /** Keeps track if first of fused commands was submitted */
+ bool first_fused_submitted;
+};
+
+struct nvme_probe_ctx {
+ size_t count;
+ struct spdk_nvme_transport_id trids[NVME_MAX_CONTROLLERS];
+ struct spdk_nvme_host_id hostids[NVME_MAX_CONTROLLERS];
+ const char *names[NVME_MAX_CONTROLLERS];
+ uint32_t prchk_flags[NVME_MAX_CONTROLLERS];
+ const char *hostnqn;
+};
+
+struct nvme_probe_skip_entry {
+ struct spdk_nvme_transport_id trid;
+ TAILQ_ENTRY(nvme_probe_skip_entry) tailq;
+};
+/* All the controllers deleted by users via RPC are skipped by hotplug monitor */
+static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER(
+ g_skipped_nvme_ctrlrs);
+
+static struct spdk_bdev_nvme_opts g_opts = {
+ .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE,
+ .timeout_us = 0,
+ .retry_count = 4,
+ .arbitration_burst = 0,
+ .low_priority_weight = 0,
+ .medium_priority_weight = 0,
+ .high_priority_weight = 0,
+ .nvme_adminq_poll_period_us = 10000ULL,
+ .nvme_ioq_poll_period_us = 0,
+ .io_queue_requests = 0,
+ .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT,
+};
+
+#define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL
+#define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL
+
+static int g_hot_insert_nvme_controller_index = 0;
+static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT;
+static bool g_nvme_hotplug_enabled = false;
+static struct spdk_thread *g_bdev_nvme_init_thread;
+static struct spdk_poller *g_hotplug_poller;
+static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx;
+static char *g_nvme_hostnqn = NULL;
+
+static void nvme_ctrlr_populate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
+ struct nvme_async_probe_ctx *ctx);
+static void nvme_ctrlr_populate_namespaces_done(struct nvme_async_probe_ctx *ctx);
+static int bdev_nvme_library_init(void);
+static void bdev_nvme_library_fini(void);
+static int bdev_nvme_readv(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
+ struct nvme_bdev_io *bio,
+ struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba);
+static int bdev_nvme_no_pi_readv(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
+ struct nvme_bdev_io *bio,
+ struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba);
+static int bdev_nvme_writev(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
+ struct nvme_bdev_io *bio,
+ struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba);
+static int bdev_nvme_comparev(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
+ struct nvme_bdev_io *bio,
+ struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba);
+static int bdev_nvme_comparev_and_writev(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
+ struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov,
+ int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba);
+static int bdev_nvme_admin_passthru(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
+ struct nvme_bdev_io *bio,
+ struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes);
+static int bdev_nvme_io_passthru(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
+ struct nvme_bdev_io *bio,
+ struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes);
+static int bdev_nvme_io_passthru_md(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
+ struct nvme_bdev_io *bio,
+ struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len);
+static int bdev_nvme_reset(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct nvme_bdev_io *bio);
+static int bdev_nvme_abort(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
+ struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort);
+
+typedef void (*populate_namespace_fn)(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
+ struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx);
+static void nvme_ctrlr_populate_standard_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
+ struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx);
+
+static populate_namespace_fn g_populate_namespace_fn[] = {
+ NULL,
+ nvme_ctrlr_populate_standard_namespace,
+ bdev_ocssd_populate_namespace,
+};
+
+typedef void (*depopulate_namespace_fn)(struct nvme_bdev_ns *ns);
+static void nvme_ctrlr_depopulate_standard_namespace(struct nvme_bdev_ns *ns);
+
+static depopulate_namespace_fn g_depopulate_namespace_fn[] = {
+ NULL,
+ nvme_ctrlr_depopulate_standard_namespace,
+ bdev_ocssd_depopulate_namespace,
+};
+
+typedef void (*config_json_namespace_fn)(struct spdk_json_write_ctx *w, struct nvme_bdev_ns *ns);
+static void nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w,
+ struct nvme_bdev_ns *ns);
+
+static config_json_namespace_fn g_config_json_namespace_fn[] = {
+ NULL,
+ nvme_ctrlr_config_json_standard_namespace,
+ bdev_ocssd_namespace_config_json,
+};
+
+struct spdk_nvme_qpair *
+bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch)
+{
+ struct nvme_io_channel *nvme_ch;
+
+ nvme_ch = spdk_io_channel_get_ctx(ctrlr_io_ch);
+
+ return nvme_ch->qpair;
+}
+
+static int
+bdev_nvme_get_ctx_size(void)
+{
+ return sizeof(struct nvme_bdev_io);
+}
+
+static struct spdk_bdev_module nvme_if = {
+ .name = "nvme",
+ .async_fini = true,
+ .module_init = bdev_nvme_library_init,
+ .module_fini = bdev_nvme_library_fini,
+ .config_text = bdev_nvme_get_spdk_running_config,
+ .config_json = bdev_nvme_config_json,
+ .get_ctx_size = bdev_nvme_get_ctx_size,
+
+};
+SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if)
+
+static void
+bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx)
+{
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "qpar %p is disconnected, attempting reconnect.\n", qpair);
+ /*
+ * Currently, just try to reconnect indefinitely. If we are doing a reset, the reset will
+ * reconnect a qpair and we will stop getting a callback for this one.
+ */
+ spdk_nvme_ctrlr_reconnect_io_qpair(qpair);
+}
+
+static int
+bdev_nvme_poll(void *arg)
+{
+ struct nvme_bdev_poll_group *group = arg;
+ int64_t num_completions;
+
+ if (group->collect_spin_stat && group->start_ticks == 0) {
+ group->start_ticks = spdk_get_ticks();
+ }
+
+ num_completions = spdk_nvme_poll_group_process_completions(group->group, 0,
+ bdev_nvme_disconnected_qpair_cb);
+ if (group->collect_spin_stat) {
+ if (num_completions > 0) {
+ if (group->end_ticks != 0) {
+ group->spin_ticks += (group->end_ticks - group->start_ticks);
+ group->end_ticks = 0;
+ }
+ group->start_ticks = 0;
+ } else {
+ group->end_ticks = spdk_get_ticks();
+ }
+ }
+
+ return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
+}
+
+static int
+bdev_nvme_poll_adminq(void *arg)
+{
+ int32_t rc;
+ struct spdk_nvme_ctrlr *ctrlr = arg;
+ struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
+
+ rc = spdk_nvme_ctrlr_process_admin_completions(ctrlr);
+
+ if (rc < 0) {
+ nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(spdk_nvme_ctrlr_get_transport_id(ctrlr));
+ assert(nvme_bdev_ctrlr != NULL);
+ bdev_nvme_reset(nvme_bdev_ctrlr, NULL);
+ }
+
+ return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY;
+}
+
+static int
+bdev_nvme_destruct(void *ctx)
+{
+ struct nvme_bdev *nvme_disk = ctx;
+
+ nvme_bdev_detach_bdev_from_ns(nvme_disk);
+
+ free(nvme_disk->disk.name);
+ free(nvme_disk);
+
+ return 0;
+}
+
+static int
+bdev_nvme_flush(struct nvme_bdev *nbdev, struct nvme_bdev_io *bio,
+ uint64_t offset, uint64_t nbytes)
+{
+ spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_SUCCESS);
+
+ return 0;
+}
+
+static void
+_bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i)
+{
+ struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
+ struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(_ch);
+ struct spdk_bdev_io *bdev_io;
+ enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS;
+
+ /* A NULL ctx means success. */
+ if (spdk_io_channel_iter_get_ctx(i) != NULL) {
+ status = SPDK_BDEV_IO_STATUS_FAILED;
+ }
+
+ while (!TAILQ_EMPTY(&nvme_ch->pending_resets)) {
+ bdev_io = TAILQ_FIRST(&nvme_ch->pending_resets);
+ TAILQ_REMOVE(&nvme_ch->pending_resets, bdev_io, module_link);
+ spdk_bdev_io_complete(bdev_io, status);
+ }
+
+ spdk_for_each_channel_continue(i, 0);
+}
+
+static void
+_bdev_nvme_reset_complete(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, int rc)
+{
+ /* we are using the for_each_channel cb_arg like a return code here. */
+ /* If it's zero, we succeeded, otherwise, the reset failed. */
+ void *cb_arg = NULL;
+
+ if (rc) {
+ cb_arg = (void *)0x1;
+ SPDK_ERRLOG("Resetting controller failed.\n");
+ } else {
+ SPDK_NOTICELOG("Resetting controller successful.\n");
+ }
+
+ pthread_mutex_lock(&g_bdev_nvme_mutex);
+ nvme_bdev_ctrlr->resetting = false;
+ pthread_mutex_unlock(&g_bdev_nvme_mutex);
+ /* Make sure we clear any pending resets before returning. */
+ spdk_for_each_channel(nvme_bdev_ctrlr,
+ _bdev_nvme_complete_pending_resets,
+ cb_arg, NULL);
+}
+
+static void
+_bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status)
+{
+ struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i);
+ void *ctx = spdk_io_channel_iter_get_ctx(i);
+ int rc = SPDK_BDEV_IO_STATUS_SUCCESS;
+
+ if (status) {
+ rc = SPDK_BDEV_IO_STATUS_FAILED;
+ }
+ if (ctx) {
+ spdk_bdev_io_complete(spdk_bdev_io_from_ctx(ctx), rc);
+ }
+ _bdev_nvme_reset_complete(nvme_bdev_ctrlr, status);
+}
+
+static void
+_bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i)
+{
+ struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i);
+ struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
+ struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(_ch);
+ struct spdk_nvme_io_qpair_opts opts;
+
+ spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_bdev_ctrlr->ctrlr, &opts, sizeof(opts));
+ opts.delay_cmd_submit = g_opts.delay_cmd_submit;
+ opts.create_only = true;
+
+ nvme_ch->qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_bdev_ctrlr->ctrlr, &opts, sizeof(opts));
+ if (!nvme_ch->qpair) {
+ spdk_for_each_channel_continue(i, -1);
+ return;
+ }
+
+ assert(nvme_ch->group != NULL);
+ if (spdk_nvme_poll_group_add(nvme_ch->group->group, nvme_ch->qpair) != 0) {
+ SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n");
+ spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair);
+ spdk_for_each_channel_continue(i, -1);
+ return;
+ }
+
+ if (spdk_nvme_ctrlr_connect_io_qpair(nvme_bdev_ctrlr->ctrlr, nvme_ch->qpair)) {
+ SPDK_ERRLOG("Unable to connect I/O qpair.\n");
+ spdk_nvme_poll_group_remove(nvme_ch->group->group, nvme_ch->qpair);
+ spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair);
+ spdk_for_each_channel_continue(i, -1);
+ return;
+ }
+
+ spdk_for_each_channel_continue(i, 0);
+}
+
+static void
+_bdev_nvme_reset(struct spdk_io_channel_iter *i, int status)
+{
+ struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i);
+ struct nvme_bdev_io *bio = spdk_io_channel_iter_get_ctx(i);
+ int rc;
+
+ if (status) {
+ if (bio) {
+ spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_FAILED);
+ }
+ _bdev_nvme_reset_complete(nvme_bdev_ctrlr, status);
+ return;
+ }
+
+ rc = spdk_nvme_ctrlr_reset(nvme_bdev_ctrlr->ctrlr);
+ if (rc != 0) {
+ if (bio) {
+ spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_FAILED);
+ }
+ _bdev_nvme_reset_complete(nvme_bdev_ctrlr, rc);
+ return;
+ }
+
+ /* Recreate all of the I/O queue pairs */
+ spdk_for_each_channel(nvme_bdev_ctrlr,
+ _bdev_nvme_reset_create_qpair,
+ bio,
+ _bdev_nvme_reset_create_qpairs_done);
+
+
+}
+
+static void
+_bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i)
+{
+ struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
+ struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
+ int rc;
+
+ rc = spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair);
+ if (!rc) {
+ nvme_ch->qpair = NULL;
+ }
+
+ spdk_for_each_channel_continue(i, rc);
+}
+
+static int
+bdev_nvme_reset(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct nvme_bdev_io *bio)
+{
+ struct spdk_io_channel *ch;
+ struct nvme_io_channel *nvme_ch;
+
+ pthread_mutex_lock(&g_bdev_nvme_mutex);
+ if (nvme_bdev_ctrlr->destruct) {
+ /* Don't bother resetting if the controller is in the process of being destructed. */
+ if (bio) {
+ spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_FAILED);
+ }
+ pthread_mutex_unlock(&g_bdev_nvme_mutex);
+ return 0;
+ }
+
+ if (!nvme_bdev_ctrlr->resetting) {
+ nvme_bdev_ctrlr->resetting = true;
+ } else {
+ pthread_mutex_unlock(&g_bdev_nvme_mutex);
+ SPDK_NOTICELOG("Unable to perform reset, already in progress.\n");
+ /*
+ * The internal reset calls won't be queued. This is on purpose so that we don't
+ * interfere with the app framework reset strategy. i.e. we are deferring to the
+ * upper level. If they are in the middle of a reset, we won't try to schedule another one.
+ */
+ if (bio) {
+ ch = spdk_get_io_channel(nvme_bdev_ctrlr);
+ assert(ch != NULL);
+ nvme_ch = spdk_io_channel_get_ctx(ch);
+ TAILQ_INSERT_TAIL(&nvme_ch->pending_resets, spdk_bdev_io_from_ctx(bio), module_link);
+ spdk_put_io_channel(ch);
+ }
+ return 0;
+ }
+
+ pthread_mutex_unlock(&g_bdev_nvme_mutex);
+ /* First, delete all NVMe I/O queue pairs. */
+ spdk_for_each_channel(nvme_bdev_ctrlr,
+ _bdev_nvme_reset_destroy_qpair,
+ bio,
+ _bdev_nvme_reset);
+
+ return 0;
+}
+
+static int
+bdev_nvme_unmap(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
+ struct nvme_bdev_io *bio,
+ uint64_t offset_blocks,
+ uint64_t num_blocks);
+
+static void
+bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
+ bool success)
+{
+ int ret;
+
+ if (!success) {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ return;
+ }
+
+ ret = bdev_nvme_readv((struct nvme_bdev *)bdev_io->bdev->ctxt,
+ ch,
+ (struct nvme_bdev_io *)bdev_io->driver_ctx,
+ bdev_io->u.bdev.iovs,
+ bdev_io->u.bdev.iovcnt,
+ bdev_io->u.bdev.md_buf,
+ bdev_io->u.bdev.num_blocks,
+ bdev_io->u.bdev.offset_blocks);
+
+ if (spdk_likely(ret == 0)) {
+ return;
+ } else if (ret == -ENOMEM) {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
+ } else {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+}
+
+static int
+_bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
+{
+ struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
+ struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt;
+ struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx;
+ struct nvme_bdev_io *nbdev_io_to_abort;
+
+ if (nvme_ch->qpair == NULL) {
+ /* The device is currently resetting */
+ return -1;
+ }
+
+ switch (bdev_io->type) {
+ case SPDK_BDEV_IO_TYPE_READ:
+ spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb,
+ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
+ return 0;
+
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ return bdev_nvme_writev(nbdev,
+ ch,
+ nbdev_io,
+ bdev_io->u.bdev.iovs,
+ bdev_io->u.bdev.iovcnt,
+ bdev_io->u.bdev.md_buf,
+ bdev_io->u.bdev.num_blocks,
+ bdev_io->u.bdev.offset_blocks);
+
+ case SPDK_BDEV_IO_TYPE_COMPARE:
+ return bdev_nvme_comparev(nbdev,
+ ch,
+ nbdev_io,
+ bdev_io->u.bdev.iovs,
+ bdev_io->u.bdev.iovcnt,
+ bdev_io->u.bdev.md_buf,
+ bdev_io->u.bdev.num_blocks,
+ bdev_io->u.bdev.offset_blocks);
+
+ case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
+ return bdev_nvme_comparev_and_writev(nbdev,
+ ch,
+ nbdev_io,
+ bdev_io->u.bdev.iovs,
+ bdev_io->u.bdev.iovcnt,
+ bdev_io->u.bdev.fused_iovs,
+ bdev_io->u.bdev.fused_iovcnt,
+ bdev_io->u.bdev.md_buf,
+ bdev_io->u.bdev.num_blocks,
+ bdev_io->u.bdev.offset_blocks);
+
+ case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
+ return bdev_nvme_unmap(nbdev,
+ ch,
+ nbdev_io,
+ bdev_io->u.bdev.offset_blocks,
+ bdev_io->u.bdev.num_blocks);
+
+ case SPDK_BDEV_IO_TYPE_UNMAP:
+ return bdev_nvme_unmap(nbdev,
+ ch,
+ nbdev_io,
+ bdev_io->u.bdev.offset_blocks,
+ bdev_io->u.bdev.num_blocks);
+
+ case SPDK_BDEV_IO_TYPE_RESET:
+ return bdev_nvme_reset(nbdev->nvme_bdev_ctrlr, nbdev_io);
+
+ case SPDK_BDEV_IO_TYPE_FLUSH:
+ return bdev_nvme_flush(nbdev,
+ nbdev_io,
+ bdev_io->u.bdev.offset_blocks,
+ bdev_io->u.bdev.num_blocks);
+
+ case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
+ return bdev_nvme_admin_passthru(nbdev,
+ ch,
+ nbdev_io,
+ &bdev_io->u.nvme_passthru.cmd,
+ bdev_io->u.nvme_passthru.buf,
+ bdev_io->u.nvme_passthru.nbytes);
+
+ case SPDK_BDEV_IO_TYPE_NVME_IO:
+ return bdev_nvme_io_passthru(nbdev,
+ ch,
+ nbdev_io,
+ &bdev_io->u.nvme_passthru.cmd,
+ bdev_io->u.nvme_passthru.buf,
+ bdev_io->u.nvme_passthru.nbytes);
+
+ case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
+ return bdev_nvme_io_passthru_md(nbdev,
+ ch,
+ nbdev_io,
+ &bdev_io->u.nvme_passthru.cmd,
+ bdev_io->u.nvme_passthru.buf,
+ bdev_io->u.nvme_passthru.nbytes,
+ bdev_io->u.nvme_passthru.md_buf,
+ bdev_io->u.nvme_passthru.md_len);
+
+ case SPDK_BDEV_IO_TYPE_ABORT:
+ nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx;
+ return bdev_nvme_abort(nbdev,
+ ch,
+ nbdev_io,
+ nbdev_io_to_abort);
+
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static void
+bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
+{
+ int rc = _bdev_nvme_submit_request(ch, bdev_io);
+
+ if (spdk_unlikely(rc != 0)) {
+ if (rc == -ENOMEM) {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
+ } else {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+ }
+}
+
+static bool
+bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
+{
+ struct nvme_bdev *nbdev = ctx;
+ const struct spdk_nvme_ctrlr_data *cdata;
+
+ switch (io_type) {
+ case SPDK_BDEV_IO_TYPE_READ:
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ case SPDK_BDEV_IO_TYPE_RESET:
+ case SPDK_BDEV_IO_TYPE_FLUSH:
+ case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
+ case SPDK_BDEV_IO_TYPE_NVME_IO:
+ case SPDK_BDEV_IO_TYPE_ABORT:
+ return true;
+
+ case SPDK_BDEV_IO_TYPE_COMPARE:
+ return spdk_nvme_ns_supports_compare(nbdev->nvme_ns->ns);
+
+ case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
+ return spdk_nvme_ns_get_md_size(nbdev->nvme_ns->ns) ? true : false;
+
+ case SPDK_BDEV_IO_TYPE_UNMAP:
+ cdata = spdk_nvme_ctrlr_get_data(nbdev->nvme_bdev_ctrlr->ctrlr);
+ return cdata->oncs.dsm;
+
+ case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
+ cdata = spdk_nvme_ctrlr_get_data(nbdev->nvme_bdev_ctrlr->ctrlr);
+ /*
+ * If an NVMe controller guarantees reading unallocated blocks returns zero,
+ * we can implement WRITE_ZEROES as an NVMe deallocate command.
+ */
+ if (cdata->oncs.dsm &&
+ spdk_nvme_ns_get_dealloc_logical_block_read_value(nbdev->nvme_ns->ns) ==
+ SPDK_NVME_DEALLOC_READ_00) {
+ return true;
+ }
+ /*
+ * The NVMe controller write_zeroes function is currently not used by our driver.
+ * If a user submits an arbitrarily large write_zeroes request to the controller, the request will fail.
+ * Until this is resolved, we only claim support for write_zeroes if deallocated blocks return 0's when read.
+ */
+ return false;
+
+ case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
+ if (spdk_nvme_ctrlr_get_flags(nbdev->nvme_bdev_ctrlr->ctrlr) &
+ SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) {
+ return true;
+ }
+ return false;
+
+ default:
+ return false;
+ }
+}
+
+static int
+bdev_nvme_create_cb(void *io_device, void *ctx_buf)
+{
+ struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = io_device;
+ struct nvme_io_channel *ch = ctx_buf;
+ struct spdk_nvme_io_qpair_opts opts;
+ struct spdk_io_channel *pg_ch = NULL;
+ int rc;
+
+ spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_bdev_ctrlr->ctrlr, &opts, sizeof(opts));
+ opts.delay_cmd_submit = g_opts.delay_cmd_submit;
+ opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests);
+ opts.create_only = true;
+ g_opts.io_queue_requests = opts.io_queue_requests;
+
+ ch->qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_bdev_ctrlr->ctrlr, &opts, sizeof(opts));
+
+ if (ch->qpair == NULL) {
+ return -1;
+ }
+
+ if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) {
+ if (bdev_ocssd_create_io_channel(ch)) {
+ goto err;
+ }
+ }
+
+ pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs);
+ if (!pg_ch) {
+ goto err;
+ }
+
+ ch->group = spdk_io_channel_get_ctx(pg_ch);
+ if (spdk_nvme_poll_group_add(ch->group->group, ch->qpair) != 0) {
+ goto err;
+ }
+
+ rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_bdev_ctrlr->ctrlr, ch->qpair);
+ if (rc) {
+ spdk_nvme_poll_group_remove(ch->group->group, ch->qpair);
+ goto err;
+ }
+
+#ifdef SPDK_CONFIG_VTUNE
+ ch->group->collect_spin_stat = true;
+#else
+ ch->group->collect_spin_stat = false;
+#endif
+
+ TAILQ_INIT(&ch->pending_resets);
+ return 0;
+
+err:
+ if (pg_ch) {
+ spdk_put_io_channel(pg_ch);
+ }
+ spdk_nvme_ctrlr_free_io_qpair(ch->qpair);
+ return -1;
+}
+
+static void
+bdev_nvme_destroy_cb(void *io_device, void *ctx_buf)
+{
+ struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = io_device;
+ struct nvme_io_channel *ch = ctx_buf;
+ struct nvme_bdev_poll_group *group;
+
+ group = ch->group;
+ assert(group != NULL);
+
+ if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) {
+ bdev_ocssd_destroy_io_channel(ch);
+ }
+
+ if (ch->qpair != NULL) {
+ spdk_nvme_poll_group_remove(group->group, ch->qpair);
+ }
+ spdk_put_io_channel(spdk_io_channel_from_ctx(group));
+
+ spdk_nvme_ctrlr_free_io_qpair(ch->qpair);
+}
+
+static int
+bdev_nvme_poll_group_create_cb(void *io_device, void *ctx_buf)
+{
+ struct nvme_bdev_poll_group *group = ctx_buf;
+
+ group->group = spdk_nvme_poll_group_create(group);
+ if (group->group == NULL) {
+ return -1;
+ }
+
+ group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us);
+
+ if (group->poller == NULL) {
+ spdk_nvme_poll_group_destroy(group->group);
+ return -1;
+ }
+
+ return 0;
+}
+
+static void
+bdev_nvme_poll_group_destroy_cb(void *io_device, void *ctx_buf)
+{
+ struct nvme_bdev_poll_group *group = ctx_buf;
+
+ spdk_poller_unregister(&group->poller);
+ if (spdk_nvme_poll_group_destroy(group->group)) {
+ SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.");
+ assert(false);
+ }
+}
+
+static struct spdk_io_channel *
+bdev_nvme_get_io_channel(void *ctx)
+{
+ struct nvme_bdev *nvme_bdev = ctx;
+
+ return spdk_get_io_channel(nvme_bdev->nvme_bdev_ctrlr);
+}
+
+static int
+bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
+{
+ struct nvme_bdev *nvme_bdev = ctx;
+ struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = nvme_bdev->nvme_bdev_ctrlr;
+ const struct spdk_nvme_ctrlr_data *cdata;
+ struct spdk_nvme_ns *ns;
+ union spdk_nvme_vs_register vs;
+ union spdk_nvme_csts_register csts;
+ char buf[128];
+
+ cdata = spdk_nvme_ctrlr_get_data(nvme_bdev->nvme_bdev_ctrlr->ctrlr);
+ vs = spdk_nvme_ctrlr_get_regs_vs(nvme_bdev->nvme_bdev_ctrlr->ctrlr);
+ csts = spdk_nvme_ctrlr_get_regs_csts(nvme_bdev->nvme_bdev_ctrlr->ctrlr);
+ ns = nvme_bdev->nvme_ns->ns;
+
+ spdk_json_write_named_object_begin(w, "nvme");
+
+ if (nvme_bdev_ctrlr->trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
+ spdk_json_write_named_string(w, "pci_address", nvme_bdev_ctrlr->trid->traddr);
+ }
+
+ spdk_json_write_named_object_begin(w, "trid");
+
+ nvme_bdev_dump_trid_json(nvme_bdev_ctrlr->trid, w);
+
+ spdk_json_write_object_end(w);
+
+#ifdef SPDK_CONFIG_NVME_CUSE
+ size_t cuse_name_size = 128;
+ char cuse_name[cuse_name_size];
+
+ int rc = spdk_nvme_cuse_get_ns_name(nvme_bdev->nvme_bdev_ctrlr->ctrlr, spdk_nvme_ns_get_id(ns),
+ cuse_name, &cuse_name_size);
+ if (rc == 0) {
+ spdk_json_write_named_string(w, "cuse_device", cuse_name);
+ }
+#endif
+
+ spdk_json_write_named_object_begin(w, "ctrlr_data");
+
+ spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid);
+
+ snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn);
+ spdk_str_trim(buf);
+ spdk_json_write_named_string(w, "model_number", buf);
+
+ snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn);
+ spdk_str_trim(buf);
+ spdk_json_write_named_string(w, "serial_number", buf);
+
+ snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr);
+ spdk_str_trim(buf);
+ spdk_json_write_named_string(w, "firmware_revision", buf);
+
+ spdk_json_write_named_object_begin(w, "oacs");
+
+ spdk_json_write_named_uint32(w, "security", cdata->oacs.security);
+ spdk_json_write_named_uint32(w, "format", cdata->oacs.format);
+ spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware);
+ spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage);
+
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_named_object_begin(w, "vs");
+
+ spdk_json_write_name(w, "nvme_version");
+ if (vs.bits.ter) {
+ spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter);
+ } else {
+ spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr);
+ }
+
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_named_object_begin(w, "csts");
+
+ spdk_json_write_named_uint32(w, "rdy", csts.bits.rdy);
+ spdk_json_write_named_uint32(w, "cfs", csts.bits.cfs);
+
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_named_object_begin(w, "ns_data");
+
+ spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns));
+
+ spdk_json_write_object_end(w);
+
+ if (cdata->oacs.security) {
+ spdk_json_write_named_object_begin(w, "security");
+
+ spdk_json_write_named_bool(w, "opal", nvme_bdev_ctrlr->opal_dev ? true : false);
+
+ spdk_json_write_object_end(w);
+ }
+
+ spdk_json_write_object_end(w);
+
+ return 0;
+}
+
+static void
+bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
+{
+ /* No config per bdev needed */
+}
+
+static uint64_t
+bdev_nvme_get_spin_time(struct spdk_io_channel *ch)
+{
+ struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
+ struct nvme_bdev_poll_group *group = nvme_ch->group;
+ uint64_t spin_time;
+
+ if (!group || !group->collect_spin_stat) {
+ return 0;
+ }
+
+ if (group->end_ticks != 0) {
+ group->spin_ticks += (group->end_ticks - group->start_ticks);
+ group->end_ticks = 0;
+ }
+
+ spin_time = (group->spin_ticks * 1000000ULL) / spdk_get_ticks_hz();
+ group->start_ticks = 0;
+ group->spin_ticks = 0;
+
+ return spin_time;
+}
+
+static const struct spdk_bdev_fn_table nvmelib_fn_table = {
+ .destruct = bdev_nvme_destruct,
+ .submit_request = bdev_nvme_submit_request,
+ .io_type_supported = bdev_nvme_io_type_supported,
+ .get_io_channel = bdev_nvme_get_io_channel,
+ .dump_info_json = bdev_nvme_dump_info_json,
+ .write_config_json = bdev_nvme_write_config_json,
+ .get_spin_time = bdev_nvme_get_spin_time,
+};
+
+static void
+nvme_ctrlr_populate_standard_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
+ struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx)
+{
+ struct spdk_nvme_ctrlr *ctrlr = nvme_bdev_ctrlr->ctrlr;
+ struct nvme_bdev *bdev;
+ struct spdk_nvme_ns *ns;
+ const struct spdk_uuid *uuid;
+ const struct spdk_nvme_ctrlr_data *cdata;
+ const struct spdk_nvme_ns_data *nsdata;
+ int rc;
+
+ cdata = spdk_nvme_ctrlr_get_data(ctrlr);
+
+ ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id);
+ if (!ns) {
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "Invalid NS %d\n", nvme_ns->id);
+ nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, -EINVAL);
+ return;
+ }
+
+ bdev = calloc(1, sizeof(*bdev));
+ if (!bdev) {
+ SPDK_ERRLOG("bdev calloc() failed\n");
+ nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, -ENOMEM);
+ return;
+ }
+
+ bdev->nvme_bdev_ctrlr = nvme_bdev_ctrlr;
+ nvme_ns->ns = ns;
+ bdev->nvme_ns = nvme_ns;
+
+ bdev->disk.name = spdk_sprintf_alloc("%sn%d", nvme_bdev_ctrlr->name, spdk_nvme_ns_get_id(ns));
+ if (!bdev->disk.name) {
+ free(bdev);
+ nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, -ENOMEM);
+ return;
+ }
+ bdev->disk.product_name = "NVMe disk";
+
+ bdev->disk.write_cache = 0;
+ if (cdata->vwc.present) {
+ /* Enable if the Volatile Write Cache exists */
+ bdev->disk.write_cache = 1;
+ }
+ bdev->disk.blocklen = spdk_nvme_ns_get_extended_sector_size(ns);
+ bdev->disk.blockcnt = spdk_nvme_ns_get_num_sectors(ns);
+ bdev->disk.optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns);
+
+ uuid = spdk_nvme_ns_get_uuid(ns);
+ if (uuid != NULL) {
+ bdev->disk.uuid = *uuid;
+ }
+
+ nsdata = spdk_nvme_ns_get_data(ns);
+
+ bdev->disk.md_len = spdk_nvme_ns_get_md_size(ns);
+ if (bdev->disk.md_len != 0) {
+ bdev->disk.md_interleave = nsdata->flbas.extended;
+ bdev->disk.dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns);
+ if (bdev->disk.dif_type != SPDK_DIF_DISABLE) {
+ bdev->disk.dif_is_head_of_md = nsdata->dps.md_start;
+ bdev->disk.dif_check_flags = nvme_bdev_ctrlr->prchk_flags;
+ }
+ }
+
+ if (!bdev_nvme_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) {
+ bdev->disk.acwu = 0;
+ } else if (nsdata->nsfeat.ns_atomic_write_unit) {
+ bdev->disk.acwu = nsdata->nacwu;
+ } else {
+ bdev->disk.acwu = cdata->acwu;
+ }
+
+ bdev->disk.ctxt = bdev;
+ bdev->disk.fn_table = &nvmelib_fn_table;
+ bdev->disk.module = &nvme_if;
+ rc = spdk_bdev_register(&bdev->disk);
+ if (rc) {
+ free(bdev->disk.name);
+ free(bdev);
+ nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, rc);
+ return;
+ }
+
+ nvme_bdev_attach_bdev_to_ns(nvme_ns, bdev);
+ nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, 0);
+}
+
+static bool
+hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
+ struct spdk_nvme_ctrlr_opts *opts)
+{
+ struct nvme_probe_skip_entry *entry;
+
+ TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) {
+ if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
+ return false;
+ }
+ }
+
+ opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst;
+ opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight;
+ opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight;
+ opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "Attaching to %s\n", trid->traddr);
+
+ return true;
+}
+
+static bool
+probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
+ struct spdk_nvme_ctrlr_opts *opts)
+{
+ struct nvme_probe_ctx *ctx = cb_ctx;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "Probing device %s\n", trid->traddr);
+
+ if (nvme_bdev_ctrlr_get(trid)) {
+ SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n",
+ trid->traddr);
+ return false;
+ }
+
+ if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
+ bool claim_device = false;
+ size_t i;
+
+ for (i = 0; i < ctx->count; i++) {
+ if (spdk_nvme_transport_id_compare(trid, &ctx->trids[i]) == 0) {
+ claim_device = true;
+ break;
+ }
+ }
+
+ if (!claim_device) {
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "Not claiming device at %s\n", trid->traddr);
+ return false;
+ }
+ }
+
+ if (ctx->hostnqn) {
+ snprintf(opts->hostnqn, sizeof(opts->hostnqn), "%s", ctx->hostnqn);
+ }
+
+ opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst;
+ opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight;
+ opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight;
+ opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight;
+
+ return true;
+}
+
+static void
+nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl)
+{
+ struct spdk_nvme_ctrlr *ctrlr = ctx;
+ struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
+
+ if (spdk_nvme_cpl_is_error(cpl)) {
+ SPDK_WARNLOG("Abort failed. Resetting controller.\n");
+ nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(spdk_nvme_ctrlr_get_transport_id(ctrlr));
+ assert(nvme_bdev_ctrlr != NULL);
+ bdev_nvme_reset(nvme_bdev_ctrlr, NULL);
+ }
+}
+
+static void
+timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr,
+ struct spdk_nvme_qpair *qpair, uint16_t cid)
+{
+ int rc;
+ union spdk_nvme_csts_register csts;
+ struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
+
+ SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid);
+
+ csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr);
+ if (csts.bits.cfs) {
+ SPDK_ERRLOG("Controller Fatal Status, reset required\n");
+ nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(spdk_nvme_ctrlr_get_transport_id(ctrlr));
+ assert(nvme_bdev_ctrlr != NULL);
+ bdev_nvme_reset(nvme_bdev_ctrlr, NULL);
+ return;
+ }
+
+ switch (g_opts.action_on_timeout) {
+ case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT:
+ if (qpair) {
+ rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid,
+ nvme_abort_cpl, ctrlr);
+ if (rc == 0) {
+ return;
+ }
+
+ SPDK_ERRLOG("Unable to send abort. Resetting.\n");
+ }
+
+ /* FALLTHROUGH */
+ case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET:
+ nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(spdk_nvme_ctrlr_get_transport_id(ctrlr));
+ assert(nvme_bdev_ctrlr != NULL);
+ bdev_nvme_reset(nvme_bdev_ctrlr, NULL);
+ break;
+ case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE:
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "No action for nvme controller timeout.\n");
+ break;
+ default:
+ SPDK_ERRLOG("An invalid timeout action value is found.\n");
+ break;
+ }
+}
+
+void
+nvme_ctrlr_depopulate_namespace_done(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr)
+{
+ pthread_mutex_lock(&g_bdev_nvme_mutex);
+ nvme_bdev_ctrlr->ref--;
+
+ if (nvme_bdev_ctrlr->ref == 0 && nvme_bdev_ctrlr->destruct) {
+ pthread_mutex_unlock(&g_bdev_nvme_mutex);
+ nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr);
+ return;
+ }
+
+ pthread_mutex_unlock(&g_bdev_nvme_mutex);
+}
+
+static void
+nvme_ctrlr_depopulate_standard_namespace(struct nvme_bdev_ns *ns)
+{
+ struct nvme_bdev *bdev, *tmp;
+
+ TAILQ_FOREACH_SAFE(bdev, &ns->bdevs, tailq, tmp) {
+ spdk_bdev_unregister(&bdev->disk, NULL, NULL);
+ }
+
+ ns->populated = false;
+
+ nvme_ctrlr_depopulate_namespace_done(ns->ctrlr);
+}
+
+static void nvme_ctrlr_populate_namespace(struct nvme_bdev_ctrlr *ctrlr, struct nvme_bdev_ns *ns,
+ struct nvme_async_probe_ctx *ctx)
+{
+ g_populate_namespace_fn[ns->type](ctrlr, ns, ctx);
+}
+
+static void nvme_ctrlr_depopulate_namespace(struct nvme_bdev_ctrlr *ctrlr, struct nvme_bdev_ns *ns)
+{
+ g_depopulate_namespace_fn[ns->type](ns);
+}
+
+void
+nvme_ctrlr_populate_namespace_done(struct nvme_async_probe_ctx *ctx,
+ struct nvme_bdev_ns *ns, int rc)
+{
+ if (rc == 0) {
+ ns->populated = true;
+ pthread_mutex_lock(&g_bdev_nvme_mutex);
+ ns->ctrlr->ref++;
+ pthread_mutex_unlock(&g_bdev_nvme_mutex);
+ } else {
+ memset(ns, 0, sizeof(*ns));
+ }
+
+ if (ctx) {
+ ctx->populates_in_progress--;
+ if (ctx->populates_in_progress == 0) {
+ nvme_ctrlr_populate_namespaces_done(ctx);
+ }
+ }
+}
+
+static void
+nvme_ctrlr_populate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
+ struct nvme_async_probe_ctx *ctx)
+{
+ struct spdk_nvme_ctrlr *ctrlr = nvme_bdev_ctrlr->ctrlr;
+ struct nvme_bdev_ns *ns;
+ struct spdk_nvme_ns *nvme_ns;
+ struct nvme_bdev *bdev;
+ uint32_t i;
+ int rc;
+ uint64_t num_sectors;
+ bool ns_is_active;
+
+ if (ctx) {
+ /* Initialize this count to 1 to handle the populate functions
+ * calling nvme_ctrlr_populate_namespace_done() immediately.
+ */
+ ctx->populates_in_progress = 1;
+ }
+
+ for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
+ uint32_t nsid = i + 1;
+
+ ns = nvme_bdev_ctrlr->namespaces[i];
+ ns_is_active = spdk_nvme_ctrlr_is_active_ns(ctrlr, nsid);
+
+ if (ns->populated && ns_is_active && ns->type == NVME_BDEV_NS_STANDARD) {
+ /* NS is still there but attributes may have changed */
+ nvme_ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
+ num_sectors = spdk_nvme_ns_get_num_sectors(nvme_ns);
+ bdev = TAILQ_FIRST(&ns->bdevs);
+ if (bdev->disk.blockcnt != num_sectors) {
+ SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %lu, new size %lu\n",
+ nsid,
+ bdev->disk.name,
+ bdev->disk.blockcnt,
+ num_sectors);
+ rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors);
+ if (rc != 0) {
+ SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n",
+ bdev->disk.name, rc);
+ }
+ }
+ }
+
+ if (!ns->populated && ns_is_active) {
+ ns->id = nsid;
+ ns->ctrlr = nvme_bdev_ctrlr;
+ if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) {
+ ns->type = NVME_BDEV_NS_OCSSD;
+ } else {
+ ns->type = NVME_BDEV_NS_STANDARD;
+ }
+
+ TAILQ_INIT(&ns->bdevs);
+
+ if (ctx) {
+ ctx->populates_in_progress++;
+ }
+ nvme_ctrlr_populate_namespace(nvme_bdev_ctrlr, ns, ctx);
+ }
+
+ if (ns->populated && !ns_is_active) {
+ nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, ns);
+ }
+ }
+
+ if (ctx) {
+ /* Decrement this count now that the loop is over to account
+ * for the one we started with. If the count is then 0, we
+ * know any populate_namespace functions completed immediately,
+ * so we'll kick the callback here.
+ */
+ ctx->populates_in_progress--;
+ if (ctx->populates_in_progress == 0) {
+ nvme_ctrlr_populate_namespaces_done(ctx);
+ }
+ }
+
+}
+
+static void
+aer_cb(void *arg, const struct spdk_nvme_cpl *cpl)
+{
+ struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = arg;
+ union spdk_nvme_async_event_completion event;
+
+ if (spdk_nvme_cpl_is_error(cpl)) {
+ SPDK_WARNLOG("AER request execute failed");
+ return;
+ }
+
+ event.raw = cpl->cdw0;
+ if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) &&
+ (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) {
+ nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL);
+ } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_VENDOR) &&
+ (event.bits.log_page_identifier == SPDK_OCSSD_LOG_CHUNK_NOTIFICATION) &&
+ spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) {
+ bdev_ocssd_handle_chunk_notification(nvme_bdev_ctrlr);
+ }
+}
+
+static int
+create_ctrlr(struct spdk_nvme_ctrlr *ctrlr,
+ const char *name,
+ const struct spdk_nvme_transport_id *trid,
+ uint32_t prchk_flags)
+{
+ struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
+ uint32_t i;
+ int rc;
+
+ nvme_bdev_ctrlr = calloc(1, sizeof(*nvme_bdev_ctrlr));
+ if (nvme_bdev_ctrlr == NULL) {
+ SPDK_ERRLOG("Failed to allocate device struct\n");
+ return -ENOMEM;
+ }
+
+ nvme_bdev_ctrlr->trid = calloc(1, sizeof(*nvme_bdev_ctrlr->trid));
+ if (nvme_bdev_ctrlr->trid == NULL) {
+ SPDK_ERRLOG("Failed to allocate device trid struct\n");
+ free(nvme_bdev_ctrlr);
+ return -ENOMEM;
+ }
+
+ nvme_bdev_ctrlr->num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr);
+ nvme_bdev_ctrlr->namespaces = calloc(nvme_bdev_ctrlr->num_ns, sizeof(struct nvme_bdev_ns *));
+ if (!nvme_bdev_ctrlr->namespaces) {
+ SPDK_ERRLOG("Failed to allocate block namespaces pointer\n");
+ free(nvme_bdev_ctrlr->trid);
+ free(nvme_bdev_ctrlr);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
+ nvme_bdev_ctrlr->namespaces[i] = calloc(1, sizeof(struct nvme_bdev_ns));
+ if (nvme_bdev_ctrlr->namespaces[i] == NULL) {
+ SPDK_ERRLOG("Failed to allocate block namespace struct\n");
+ for (; i > 0; i--) {
+ free(nvme_bdev_ctrlr->namespaces[i - 1]);
+ }
+ free(nvme_bdev_ctrlr->namespaces);
+ free(nvme_bdev_ctrlr->trid);
+ free(nvme_bdev_ctrlr);
+ return -ENOMEM;
+ }
+ }
+
+ nvme_bdev_ctrlr->thread = spdk_get_thread();
+ nvme_bdev_ctrlr->adminq_timer_poller = NULL;
+ nvme_bdev_ctrlr->ctrlr = ctrlr;
+ nvme_bdev_ctrlr->ref = 0;
+ *nvme_bdev_ctrlr->trid = *trid;
+ nvme_bdev_ctrlr->name = strdup(name);
+ if (nvme_bdev_ctrlr->name == NULL) {
+ free(nvme_bdev_ctrlr->namespaces);
+ free(nvme_bdev_ctrlr->trid);
+ free(nvme_bdev_ctrlr);
+ return -ENOMEM;
+ }
+
+ if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) {
+ rc = bdev_ocssd_init_ctrlr(nvme_bdev_ctrlr);
+ if (spdk_unlikely(rc != 0)) {
+ SPDK_ERRLOG("Unable to initialize OCSSD controller\n");
+ free(nvme_bdev_ctrlr->name);
+ free(nvme_bdev_ctrlr->namespaces);
+ free(nvme_bdev_ctrlr->trid);
+ free(nvme_bdev_ctrlr);
+ return rc;
+ }
+ }
+
+ nvme_bdev_ctrlr->prchk_flags = prchk_flags;
+
+ spdk_io_device_register(nvme_bdev_ctrlr, bdev_nvme_create_cb, bdev_nvme_destroy_cb,
+ sizeof(struct nvme_io_channel),
+ name);
+
+ nvme_bdev_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, ctrlr,
+ g_opts.nvme_adminq_poll_period_us);
+
+ TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nvme_bdev_ctrlr, tailq);
+
+ if (g_opts.timeout_us > 0) {
+ spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us,
+ timeout_cb, NULL);
+ }
+
+ spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_bdev_ctrlr);
+
+ if (spdk_nvme_ctrlr_get_flags(nvme_bdev_ctrlr->ctrlr) &
+ SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) {
+ nvme_bdev_ctrlr->opal_dev = spdk_opal_dev_construct(nvme_bdev_ctrlr->ctrlr);
+ if (nvme_bdev_ctrlr->opal_dev == NULL) {
+ SPDK_ERRLOG("Failed to initialize Opal\n");
+ }
+ }
+ return 0;
+}
+
+static void
+attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
+ struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
+{
+ struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
+ struct nvme_probe_ctx *ctx = cb_ctx;
+ char *name = NULL;
+ uint32_t prchk_flags = 0;
+ size_t i;
+
+ if (ctx) {
+ for (i = 0; i < ctx->count; i++) {
+ if (spdk_nvme_transport_id_compare(trid, &ctx->trids[i]) == 0) {
+ prchk_flags = ctx->prchk_flags[i];
+ name = strdup(ctx->names[i]);
+ break;
+ }
+ }
+ } else {
+ name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++);
+ }
+ if (!name) {
+ SPDK_ERRLOG("Failed to assign name to NVMe device\n");
+ return;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "Attached to %s (%s)\n", trid->traddr, name);
+
+ create_ctrlr(ctrlr, name, trid, prchk_flags);
+
+ nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(trid);
+ if (!nvme_bdev_ctrlr) {
+ SPDK_ERRLOG("Failed to find new NVMe controller\n");
+ free(name);
+ return;
+ }
+
+ nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL);
+
+ free(name);
+}
+
+static void
+remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr)
+{
+ uint32_t i;
+ struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
+ struct nvme_bdev_ns *ns;
+
+ pthread_mutex_lock(&g_bdev_nvme_mutex);
+ TAILQ_FOREACH(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
+ if (nvme_bdev_ctrlr->ctrlr == ctrlr) {
+ /* The controller's destruction was already started */
+ if (nvme_bdev_ctrlr->destruct) {
+ pthread_mutex_unlock(&g_bdev_nvme_mutex);
+ return;
+ }
+ pthread_mutex_unlock(&g_bdev_nvme_mutex);
+ for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
+ uint32_t nsid = i + 1;
+
+ ns = nvme_bdev_ctrlr->namespaces[nsid - 1];
+ if (ns->populated) {
+ assert(ns->id == nsid);
+ nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, ns);
+ }
+ }
+
+ pthread_mutex_lock(&g_bdev_nvme_mutex);
+ nvme_bdev_ctrlr->destruct = true;
+ if (nvme_bdev_ctrlr->ref == 0) {
+ pthread_mutex_unlock(&g_bdev_nvme_mutex);
+ nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr);
+ } else {
+ pthread_mutex_unlock(&g_bdev_nvme_mutex);
+ }
+ return;
+ }
+ }
+ pthread_mutex_unlock(&g_bdev_nvme_mutex);
+}
+
+static int
+bdev_nvme_hotplug(void *arg)
+{
+ struct spdk_nvme_transport_id trid_pcie;
+ int done;
+
+ if (!g_hotplug_probe_ctx) {
+ memset(&trid_pcie, 0, sizeof(trid_pcie));
+ spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE);
+
+ g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL,
+ hotplug_probe_cb,
+ attach_cb, remove_cb);
+ if (!g_hotplug_probe_ctx) {
+ return SPDK_POLLER_BUSY;
+ }
+ }
+
+ done = spdk_nvme_probe_poll_async(g_hotplug_probe_ctx);
+ if (done != -EAGAIN) {
+ g_hotplug_probe_ctx = NULL;
+ }
+
+ return SPDK_POLLER_BUSY;
+}
+
+void
+bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts)
+{
+ *opts = g_opts;
+}
+
+int
+bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts)
+{
+ if (g_bdev_nvme_init_thread != NULL) {
+ if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
+ return -EPERM;
+ }
+ }
+
+ g_opts = *opts;
+
+ return 0;
+}
+
+struct set_nvme_hotplug_ctx {
+ uint64_t period_us;
+ bool enabled;
+ spdk_msg_fn fn;
+ void *fn_ctx;
+};
+
+static void
+set_nvme_hotplug_period_cb(void *_ctx)
+{
+ struct set_nvme_hotplug_ctx *ctx = _ctx;
+
+ spdk_poller_unregister(&g_hotplug_poller);
+ if (ctx->enabled) {
+ g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us);
+ }
+
+ g_nvme_hotplug_poll_period_us = ctx->period_us;
+ g_nvme_hotplug_enabled = ctx->enabled;
+ if (ctx->fn) {
+ ctx->fn(ctx->fn_ctx);
+ }
+
+ free(ctx);
+}
+
+int
+bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx)
+{
+ struct set_nvme_hotplug_ctx *ctx;
+
+ if (enabled == true && !spdk_process_is_primary()) {
+ return -EPERM;
+ }
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (ctx == NULL) {
+ return -ENOMEM;
+ }
+
+ period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us;
+ ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX);
+ ctx->enabled = enabled;
+ ctx->fn = cb;
+ ctx->fn_ctx = cb_ctx;
+
+ spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx);
+ return 0;
+}
+
+static void
+populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, size_t count, int rc)
+{
+ if (ctx->cb_fn) {
+ ctx->cb_fn(ctx->cb_ctx, count, rc);
+ }
+
+ free(ctx);
+}
+
+static void
+nvme_ctrlr_populate_namespaces_done(struct nvme_async_probe_ctx *ctx)
+{
+ struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
+ struct nvme_bdev_ns *ns;
+ struct nvme_bdev *nvme_bdev, *tmp;
+ uint32_t i, nsid;
+ size_t j;
+
+ nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(&ctx->trid);
+ assert(nvme_bdev_ctrlr != NULL);
+
+ /*
+ * Report the new bdevs that were created in this call.
+ * There can be more than one bdev per NVMe controller.
+ */
+ j = 0;
+ for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
+ nsid = i + 1;
+ ns = nvme_bdev_ctrlr->namespaces[nsid - 1];
+ if (!ns->populated) {
+ continue;
+ }
+ assert(ns->id == nsid);
+ TAILQ_FOREACH_SAFE(nvme_bdev, &ns->bdevs, tailq, tmp) {
+ if (j < ctx->count) {
+ ctx->names[j] = nvme_bdev->disk.name;
+ j++;
+ } else {
+ SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n",
+ ctx->count);
+ populate_namespaces_cb(ctx, 0, -ERANGE);
+ return;
+ }
+ }
+ }
+
+ populate_namespaces_cb(ctx, j, 0);
+}
+
+static void
+connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
+ struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
+{
+ struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx;
+ struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
+ struct nvme_async_probe_ctx *ctx;
+ int rc;
+
+ ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, opts);
+
+ spdk_poller_unregister(&ctx->poller);
+
+ rc = create_ctrlr(ctrlr, ctx->base_name, &ctx->trid, ctx->prchk_flags);
+ if (rc) {
+ SPDK_ERRLOG("Failed to create new device\n");
+ populate_namespaces_cb(ctx, 0, rc);
+ return;
+ }
+
+ nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(&ctx->trid);
+ assert(nvme_bdev_ctrlr != NULL);
+
+ nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, ctx);
+}
+
+static int
+bdev_nvme_async_poll(void *arg)
+{
+ struct nvme_async_probe_ctx *ctx = arg;
+ int rc;
+
+ rc = spdk_nvme_probe_poll_async(ctx->probe_ctx);
+ if (spdk_unlikely(rc != -EAGAIN && rc != 0)) {
+ spdk_poller_unregister(&ctx->poller);
+ free(ctx);
+ }
+
+ return SPDK_POLLER_BUSY;
+}
+
+int
+bdev_nvme_create(struct spdk_nvme_transport_id *trid,
+ struct spdk_nvme_host_id *hostid,
+ const char *base_name,
+ const char **names,
+ uint32_t count,
+ const char *hostnqn,
+ uint32_t prchk_flags,
+ spdk_bdev_create_nvme_fn cb_fn,
+ void *cb_ctx)
+{
+ struct nvme_probe_skip_entry *entry, *tmp;
+ struct nvme_async_probe_ctx *ctx;
+
+ if (nvme_bdev_ctrlr_get(trid) != NULL) {
+ SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr);
+ return -EEXIST;
+ }
+
+ if (nvme_bdev_ctrlr_get_by_name(base_name)) {
+ SPDK_ERRLOG("A controller with the provided name (%s) already exists.\n", base_name);
+ return -EEXIST;
+ }
+
+ if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
+ TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) {
+ if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
+ TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
+ free(entry);
+ break;
+ }
+ }
+ }
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx) {
+ return -ENOMEM;
+ }
+ ctx->base_name = base_name;
+ ctx->names = names;
+ ctx->count = count;
+ ctx->cb_fn = cb_fn;
+ ctx->cb_ctx = cb_ctx;
+ ctx->prchk_flags = prchk_flags;
+ ctx->trid = *trid;
+
+ spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->opts, sizeof(ctx->opts));
+ ctx->opts.transport_retry_count = g_opts.retry_count;
+
+ if (hostnqn) {
+ snprintf(ctx->opts.hostnqn, sizeof(ctx->opts.hostnqn), "%s", hostnqn);
+ }
+
+ if (hostid->hostaddr[0] != '\0') {
+ snprintf(ctx->opts.src_addr, sizeof(ctx->opts.src_addr), "%s", hostid->hostaddr);
+ }
+
+ if (hostid->hostsvcid[0] != '\0') {
+ snprintf(ctx->opts.src_svcid, sizeof(ctx->opts.src_svcid), "%s", hostid->hostsvcid);
+ }
+
+ ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->opts, connect_attach_cb);
+ if (ctx->probe_ctx == NULL) {
+ SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr);
+ free(ctx);
+ return -ENODEV;
+ }
+ ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000);
+
+ return 0;
+}
+
+int
+bdev_nvme_delete(const char *name)
+{
+ struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = NULL;
+ struct nvme_probe_skip_entry *entry;
+
+ if (name == NULL) {
+ return -EINVAL;
+ }
+
+ nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
+ if (nvme_bdev_ctrlr == NULL) {
+ SPDK_ERRLOG("Failed to find NVMe controller\n");
+ return -ENODEV;
+ }
+
+ if (nvme_bdev_ctrlr->trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
+ entry = calloc(1, sizeof(*entry));
+ if (!entry) {
+ return -ENOMEM;
+ }
+ entry->trid = *nvme_bdev_ctrlr->trid;
+ TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq);
+ }
+
+ remove_cb(NULL, nvme_bdev_ctrlr->ctrlr);
+ return 0;
+}
+
+static int
+bdev_nvme_library_init(void)
+{
+ struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
+ struct spdk_conf_section *sp;
+ const char *val;
+ int rc = 0;
+ int64_t intval = 0;
+ size_t i;
+ struct nvme_probe_ctx *probe_ctx = NULL;
+ int retry_count;
+ uint32_t local_nvme_num = 0;
+ int64_t hotplug_period;
+ bool hotplug_enabled = g_nvme_hotplug_enabled;
+
+ g_bdev_nvme_init_thread = spdk_get_thread();
+
+ spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_poll_group_create_cb,
+ bdev_nvme_poll_group_destroy_cb,
+ sizeof(struct nvme_bdev_poll_group), "bdev_nvme_poll_groups");
+
+ sp = spdk_conf_find_section(NULL, "Nvme");
+ if (sp == NULL) {
+ goto end;
+ }
+
+ probe_ctx = calloc(1, sizeof(*probe_ctx));
+ if (probe_ctx == NULL) {
+ SPDK_ERRLOG("Failed to allocate probe_ctx\n");
+ rc = -1;
+ goto end;
+ }
+
+ retry_count = spdk_conf_section_get_intval(sp, "RetryCount");
+ if (retry_count >= 0) {
+ g_opts.retry_count = retry_count;
+ }
+
+ val = spdk_conf_section_get_val(sp, "TimeoutUsec");
+ if (val != NULL) {
+ intval = spdk_strtoll(val, 10);
+ if (intval < 0) {
+ SPDK_ERRLOG("Invalid TimeoutUsec value\n");
+ rc = -1;
+ goto end;
+ }
+ }
+
+ g_opts.timeout_us = intval;
+
+ if (g_opts.timeout_us > 0) {
+ val = spdk_conf_section_get_val(sp, "ActionOnTimeout");
+ if (val != NULL) {
+ if (!strcasecmp(val, "Reset")) {
+ g_opts.action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET;
+ } else if (!strcasecmp(val, "Abort")) {
+ g_opts.action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT;
+ }
+ }
+ }
+
+ intval = spdk_conf_section_get_intval(sp, "AdminPollRate");
+ if (intval > 0) {
+ g_opts.nvme_adminq_poll_period_us = intval;
+ }
+
+ intval = spdk_conf_section_get_intval(sp, "IOPollRate");
+ if (intval > 0) {
+ g_opts.nvme_ioq_poll_period_us = intval;
+ }
+
+ if (spdk_process_is_primary()) {
+ hotplug_enabled = spdk_conf_section_get_boolval(sp, "HotplugEnable", false);
+ }
+
+ hotplug_period = spdk_conf_section_get_intval(sp, "HotplugPollRate");
+ if (hotplug_period < 0) {
+ hotplug_period = 0;
+ }
+
+ g_nvme_hostnqn = spdk_conf_section_get_val(sp, "HostNQN");
+ probe_ctx->hostnqn = g_nvme_hostnqn;
+
+ g_opts.delay_cmd_submit = spdk_conf_section_get_boolval(sp, "DelayCmdSubmit",
+ SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT);
+
+ for (i = 0; i < NVME_MAX_CONTROLLERS; i++) {
+ val = spdk_conf_section_get_nmval(sp, "TransportID", i, 0);
+ if (val == NULL) {
+ break;
+ }
+
+ rc = spdk_nvme_transport_id_parse(&probe_ctx->trids[i], val);
+ if (rc < 0) {
+ SPDK_ERRLOG("Unable to parse TransportID: %s\n", val);
+ rc = -1;
+ goto end;
+ }
+
+ rc = spdk_nvme_host_id_parse(&probe_ctx->hostids[i], val);
+ if (rc < 0) {
+ SPDK_ERRLOG("Unable to parse HostID: %s\n", val);
+ rc = -1;
+ goto end;
+ }
+
+ val = spdk_conf_section_get_nmval(sp, "TransportID", i, 1);
+ if (val == NULL) {
+ SPDK_ERRLOG("No name provided for TransportID\n");
+ rc = -1;
+ goto end;
+ }
+
+ probe_ctx->names[i] = val;
+
+ val = spdk_conf_section_get_nmval(sp, "TransportID", i, 2);
+ if (val != NULL) {
+ rc = spdk_nvme_prchk_flags_parse(&probe_ctx->prchk_flags[i], val);
+ if (rc < 0) {
+ SPDK_ERRLOG("Unable to parse prchk: %s\n", val);
+ rc = -1;
+ goto end;
+ }
+ }
+
+ probe_ctx->count++;
+
+ if (probe_ctx->trids[i].trtype != SPDK_NVME_TRANSPORT_PCIE) {
+ struct spdk_nvme_ctrlr *ctrlr;
+ struct spdk_nvme_ctrlr_opts opts;
+
+ if (nvme_bdev_ctrlr_get(&probe_ctx->trids[i])) {
+ SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n",
+ probe_ctx->trids[i].traddr);
+ rc = -1;
+ goto end;
+ }
+
+ if (probe_ctx->trids[i].subnqn[0] == '\0') {
+ SPDK_ERRLOG("Need to provide subsystem nqn\n");
+ rc = -1;
+ goto end;
+ }
+
+ spdk_nvme_ctrlr_get_default_ctrlr_opts(&opts, sizeof(opts));
+ opts.transport_retry_count = g_opts.retry_count;
+
+ if (probe_ctx->hostnqn != NULL) {
+ snprintf(opts.hostnqn, sizeof(opts.hostnqn), "%s", probe_ctx->hostnqn);
+ }
+
+ if (probe_ctx->hostids[i].hostaddr[0] != '\0') {
+ snprintf(opts.src_addr, sizeof(opts.src_addr), "%s", probe_ctx->hostids[i].hostaddr);
+ }
+
+ if (probe_ctx->hostids[i].hostsvcid[0] != '\0') {
+ snprintf(opts.src_svcid, sizeof(opts.src_svcid), "%s", probe_ctx->hostids[i].hostsvcid);
+ }
+
+ ctrlr = spdk_nvme_connect(&probe_ctx->trids[i], &opts, sizeof(opts));
+ if (ctrlr == NULL) {
+ SPDK_ERRLOG("Unable to connect to provided trid (traddr: %s)\n",
+ probe_ctx->trids[i].traddr);
+ rc = -1;
+ goto end;
+ }
+
+ rc = create_ctrlr(ctrlr, probe_ctx->names[i], &probe_ctx->trids[i], 0);
+ if (rc) {
+ goto end;
+ }
+
+ nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(&probe_ctx->trids[i]);
+ if (!nvme_bdev_ctrlr) {
+ SPDK_ERRLOG("Failed to find new NVMe controller\n");
+ rc = -ENODEV;
+ goto end;
+ }
+
+ nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL);
+ } else {
+ local_nvme_num++;
+ }
+ }
+
+ if (local_nvme_num > 0) {
+ /* used to probe local NVMe device */
+ if (spdk_nvme_probe(NULL, probe_ctx, probe_cb, attach_cb, remove_cb)) {
+ rc = -1;
+ goto end;
+ }
+
+ for (i = 0; i < probe_ctx->count; i++) {
+ if (probe_ctx->trids[i].trtype != SPDK_NVME_TRANSPORT_PCIE) {
+ continue;
+ }
+
+ if (!nvme_bdev_ctrlr_get(&probe_ctx->trids[i])) {
+ SPDK_ERRLOG("NVMe SSD \"%s\" could not be found.\n", probe_ctx->trids[i].traddr);
+ SPDK_ERRLOG("Check PCIe BDF and that it is attached to UIO/VFIO driver.\n");
+ }
+ }
+ }
+
+ rc = bdev_nvme_set_hotplug(hotplug_enabled, hotplug_period, NULL, NULL);
+ if (rc) {
+ SPDK_ERRLOG("Failed to setup hotplug (%d): %s", rc, spdk_strerror(rc));
+ rc = -1;
+ }
+end:
+ free(probe_ctx);
+ return rc;
+}
+
+static void
+bdev_nvme_library_fini(void)
+{
+ struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, *tmp;
+ struct nvme_probe_skip_entry *entry, *entry_tmp;
+ struct nvme_bdev_ns *ns;
+ uint32_t i;
+
+ spdk_poller_unregister(&g_hotplug_poller);
+ free(g_hotplug_probe_ctx);
+
+ TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) {
+ TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
+ free(entry);
+ }
+
+ pthread_mutex_lock(&g_bdev_nvme_mutex);
+ TAILQ_FOREACH_SAFE(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq, tmp) {
+ if (nvme_bdev_ctrlr->destruct) {
+ /* This controller's destruction was already started
+ * before the application started shutting down
+ */
+ continue;
+ }
+
+ pthread_mutex_unlock(&g_bdev_nvme_mutex);
+
+ for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
+ uint32_t nsid = i + 1;
+
+ ns = nvme_bdev_ctrlr->namespaces[nsid - 1];
+ if (ns->populated) {
+ assert(ns->id == nsid);
+ nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, ns);
+ }
+ }
+
+ pthread_mutex_lock(&g_bdev_nvme_mutex);
+ nvme_bdev_ctrlr->destruct = true;
+
+ if (nvme_bdev_ctrlr->ref == 0) {
+ pthread_mutex_unlock(&g_bdev_nvme_mutex);
+ nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr);
+ pthread_mutex_lock(&g_bdev_nvme_mutex);
+ }
+ }
+
+ g_bdev_nvme_module_finish = true;
+ if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
+ pthread_mutex_unlock(&g_bdev_nvme_mutex);
+ spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL);
+ spdk_bdev_module_finish_done();
+ return;
+ }
+
+ pthread_mutex_unlock(&g_bdev_nvme_mutex);
+}
+
+static void
+bdev_nvme_verify_pi_error(struct spdk_bdev_io *bdev_io)
+{
+ struct spdk_bdev *bdev = bdev_io->bdev;
+ struct spdk_dif_ctx dif_ctx;
+ struct spdk_dif_error err_blk = {};
+ int rc;
+
+ rc = spdk_dif_ctx_init(&dif_ctx,
+ bdev->blocklen, bdev->md_len, bdev->md_interleave,
+ bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags,
+ bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0);
+ if (rc != 0) {
+ SPDK_ERRLOG("Initialization of DIF context failed\n");
+ return;
+ }
+
+ if (bdev->md_interleave) {
+ rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
+ bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
+ } else {
+ struct iovec md_iov = {
+ .iov_base = bdev_io->u.bdev.md_buf,
+ .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len,
+ };
+
+ rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
+ &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
+ }
+
+ if (rc != 0) {
+ SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n",
+ err_blk.err_type, err_blk.err_offset);
+ } else {
+ SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n");
+ }
+}
+
+static void
+bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
+{
+ struct nvme_bdev_io *bio = ref;
+ struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
+
+ if (spdk_nvme_cpl_is_success(cpl)) {
+ /* Run PI verification for read data buffer. */
+ bdev_nvme_verify_pi_error(bdev_io);
+ }
+
+ /* Return original completion status */
+ spdk_bdev_io_complete_nvme_status(bdev_io, bio->cpl.cdw0, bio->cpl.status.sct,
+ bio->cpl.status.sc);
+}
+
+static void
+bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
+{
+ struct nvme_bdev_io *bio = ref;
+ struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
+ int ret;
+
+ if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) {
+ SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n",
+ cpl->status.sct, cpl->status.sc);
+
+ /* Save completion status to use after verifying PI error. */
+ bio->cpl = *cpl;
+
+ /* Read without PI checking to verify PI error. */
+ ret = bdev_nvme_no_pi_readv((struct nvme_bdev *)bdev_io->bdev->ctxt,
+ spdk_bdev_io_get_io_channel(bdev_io),
+ bio,
+ bdev_io->u.bdev.iovs,
+ bdev_io->u.bdev.iovcnt,
+ bdev_io->u.bdev.md_buf,
+ bdev_io->u.bdev.num_blocks,
+ bdev_io->u.bdev.offset_blocks);
+ if (ret == 0) {
+ return;
+ }
+ }
+
+ spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
+}
+
+static void
+bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
+{
+ struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref);
+
+ if (spdk_nvme_cpl_is_pi_error(cpl)) {
+ SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n",
+ cpl->status.sct, cpl->status.sc);
+ /* Run PI verification for write data buffer if PI error is detected. */
+ bdev_nvme_verify_pi_error(bdev_io);
+ }
+
+ spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
+}
+
+static void
+bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl)
+{
+ struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref);
+
+ if (spdk_nvme_cpl_is_pi_error(cpl)) {
+ SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n",
+ cpl->status.sct, cpl->status.sc);
+ /* Run PI verification for compare data buffer if PI error is detected. */
+ bdev_nvme_verify_pi_error(bdev_io);
+ }
+
+ spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
+}
+
+static void
+bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
+{
+ struct nvme_bdev_io *bio = ref;
+ struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
+
+ /* Compare operation completion */
+ if ((cpl->cdw0 & 0xFF) == SPDK_NVME_OPC_COMPARE) {
+ /* Save compare result for write callback */
+ bio->cpl = *cpl;
+ return;
+ }
+
+ /* Write operation completion */
+ if (spdk_nvme_cpl_is_error(&bio->cpl)) {
+ /* If bio->cpl is already an error, it means the compare operation failed. In that case,
+ * complete the IO with the compare operation's status.
+ */
+ if (!spdk_nvme_cpl_is_error(cpl)) {
+ SPDK_ERRLOG("Unexpected write success after compare failure.\n");
+ }
+
+ spdk_bdev_io_complete_nvme_status(bdev_io, bio->cpl.cdw0, bio->cpl.status.sct, bio->cpl.status.sc);
+ } else {
+ spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
+ }
+}
+
+static void
+bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl)
+{
+ struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref);
+
+ spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
+}
+
+static void
+bdev_nvme_admin_passthru_completion(void *ctx)
+{
+ struct nvme_bdev_io *bio = ctx;
+ struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
+
+ spdk_bdev_io_complete_nvme_status(bdev_io,
+ bio->cpl.cdw0, bio->cpl.status.sct, bio->cpl.status.sc);
+}
+
+static void
+bdev_nvme_abort_completion(void *ctx)
+{
+ struct nvme_bdev_io *bio = ctx;
+ struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
+
+ if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
+ } else {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+}
+
+static void
+bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl)
+{
+ struct nvme_bdev_io *bio = ref;
+
+ bio->cpl = *cpl;
+ spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio);
+}
+
+static void
+bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl)
+{
+ struct nvme_bdev_io *bio = ref;
+
+ bio->cpl = *cpl;
+ spdk_thread_send_msg(bio->orig_thread, bdev_nvme_admin_passthru_completion, bio);
+}
+
+static void
+bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset)
+{
+ struct nvme_bdev_io *bio = ref;
+ struct iovec *iov;
+
+ bio->iov_offset = sgl_offset;
+ for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) {
+ iov = &bio->iovs[bio->iovpos];
+ if (bio->iov_offset < iov->iov_len) {
+ break;
+ }
+
+ bio->iov_offset -= iov->iov_len;
+ }
+}
+
+static int
+bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length)
+{
+ struct nvme_bdev_io *bio = ref;
+ struct iovec *iov;
+
+ assert(bio->iovpos < bio->iovcnt);
+
+ iov = &bio->iovs[bio->iovpos];
+
+ *address = iov->iov_base;
+ *length = iov->iov_len;
+
+ if (bio->iov_offset) {
+ assert(bio->iov_offset <= iov->iov_len);
+ *address += bio->iov_offset;
+ *length -= bio->iov_offset;
+ }
+
+ bio->iov_offset += *length;
+ if (bio->iov_offset == iov->iov_len) {
+ bio->iovpos++;
+ bio->iov_offset = 0;
+ }
+
+ return 0;
+}
+
+static void
+bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset)
+{
+ struct nvme_bdev_io *bio = ref;
+ struct iovec *iov;
+
+ bio->fused_iov_offset = sgl_offset;
+ for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) {
+ iov = &bio->fused_iovs[bio->fused_iovpos];
+ if (bio->fused_iov_offset < iov->iov_len) {
+ break;
+ }
+
+ bio->fused_iov_offset -= iov->iov_len;
+ }
+}
+
+static int
+bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length)
+{
+ struct nvme_bdev_io *bio = ref;
+ struct iovec *iov;
+
+ assert(bio->fused_iovpos < bio->fused_iovcnt);
+
+ iov = &bio->fused_iovs[bio->fused_iovpos];
+
+ *address = iov->iov_base;
+ *length = iov->iov_len;
+
+ if (bio->fused_iov_offset) {
+ assert(bio->fused_iov_offset <= iov->iov_len);
+ *address += bio->fused_iov_offset;
+ *length -= bio->fused_iov_offset;
+ }
+
+ bio->fused_iov_offset += *length;
+ if (bio->fused_iov_offset == iov->iov_len) {
+ bio->fused_iovpos++;
+ bio->fused_iov_offset = 0;
+ }
+
+ return 0;
+}
+
+static int
+bdev_nvme_no_pi_readv(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
+ struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
+ void *md, uint64_t lba_count, uint64_t lba)
+{
+ struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
+ int rc;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "read %lu blocks with offset %#lx without PI check\n",
+ lba_count, lba);
+
+ bio->iovs = iov;
+ bio->iovcnt = iovcnt;
+ bio->iovpos = 0;
+ bio->iov_offset = 0;
+
+ rc = spdk_nvme_ns_cmd_readv_with_md(nbdev->nvme_ns->ns, nvme_ch->qpair, lba, lba_count,
+ bdev_nvme_no_pi_readv_done, bio, 0,
+ bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
+ md, 0, 0);
+
+ if (rc != 0 && rc != -ENOMEM) {
+ SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc);
+ }
+ return rc;
+}
+
+static int
+bdev_nvme_readv(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
+ struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
+ void *md, uint64_t lba_count, uint64_t lba)
+{
+ struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
+ int rc;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "read %lu blocks with offset %#lx\n",
+ lba_count, lba);
+
+ bio->iovs = iov;
+ bio->iovcnt = iovcnt;
+ bio->iovpos = 0;
+ bio->iov_offset = 0;
+
+ rc = spdk_nvme_ns_cmd_readv_with_md(nbdev->nvme_ns->ns, nvme_ch->qpair, lba, lba_count,
+ bdev_nvme_readv_done, bio, nbdev->disk.dif_check_flags,
+ bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
+ md, 0, 0);
+
+ if (rc != 0 && rc != -ENOMEM) {
+ SPDK_ERRLOG("readv failed: rc = %d\n", rc);
+ }
+ return rc;
+}
+
+static int
+bdev_nvme_writev(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
+ struct nvme_bdev_io *bio,
+ struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba)
+{
+ struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
+ int rc;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "write %lu blocks with offset %#lx\n",
+ lba_count, lba);
+
+ bio->iovs = iov;
+ bio->iovcnt = iovcnt;
+ bio->iovpos = 0;
+ bio->iov_offset = 0;
+
+ rc = spdk_nvme_ns_cmd_writev_with_md(nbdev->nvme_ns->ns, nvme_ch->qpair, lba, lba_count,
+ bdev_nvme_writev_done, bio, nbdev->disk.dif_check_flags,
+ bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
+ md, 0, 0);
+
+ if (rc != 0 && rc != -ENOMEM) {
+ SPDK_ERRLOG("writev failed: rc = %d\n", rc);
+ }
+ return rc;
+}
+
+static int
+bdev_nvme_comparev(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
+ struct nvme_bdev_io *bio,
+ struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba)
+{
+ struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
+ int rc;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "compare %lu blocks with offset %#lx\n",
+ lba_count, lba);
+
+ bio->iovs = iov;
+ bio->iovcnt = iovcnt;
+ bio->iovpos = 0;
+ bio->iov_offset = 0;
+
+ rc = spdk_nvme_ns_cmd_comparev_with_md(nbdev->nvme_ns->ns, nvme_ch->qpair, lba, lba_count,
+ bdev_nvme_comparev_done, bio, nbdev->disk.dif_check_flags,
+ bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
+ md, 0, 0);
+
+ if (rc != 0 && rc != -ENOMEM) {
+ SPDK_ERRLOG("comparev failed: rc = %d\n", rc);
+ }
+ return rc;
+}
+
+static int
+bdev_nvme_comparev_and_writev(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
+ struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov,
+ int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba)
+{
+ struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
+ struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
+ uint32_t flags = nbdev->disk.dif_check_flags;
+ int rc;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "compare and write %lu blocks with offset %#lx\n",
+ lba_count, lba);
+
+ bio->iovs = cmp_iov;
+ bio->iovcnt = cmp_iovcnt;
+ bio->iovpos = 0;
+ bio->iov_offset = 0;
+ bio->fused_iovs = write_iov;
+ bio->fused_iovcnt = write_iovcnt;
+ bio->fused_iovpos = 0;
+ bio->fused_iov_offset = 0;
+
+ if (bdev_io->num_retries == 0) {
+ bio->first_fused_submitted = false;
+ }
+
+ if (!bio->first_fused_submitted) {
+ flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST;
+ memset(&bio->cpl, 0, sizeof(bio->cpl));
+
+ rc = spdk_nvme_ns_cmd_comparev_with_md(nbdev->nvme_ns->ns, nvme_ch->qpair, lba, lba_count,
+ bdev_nvme_comparev_and_writev_done, bio, flags,
+ bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0);
+ if (rc == 0) {
+ bio->first_fused_submitted = true;
+ flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST;
+ } else {
+ if (rc != -ENOMEM) {
+ SPDK_ERRLOG("compare failed: rc = %d\n", rc);
+ }
+ return rc;
+ }
+ }
+
+ flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND;
+
+ rc = spdk_nvme_ns_cmd_writev_with_md(nbdev->nvme_ns->ns, nvme_ch->qpair, lba, lba_count,
+ bdev_nvme_comparev_and_writev_done, bio, flags,
+ bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0);
+ if (rc != 0 && rc != -ENOMEM) {
+ SPDK_ERRLOG("write failed: rc = %d\n", rc);
+ rc = 0;
+ }
+
+ return rc;
+}
+
+static int
+bdev_nvme_unmap(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
+ struct nvme_bdev_io *bio,
+ uint64_t offset_blocks,
+ uint64_t num_blocks)
+{
+ struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
+ struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES];
+ struct spdk_nvme_dsm_range *range;
+ uint64_t offset, remaining;
+ uint64_t num_ranges_u64;
+ uint16_t num_ranges;
+ int rc;
+
+ num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) /
+ SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
+ if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) {
+ SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks);
+ return -EINVAL;
+ }
+ num_ranges = (uint16_t)num_ranges_u64;
+
+ offset = offset_blocks;
+ remaining = num_blocks;
+ range = &dsm_ranges[0];
+
+ /* Fill max-size ranges until the remaining blocks fit into one range */
+ while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) {
+ range->attributes.raw = 0;
+ range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
+ range->starting_lba = offset;
+
+ offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
+ remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
+ range++;
+ }
+
+ /* Final range describes the remaining blocks */
+ range->attributes.raw = 0;
+ range->length = remaining;
+ range->starting_lba = offset;
+
+ rc = spdk_nvme_ns_cmd_dataset_management(nbdev->nvme_ns->ns, nvme_ch->qpair,
+ SPDK_NVME_DSM_ATTR_DEALLOCATE,
+ dsm_ranges, num_ranges,
+ bdev_nvme_queued_done, bio);
+
+ return rc;
+}
+
+static int
+bdev_nvme_admin_passthru(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
+ struct nvme_bdev_io *bio,
+ struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes)
+{
+ uint32_t max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nbdev->nvme_bdev_ctrlr->ctrlr);
+
+ if (nbytes > max_xfer_size) {
+ SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
+ return -EINVAL;
+ }
+
+ bio->orig_thread = spdk_io_channel_get_thread(ch);
+
+ return spdk_nvme_ctrlr_cmd_admin_raw(nbdev->nvme_bdev_ctrlr->ctrlr, cmd, buf,
+ (uint32_t)nbytes, bdev_nvme_admin_passthru_done, bio);
+}
+
+static int
+bdev_nvme_io_passthru(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
+ struct nvme_bdev_io *bio,
+ struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes)
+{
+ struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
+ uint32_t max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nbdev->nvme_bdev_ctrlr->ctrlr);
+
+ if (nbytes > max_xfer_size) {
+ SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
+ return -EINVAL;
+ }
+
+ /*
+ * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
+ * so fill it out automatically.
+ */
+ cmd->nsid = spdk_nvme_ns_get_id(nbdev->nvme_ns->ns);
+
+ return spdk_nvme_ctrlr_cmd_io_raw(nbdev->nvme_bdev_ctrlr->ctrlr, nvme_ch->qpair, cmd, buf,
+ (uint32_t)nbytes, bdev_nvme_queued_done, bio);
+}
+
+static int
+bdev_nvme_io_passthru_md(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
+ struct nvme_bdev_io *bio,
+ struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len)
+{
+ struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
+ size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(nbdev->nvme_ns->ns);
+ uint32_t max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nbdev->nvme_bdev_ctrlr->ctrlr);
+
+ if (nbytes > max_xfer_size) {
+ SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
+ return -EINVAL;
+ }
+
+ if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(nbdev->nvme_ns->ns)) {
+ SPDK_ERRLOG("invalid meta data buffer size\n");
+ return -EINVAL;
+ }
+
+ /*
+ * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
+ * so fill it out automatically.
+ */
+ cmd->nsid = spdk_nvme_ns_get_id(nbdev->nvme_ns->ns);
+
+ return spdk_nvme_ctrlr_cmd_io_raw_with_md(nbdev->nvme_bdev_ctrlr->ctrlr, nvme_ch->qpair, cmd, buf,
+ (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio);
+}
+
+static void
+bdev_nvme_abort_admin_cmd(void *ctx)
+{
+ struct nvme_bdev_io *bio = ctx;
+ struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
+ struct nvme_bdev *nbdev;
+ struct nvme_bdev_io *bio_to_abort;
+ int rc;
+
+ nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt;
+ bio_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx;
+
+ rc = spdk_nvme_ctrlr_cmd_abort_ext(nbdev->nvme_bdev_ctrlr->ctrlr,
+ NULL,
+ bio_to_abort,
+ bdev_nvme_abort_done, bio);
+ if (rc == -ENOENT) {
+ /* If no admin command was found in admin qpair, complete the abort
+ * request with failure.
+ */
+ bio->cpl.cdw0 |= 1U;
+ bio->cpl.status.sc = SPDK_NVME_SC_SUCCESS;
+ bio->cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+
+ spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio);
+ }
+}
+
+static int
+bdev_nvme_abort(struct nvme_bdev *nbdev, struct spdk_io_channel *ch,
+ struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort)
+{
+ struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch);
+ int rc;
+
+ bio->orig_thread = spdk_io_channel_get_thread(ch);
+
+ rc = spdk_nvme_ctrlr_cmd_abort_ext(nbdev->nvme_bdev_ctrlr->ctrlr,
+ nvme_ch->qpair,
+ bio_to_abort,
+ bdev_nvme_abort_done, bio);
+ if (rc == -ENOENT) {
+ /* If no command was found in I/O qpair, the target command may be
+ * admin command. Only a single thread tries aborting admin command
+ * to clean I/O flow.
+ */
+ spdk_thread_send_msg(nbdev->nvme_bdev_ctrlr->thread,
+ bdev_nvme_abort_admin_cmd, bio);
+ rc = 0;
+ }
+
+ return rc;
+}
+
+static void
+bdev_nvme_get_spdk_running_config(FILE *fp)
+{
+ struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
+
+ fprintf(fp, "\n[Nvme]");
+ fprintf(fp, "\n"
+ "# NVMe Device Whitelist\n"
+ "# Users may specify which NVMe devices to claim by their transport id.\n"
+ "# See spdk_nvme_transport_id_parse() in spdk/nvme.h for the correct format.\n"
+ "# The second argument is the assigned name, which can be referenced from\n"
+ "# other sections in the configuration file. For NVMe devices, a namespace\n"
+ "# is automatically appended to each name in the format <YourName>nY, where\n"
+ "# Y is the NSID (starts at 1).\n");
+
+ TAILQ_FOREACH(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
+ const char *trtype;
+ const char *prchk_flags;
+
+ trtype = spdk_nvme_transport_id_trtype_str(nvme_bdev_ctrlr->trid->trtype);
+ if (!trtype) {
+ continue;
+ }
+
+ if (nvme_bdev_ctrlr->trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
+ fprintf(fp, "TransportID \"trtype:%s traddr:%s\" %s\n",
+ trtype,
+ nvme_bdev_ctrlr->trid->traddr, nvme_bdev_ctrlr->name);
+ } else {
+ const char *adrfam;
+
+ adrfam = spdk_nvme_transport_id_adrfam_str(nvme_bdev_ctrlr->trid->adrfam);
+ prchk_flags = spdk_nvme_prchk_flags_str(nvme_bdev_ctrlr->prchk_flags);
+
+ if (adrfam) {
+ fprintf(fp, "TransportID \"trtype:%s adrfam:%s traddr:%s trsvcid:%s subnqn:%s\" %s",
+ trtype, adrfam,
+ nvme_bdev_ctrlr->trid->traddr, nvme_bdev_ctrlr->trid->trsvcid,
+ nvme_bdev_ctrlr->trid->subnqn, nvme_bdev_ctrlr->name);
+ } else {
+ fprintf(fp, "TransportID \"trtype:%s traddr:%s trsvcid:%s subnqn:%s\" %s",
+ trtype,
+ nvme_bdev_ctrlr->trid->traddr, nvme_bdev_ctrlr->trid->trsvcid,
+ nvme_bdev_ctrlr->trid->subnqn, nvme_bdev_ctrlr->name);
+ }
+
+ if (prchk_flags) {
+ fprintf(fp, " \"%s\"\n", prchk_flags);
+ } else {
+ fprintf(fp, "\n");
+ }
+ }
+ }
+
+ fprintf(fp, "\n"
+ "# The number of attempts per I/O when an I/O fails. Do not include\n"
+ "# this key to get the default behavior.\n");
+ fprintf(fp, "RetryCount %d\n", g_opts.retry_count);
+ fprintf(fp, "\n"
+ "# Timeout for each command, in microseconds. If 0, don't track timeouts.\n");
+ fprintf(fp, "TimeoutUsec %"PRIu64"\n", g_opts.timeout_us);
+
+ fprintf(fp, "\n"
+ "# Action to take on command time out. Only valid when Timeout is greater\n"
+ "# than 0. This may be 'Reset' to reset the controller, 'Abort' to abort\n"
+ "# the command, or 'None' to just print a message but do nothing.\n"
+ "# Admin command timeouts will always result in a reset.\n");
+ switch (g_opts.action_on_timeout) {
+ case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE:
+ fprintf(fp, "ActionOnTimeout None\n");
+ break;
+ case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET:
+ fprintf(fp, "ActionOnTimeout Reset\n");
+ break;
+ case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT:
+ fprintf(fp, "ActionOnTimeout Abort\n");
+ break;
+ }
+
+ fprintf(fp, "\n"
+ "# Set how often the admin queue is polled for asynchronous events.\n"
+ "# Units in microseconds.\n");
+ fprintf(fp, "AdminPollRate %"PRIu64"\n", g_opts.nvme_adminq_poll_period_us);
+ fprintf(fp, "IOPollRate %" PRIu64"\n", g_opts.nvme_ioq_poll_period_us);
+ fprintf(fp, "\n"
+ "# Disable handling of hotplug (runtime insert and remove) events,\n"
+ "# users can set to Yes if want to enable it.\n"
+ "# Default: No\n");
+ fprintf(fp, "HotplugEnable %s\n", g_nvme_hotplug_enabled ? "Yes" : "No");
+ fprintf(fp, "\n"
+ "# Set how often the hotplug is processed for insert and remove events."
+ "# Units in microseconds.\n");
+ fprintf(fp, "HotplugPollRate %"PRIu64"\n", g_nvme_hotplug_poll_period_us);
+ if (g_nvme_hostnqn) {
+ fprintf(fp, "HostNQN %s\n", g_nvme_hostnqn);
+ }
+ fprintf(fp, "DelayCmdSubmit %s\n", g_opts.delay_cmd_submit ? "True" : "False");
+
+ fprintf(fp, "\n");
+}
+
+static void
+nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w, struct nvme_bdev_ns *ns)
+{
+ /* nop */
+}
+
+static void
+nvme_namespace_config_json(struct spdk_json_write_ctx *w, struct nvme_bdev_ns *ns)
+{
+ g_config_json_namespace_fn[ns->type](w, ns);
+}
+
+static int
+bdev_nvme_config_json(struct spdk_json_write_ctx *w)
+{
+ struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
+ struct spdk_nvme_transport_id *trid;
+ const char *action;
+ uint32_t nsid;
+
+ if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) {
+ action = "reset";
+ } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) {
+ action = "abort";
+ } else {
+ action = "none";
+ }
+
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_string(w, "method", "bdev_nvme_set_options");
+
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_string(w, "action_on_timeout", action);
+ spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us);
+ spdk_json_write_named_uint32(w, "retry_count", g_opts.retry_count);
+ spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst);
+ spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight);
+ spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight);
+ spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight);
+ spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us);
+ spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us);
+ spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests);
+ spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit);
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+
+ pthread_mutex_lock(&g_bdev_nvme_mutex);
+ TAILQ_FOREACH(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
+ trid = nvme_bdev_ctrlr->trid;
+
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller");
+
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_string(w, "name", nvme_bdev_ctrlr->name);
+ nvme_bdev_dump_trid_json(trid, w);
+ spdk_json_write_named_bool(w, "prchk_reftag",
+ (nvme_bdev_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0);
+ spdk_json_write_named_bool(w, "prchk_guard",
+ (nvme_bdev_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0);
+
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+
+ for (nsid = 0; nsid < nvme_bdev_ctrlr->num_ns; ++nsid) {
+ if (!nvme_bdev_ctrlr->namespaces[nsid]->populated) {
+ continue;
+ }
+
+ nvme_namespace_config_json(w, nvme_bdev_ctrlr->namespaces[nsid]);
+ }
+ }
+
+ /* Dump as last parameter to give all NVMe bdevs chance to be constructed
+ * before enabling hotplug poller.
+ */
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug");
+
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us);
+ spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled);
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+
+ pthread_mutex_unlock(&g_bdev_nvme_mutex);
+ return 0;
+}
+
+struct spdk_nvme_ctrlr *
+bdev_nvme_get_ctrlr(struct spdk_bdev *bdev)
+{
+ if (!bdev || bdev->module != &nvme_if) {
+ return NULL;
+ }
+
+ return SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk)->nvme_bdev_ctrlr->ctrlr;
+}
+
+SPDK_LOG_REGISTER_COMPONENT("bdev_nvme", SPDK_LOG_BDEV_NVME)
diff --git a/src/spdk/module/bdev/nvme/bdev_nvme.h b/src/spdk/module/bdev/nvme/bdev_nvme.h
new file mode 100644
index 000000000..417c21cad
--- /dev/null
+++ b/src/spdk/module/bdev/nvme/bdev_nvme.h
@@ -0,0 +1,90 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation. All rights reserved.
+ * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_BDEV_NVME_H
+#define SPDK_BDEV_NVME_H
+
+#include "spdk/stdinc.h"
+
+#include "spdk/queue.h"
+#include "spdk/nvme.h"
+#include "spdk/bdev_module.h"
+
+#include "common.h"
+
+enum spdk_bdev_timeout_action {
+ SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE = 0,
+ SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET,
+ SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT,
+};
+
+struct spdk_bdev_nvme_opts {
+ enum spdk_bdev_timeout_action action_on_timeout;
+ uint64_t timeout_us;
+ uint32_t retry_count;
+ uint32_t arbitration_burst;
+ uint32_t low_priority_weight;
+ uint32_t medium_priority_weight;
+ uint32_t high_priority_weight;
+ uint64_t nvme_adminq_poll_period_us;
+ uint64_t nvme_ioq_poll_period_us;
+ uint32_t io_queue_requests;
+ bool delay_cmd_submit;
+};
+
+struct spdk_nvme_qpair *bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch);
+void bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts);
+int bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts);
+int bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx);
+
+int bdev_nvme_create(struct spdk_nvme_transport_id *trid,
+ struct spdk_nvme_host_id *hostid,
+ const char *base_name,
+ const char **names,
+ uint32_t count,
+ const char *hostnqn,
+ uint32_t prchk_flags,
+ spdk_bdev_create_nvme_fn cb_fn,
+ void *cb_ctx);
+struct spdk_nvme_ctrlr *bdev_nvme_get_ctrlr(struct spdk_bdev *bdev);
+
+/**
+ * Delete NVMe controller with all bdevs on top of it.
+ * Requires to pass name of NVMe controller.
+ *
+ * \param name NVMe controller name
+ * \return zero on success, -EINVAL on wrong parameters or -ENODEV if controller is not found
+ */
+int bdev_nvme_delete(const char *name);
+
+#endif /* SPDK_BDEV_NVME_H */
diff --git a/src/spdk/module/bdev/nvme/bdev_nvme_cuse_rpc.c b/src/spdk/module/bdev/nvme/bdev_nvme_cuse_rpc.c
new file mode 100644
index 000000000..c116c510d
--- /dev/null
+++ b/src/spdk/module/bdev/nvme/bdev_nvme_cuse_rpc.c
@@ -0,0 +1,152 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "bdev_nvme.h"
+
+#include "spdk/string.h"
+#include "spdk/rpc.h"
+#include "spdk/util.h"
+#include "spdk/nvme.h"
+
+#include "spdk_internal/log.h"
+
+struct rpc_nvme_cuse_register {
+ char *name;
+};
+
+static void
+free_rpc_nvme_cuse_register(struct rpc_nvme_cuse_register *req)
+{
+ free(req->name);
+}
+
+static const struct spdk_json_object_decoder rpc_nvme_cuse_register_decoders[] = {
+ {"name", offsetof(struct rpc_nvme_cuse_register, name), spdk_json_decode_string},
+};
+
+static void
+rpc_nvme_cuse_register(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_nvme_cuse_register req = {};
+ struct spdk_json_write_ctx *w;
+ struct nvme_bdev_ctrlr *bdev_ctrlr = NULL;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_nvme_cuse_register_decoders,
+ SPDK_COUNTOF(rpc_nvme_cuse_register_decoders),
+ &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(req.name);
+ if (!bdev_ctrlr) {
+ SPDK_ERRLOG("No such controller\n");
+ spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV));
+ goto cleanup;
+ }
+
+ rc = spdk_nvme_cuse_register(bdev_ctrlr->ctrlr);
+ if (rc) {
+ SPDK_ERRLOG("Failed to register CUSE devices: %s\n", spdk_strerror(-rc));
+ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc));
+ goto cleanup;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+
+cleanup:
+ free_rpc_nvme_cuse_register(&req);
+}
+SPDK_RPC_REGISTER("bdev_nvme_cuse_register", rpc_nvme_cuse_register, SPDK_RPC_RUNTIME)
+
+struct rpc_nvme_cuse_unregister {
+ char *name;
+};
+
+static void
+free_rpc_nvme_cuse_unregister(struct rpc_nvme_cuse_unregister *req)
+{
+ free(req->name);
+}
+
+static const struct spdk_json_object_decoder rpc_nvme_cuse_unregister_decoders[] = {
+ {"name", offsetof(struct rpc_nvme_cuse_unregister, name), spdk_json_decode_string, true},
+};
+
+static void
+rpc_nvme_cuse_unregister(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_nvme_cuse_unregister req = {};
+ struct spdk_json_write_ctx *w;
+ struct nvme_bdev_ctrlr *bdev_ctrlr = NULL;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_nvme_cuse_unregister_decoders,
+ SPDK_COUNTOF(rpc_nvme_cuse_unregister_decoders),
+ &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(req.name);
+ if (!bdev_ctrlr) {
+ SPDK_ERRLOG("No such controller\n");
+ spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV));
+ goto cleanup;
+ }
+
+ rc = spdk_nvme_cuse_unregister(bdev_ctrlr->ctrlr);
+ if (rc) {
+ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc));
+ goto cleanup;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+
+cleanup:
+ free_rpc_nvme_cuse_unregister(&req);
+}
+SPDK_RPC_REGISTER("bdev_nvme_cuse_unregister", rpc_nvme_cuse_unregister, SPDK_RPC_RUNTIME)
diff --git a/src/spdk/module/bdev/nvme/bdev_nvme_rpc.c b/src/spdk/module/bdev/nvme/bdev_nvme_rpc.c
new file mode 100644
index 000000000..299da4023
--- /dev/null
+++ b/src/spdk/module/bdev/nvme/bdev_nvme_rpc.c
@@ -0,0 +1,842 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation. All rights reserved.
+ * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "bdev_nvme.h"
+#include "common.h"
+
+#include "spdk/config.h"
+
+#include "spdk/string.h"
+#include "spdk/rpc.h"
+#include "spdk/util.h"
+
+#include "spdk_internal/log.h"
+#include "spdk/bdev_module.h"
+
+struct open_descriptors {
+ void *desc;
+ struct spdk_bdev *bdev;
+ TAILQ_ENTRY(open_descriptors) tqlst;
+ struct spdk_thread *thread;
+};
+typedef TAILQ_HEAD(, open_descriptors) open_descriptors_t;
+
+static int
+rpc_decode_action_on_timeout(const struct spdk_json_val *val, void *out)
+{
+ enum spdk_bdev_timeout_action *action = out;
+
+ if (spdk_json_strequal(val, "none") == true) {
+ *action = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE;
+ } else if (spdk_json_strequal(val, "abort") == true) {
+ *action = SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT;
+ } else if (spdk_json_strequal(val, "reset") == true) {
+ *action = SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET;
+ } else {
+ SPDK_NOTICELOG("Invalid parameter value: action_on_timeout\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static const struct spdk_json_object_decoder rpc_bdev_nvme_options_decoders[] = {
+ {"action_on_timeout", offsetof(struct spdk_bdev_nvme_opts, action_on_timeout), rpc_decode_action_on_timeout, true},
+ {"timeout_us", offsetof(struct spdk_bdev_nvme_opts, timeout_us), spdk_json_decode_uint64, true},
+ {"retry_count", offsetof(struct spdk_bdev_nvme_opts, retry_count), spdk_json_decode_uint32, true},
+ {"arbitration_burst", offsetof(struct spdk_bdev_nvme_opts, arbitration_burst), spdk_json_decode_uint32, true},
+ {"low_priority_weight", offsetof(struct spdk_bdev_nvme_opts, low_priority_weight), spdk_json_decode_uint32, true},
+ {"medium_priority_weight", offsetof(struct spdk_bdev_nvme_opts, medium_priority_weight), spdk_json_decode_uint32, true},
+ {"high_priority_weight", offsetof(struct spdk_bdev_nvme_opts, high_priority_weight), spdk_json_decode_uint32, true},
+ {"nvme_adminq_poll_period_us", offsetof(struct spdk_bdev_nvme_opts, nvme_adminq_poll_period_us), spdk_json_decode_uint64, true},
+ {"nvme_ioq_poll_period_us", offsetof(struct spdk_bdev_nvme_opts, nvme_ioq_poll_period_us), spdk_json_decode_uint64, true},
+ {"io_queue_requests", offsetof(struct spdk_bdev_nvme_opts, io_queue_requests), spdk_json_decode_uint32, true},
+ {"delay_cmd_submit", offsetof(struct spdk_bdev_nvme_opts, delay_cmd_submit), spdk_json_decode_bool, true},
+};
+
+static void
+rpc_bdev_nvme_set_options(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct spdk_bdev_nvme_opts opts;
+ struct spdk_json_write_ctx *w;
+ int rc;
+
+ bdev_nvme_get_opts(&opts);
+ if (params && spdk_json_decode_object(params, rpc_bdev_nvme_options_decoders,
+ SPDK_COUNTOF(rpc_bdev_nvme_options_decoders),
+ &opts)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ return;
+ }
+
+ rc = bdev_nvme_set_opts(&opts);
+ if (rc) {
+ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc));
+ return;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+
+ return;
+}
+SPDK_RPC_REGISTER("bdev_nvme_set_options", rpc_bdev_nvme_set_options,
+ SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_nvme_set_options, set_bdev_nvme_options)
+
+struct rpc_bdev_nvme_hotplug {
+ bool enabled;
+ uint64_t period_us;
+};
+
+static const struct spdk_json_object_decoder rpc_bdev_nvme_hotplug_decoders[] = {
+ {"enable", offsetof(struct rpc_bdev_nvme_hotplug, enabled), spdk_json_decode_bool, false},
+ {"period_us", offsetof(struct rpc_bdev_nvme_hotplug, period_us), spdk_json_decode_uint64, true},
+};
+
+static void
+rpc_bdev_nvme_set_hotplug_done(void *ctx)
+{
+ struct spdk_jsonrpc_request *request = ctx;
+ struct spdk_json_write_ctx *w = spdk_jsonrpc_begin_result(request);
+
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+}
+
+static void
+rpc_bdev_nvme_set_hotplug(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_nvme_hotplug req = {false, 0};
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_bdev_nvme_hotplug_decoders,
+ SPDK_COUNTOF(rpc_bdev_nvme_hotplug_decoders), &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ rc = -EINVAL;
+ goto invalid;
+ }
+
+ rc = bdev_nvme_set_hotplug(req.enabled, req.period_us, rpc_bdev_nvme_set_hotplug_done,
+ request);
+ if (rc) {
+ goto invalid;
+ }
+
+ return;
+invalid:
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, spdk_strerror(-rc));
+}
+SPDK_RPC_REGISTER("bdev_nvme_set_hotplug", rpc_bdev_nvme_set_hotplug, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_nvme_set_hotplug, set_bdev_nvme_hotplug)
+
+struct rpc_bdev_nvme_attach_controller {
+ char *name;
+ char *trtype;
+ char *adrfam;
+ char *traddr;
+ char *trsvcid;
+ char *priority;
+ char *subnqn;
+ char *hostnqn;
+ char *hostaddr;
+ char *hostsvcid;
+ bool prchk_reftag;
+ bool prchk_guard;
+};
+
+static void
+free_rpc_bdev_nvme_attach_controller(struct rpc_bdev_nvme_attach_controller *req)
+{
+ free(req->name);
+ free(req->trtype);
+ free(req->adrfam);
+ free(req->traddr);
+ free(req->trsvcid);
+ free(req->priority);
+ free(req->subnqn);
+ free(req->hostnqn);
+ free(req->hostaddr);
+ free(req->hostsvcid);
+}
+
+static const struct spdk_json_object_decoder rpc_bdev_nvme_attach_controller_decoders[] = {
+ {"name", offsetof(struct rpc_bdev_nvme_attach_controller, name), spdk_json_decode_string},
+ {"trtype", offsetof(struct rpc_bdev_nvme_attach_controller, trtype), spdk_json_decode_string},
+ {"traddr", offsetof(struct rpc_bdev_nvme_attach_controller, traddr), spdk_json_decode_string},
+
+ {"adrfam", offsetof(struct rpc_bdev_nvme_attach_controller, adrfam), spdk_json_decode_string, true},
+ {"trsvcid", offsetof(struct rpc_bdev_nvme_attach_controller, trsvcid), spdk_json_decode_string, true},
+ {"priority", offsetof(struct rpc_bdev_nvme_attach_controller, priority), spdk_json_decode_string, true},
+ {"subnqn", offsetof(struct rpc_bdev_nvme_attach_controller, subnqn), spdk_json_decode_string, true},
+ {"hostnqn", offsetof(struct rpc_bdev_nvme_attach_controller, hostnqn), spdk_json_decode_string, true},
+ {"hostaddr", offsetof(struct rpc_bdev_nvme_attach_controller, hostaddr), spdk_json_decode_string, true},
+ {"hostsvcid", offsetof(struct rpc_bdev_nvme_attach_controller, hostsvcid), spdk_json_decode_string, true},
+
+ {"prchk_reftag", offsetof(struct rpc_bdev_nvme_attach_controller, prchk_reftag), spdk_json_decode_bool, true},
+ {"prchk_guard", offsetof(struct rpc_bdev_nvme_attach_controller, prchk_guard), spdk_json_decode_bool, true}
+};
+
+#define NVME_MAX_BDEVS_PER_RPC 128
+
+struct rpc_bdev_nvme_attach_controller_ctx {
+ struct rpc_bdev_nvme_attach_controller req;
+ uint32_t count;
+ const char *names[NVME_MAX_BDEVS_PER_RPC];
+ struct spdk_jsonrpc_request *request;
+};
+
+static void
+rpc_bdev_nvme_attach_controller_done(void *cb_ctx, size_t bdev_count, int rc)
+{
+ struct rpc_bdev_nvme_attach_controller_ctx *ctx = cb_ctx;
+ struct spdk_jsonrpc_request *request = ctx->request;
+ struct spdk_json_write_ctx *w;
+ size_t i;
+
+ if (rc < 0) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+ goto exit;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_array_begin(w);
+ for (i = 0; i < bdev_count; i++) {
+ spdk_json_write_string(w, ctx->names[i]);
+ }
+ spdk_json_write_array_end(w);
+ spdk_jsonrpc_end_result(request, w);
+
+exit:
+ free_rpc_bdev_nvme_attach_controller(&ctx->req);
+ free(ctx);
+}
+
+static void
+rpc_bdev_nvme_attach_controller(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_nvme_attach_controller_ctx *ctx;
+ struct spdk_nvme_transport_id trid = {};
+ struct spdk_nvme_host_id hostid = {};
+ uint32_t prchk_flags = 0;
+ int rc;
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx) {
+ spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM));
+ return;
+ }
+
+ if (spdk_json_decode_object(params, rpc_bdev_nvme_attach_controller_decoders,
+ SPDK_COUNTOF(rpc_bdev_nvme_attach_controller_decoders),
+ &ctx->req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ /* Parse trstring */
+ rc = spdk_nvme_transport_id_populate_trstring(&trid, ctx->req.trtype);
+ if (rc < 0) {
+ SPDK_ERRLOG("Failed to parse trtype: %s\n", ctx->req.trtype);
+ spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "Failed to parse trtype: %s",
+ ctx->req.trtype);
+ goto cleanup;
+ }
+
+ /* Parse trtype */
+ rc = spdk_nvme_transport_id_parse_trtype(&trid.trtype, ctx->req.trtype);
+ assert(rc == 0);
+
+ /* Parse traddr */
+ snprintf(trid.traddr, sizeof(trid.traddr), "%s", ctx->req.traddr);
+
+ /* Parse adrfam */
+ if (ctx->req.adrfam) {
+ rc = spdk_nvme_transport_id_parse_adrfam(&trid.adrfam, ctx->req.adrfam);
+ if (rc < 0) {
+ SPDK_ERRLOG("Failed to parse adrfam: %s\n", ctx->req.adrfam);
+ spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "Failed to parse adrfam: %s",
+ ctx->req.adrfam);
+ goto cleanup;
+ }
+ }
+
+ /* Parse trsvcid */
+ if (ctx->req.trsvcid) {
+ snprintf(trid.trsvcid, sizeof(trid.trsvcid), "%s", ctx->req.trsvcid);
+ }
+
+ /* Parse priority for the NVMe-oF transport connection */
+ if (ctx->req.priority) {
+ trid.priority = spdk_strtol(ctx->req.priority, 10);
+ }
+
+ /* Parse subnqn */
+ if (ctx->req.subnqn) {
+ snprintf(trid.subnqn, sizeof(trid.subnqn), "%s", ctx->req.subnqn);
+ }
+
+ if (ctx->req.hostaddr) {
+ snprintf(hostid.hostaddr, sizeof(hostid.hostaddr), "%s", ctx->req.hostaddr);
+ }
+
+ if (ctx->req.hostsvcid) {
+ snprintf(hostid.hostsvcid, sizeof(hostid.hostsvcid), "%s", ctx->req.hostsvcid);
+ }
+
+ if (ctx->req.prchk_reftag) {
+ prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_REFTAG;
+ }
+
+ if (ctx->req.prchk_guard) {
+ prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_GUARD;
+ }
+
+ ctx->request = request;
+ ctx->count = NVME_MAX_BDEVS_PER_RPC;
+ rc = bdev_nvme_create(&trid, &hostid, ctx->req.name, ctx->names, ctx->count, ctx->req.hostnqn,
+ prchk_flags, rpc_bdev_nvme_attach_controller_done, ctx);
+ if (rc) {
+ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc));
+ goto cleanup;
+ }
+
+ return;
+
+cleanup:
+ free_rpc_bdev_nvme_attach_controller(&ctx->req);
+ free(ctx);
+}
+SPDK_RPC_REGISTER("bdev_nvme_attach_controller", rpc_bdev_nvme_attach_controller,
+ SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_nvme_attach_controller, construct_nvme_bdev)
+
+static void
+rpc_dump_nvme_controller_info(struct spdk_json_write_ctx *w,
+ struct nvme_bdev_ctrlr *nvme_bdev_ctrlr)
+{
+ struct spdk_nvme_transport_id *trid;
+
+ trid = nvme_bdev_ctrlr->trid;
+
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "name", nvme_bdev_ctrlr->name);
+
+#ifdef SPDK_CONFIG_NVME_CUSE
+ size_t cuse_name_size = 128;
+ char cuse_name[cuse_name_size];
+
+ int rc = spdk_nvme_cuse_get_ctrlr_name(nvme_bdev_ctrlr->ctrlr, cuse_name, &cuse_name_size);
+ if (rc == 0) {
+ spdk_json_write_named_string(w, "cuse_device", cuse_name);
+ }
+#endif
+
+ spdk_json_write_named_object_begin(w, "trid");
+ nvme_bdev_dump_trid_json(trid, w);
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+}
+
+struct rpc_bdev_nvme_get_controllers {
+ char *name;
+};
+
+static void
+free_rpc_bdev_nvme_get_controllers(struct rpc_bdev_nvme_get_controllers *r)
+{
+ free(r->name);
+}
+
+static const struct spdk_json_object_decoder rpc_bdev_nvme_get_controllers_decoders[] = {
+ {"name", offsetof(struct rpc_bdev_nvme_get_controllers, name), spdk_json_decode_string, true},
+};
+
+static void
+rpc_bdev_nvme_get_controllers(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_nvme_get_controllers req = {};
+ struct spdk_json_write_ctx *w;
+ struct nvme_bdev_ctrlr *ctrlr = NULL;
+
+ if (params && spdk_json_decode_object(params, rpc_bdev_nvme_get_controllers_decoders,
+ SPDK_COUNTOF(rpc_bdev_nvme_get_controllers_decoders),
+ &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ if (req.name) {
+ ctrlr = nvme_bdev_ctrlr_get_by_name(req.name);
+ if (ctrlr == NULL) {
+ SPDK_ERRLOG("ctrlr '%s' does not exist\n", req.name);
+ spdk_jsonrpc_send_error_response_fmt(request, EINVAL, "Controller %s does not exist", req.name);
+ goto cleanup;
+ }
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_array_begin(w);
+
+ if (ctrlr != NULL) {
+ rpc_dump_nvme_controller_info(w, ctrlr);
+ } else {
+ for (ctrlr = nvme_bdev_first_ctrlr(); ctrlr; ctrlr = nvme_bdev_next_ctrlr(ctrlr)) {
+ rpc_dump_nvme_controller_info(w, ctrlr);
+ }
+ }
+
+ spdk_json_write_array_end(w);
+
+ spdk_jsonrpc_end_result(request, w);
+
+cleanup:
+ free_rpc_bdev_nvme_get_controllers(&req);
+}
+SPDK_RPC_REGISTER("bdev_nvme_get_controllers", rpc_bdev_nvme_get_controllers, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_nvme_get_controllers, get_nvme_controllers)
+
+struct rpc_bdev_nvme_detach_controller {
+ char *name;
+};
+
+static void
+free_rpc_bdev_nvme_detach_controller(struct rpc_bdev_nvme_detach_controller *req)
+{
+ free(req->name);
+}
+
+static const struct spdk_json_object_decoder rpc_bdev_nvme_detach_controller_decoders[] = {
+ {"name", offsetof(struct rpc_bdev_nvme_detach_controller, name), spdk_json_decode_string},
+};
+
+static void
+rpc_bdev_nvme_detach_controller(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_nvme_detach_controller req = {NULL};
+ struct spdk_json_write_ctx *w;
+ int rc = 0;
+
+ if (spdk_json_decode_object(params, rpc_bdev_nvme_detach_controller_decoders,
+ SPDK_COUNTOF(rpc_bdev_nvme_detach_controller_decoders),
+ &req)) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ rc = bdev_nvme_delete(req.name);
+ if (rc != 0) {
+ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc));
+ goto cleanup;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+
+cleanup:
+ free_rpc_bdev_nvme_detach_controller(&req);
+}
+SPDK_RPC_REGISTER("bdev_nvme_detach_controller", rpc_bdev_nvme_detach_controller,
+ SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_nvme_detach_controller, delete_nvme_controller)
+
+struct rpc_apply_firmware {
+ char *filename;
+ char *bdev_name;
+};
+
+static void
+free_rpc_apply_firmware(struct rpc_apply_firmware *req)
+{
+ free(req->filename);
+ free(req->bdev_name);
+}
+
+static const struct spdk_json_object_decoder rpc_apply_firmware_decoders[] = {
+ {"filename", offsetof(struct rpc_apply_firmware, filename), spdk_json_decode_string},
+ {"bdev_name", offsetof(struct rpc_apply_firmware, bdev_name), spdk_json_decode_string},
+};
+
+struct firmware_update_info {
+ void *fw_image;
+ void *p;
+ unsigned int size;
+ unsigned int size_remaining;
+ unsigned int offset;
+ unsigned int transfer;
+
+ void *desc;
+ struct spdk_io_channel *ch;
+ struct spdk_jsonrpc_request *request;
+ struct spdk_nvme_ctrlr *ctrlr;
+ open_descriptors_t desc_head;
+ struct rpc_apply_firmware *req;
+};
+
+static void
+_apply_firmware_cleanup(void *ctx)
+{
+ struct spdk_bdev_desc *desc = ctx;
+
+ spdk_bdev_close(desc);
+}
+
+static void
+apply_firmware_cleanup(void *cb_arg)
+{
+ struct open_descriptors *opt, *tmp;
+ struct firmware_update_info *firm_ctx = cb_arg;
+
+ if (!firm_ctx) {
+ return;
+ }
+
+ if (firm_ctx->fw_image) {
+ spdk_free(firm_ctx->fw_image);
+ }
+
+ if (firm_ctx->req) {
+ free_rpc_apply_firmware(firm_ctx->req);
+ free(firm_ctx->req);
+ }
+
+ if (firm_ctx->ch) {
+ spdk_put_io_channel(firm_ctx->ch);
+ }
+
+ TAILQ_FOREACH_SAFE(opt, &firm_ctx->desc_head, tqlst, tmp) {
+ TAILQ_REMOVE(&firm_ctx->desc_head, opt, tqlst);
+ /* Close the underlying bdev on its same opened thread. */
+ if (opt->thread && opt->thread != spdk_get_thread()) {
+ spdk_thread_send_msg(opt->thread, _apply_firmware_cleanup, opt->desc);
+ } else {
+ spdk_bdev_close(opt->desc);
+ }
+ free(opt);
+ }
+ free(firm_ctx);
+}
+
+static void
+apply_firmware_complete_reset(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ int rc;
+ struct spdk_json_write_ctx *w;
+ struct firmware_update_info *firm_ctx = cb_arg;
+
+ spdk_bdev_free_io(bdev_io);
+
+ if (!success) {
+ spdk_jsonrpc_send_error_response(firm_ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "firmware commit failed.");
+ apply_firmware_cleanup(firm_ctx);
+ return;
+ }
+
+ if ((rc = spdk_nvme_ctrlr_reset(firm_ctx->ctrlr)) != 0) {
+ spdk_jsonrpc_send_error_response(firm_ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Controller reset failed.");
+ apply_firmware_cleanup(firm_ctx);
+ return;
+ }
+
+ w = spdk_jsonrpc_begin_result(firm_ctx->request);
+ spdk_json_write_string(w, "firmware commit succeeded. Controller reset in progress.");
+ spdk_jsonrpc_end_result(firm_ctx->request, w);
+ apply_firmware_cleanup(firm_ctx);
+}
+
+static void
+apply_firmware_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct spdk_nvme_cmd cmd = {};
+ struct spdk_nvme_fw_commit fw_commit;
+ int slot = 0;
+ int rc;
+ struct firmware_update_info *firm_ctx = cb_arg;
+ enum spdk_nvme_fw_commit_action commit_action = SPDK_NVME_FW_COMMIT_REPLACE_AND_ENABLE_IMG;
+
+ if (!success) {
+ spdk_jsonrpc_send_error_response(firm_ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "firmware download failed .");
+ spdk_bdev_free_io(bdev_io);
+ apply_firmware_cleanup(firm_ctx);
+ return;
+ }
+
+ firm_ctx->p += firm_ctx->transfer;
+ firm_ctx->offset += firm_ctx->transfer;
+ firm_ctx->size_remaining -= firm_ctx->transfer;
+
+ switch (firm_ctx->size_remaining) {
+ case 0:
+ /* firmware download completed. Commit firmware */
+ memset(&fw_commit, 0, sizeof(struct spdk_nvme_fw_commit));
+ fw_commit.fs = slot;
+ fw_commit.ca = commit_action;
+
+ cmd.opc = SPDK_NVME_OPC_FIRMWARE_COMMIT;
+ memcpy(&cmd.cdw10, &fw_commit, sizeof(uint32_t));
+ rc = spdk_bdev_nvme_admin_passthru(firm_ctx->desc, firm_ctx->ch, &cmd, NULL, 0,
+ apply_firmware_complete_reset, firm_ctx);
+ if (rc) {
+ spdk_jsonrpc_send_error_response(firm_ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "firmware commit failed.");
+ spdk_bdev_free_io(bdev_io);
+ apply_firmware_cleanup(firm_ctx);
+ return;
+ }
+ break;
+ default:
+ firm_ctx->transfer = spdk_min(firm_ctx->size_remaining, 4096);
+ cmd.opc = SPDK_NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD;
+
+ cmd.cdw10 = (firm_ctx->transfer >> 2) - 1;
+ cmd.cdw11 = firm_ctx->offset >> 2;
+ rc = spdk_bdev_nvme_admin_passthru(firm_ctx->desc, firm_ctx->ch, &cmd, firm_ctx->p,
+ firm_ctx->transfer, apply_firmware_complete, firm_ctx);
+ if (rc) {
+ spdk_jsonrpc_send_error_response(firm_ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "firmware download failed.");
+ spdk_bdev_free_io(bdev_io);
+ apply_firmware_cleanup(firm_ctx);
+ return;
+ }
+ break;
+ }
+}
+
+static void
+rpc_bdev_nvme_apply_firmware(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ int rc;
+ int fd = -1;
+ struct stat fw_stat;
+ struct spdk_nvme_ctrlr *ctrlr;
+ char msg[1024];
+ struct spdk_bdev *bdev;
+ struct spdk_bdev *bdev2;
+ struct open_descriptors *opt;
+ struct spdk_bdev_desc *desc;
+ struct spdk_nvme_cmd *cmd;
+ struct firmware_update_info *firm_ctx;
+
+ firm_ctx = calloc(1, sizeof(struct firmware_update_info));
+ if (!firm_ctx) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Memory allocation error.");
+ return;
+ }
+ firm_ctx->fw_image = NULL;
+ TAILQ_INIT(&firm_ctx->desc_head);
+ firm_ctx->request = request;
+
+ firm_ctx->req = calloc(1, sizeof(struct rpc_apply_firmware));
+ if (!firm_ctx->req) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Memory allocation error.");
+ free(firm_ctx);
+ return;
+ }
+
+ if (spdk_json_decode_object(params, rpc_apply_firmware_decoders,
+ SPDK_COUNTOF(rpc_apply_firmware_decoders), firm_ctx->req)) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed.");
+ free(firm_ctx->req);
+ free(firm_ctx);
+ return;
+ }
+
+ if ((bdev = spdk_bdev_get_by_name(firm_ctx->req->bdev_name)) == NULL) {
+ snprintf(msg, sizeof(msg), "bdev %s were not found", firm_ctx->req->bdev_name);
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, msg);
+ apply_firmware_cleanup(firm_ctx);
+ return;
+ }
+
+ if ((ctrlr = bdev_nvme_get_ctrlr(bdev)) == NULL) {
+ snprintf(msg, sizeof(msg), "Controller information for %s were not found.",
+ firm_ctx->req->bdev_name);
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, msg);
+ apply_firmware_cleanup(firm_ctx);
+ return;
+ }
+ firm_ctx->ctrlr = ctrlr;
+
+ for (bdev2 = spdk_bdev_first(); bdev2; bdev2 = spdk_bdev_next(bdev2)) {
+
+ if (bdev_nvme_get_ctrlr(bdev2) != ctrlr) {
+ continue;
+ }
+
+ if (!(opt = malloc(sizeof(struct open_descriptors)))) {
+ snprintf(msg, sizeof(msg), "Memory allocation error.");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, msg);
+ apply_firmware_cleanup(firm_ctx);
+ return;
+ }
+
+ if ((rc = spdk_bdev_open(bdev2, true, NULL, NULL, &desc)) != 0) {
+ snprintf(msg, sizeof(msg), "Device %s is in use.", firm_ctx->req->bdev_name);
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, msg);
+ free(opt);
+ apply_firmware_cleanup(firm_ctx);
+ return;
+ }
+
+ /* Save the thread where the base device is opened */
+ opt->thread = spdk_get_thread();
+
+ opt->desc = desc;
+ opt->bdev = bdev;
+ TAILQ_INSERT_TAIL(&firm_ctx->desc_head, opt, tqlst);
+ }
+
+ /*
+ * find a descriptor associated with our bdev
+ */
+ firm_ctx->desc = NULL;
+ TAILQ_FOREACH(opt, &firm_ctx->desc_head, tqlst) {
+ if (opt->bdev == bdev) {
+ firm_ctx->desc = opt->desc;
+ break;
+ }
+ }
+
+ if (!firm_ctx->desc) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "No descriptor were found.");
+ apply_firmware_cleanup(firm_ctx);
+ return;
+ }
+
+ firm_ctx->ch = spdk_bdev_get_io_channel(firm_ctx->desc);
+ if (!firm_ctx->ch) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "No channels were found.");
+ apply_firmware_cleanup(firm_ctx);
+ return;
+ }
+
+ fd = open(firm_ctx->req->filename, O_RDONLY);
+ if (fd < 0) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "open file failed.");
+ apply_firmware_cleanup(firm_ctx);
+ return;
+ }
+
+ rc = fstat(fd, &fw_stat);
+ if (rc < 0) {
+ close(fd);
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "fstat failed.");
+ apply_firmware_cleanup(firm_ctx);
+ return;
+ }
+
+ firm_ctx->size = fw_stat.st_size;
+ if (fw_stat.st_size % 4) {
+ close(fd);
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Firmware image size is not multiple of 4.");
+ apply_firmware_cleanup(firm_ctx);
+ return;
+ }
+
+ firm_ctx->fw_image = spdk_zmalloc(firm_ctx->size, 4096, NULL,
+ SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+ if (!firm_ctx->fw_image) {
+ close(fd);
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Memory allocation error.");
+ apply_firmware_cleanup(firm_ctx);
+ return;
+ }
+ firm_ctx->p = firm_ctx->fw_image;
+
+ if (read(fd, firm_ctx->p, firm_ctx->size) != ((ssize_t)(firm_ctx->size))) {
+ close(fd);
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Read firmware image failed!");
+ apply_firmware_cleanup(firm_ctx);
+ return;
+ }
+ close(fd);
+
+ firm_ctx->offset = 0;
+ firm_ctx->size_remaining = firm_ctx->size;
+ firm_ctx->transfer = spdk_min(firm_ctx->size_remaining, 4096);
+
+ cmd = malloc(sizeof(struct spdk_nvme_cmd));
+ if (!cmd) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Memory allocation error.");
+ apply_firmware_cleanup(firm_ctx);
+ return;
+ }
+ memset(cmd, 0, sizeof(struct spdk_nvme_cmd));
+ cmd->opc = SPDK_NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD;
+
+ cmd->cdw10 = (firm_ctx->transfer >> 2) - 1;
+ cmd->cdw11 = firm_ctx->offset >> 2;
+
+ rc = spdk_bdev_nvme_admin_passthru(firm_ctx->desc, firm_ctx->ch, cmd, firm_ctx->p,
+ firm_ctx->transfer, apply_firmware_complete, firm_ctx);
+ if (rc) {
+ free(cmd);
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Read firmware image failed!");
+ apply_firmware_cleanup(firm_ctx);
+ return;
+ }
+}
+SPDK_RPC_REGISTER("bdev_nvme_apply_firmware", rpc_bdev_nvme_apply_firmware, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_nvme_apply_firmware, apply_nvme_firmware)
diff --git a/src/spdk/module/bdev/nvme/bdev_ocssd.c b/src/spdk/module/bdev/nvme/bdev_ocssd.c
new file mode 100644
index 000000000..35f665f40
--- /dev/null
+++ b/src/spdk/module/bdev/nvme/bdev_ocssd.c
@@ -0,0 +1,1498 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+#include "spdk/bdev_module.h"
+#include "spdk/bdev_zone.h"
+#include "spdk/likely.h"
+#include "spdk/log.h"
+#include "spdk/string.h"
+#include "spdk/util.h"
+#include "spdk/nvme_ocssd.h"
+#include "spdk/nvme_ocssd_spec.h"
+#include "spdk_internal/log.h"
+#include "spdk/nvme.h"
+#include "common.h"
+#include "bdev_ocssd.h"
+
+struct bdev_ocssd_lba_offsets {
+ uint32_t grp;
+ uint32_t pu;
+ uint32_t chk;
+ uint32_t lbk;
+};
+
+struct bdev_ocssd_zone {
+ uint64_t slba;
+ uint64_t write_pointer;
+ uint64_t capacity;
+ bool busy;
+};
+
+struct bdev_ocssd_io {
+ union {
+ struct {
+ struct bdev_ocssd_zone *zone;
+ size_t iov_pos;
+ size_t iov_off;
+ uint64_t lba[SPDK_NVME_OCSSD_MAX_LBAL_ENTRIES];
+ } io;
+ struct {
+ size_t chunk_offset;
+ struct spdk_ocssd_chunk_information_entry chunk_info;
+ } zone_info;
+ };
+};
+
+struct ocssd_io_channel {
+ struct spdk_poller *pending_poller;
+ TAILQ_HEAD(, spdk_bdev_io) pending_requests;
+};
+
+struct ocssd_bdev {
+ struct nvme_bdev nvme_bdev;
+ struct bdev_ocssd_zone *zones;
+ struct bdev_ocssd_range range;
+};
+
+struct bdev_ocssd_ns {
+ struct spdk_ocssd_geometry_data geometry;
+ struct bdev_ocssd_lba_offsets lba_offsets;
+ bool chunk_notify_pending;
+ uint64_t chunk_notify_count;
+ uint64_t num_outstanding;
+#define CHUNK_NOTIFICATION_ENTRY_COUNT 64
+ struct spdk_ocssd_chunk_notification_entry chunk[CHUNK_NOTIFICATION_ENTRY_COUNT];
+};
+
+struct ocssd_bdev_ctrlr {
+ struct spdk_poller *mm_poller;
+};
+
+static struct bdev_ocssd_ns *
+bdev_ocssd_get_ns_from_nvme(struct nvme_bdev_ns *nvme_ns)
+{
+ return nvme_ns->type_ctx;
+}
+
+static struct bdev_ocssd_ns *
+bdev_ocssd_get_ns_from_bdev(struct ocssd_bdev *ocssd_bdev)
+{
+ return bdev_ocssd_get_ns_from_nvme(ocssd_bdev->nvme_bdev.nvme_ns);
+}
+
+static uint64_t
+bdev_ocssd_num_parallel_units(const struct ocssd_bdev *ocssd_bdev)
+{
+ return ocssd_bdev->range.end - ocssd_bdev->range.begin + 1;
+}
+
+static uint64_t
+bdev_ocssd_num_zones(const struct ocssd_bdev *ocssd_bdev)
+{
+ return ocssd_bdev->nvme_bdev.disk.blockcnt / ocssd_bdev->nvme_bdev.disk.zone_size;
+}
+
+static int
+bdev_ocssd_library_init(void)
+{
+ return 0;
+}
+
+static void
+bdev_ocssd_library_fini(void)
+{
+}
+
+static int
+bdev_ocssd_config_json(struct spdk_json_write_ctx *w)
+{
+ return 0;
+}
+
+void
+bdev_ocssd_namespace_config_json(struct spdk_json_write_ctx *w, struct nvme_bdev_ns *ns)
+{
+ struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
+ struct nvme_bdev *nvme_bdev;
+ struct ocssd_bdev *ocssd_bdev;
+ char range_buf[128];
+ int rc;
+
+ TAILQ_FOREACH(nvme_bdev, &ns->bdevs, tailq) {
+ nvme_bdev_ctrlr = nvme_bdev->nvme_bdev_ctrlr;
+ ocssd_bdev = SPDK_CONTAINEROF(nvme_bdev, struct ocssd_bdev, nvme_bdev);
+
+ rc = snprintf(range_buf, sizeof(range_buf), "%"PRIu64"-%"PRIu64,
+ ocssd_bdev->range.begin, ocssd_bdev->range.end);
+ if (rc < 0 || rc >= (int)sizeof(range_buf)) {
+ SPDK_ERRLOG("Failed to convert parallel unit range\n");
+ continue;
+ }
+
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "method", "bdev_ocssd_create");
+
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_string(w, "ctrlr_name", nvme_bdev_ctrlr->name);
+ spdk_json_write_named_string(w, "bdev_name", nvme_bdev->disk.name);
+ spdk_json_write_named_uint32(w, "nsid", nvme_bdev->nvme_ns->id);
+ spdk_json_write_named_string(w, "range", range_buf);
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+ }
+}
+
+static int
+bdev_ocssd_get_ctx_size(void)
+{
+ return sizeof(struct bdev_ocssd_io);
+}
+
+static struct spdk_bdev_module ocssd_if = {
+ .name = "ocssd",
+ .module_init = bdev_ocssd_library_init,
+ .module_fini = bdev_ocssd_library_fini,
+ .config_json = bdev_ocssd_config_json,
+ .get_ctx_size = bdev_ocssd_get_ctx_size,
+};
+
+SPDK_BDEV_MODULE_REGISTER(ocssd, &ocssd_if);
+
+static struct bdev_ocssd_zone *
+bdev_ocssd_get_zone_by_lba(struct ocssd_bdev *ocssd_bdev, uint64_t lba)
+{
+ struct nvme_bdev *nvme_bdev = &ocssd_bdev->nvme_bdev;
+ size_t zone_size = nvme_bdev->disk.zone_size;
+
+ if (lba >= nvme_bdev->disk.blockcnt) {
+ return NULL;
+ }
+
+ return &ocssd_bdev->zones[lba / zone_size];
+}
+
+static struct bdev_ocssd_zone *
+bdev_ocssd_get_zone_by_slba(struct ocssd_bdev *ocssd_bdev, uint64_t slba)
+{
+ struct nvme_bdev *nvme_bdev = &ocssd_bdev->nvme_bdev;
+
+ if (slba % nvme_bdev->disk.zone_size != 0) {
+ return NULL;
+ }
+
+ return bdev_ocssd_get_zone_by_lba(ocssd_bdev, slba);
+}
+
+static void
+bdev_ocssd_free_bdev(struct ocssd_bdev *ocssd_bdev)
+{
+ if (!ocssd_bdev) {
+ return;
+ }
+
+ free(ocssd_bdev->zones);
+ free(ocssd_bdev->nvme_bdev.disk.name);
+ free(ocssd_bdev);
+}
+
+static int
+bdev_ocssd_destruct(void *ctx)
+{
+ struct ocssd_bdev *ocssd_bdev = ctx;
+ struct nvme_bdev *nvme_bdev = &ocssd_bdev->nvme_bdev;
+
+ nvme_bdev_detach_bdev_from_ns(nvme_bdev);
+ bdev_ocssd_free_bdev(ocssd_bdev);
+
+ return 0;
+}
+
+static void
+bdev_ocssd_translate_lba(struct ocssd_bdev *ocssd_bdev, uint64_t lba, uint64_t *grp,
+ uint64_t *pu, uint64_t *chk, uint64_t *lbk)
+{
+ struct bdev_ocssd_ns *ocssd_ns = bdev_ocssd_get_ns_from_bdev(ocssd_bdev);
+ const struct spdk_ocssd_geometry_data *geo = &ocssd_ns->geometry;
+ const struct bdev_ocssd_range *range = &ocssd_bdev->range;
+ uint64_t addr_shift, punit;
+
+ /* To achieve best performance, we need to make sure that adjacent zones can be accessed
+ * in parallel. We accomplish this by having the following addressing scheme:
+ *
+ * [ zone id ][ zone offset ] User's LBA
+ * [ chunk ][ group ][ parallel unit ][ logical block ] Open Channel's LBA
+ *
+ * which means that neighbouring zones are placed in a different group and parallel unit.
+ */
+ *lbk = lba % geo->clba;
+ addr_shift = geo->clba;
+
+ punit = range->begin + (lba / addr_shift) % bdev_ocssd_num_parallel_units(ocssd_bdev);
+
+ *pu = punit % geo->num_pu;
+ *grp = punit / geo->num_pu;
+
+ addr_shift *= bdev_ocssd_num_parallel_units(ocssd_bdev);
+
+ *chk = (lba / addr_shift) % geo->num_chk;
+}
+
+static uint64_t
+bdev_ocssd_from_disk_lba(struct ocssd_bdev *ocssd_bdev, uint64_t lba)
+{
+ struct bdev_ocssd_ns *ocssd_ns = bdev_ocssd_get_ns_from_bdev(ocssd_bdev);
+ const struct spdk_ocssd_geometry_data *geometry = &ocssd_ns->geometry;
+ const struct bdev_ocssd_lba_offsets *offsets = &ocssd_ns->lba_offsets;
+ const struct bdev_ocssd_range *range = &ocssd_bdev->range;
+ uint64_t lbk, chk, pu, grp, punit;
+
+ lbk = (lba >> offsets->lbk) & ((1 << geometry->lbaf.lbk_len) - 1);
+ chk = (lba >> offsets->chk) & ((1 << geometry->lbaf.chk_len) - 1);
+ pu = (lba >> offsets->pu) & ((1 << geometry->lbaf.pu_len) - 1);
+ grp = (lba >> offsets->grp) & ((1 << geometry->lbaf.grp_len) - 1);
+
+ punit = grp * geometry->num_pu + pu - range->begin;
+
+ return lbk + punit * geometry->clba + chk * geometry->clba *
+ bdev_ocssd_num_parallel_units(ocssd_bdev);
+}
+
+static uint64_t
+bdev_ocssd_to_disk_lba(struct ocssd_bdev *ocssd_bdev, uint64_t lba)
+{
+ struct bdev_ocssd_ns *ocssd_ns = bdev_ocssd_get_ns_from_bdev(ocssd_bdev);
+ const struct bdev_ocssd_lba_offsets *offsets = &ocssd_ns->lba_offsets;
+ uint64_t lbk, chk, pu, grp;
+
+ bdev_ocssd_translate_lba(ocssd_bdev, lba, &grp, &pu, &chk, &lbk);
+
+ return (lbk << offsets->lbk) |
+ (chk << offsets->chk) |
+ (pu << offsets->pu) |
+ (grp << offsets->grp);
+}
+
+static bool
+bdev_ocssd_lba_in_range(struct ocssd_bdev *ocssd_bdev, uint64_t lba)
+{
+ struct bdev_ocssd_ns *ocssd_ns = bdev_ocssd_get_ns_from_bdev(ocssd_bdev);
+ const struct spdk_ocssd_geometry_data *geometry = &ocssd_ns->geometry;
+ const struct bdev_ocssd_lba_offsets *offsets = &ocssd_ns->lba_offsets;
+ const struct bdev_ocssd_range *range = &ocssd_bdev->range;
+ uint64_t pu, grp, punit;
+
+ pu = (lba >> offsets->pu) & ((1 << geometry->lbaf.pu_len) - 1);
+ grp = (lba >> offsets->grp) & ((1 << geometry->lbaf.grp_len) - 1);
+ punit = grp * geometry->num_pu + pu;
+
+ return punit >= range->begin && punit <= range->end;
+}
+
+static void
+bdev_ocssd_reset_sgl(void *cb_arg, uint32_t offset)
+{
+ struct spdk_bdev_io *bdev_io = cb_arg;
+ struct bdev_ocssd_io *ocdev_io = (struct bdev_ocssd_io *)bdev_io->driver_ctx;
+ struct iovec *iov;
+
+ ocdev_io->io.iov_pos = 0;
+ ocdev_io->io.iov_off = 0;
+
+ for (; ocdev_io->io.iov_pos < (size_t)bdev_io->u.bdev.iovcnt; ++ocdev_io->io.iov_pos) {
+ iov = &bdev_io->u.bdev.iovs[ocdev_io->io.iov_pos];
+ if (offset < iov->iov_len) {
+ ocdev_io->io.iov_off = offset;
+ return;
+ }
+
+ offset -= iov->iov_len;
+ }
+
+ assert(false && "Invalid offset length");
+}
+
+static int
+bdev_ocssd_next_sge(void *cb_arg, void **address, uint32_t *length)
+{
+ struct spdk_bdev_io *bdev_io = cb_arg;
+ struct bdev_ocssd_io *ocdev_io = (struct bdev_ocssd_io *)bdev_io->driver_ctx;
+ struct iovec *iov;
+
+ assert(ocdev_io->io.iov_pos < (size_t)bdev_io->u.bdev.iovcnt);
+ iov = &bdev_io->u.bdev.iovs[ocdev_io->io.iov_pos];
+
+ *address = iov->iov_base;
+ *length = iov->iov_len;
+
+ if (ocdev_io->io.iov_off != 0) {
+ assert(ocdev_io->io.iov_off < iov->iov_len);
+ *address = (char *)*address + ocdev_io->io.iov_off;
+ *length -= ocdev_io->io.iov_off;
+ }
+
+ assert(ocdev_io->io.iov_off + *length == iov->iov_len);
+ ocdev_io->io.iov_off = 0;
+ ocdev_io->io.iov_pos++;
+
+ return 0;
+}
+
+static void
+bdev_ocssd_read_cb(void *ctx, const struct spdk_nvme_cpl *cpl)
+{
+ struct spdk_bdev_io *bdev_io = ctx;
+
+ spdk_bdev_io_complete_nvme_status(bdev_io, 0, cpl->status.sct, cpl->status.sc);
+}
+
+static int
+bdev_ocssd_read(struct spdk_io_channel *ioch, struct spdk_bdev_io *bdev_io)
+{
+ struct ocssd_bdev *ocssd_bdev = bdev_io->bdev->ctxt;
+ struct nvme_bdev *nvme_bdev = &ocssd_bdev->nvme_bdev;
+ struct nvme_io_channel *nvme_ioch = spdk_io_channel_get_ctx(ioch);
+ struct bdev_ocssd_io *ocdev_io = (struct bdev_ocssd_io *)bdev_io->driver_ctx;
+ const size_t zone_size = nvme_bdev->disk.zone_size;
+ uint64_t lba;
+
+ if ((bdev_io->u.bdev.offset_blocks % zone_size) + bdev_io->u.bdev.num_blocks > zone_size) {
+ SPDK_ERRLOG("Tried to cross zone boundary during read command\n");
+ return -EINVAL;
+ }
+
+ ocdev_io->io.iov_pos = 0;
+ ocdev_io->io.iov_off = 0;
+
+ lba = bdev_ocssd_to_disk_lba(ocssd_bdev, bdev_io->u.bdev.offset_blocks);
+
+ return spdk_nvme_ns_cmd_readv_with_md(nvme_bdev->nvme_ns->ns, nvme_ioch->qpair, lba,
+ bdev_io->u.bdev.num_blocks, bdev_ocssd_read_cb,
+ bdev_io, 0, bdev_ocssd_reset_sgl,
+ bdev_ocssd_next_sge, bdev_io->u.bdev.md_buf, 0, 0);
+}
+
+static void
+bdev_ocssd_write_cb(void *ctx, const struct spdk_nvme_cpl *cpl)
+{
+ struct spdk_bdev_io *bdev_io = ctx;
+ struct bdev_ocssd_io *ocdev_io = (struct bdev_ocssd_io *)bdev_io->driver_ctx;
+
+ if (bdev_io->type == SPDK_BDEV_IO_TYPE_ZONE_APPEND) {
+ bdev_io->u.bdev.offset_blocks = ocdev_io->io.zone->write_pointer;
+ }
+
+ ocdev_io->io.zone->write_pointer = bdev_io->u.bdev.offset_blocks +
+ bdev_io->u.bdev.num_blocks;
+ assert(ocdev_io->io.zone->write_pointer <= ocdev_io->io.zone->slba +
+ ocdev_io->io.zone->capacity);
+
+ __atomic_store_n(&ocdev_io->io.zone->busy, false, __ATOMIC_SEQ_CST);
+ spdk_bdev_io_complete_nvme_status(bdev_io, 0, cpl->status.sct, cpl->status.sc);
+}
+
+static int
+bdev_ocssd_write(struct spdk_io_channel *ioch, struct spdk_bdev_io *bdev_io)
+{
+ struct ocssd_bdev *ocssd_bdev = bdev_io->bdev->ctxt;
+ struct nvme_bdev *nvme_bdev = &ocssd_bdev->nvme_bdev;
+ struct nvme_io_channel *nvme_ioch = spdk_io_channel_get_ctx(ioch);
+ struct bdev_ocssd_io *ocdev_io = (struct bdev_ocssd_io *)bdev_io->driver_ctx;
+ const size_t zone_size = nvme_bdev->disk.zone_size;
+ uint64_t lba;
+ int rc;
+
+ if ((bdev_io->u.bdev.offset_blocks % zone_size) + bdev_io->u.bdev.num_blocks > zone_size) {
+ SPDK_ERRLOG("Tried to cross zone boundary during write command\n");
+ return -EINVAL;
+ }
+
+ ocdev_io->io.zone = bdev_ocssd_get_zone_by_lba(ocssd_bdev, bdev_io->u.bdev.offset_blocks);
+ if (__atomic_exchange_n(&ocdev_io->io.zone->busy, true, __ATOMIC_SEQ_CST)) {
+ return -EINVAL;
+ }
+
+ ocdev_io->io.iov_pos = 0;
+ ocdev_io->io.iov_off = 0;
+
+ lba = bdev_ocssd_to_disk_lba(ocssd_bdev, bdev_io->u.bdev.offset_blocks);
+ rc = spdk_nvme_ns_cmd_writev_with_md(nvme_bdev->nvme_ns->ns, nvme_ioch->qpair, lba,
+ bdev_io->u.bdev.num_blocks, bdev_ocssd_write_cb,
+ bdev_io, 0, bdev_ocssd_reset_sgl,
+ bdev_ocssd_next_sge, bdev_io->u.bdev.md_buf, 0, 0);
+ if (spdk_unlikely(rc != 0)) {
+ __atomic_store_n(&ocdev_io->io.zone->busy, false, __ATOMIC_SEQ_CST);
+ }
+
+ return rc;
+}
+
+static int
+bdev_ocssd_zone_append(struct spdk_io_channel *ioch, struct spdk_bdev_io *bdev_io)
+{
+ struct ocssd_bdev *ocssd_bdev = bdev_io->bdev->ctxt;
+ struct nvme_bdev *nvme_bdev = &ocssd_bdev->nvme_bdev;
+ struct nvme_io_channel *nvme_ioch = spdk_io_channel_get_ctx(ioch);
+ struct bdev_ocssd_io *ocdev_io = (struct bdev_ocssd_io *)bdev_io->driver_ctx;
+ struct bdev_ocssd_zone *zone;
+ uint64_t lba;
+ int rc = 0;
+
+ zone = bdev_ocssd_get_zone_by_slba(ocssd_bdev, bdev_io->u.bdev.offset_blocks);
+ if (!zone) {
+ SPDK_ERRLOG("Invalid zone SLBA: %"PRIu64"\n", bdev_io->u.bdev.offset_blocks);
+ return -EINVAL;
+ }
+
+ if (__atomic_exchange_n(&zone->busy, true, __ATOMIC_SEQ_CST)) {
+ return -EAGAIN;
+ }
+
+ if (zone->slba + zone->capacity - zone->write_pointer < bdev_io->u.bdev.num_blocks) {
+ SPDK_ERRLOG("Insufficient number of blocks remaining\n");
+ rc = -ENOSPC;
+ goto out;
+ }
+
+ ocdev_io->io.zone = zone;
+ ocdev_io->io.iov_pos = 0;
+ ocdev_io->io.iov_off = 0;
+
+ lba = bdev_ocssd_to_disk_lba(ocssd_bdev, zone->write_pointer);
+ rc = spdk_nvme_ns_cmd_writev_with_md(nvme_bdev->nvme_ns->ns, nvme_ioch->qpair, lba,
+ bdev_io->u.bdev.num_blocks, bdev_ocssd_write_cb,
+ bdev_io, 0, bdev_ocssd_reset_sgl,
+ bdev_ocssd_next_sge, bdev_io->u.bdev.md_buf, 0, 0);
+out:
+ if (spdk_unlikely(rc != 0)) {
+ __atomic_store_n(&zone->busy, false, __ATOMIC_SEQ_CST);
+ }
+
+ return rc;
+}
+
+static void
+bdev_ocssd_io_get_buf_cb(struct spdk_io_channel *ioch, struct spdk_bdev_io *bdev_io, bool success)
+{
+ int rc;
+
+ if (!success) {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
+ return;
+ }
+
+ rc = bdev_ocssd_read(ioch, bdev_io);
+ if (spdk_likely(rc != 0)) {
+ if (rc == -ENOMEM) {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
+ } else {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+ }
+}
+
+static void
+bdev_ocssd_reset_zone_cb(void *ctx, const struct spdk_nvme_cpl *cpl)
+{
+ struct spdk_bdev_io *bdev_io = ctx;
+ struct bdev_ocssd_io *ocdev_io = (struct bdev_ocssd_io *)bdev_io->driver_ctx;
+
+ ocdev_io->io.zone->write_pointer = ocdev_io->io.zone->slba;
+ __atomic_store_n(&ocdev_io->io.zone->busy, false, __ATOMIC_SEQ_CST);
+ spdk_bdev_io_complete_nvme_status(bdev_io, 0, cpl->status.sct, cpl->status.sc);
+}
+
+static int
+bdev_ocssd_reset_zone(struct spdk_io_channel *ioch, struct spdk_bdev_io *bdev_io,
+ uint64_t slba, size_t num_zones)
+{
+ struct ocssd_bdev *ocssd_bdev = bdev_io->bdev->ctxt;
+ struct nvme_bdev *nvme_bdev = &ocssd_bdev->nvme_bdev;
+ struct nvme_io_channel *nvme_ioch = spdk_io_channel_get_ctx(ioch);
+ struct bdev_ocssd_io *ocdev_io = (struct bdev_ocssd_io *)bdev_io->driver_ctx;
+ uint64_t offset, zone_size = nvme_bdev->disk.zone_size;
+ int rc;
+
+ if (num_zones > 1) {
+ SPDK_ERRLOG("Exceeded maximum number of zones per single reset: 1\n");
+ return -EINVAL;
+ }
+
+ ocdev_io->io.zone = bdev_ocssd_get_zone_by_slba(ocssd_bdev, slba);
+ if (__atomic_exchange_n(&ocdev_io->io.zone->busy, true, __ATOMIC_SEQ_CST)) {
+ return -EINVAL;
+ }
+
+ for (offset = 0; offset < num_zones; ++offset) {
+ ocdev_io->io.lba[offset] = bdev_ocssd_to_disk_lba(ocssd_bdev,
+ slba + offset * zone_size);
+ }
+
+ rc = spdk_nvme_ocssd_ns_cmd_vector_reset(nvme_bdev->nvme_ns->ns, nvme_ioch->qpair,
+ ocdev_io->io.lba, num_zones, NULL,
+ bdev_ocssd_reset_zone_cb, bdev_io);
+ if (spdk_unlikely(rc != 0)) {
+ __atomic_store_n(&ocdev_io->io.zone->busy, false, __ATOMIC_SEQ_CST);
+ }
+
+ return rc;
+}
+
+static int _bdev_ocssd_get_zone_info(struct spdk_bdev_io *bdev_io);
+
+static void
+bdev_ocssd_fill_zone_info(struct ocssd_bdev *ocssd_bdev, struct spdk_bdev_zone_info *zone_info,
+ const struct spdk_ocssd_chunk_information_entry *chunk_info)
+{
+ struct nvme_bdev *nvme_bdev = &ocssd_bdev->nvme_bdev;
+
+ zone_info->zone_id = bdev_ocssd_from_disk_lba(ocssd_bdev, chunk_info->slba);
+ zone_info->write_pointer = zone_info->zone_id;
+
+ if (chunk_info->cs.free) {
+ zone_info->state = SPDK_BDEV_ZONE_STATE_EMPTY;
+ } else if (chunk_info->cs.closed) {
+ zone_info->state = SPDK_BDEV_ZONE_STATE_FULL;
+ } else if (chunk_info->cs.open) {
+ zone_info->state = SPDK_BDEV_ZONE_STATE_OPEN;
+ zone_info->write_pointer += chunk_info->wp % nvme_bdev->disk.zone_size;
+ } else if (chunk_info->cs.offline) {
+ zone_info->state = SPDK_BDEV_ZONE_STATE_OFFLINE;
+ } else {
+ SPDK_ERRLOG("Unknown chunk state, assuming offline\n");
+ zone_info->state = SPDK_BDEV_ZONE_STATE_OFFLINE;
+ }
+
+ if (chunk_info->ct.size_deviate) {
+ zone_info->capacity = chunk_info->cnlb;
+ } else {
+ zone_info->capacity = nvme_bdev->disk.zone_size;
+ }
+}
+
+static void
+bdev_ocssd_zone_info_cb(void *ctx, const struct spdk_nvme_cpl *cpl)
+{
+ struct spdk_bdev_io *bdev_io = ctx;
+ struct ocssd_bdev *ocssd_bdev = bdev_io->bdev->ctxt;
+ struct bdev_ocssd_io *ocdev_io = (struct bdev_ocssd_io *)bdev_io->driver_ctx;
+ struct spdk_ocssd_chunk_information_entry *chunk_info = &ocdev_io->zone_info.chunk_info;
+ struct spdk_bdev_zone_info *zone_info;
+ int rc;
+
+ if (spdk_unlikely(spdk_nvme_cpl_is_error(cpl))) {
+ spdk_bdev_io_complete_nvme_status(bdev_io, 0, cpl->status.sct, cpl->status.sc);
+ return;
+ }
+
+ zone_info = ((struct spdk_bdev_zone_info *)bdev_io->u.zone_mgmt.buf) +
+ ocdev_io->zone_info.chunk_offset;
+ bdev_ocssd_fill_zone_info(ocssd_bdev, zone_info, chunk_info);
+
+ if (++ocdev_io->zone_info.chunk_offset == bdev_io->u.zone_mgmt.num_zones) {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
+ } else {
+ rc = _bdev_ocssd_get_zone_info(bdev_io);
+ if (spdk_unlikely(rc != 0)) {
+ if (rc == -ENOMEM) {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
+ } else {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+ }
+ }
+}
+
+static int
+_bdev_ocssd_get_zone_info(struct spdk_bdev_io *bdev_io)
+{
+ struct ocssd_bdev *ocssd_bdev = bdev_io->bdev->ctxt;
+ struct nvme_bdev *nvme_bdev = &ocssd_bdev->nvme_bdev;
+ struct bdev_ocssd_ns *ocssd_ns = bdev_ocssd_get_ns_from_bdev(ocssd_bdev);
+ const struct spdk_ocssd_geometry_data *geo = &ocssd_ns->geometry;
+ struct bdev_ocssd_io *ocdev_io = (struct bdev_ocssd_io *)bdev_io->driver_ctx;
+ uint64_t lba, grp, pu, chk, lbk, offset;
+
+ lba = bdev_io->u.zone_mgmt.zone_id + ocdev_io->zone_info.chunk_offset *
+ nvme_bdev->disk.zone_size;
+ bdev_ocssd_translate_lba(ocssd_bdev, lba, &grp, &pu, &chk, &lbk);
+ offset = grp * geo->num_pu * geo->num_chk + pu * geo->num_chk + chk;
+
+ return spdk_nvme_ctrlr_cmd_get_log_page(nvme_bdev->nvme_bdev_ctrlr->ctrlr,
+ SPDK_OCSSD_LOG_CHUNK_INFO,
+ spdk_nvme_ns_get_id(nvme_bdev->nvme_ns->ns),
+ &ocdev_io->zone_info.chunk_info,
+ sizeof(ocdev_io->zone_info.chunk_info),
+ offset * sizeof(ocdev_io->zone_info.chunk_info),
+ bdev_ocssd_zone_info_cb, (void *)bdev_io);
+}
+
+static int
+bdev_ocssd_get_zone_info(struct spdk_io_channel *ioch, struct spdk_bdev_io *bdev_io)
+{
+ struct bdev_ocssd_io *ocdev_io = (struct bdev_ocssd_io *)bdev_io->driver_ctx;
+
+ if (bdev_io->u.zone_mgmt.num_zones < 1) {
+ SPDK_ERRLOG("Invalid number of zones: %"PRIu32"\n", bdev_io->u.zone_mgmt.num_zones);
+ return -EINVAL;
+ }
+
+ if (bdev_io->u.zone_mgmt.zone_id % bdev_io->bdev->zone_size != 0) {
+ SPDK_ERRLOG("Unaligned zone LBA: %"PRIu64"\n", bdev_io->u.zone_mgmt.zone_id);
+ return -EINVAL;
+ }
+
+ ocdev_io->zone_info.chunk_offset = 0;
+
+ return _bdev_ocssd_get_zone_info(bdev_io);
+}
+
+static int
+bdev_ocssd_zone_management(struct spdk_io_channel *ioch, struct spdk_bdev_io *bdev_io)
+{
+ switch (bdev_io->u.zone_mgmt.zone_action) {
+ case SPDK_BDEV_ZONE_RESET:
+ return bdev_ocssd_reset_zone(ioch, bdev_io, bdev_io->u.zone_mgmt.zone_id,
+ bdev_io->u.zone_mgmt.num_zones);
+ default:
+ return -EINVAL;
+ }
+}
+
+static void bdev_ocssd_submit_request(struct spdk_io_channel *ioch, struct spdk_bdev_io *bdev_io);
+
+static int
+bdev_ocssd_poll_pending(void *ctx)
+{
+ struct spdk_io_channel *ioch = ctx;
+ struct nvme_io_channel *nvme_ioch;
+ struct ocssd_io_channel *ocssd_ioch;
+ struct spdk_bdev_io *bdev_io;
+ TAILQ_HEAD(, spdk_bdev_io) pending_requests;
+ int num_requests = 0;
+
+ nvme_ioch = spdk_io_channel_get_ctx(ioch);
+ ocssd_ioch = nvme_ioch->ocssd_ioch;
+
+ TAILQ_INIT(&pending_requests);
+ TAILQ_SWAP(&ocssd_ioch->pending_requests, &pending_requests, spdk_bdev_io, module_link);
+
+ while ((bdev_io = TAILQ_FIRST(&pending_requests))) {
+ TAILQ_REMOVE(&pending_requests, bdev_io, module_link);
+ bdev_ocssd_submit_request(ioch, bdev_io);
+ num_requests++;
+ }
+
+ if (TAILQ_EMPTY(&ocssd_ioch->pending_requests)) {
+ spdk_poller_pause(ocssd_ioch->pending_poller);
+ }
+
+ return num_requests;
+}
+
+static void
+bdev_ocssd_delay_request(struct spdk_io_channel *ioch, struct spdk_bdev_io *bdev_io)
+{
+ struct nvme_io_channel *nvme_ioch = spdk_io_channel_get_ctx(ioch);
+ struct ocssd_io_channel *ocssd_ioch = nvme_ioch->ocssd_ioch;
+
+ TAILQ_INSERT_TAIL(&ocssd_ioch->pending_requests, bdev_io, module_link);
+ spdk_poller_resume(ocssd_ioch->pending_poller);
+}
+
+static void
+bdev_ocssd_submit_request(struct spdk_io_channel *ioch, struct spdk_bdev_io *bdev_io)
+{
+ int rc = 0;
+
+ switch (bdev_io->type) {
+ case SPDK_BDEV_IO_TYPE_READ:
+ spdk_bdev_io_get_buf(bdev_io, bdev_ocssd_io_get_buf_cb,
+ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
+ break;
+
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ rc = bdev_ocssd_write(ioch, bdev_io);
+ break;
+
+ case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
+ rc = bdev_ocssd_zone_management(ioch, bdev_io);
+ break;
+
+ case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
+ rc = bdev_ocssd_get_zone_info(ioch, bdev_io);
+ break;
+
+ case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
+ rc = bdev_ocssd_zone_append(ioch, bdev_io);
+ break;
+
+ default:
+ rc = -EINVAL;
+ break;
+ }
+
+ if (spdk_unlikely(rc != 0)) {
+ switch (rc) {
+ case -ENOMEM:
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
+ break;
+ case -EAGAIN:
+ bdev_ocssd_delay_request(ioch, bdev_io);
+ break;
+ default:
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ break;
+ }
+ }
+}
+
+static bool
+bdev_ocssd_io_type_supported(void *ctx, enum spdk_bdev_io_type type)
+{
+ switch (type) {
+ case SPDK_BDEV_IO_TYPE_READ:
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
+ case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+static struct spdk_io_channel *
+bdev_ocssd_get_io_channel(void *ctx)
+{
+ struct ocssd_bdev *ocssd_bdev = ctx;
+
+ return spdk_get_io_channel(ocssd_bdev->nvme_bdev.nvme_bdev_ctrlr);
+}
+
+static void
+bdev_ocssd_free_namespace(struct nvme_bdev_ns *nvme_ns)
+{
+ struct nvme_bdev *bdev, *tmp;
+
+ TAILQ_FOREACH_SAFE(bdev, &nvme_ns->bdevs, tailq, tmp) {
+ spdk_bdev_unregister(&bdev->disk, NULL, NULL);
+ }
+
+ free(nvme_ns->type_ctx);
+ nvme_ns->type_ctx = NULL;
+
+ nvme_ctrlr_depopulate_namespace_done(nvme_ns->ctrlr);
+}
+
+static void
+bdev_ocssd_chunk_notification_cb(void *ctx, const struct spdk_nvme_cpl *cpl)
+{
+ struct nvme_bdev_ns *nvme_ns = ctx;
+ struct bdev_ocssd_ns *ocssd_ns = bdev_ocssd_get_ns_from_nvme(nvme_ns);
+ struct spdk_bdev_media_event event;
+ struct spdk_ocssd_chunk_notification_entry *chunk_entry;
+ struct nvme_bdev *nvme_bdev;
+ struct ocssd_bdev *ocssd_bdev;
+ size_t chunk_id, num_blocks, lba;
+ int rc;
+
+ ocssd_ns->num_outstanding--;
+
+ /* The namespace could have been depopulated in the meantime */
+ if (!nvme_ns->populated) {
+ if (ocssd_ns->num_outstanding == 0) {
+ bdev_ocssd_free_namespace(nvme_ns);
+ }
+
+ return;
+ }
+
+ if (spdk_nvme_cpl_is_error(cpl)) {
+ SPDK_ERRLOG("Failed to retrieve chunk notification log\n");
+ return;
+ }
+
+ for (chunk_id = 0; chunk_id < CHUNK_NOTIFICATION_ENTRY_COUNT; ++chunk_id) {
+ chunk_entry = &ocssd_ns->chunk[chunk_id];
+ if (chunk_entry->nc <= ocssd_ns->chunk_notify_count) {
+ break;
+ }
+
+ ocssd_ns->chunk_notify_count = chunk_entry->nc;
+ if (chunk_entry->mask.lblk) {
+ num_blocks = chunk_entry->nlb;
+ } else if (chunk_entry->mask.chunk) {
+ num_blocks = ocssd_ns->geometry.clba;
+ } else if (chunk_entry->mask.pu) {
+ num_blocks = ocssd_ns->geometry.clba * ocssd_ns->geometry.num_chk;
+ } else {
+ SPDK_WARNLOG("Invalid chunk notification mask\n");
+ continue;
+ }
+
+ TAILQ_FOREACH(nvme_bdev, &nvme_ns->bdevs, tailq) {
+ ocssd_bdev = SPDK_CONTAINEROF(nvme_bdev, struct ocssd_bdev, nvme_bdev);
+ if (bdev_ocssd_lba_in_range(ocssd_bdev, chunk_entry->lba)) {
+ break;
+ }
+ }
+
+ if (nvme_bdev == NULL) {
+ SPDK_INFOLOG(SPDK_LOG_BDEV_OCSSD, "Dropping media management event\n");
+ continue;
+ }
+
+ lba = bdev_ocssd_from_disk_lba(ocssd_bdev, chunk_entry->lba);
+ while (num_blocks > 0 && lba < nvme_bdev->disk.blockcnt) {
+ event.offset = lba;
+ event.num_blocks = spdk_min(num_blocks, ocssd_ns->geometry.clba);
+
+ rc = spdk_bdev_push_media_events(&nvme_bdev->disk, &event, 1);
+ if (spdk_unlikely(rc < 0)) {
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_OCSSD, "Failed to push media event: %s\n",
+ spdk_strerror(-rc));
+ break;
+ }
+
+ /* Jump to the next chunk on the same parallel unit */
+ lba += ocssd_ns->geometry.clba * bdev_ocssd_num_parallel_units(ocssd_bdev);
+ num_blocks -= event.num_blocks;
+ }
+ }
+
+ /* If at least one notification has been processed send out media event */
+ if (chunk_id > 0) {
+ TAILQ_FOREACH(nvme_bdev, &nvme_ns->bdevs, tailq) {
+ spdk_bdev_notify_media_management(&nvme_bdev->disk);
+ }
+ }
+
+ /* If we filled the full array of events, there may be more still pending. Set the pending
+ * flag back to true so that we try to get more events again next time the poller runs.
+ */
+ if (chunk_id == CHUNK_NOTIFICATION_ENTRY_COUNT) {
+ ocssd_ns->chunk_notify_pending = true;
+ }
+}
+
+static int
+bdev_ocssd_poll_mm(void *ctx)
+{
+ struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = ctx;
+ struct nvme_bdev_ns *nvme_ns;
+ struct bdev_ocssd_ns *ocssd_ns;
+ uint32_t nsid;
+ int rc;
+
+ for (nsid = 0; nsid < nvme_bdev_ctrlr->num_ns; ++nsid) {
+ nvme_ns = nvme_bdev_ctrlr->namespaces[nsid];
+ if (nvme_ns == NULL || !nvme_ns->populated) {
+ continue;
+ }
+
+ ocssd_ns = bdev_ocssd_get_ns_from_nvme(nvme_ns);
+ if (ocssd_ns->chunk_notify_pending) {
+ ocssd_ns->chunk_notify_pending = false;
+ ocssd_ns->num_outstanding++;
+
+ rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_bdev_ctrlr->ctrlr,
+ SPDK_OCSSD_LOG_CHUNK_NOTIFICATION,
+ nsid + 1, ocssd_ns->chunk,
+ sizeof(ocssd_ns->chunk[0]) *
+ CHUNK_NOTIFICATION_ENTRY_COUNT,
+ 0, bdev_ocssd_chunk_notification_cb,
+ nvme_ns);
+ if (spdk_unlikely(rc != 0)) {
+ SPDK_ERRLOG("Failed to get chunk notification log page: %s\n",
+ spdk_strerror(-rc));
+ ocssd_ns->num_outstanding--;
+ }
+ }
+ }
+
+ return SPDK_POLLER_BUSY;
+}
+
+void
+bdev_ocssd_handle_chunk_notification(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr)
+{
+ struct bdev_ocssd_ns *ocssd_ns;
+ struct nvme_bdev_ns *nvme_ns;
+ uint32_t nsid;
+
+ for (nsid = 0; nsid < nvme_bdev_ctrlr->num_ns; ++nsid) {
+ nvme_ns = nvme_bdev_ctrlr->namespaces[nsid];
+ if (nvme_ns == NULL || !nvme_ns->populated) {
+ continue;
+ }
+
+ ocssd_ns = bdev_ocssd_get_ns_from_nvme(nvme_ns);
+ ocssd_ns->chunk_notify_pending = true;
+ }
+}
+
+static struct spdk_bdev_fn_table ocssdlib_fn_table = {
+ .destruct = bdev_ocssd_destruct,
+ .submit_request = bdev_ocssd_submit_request,
+ .io_type_supported = bdev_ocssd_io_type_supported,
+ .get_io_channel = bdev_ocssd_get_io_channel,
+};
+
+struct bdev_ocssd_create_ctx {
+ struct ocssd_bdev *ocssd_bdev;
+ bdev_ocssd_create_cb cb_fn;
+ void *cb_arg;
+ const struct bdev_ocssd_range *range;
+ uint64_t chunk_offset;
+ uint64_t end_chunk_offset;
+ uint64_t num_chunks;
+#define OCSSD_BDEV_CHUNK_INFO_COUNT 128
+ struct spdk_ocssd_chunk_information_entry chunk_info[OCSSD_BDEV_CHUNK_INFO_COUNT];
+};
+
+static void
+bdev_ocssd_create_complete(struct bdev_ocssd_create_ctx *create_ctx, int status)
+{
+ const char *bdev_name = create_ctx->ocssd_bdev->nvme_bdev.disk.name;
+
+ if (spdk_unlikely(status != 0)) {
+ bdev_ocssd_free_bdev(create_ctx->ocssd_bdev);
+ }
+
+ create_ctx->cb_fn(bdev_name, status, create_ctx->cb_arg);
+ free(create_ctx);
+}
+
+static int bdev_ocssd_init_zone(struct bdev_ocssd_create_ctx *create_ctx);
+
+static void
+bdev_ocssd_register_bdev(void *ctx)
+{
+ struct bdev_ocssd_create_ctx *create_ctx = ctx;
+ struct ocssd_bdev *ocssd_bdev = create_ctx->ocssd_bdev;
+ struct nvme_bdev *nvme_bdev = &ocssd_bdev->nvme_bdev;
+ int rc;
+
+ rc = spdk_bdev_register(&nvme_bdev->disk);
+ if (spdk_likely(rc == 0)) {
+ nvme_bdev_attach_bdev_to_ns(nvme_bdev->nvme_ns, nvme_bdev);
+ } else {
+ SPDK_ERRLOG("Failed to register bdev %s\n", nvme_bdev->disk.name);
+ }
+
+ bdev_ocssd_create_complete(create_ctx, rc);
+}
+
+static void
+bdev_occsd_init_zone_cb(void *ctx, const struct spdk_nvme_cpl *cpl)
+{
+ struct bdev_ocssd_create_ctx *create_ctx = ctx;
+ struct bdev_ocssd_zone *ocssd_zone;
+ struct ocssd_bdev *ocssd_bdev = create_ctx->ocssd_bdev;
+ struct spdk_bdev_zone_info zone_info = {};
+ uint64_t offset;
+ int rc = 0;
+
+ if (spdk_nvme_cpl_is_error(cpl)) {
+ SPDK_ERRLOG("Chunk information log page failed\n");
+ bdev_ocssd_create_complete(create_ctx, -EIO);
+ return;
+ }
+
+ for (offset = 0; offset < create_ctx->num_chunks; ++offset) {
+ bdev_ocssd_fill_zone_info(ocssd_bdev, &zone_info, &create_ctx->chunk_info[offset]);
+
+ ocssd_zone = bdev_ocssd_get_zone_by_slba(ocssd_bdev, zone_info.zone_id);
+ if (!ocssd_zone) {
+ SPDK_ERRLOG("Received invalid zone starting LBA: %"PRIu64"\n",
+ zone_info.zone_id);
+ bdev_ocssd_create_complete(create_ctx, -EINVAL);
+ return;
+ }
+
+ /* Make sure we're not filling the same zone twice */
+ assert(ocssd_zone->busy);
+
+ ocssd_zone->busy = false;
+ ocssd_zone->slba = zone_info.zone_id;
+ ocssd_zone->capacity = zone_info.capacity;
+ ocssd_zone->write_pointer = zone_info.write_pointer;
+ }
+
+ create_ctx->chunk_offset += create_ctx->num_chunks;
+ if (create_ctx->chunk_offset < create_ctx->end_chunk_offset) {
+ rc = bdev_ocssd_init_zone(create_ctx);
+ if (spdk_unlikely(rc != 0)) {
+ SPDK_ERRLOG("Failed to send chunk info log page\n");
+ bdev_ocssd_create_complete(create_ctx, rc);
+ }
+ } else {
+ /* Make sure all zones have been processed */
+ for (offset = 0; offset < bdev_ocssd_num_zones(ocssd_bdev); ++offset) {
+ assert(!ocssd_bdev->zones[offset].busy);
+ }
+
+ /* Schedule the last bit of work (io_device, bdev registration) to be done in a
+ * context that is not tied to admin command's completion callback.
+ */
+ spdk_thread_send_msg(spdk_get_thread(), bdev_ocssd_register_bdev, create_ctx);
+ }
+}
+
+static int
+bdev_ocssd_init_zone(struct bdev_ocssd_create_ctx *create_ctx)
+{
+ struct ocssd_bdev *ocssd_bdev = create_ctx->ocssd_bdev;
+ struct nvme_bdev *nvme_bdev = &ocssd_bdev->nvme_bdev;
+
+ create_ctx->num_chunks = spdk_min(create_ctx->end_chunk_offset - create_ctx->chunk_offset,
+ OCSSD_BDEV_CHUNK_INFO_COUNT);
+ assert(create_ctx->num_chunks > 0);
+
+ return spdk_nvme_ctrlr_cmd_get_log_page(nvme_bdev->nvme_bdev_ctrlr->ctrlr,
+ SPDK_OCSSD_LOG_CHUNK_INFO,
+ spdk_nvme_ns_get_id(nvme_bdev->nvme_ns->ns),
+ &create_ctx->chunk_info,
+ sizeof(create_ctx->chunk_info[0]) *
+ create_ctx->num_chunks,
+ sizeof(create_ctx->chunk_info[0]) *
+ create_ctx->chunk_offset,
+ bdev_occsd_init_zone_cb, create_ctx);
+}
+
+static int
+bdev_ocssd_init_zones(struct bdev_ocssd_create_ctx *create_ctx)
+{
+ struct ocssd_bdev *ocssd_bdev = create_ctx->ocssd_bdev;
+ struct bdev_ocssd_ns *ocssd_ns = bdev_ocssd_get_ns_from_bdev(ocssd_bdev);
+ struct spdk_bdev *bdev = &ocssd_bdev->nvme_bdev.disk;
+ uint64_t offset;
+
+ ocssd_bdev->zones = calloc(bdev_ocssd_num_zones(ocssd_bdev), sizeof(*ocssd_bdev->zones));
+ if (!ocssd_bdev->zones) {
+ return -ENOMEM;
+ }
+
+ create_ctx->chunk_offset = ocssd_bdev->range.begin * ocssd_ns->geometry.num_chk;
+ create_ctx->end_chunk_offset = create_ctx->chunk_offset + bdev->blockcnt / bdev->zone_size;
+
+ /* Mark all zones as busy and clear it as their info is filled */
+ for (offset = 0; offset < bdev_ocssd_num_zones(ocssd_bdev); ++offset) {
+ ocssd_bdev->zones[offset].busy = true;
+ }
+
+ return bdev_ocssd_init_zone(create_ctx);
+}
+
+static bool
+bdev_ocssd_verify_range(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, uint32_t nsid,
+ const struct bdev_ocssd_range *range)
+{
+ struct nvme_bdev_ns *nvme_ns = nvme_bdev_ctrlr->namespaces[nsid - 1];
+ struct bdev_ocssd_ns *ocssd_ns = bdev_ocssd_get_ns_from_nvme(nvme_ns);
+ const struct spdk_ocssd_geometry_data *geometry = &ocssd_ns->geometry;
+ struct ocssd_bdev *ocssd_bdev;
+ struct nvme_bdev *nvme_bdev;
+ size_t num_punits = geometry->num_pu * geometry->num_grp;
+
+ /* First verify the range is within the geometry */
+ if (range != NULL && (range->begin > range->end || range->end >= num_punits)) {
+ return false;
+ }
+
+ TAILQ_FOREACH(nvme_bdev, &nvme_ns->bdevs, tailq) {
+ ocssd_bdev = SPDK_CONTAINEROF(nvme_bdev, struct ocssd_bdev, nvme_bdev);
+
+ /* Only verify bdevs created on the same namespace */
+ if (spdk_nvme_ns_get_id(nvme_bdev->nvme_ns->ns) != nsid) {
+ continue;
+ }
+
+ /* Empty range means whole namespace should be used */
+ if (range == NULL) {
+ return false;
+ }
+
+ /* Make sure the range doesn't overlap with any other range */
+ if (range->begin <= ocssd_bdev->range.end &&
+ range->end >= ocssd_bdev->range.begin) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+void
+bdev_ocssd_create_bdev(const char *ctrlr_name, const char *bdev_name, uint32_t nsid,
+ const struct bdev_ocssd_range *range, bdev_ocssd_create_cb cb_fn,
+ void *cb_arg)
+{
+ struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
+ struct bdev_ocssd_create_ctx *create_ctx = NULL;
+ struct nvme_bdev *nvme_bdev = NULL;
+ struct ocssd_bdev *ocssd_bdev = NULL;
+ struct spdk_nvme_ns *ns;
+ struct nvme_bdev_ns *nvme_ns;
+ struct bdev_ocssd_ns *ocssd_ns;
+ struct spdk_ocssd_geometry_data *geometry;
+ int rc = 0;
+
+ nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(ctrlr_name);
+ if (!nvme_bdev_ctrlr) {
+ SPDK_ERRLOG("Unable to find controller %s\n", ctrlr_name);
+ rc = -ENODEV;
+ goto error;
+ }
+
+ ns = spdk_nvme_ctrlr_get_ns(nvme_bdev_ctrlr->ctrlr, nsid);
+ if (!ns) {
+ SPDK_ERRLOG("Unable to retrieve namespace %"PRIu32"\n", nsid);
+ rc = -ENODEV;
+ goto error;
+ }
+
+ if (!spdk_nvme_ns_is_active(ns)) {
+ SPDK_ERRLOG("Namespace %"PRIu32" is inactive\n", nsid);
+ rc = -EACCES;
+ goto error;
+ }
+
+ assert(nsid <= nvme_bdev_ctrlr->num_ns);
+ nvme_ns = nvme_bdev_ctrlr->namespaces[nsid - 1];
+ if (nvme_ns == NULL) {
+ SPDK_ERRLOG("Namespace %"PRIu32" is not initialized\n", nsid);
+ rc = -EINVAL;
+ goto error;
+ }
+
+ ocssd_ns = bdev_ocssd_get_ns_from_nvme(nvme_ns);
+ if (ocssd_ns == NULL) {
+ SPDK_ERRLOG("Namespace %"PRIu32" is not an OCSSD namespace\n", nsid);
+ rc = -EINVAL;
+ goto error;
+ }
+
+ if (spdk_bdev_get_by_name(bdev_name) != NULL) {
+ SPDK_ERRLOG("Device with provided name (%s) already exists\n", bdev_name);
+ rc = -EEXIST;
+ goto error;
+ }
+
+ if (!bdev_ocssd_verify_range(nvme_bdev_ctrlr, nsid, range)) {
+ SPDK_ERRLOG("Invalid parallel unit range\n");
+ rc = -EINVAL;
+ goto error;
+ }
+
+ ocssd_bdev = calloc(1, sizeof(*ocssd_bdev));
+ if (!ocssd_bdev) {
+ rc = -ENOMEM;
+ goto error;
+ }
+
+ create_ctx = calloc(1, sizeof(*create_ctx));
+ if (!create_ctx) {
+ rc = -ENOMEM;
+ goto error;
+ }
+
+ create_ctx->ocssd_bdev = ocssd_bdev;
+ create_ctx->cb_fn = cb_fn;
+ create_ctx->cb_arg = cb_arg;
+ create_ctx->range = range;
+
+ nvme_bdev = &ocssd_bdev->nvme_bdev;
+ nvme_bdev->nvme_ns = nvme_ns;
+ nvme_bdev->nvme_bdev_ctrlr = nvme_bdev_ctrlr;
+ geometry = &ocssd_ns->geometry;
+
+ if (range != NULL) {
+ ocssd_bdev->range = *range;
+ } else {
+ ocssd_bdev->range.begin = 0;
+ ocssd_bdev->range.end = geometry->num_grp * geometry->num_pu - 1;
+ }
+
+ nvme_bdev->disk.name = strdup(bdev_name);
+ if (!nvme_bdev->disk.name) {
+ rc = -ENOMEM;
+ goto error;
+ }
+
+ nvme_bdev->disk.product_name = "Open Channel SSD";
+ nvme_bdev->disk.ctxt = ocssd_bdev;
+ nvme_bdev->disk.fn_table = &ocssdlib_fn_table;
+ nvme_bdev->disk.module = &ocssd_if;
+ nvme_bdev->disk.blocklen = spdk_nvme_ns_get_extended_sector_size(ns);
+ nvme_bdev->disk.zoned = true;
+ nvme_bdev->disk.blockcnt = bdev_ocssd_num_parallel_units(ocssd_bdev) *
+ geometry->num_chk * geometry->clba;
+ nvme_bdev->disk.zone_size = geometry->clba;
+ nvme_bdev->disk.max_open_zones = geometry->maxoc;
+ nvme_bdev->disk.optimal_open_zones = bdev_ocssd_num_parallel_units(ocssd_bdev);
+ nvme_bdev->disk.write_unit_size = geometry->ws_opt;
+ nvme_bdev->disk.media_events = true;
+
+ if (geometry->maxocpu != 0 && geometry->maxocpu != geometry->maxoc) {
+ SPDK_WARNLOG("Maximum open chunks per PU is not zero. Reducing the maximum "
+ "number of open zones: %"PRIu32" -> %"PRIu32"\n",
+ geometry->maxoc, geometry->maxocpu);
+ nvme_bdev->disk.max_open_zones = geometry->maxocpu;
+ }
+
+ rc = bdev_ocssd_init_zones(create_ctx);
+ if (spdk_unlikely(rc != 0)) {
+ SPDK_ERRLOG("Failed to initialize zones on bdev %s\n", nvme_bdev->disk.name);
+ goto error;
+ }
+
+ return;
+error:
+ bdev_ocssd_free_bdev(ocssd_bdev);
+ cb_fn(NULL, rc, cb_arg);
+ free(create_ctx);
+}
+
+struct bdev_ocssd_delete_ctx {
+ bdev_ocssd_delete_cb cb_fn;
+ void *cb_arg;
+};
+
+static void
+bdev_ocssd_unregister_cb(void *cb_arg, int status)
+{
+ struct bdev_ocssd_delete_ctx *delete_ctx = cb_arg;
+
+ delete_ctx->cb_fn(status, delete_ctx->cb_arg);
+ free(delete_ctx);
+}
+
+void
+bdev_ocssd_delete_bdev(const char *bdev_name, bdev_ocssd_delete_cb cb_fn, void *cb_arg)
+{
+ struct spdk_bdev *bdev;
+ struct bdev_ocssd_delete_ctx *delete_ctx;
+
+ bdev = spdk_bdev_get_by_name(bdev_name);
+ if (!bdev) {
+ SPDK_ERRLOG("Unable to find bdev %s\n", bdev_name);
+ cb_fn(-ENODEV, cb_arg);
+ return;
+ }
+
+ if (bdev->module != &ocssd_if) {
+ SPDK_ERRLOG("Specified bdev %s is not an OCSSD bdev\n", bdev_name);
+ cb_fn(-EINVAL, cb_arg);
+ return;
+ }
+
+ delete_ctx = calloc(1, sizeof(*delete_ctx));
+ if (!delete_ctx) {
+ SPDK_ERRLOG("Unable to allocate deletion context\n");
+ cb_fn(-ENOMEM, cb_arg);
+ return;
+ }
+
+ delete_ctx->cb_fn = cb_fn;
+ delete_ctx->cb_arg = cb_arg;
+
+ spdk_bdev_unregister(bdev, bdev_ocssd_unregister_cb, delete_ctx);
+}
+
+struct bdev_ocssd_populate_ns_ctx {
+ struct nvme_async_probe_ctx *nvme_ctx;
+ struct nvme_bdev_ns *nvme_ns;
+};
+
+static void
+bdev_ocssd_geometry_cb(void *_ctx, const struct spdk_nvme_cpl *cpl)
+{
+ struct bdev_ocssd_populate_ns_ctx *ctx = _ctx;
+ struct nvme_bdev_ns *nvme_ns = ctx->nvme_ns;
+ struct bdev_ocssd_ns *ocssd_ns = bdev_ocssd_get_ns_from_nvme(nvme_ns);
+ int rc = 0;
+
+ if (spdk_unlikely(spdk_nvme_cpl_is_error(cpl))) {
+ SPDK_ERRLOG("Failed to retrieve geometry for namespace %"PRIu32"\n", nvme_ns->id);
+ free(nvme_ns->type_ctx);
+ nvme_ns->type_ctx = NULL;
+ rc = -EIO;
+ } else {
+ ocssd_ns->lba_offsets.lbk = 0;
+ ocssd_ns->lba_offsets.chk = ocssd_ns->lba_offsets.lbk +
+ ocssd_ns->geometry.lbaf.lbk_len;
+ ocssd_ns->lba_offsets.pu = ocssd_ns->lba_offsets.chk +
+ ocssd_ns->geometry.lbaf.chk_len;
+ ocssd_ns->lba_offsets.grp = ocssd_ns->lba_offsets.pu +
+ ocssd_ns->geometry.lbaf.pu_len;
+ ocssd_ns->chunk_notify_pending = true;
+ }
+
+ nvme_ctrlr_populate_namespace_done(ctx->nvme_ctx, nvme_ns, rc);
+ free(ctx);
+}
+
+void
+bdev_ocssd_populate_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
+ struct nvme_bdev_ns *nvme_ns,
+ struct nvme_async_probe_ctx *nvme_ctx)
+{
+ struct bdev_ocssd_ns *ocssd_ns;
+ struct bdev_ocssd_populate_ns_ctx *ctx;
+ struct spdk_nvme_ns *ns;
+ int rc;
+
+ ns = spdk_nvme_ctrlr_get_ns(nvme_bdev_ctrlr->ctrlr, nvme_ns->id);
+ if (ns == NULL) {
+ nvme_ctrlr_populate_namespace_done(nvme_ctx, nvme_ns, -EINVAL);
+ return;
+ }
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (ctx == NULL) {
+ nvme_ctrlr_populate_namespace_done(nvme_ctx, nvme_ns, -ENOMEM);
+ return;
+ }
+
+ ocssd_ns = calloc(1, sizeof(*ocssd_ns));
+ if (ocssd_ns == NULL) {
+ nvme_ctrlr_populate_namespace_done(nvme_ctx, nvme_ns, -ENOMEM);
+ free(ctx);
+ return;
+ }
+
+ nvme_ns->type_ctx = ocssd_ns;
+ nvme_ns->ns = ns;
+ ctx->nvme_ctx = nvme_ctx;
+ ctx->nvme_ns = nvme_ns;
+
+ rc = spdk_nvme_ocssd_ctrlr_cmd_geometry(nvme_bdev_ctrlr->ctrlr, nvme_ns->id,
+ &ocssd_ns->geometry,
+ sizeof(ocssd_ns->geometry),
+ bdev_ocssd_geometry_cb, ctx);
+ if (spdk_unlikely(rc != 0)) {
+ SPDK_ERRLOG("Failed to retrieve OC geometry: %s\n", spdk_strerror(-rc));
+ nvme_ns->type_ctx = NULL;
+ nvme_ctrlr_populate_namespace_done(nvme_ctx, nvme_ns, rc);
+ free(ocssd_ns);
+ free(ctx);
+ }
+}
+
+void
+bdev_ocssd_depopulate_namespace(struct nvme_bdev_ns *ns)
+{
+ struct bdev_ocssd_ns *ocssd_ns;
+
+ ocssd_ns = bdev_ocssd_get_ns_from_nvme(ns);
+
+ /* If there are outstanding admin requests, we cannot free the context
+ * here, as they'd write over deallocated memory. Clear the populated
+ * flag, so that the completion callback knows that the namespace is
+ * being depopulated and finish its deallocation once all requests are
+ * completed.
+ */
+ ns->populated = false;
+ if (ocssd_ns->num_outstanding == 0) {
+ bdev_ocssd_free_namespace(ns);
+ }
+}
+
+int
+bdev_ocssd_create_io_channel(struct nvme_io_channel *ioch)
+{
+ struct ocssd_io_channel *ocssd_ioch;
+
+ ocssd_ioch = calloc(1, sizeof(*ocssd_ioch));
+ if (ocssd_ioch == NULL) {
+ return -ENOMEM;
+ }
+
+ ocssd_ioch->pending_poller = SPDK_POLLER_REGISTER(bdev_ocssd_poll_pending,
+ spdk_io_channel_from_ctx(ioch), 0);
+ if (ocssd_ioch->pending_poller == NULL) {
+ SPDK_ERRLOG("Failed to register pending requests poller\n");
+ free(ocssd_ioch);
+ return -ENOMEM;
+ }
+
+ /* Start the poller paused and only resume it once there are pending requests */
+ spdk_poller_pause(ocssd_ioch->pending_poller);
+
+ TAILQ_INIT(&ocssd_ioch->pending_requests);
+ ioch->ocssd_ioch = ocssd_ioch;
+
+ return 0;
+}
+
+void
+bdev_ocssd_destroy_io_channel(struct nvme_io_channel *ioch)
+{
+ spdk_poller_unregister(&ioch->ocssd_ioch->pending_poller);
+ free(ioch->ocssd_ioch);
+}
+
+int
+bdev_ocssd_init_ctrlr(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr)
+{
+ struct ocssd_bdev_ctrlr *ocssd_ctrlr;
+
+ ocssd_ctrlr = calloc(1, sizeof(*ocssd_ctrlr));
+ if (!ocssd_ctrlr) {
+ return -ENOMEM;
+ }
+
+ ocssd_ctrlr->mm_poller = SPDK_POLLER_REGISTER(bdev_ocssd_poll_mm, nvme_bdev_ctrlr,
+ 10000ULL);
+ if (!ocssd_ctrlr->mm_poller) {
+ free(ocssd_ctrlr);
+ return -ENOMEM;
+ }
+
+ nvme_bdev_ctrlr->ocssd_ctrlr = ocssd_ctrlr;
+
+ return 0;
+}
+
+void
+bdev_ocssd_fini_ctrlr(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr)
+{
+ spdk_poller_unregister(&nvme_bdev_ctrlr->ocssd_ctrlr->mm_poller);
+ free(nvme_bdev_ctrlr->ocssd_ctrlr);
+ nvme_bdev_ctrlr->ocssd_ctrlr = NULL;
+}
+
+SPDK_LOG_REGISTER_COMPONENT("bdev_ocssd", SPDK_LOG_BDEV_OCSSD)
diff --git a/src/spdk/module/bdev/nvme/bdev_ocssd.h b/src/spdk/module/bdev/nvme/bdev_ocssd.h
new file mode 100644
index 000000000..89e5a3058
--- /dev/null
+++ b/src/spdk/module/bdev/nvme/bdev_ocssd.h
@@ -0,0 +1,67 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_BDEV_OCSSD_H
+#define SPDK_BDEV_OCSSD_H
+
+#include "spdk/stdinc.h"
+#include "common.h"
+
+struct bdev_ocssd_range {
+ uint64_t begin;
+ uint64_t end;
+};
+
+typedef void (*bdev_ocssd_create_cb)(const char *bdev_name, int status, void *ctx);
+typedef void (*bdev_ocssd_delete_cb)(int status, void *ctx);
+
+void bdev_ocssd_create_bdev(const char *ctrlr_name, const char *bdev_name, uint32_t nsid,
+ const struct bdev_ocssd_range *range,
+ bdev_ocssd_create_cb cb_fn, void *cb_arg);
+void bdev_ocssd_delete_bdev(const char *bdev_name, bdev_ocssd_delete_cb cb_fn, void *cb_arg);
+
+void bdev_ocssd_populate_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr,
+ struct nvme_bdev_ns *nvme_ns,
+ struct nvme_async_probe_ctx *ctx);
+void bdev_ocssd_depopulate_namespace(struct nvme_bdev_ns *ns);
+void bdev_ocssd_namespace_config_json(struct spdk_json_write_ctx *w, struct nvme_bdev_ns *ns);
+
+int bdev_ocssd_create_io_channel(struct nvme_io_channel *ioch);
+void bdev_ocssd_destroy_io_channel(struct nvme_io_channel *ioch);
+
+int bdev_ocssd_init_ctrlr(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr);
+void bdev_ocssd_fini_ctrlr(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr);
+
+void bdev_ocssd_handle_chunk_notification(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr);
+
+#endif /* SPDK_BDEV_OCSSD_H */
diff --git a/src/spdk/module/bdev/nvme/bdev_ocssd_rpc.c b/src/spdk/module/bdev/nvme/bdev_ocssd_rpc.c
new file mode 100644
index 000000000..47c5acdb3
--- /dev/null
+++ b/src/spdk/module/bdev/nvme/bdev_ocssd_rpc.c
@@ -0,0 +1,197 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+#include "spdk/rpc.h"
+#include "spdk/string.h"
+#include "spdk/util.h"
+#include "spdk/log.h"
+#include "spdk/likely.h"
+#include "bdev_ocssd.h"
+
+#define BDEV_OCSSD_DEFAULT_NSID 1
+
+struct rpc_create_ocssd_bdev {
+ char *ctrlr_name;
+ char *bdev_name;
+ uint32_t nsid;
+ char *range;
+};
+
+static const struct spdk_json_object_decoder rpc_create_ocssd_bdev_decoders[] = {
+ {"ctrlr_name", offsetof(struct rpc_create_ocssd_bdev, ctrlr_name), spdk_json_decode_string},
+ {"bdev_name", offsetof(struct rpc_create_ocssd_bdev, bdev_name), spdk_json_decode_string},
+ {"nsid", offsetof(struct rpc_create_ocssd_bdev, nsid), spdk_json_decode_uint32, true},
+ {"range", offsetof(struct rpc_create_ocssd_bdev, range), spdk_json_decode_string, true},
+};
+
+static void
+free_rpc_create_ocssd_bdev(struct rpc_create_ocssd_bdev *rpc)
+{
+ free(rpc->ctrlr_name);
+ free(rpc->bdev_name);
+ free(rpc->range);
+}
+
+struct rpc_bdev_ocssd_create_ctx {
+ struct spdk_jsonrpc_request *request;
+ struct rpc_create_ocssd_bdev rpc;
+ struct bdev_ocssd_range range;
+};
+
+static void
+rpc_bdev_ocssd_create_done(const char *bdev_name, int status, void *_ctx)
+{
+ struct rpc_bdev_ocssd_create_ctx *ctx = _ctx;
+ struct spdk_json_write_ctx *w;
+
+ if (status != 0) {
+ spdk_jsonrpc_send_error_response(ctx->request, status, spdk_strerror(-status));
+ goto out;
+ }
+
+ w = spdk_jsonrpc_begin_result(ctx->request);
+ spdk_json_write_string(w, bdev_name);
+ spdk_jsonrpc_end_result(ctx->request, w);
+out:
+ free_rpc_create_ocssd_bdev(&ctx->rpc);
+ free(ctx);
+}
+
+static void
+rpc_bdev_ocssd_create(struct spdk_jsonrpc_request *request, const struct spdk_json_val *params)
+{
+ struct rpc_bdev_ocssd_create_ctx *ctx;
+ struct bdev_ocssd_range *range = NULL;
+ int rc;
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx) {
+ spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM));
+ return;
+ }
+
+ ctx->rpc.nsid = BDEV_OCSSD_DEFAULT_NSID;
+ ctx->request = request;
+
+ if (spdk_json_decode_object(params, rpc_create_ocssd_bdev_decoders,
+ SPDK_COUNTOF(rpc_create_ocssd_bdev_decoders),
+ &ctx->rpc)) {
+ spdk_jsonrpc_send_error_response(request, -EINVAL, "Failed to parse the request");
+ goto out;
+ }
+
+ if (ctx->rpc.range != NULL) {
+ rc = sscanf(ctx->rpc.range, "%"PRIu64"-%"PRIu64,
+ &ctx->range.begin, &ctx->range.end);
+ if (rc != 2) {
+ spdk_jsonrpc_send_error_response(request, -EINVAL, "Failed to parse range");
+ goto out;
+ }
+
+ range = &ctx->range;
+ }
+
+ bdev_ocssd_create_bdev(ctx->rpc.ctrlr_name, ctx->rpc.bdev_name, ctx->rpc.nsid,
+ range, rpc_bdev_ocssd_create_done, ctx);
+ return;
+out:
+ free_rpc_create_ocssd_bdev(&ctx->rpc);
+ free(ctx);
+}
+
+SPDK_RPC_REGISTER("bdev_ocssd_create", rpc_bdev_ocssd_create, SPDK_RPC_RUNTIME)
+
+struct rpc_delete_ocssd_bdev {
+ char *name;
+};
+
+static const struct spdk_json_object_decoder rpc_delete_ocssd_bdev_decoders[] = {
+ {"name", offsetof(struct rpc_delete_ocssd_bdev, name), spdk_json_decode_string},
+};
+
+static void
+free_rpc_delete_ocssd_bdev(struct rpc_delete_ocssd_bdev *rpc)
+{
+ free(rpc->name);
+}
+
+struct rpc_bdev_ocssd_delete_ctx {
+ struct spdk_jsonrpc_request *request;
+ struct rpc_delete_ocssd_bdev rpc;
+};
+
+static void
+rpc_bdev_ocssd_delete_done(int status, void *_ctx)
+{
+ struct rpc_bdev_ocssd_delete_ctx *ctx = _ctx;
+ struct spdk_json_write_ctx *w;
+
+ if (status != 0) {
+ spdk_jsonrpc_send_error_response(ctx->request, status, spdk_strerror(-status));
+ goto out;
+ }
+
+ w = spdk_jsonrpc_begin_result(ctx->request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(ctx->request, w);
+out:
+ free_rpc_delete_ocssd_bdev(&ctx->rpc);
+ free(ctx);
+}
+
+static void
+rpc_bdev_ocssd_delete(struct spdk_jsonrpc_request *request, const struct spdk_json_val *params)
+{
+ struct rpc_bdev_ocssd_delete_ctx *ctx;
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx) {
+ spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM));
+ return;
+ }
+
+ ctx->request = request;
+ if (spdk_json_decode_object(params, rpc_delete_ocssd_bdev_decoders,
+ SPDK_COUNTOF(rpc_delete_ocssd_bdev_decoders),
+ &ctx->rpc)) {
+ spdk_jsonrpc_send_error_response(request, -EINVAL, "Failed to parse the request");
+ free_rpc_delete_ocssd_bdev(&ctx->rpc);
+ free(ctx);
+ return;
+ }
+
+ bdev_ocssd_delete_bdev(ctx->rpc.name, rpc_bdev_ocssd_delete_done, ctx);
+}
+
+SPDK_RPC_REGISTER("bdev_ocssd_delete", rpc_bdev_ocssd_delete, SPDK_RPC_RUNTIME)
diff --git a/src/spdk/module/bdev/nvme/common.c b/src/spdk/module/bdev/nvme/common.c
new file mode 100644
index 000000000..c895f1102
--- /dev/null
+++ b/src/spdk/module/bdev/nvme/common.c
@@ -0,0 +1,204 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/env.h"
+#include "bdev_ocssd.h"
+#include "common.h"
+
+struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_bdev_ctrlrs);
+pthread_mutex_t g_bdev_nvme_mutex = PTHREAD_MUTEX_INITIALIZER;
+bool g_bdev_nvme_module_finish;
+
+struct nvme_bdev_ctrlr *
+nvme_bdev_ctrlr_get(const struct spdk_nvme_transport_id *trid)
+{
+ struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
+
+ TAILQ_FOREACH(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
+ if (spdk_nvme_transport_id_compare(trid, nvme_bdev_ctrlr->trid) == 0) {
+ return nvme_bdev_ctrlr;
+ }
+ }
+
+ return NULL;
+}
+
+struct nvme_bdev_ctrlr *
+nvme_bdev_ctrlr_get_by_name(const char *name)
+{
+ struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
+
+ if (name == NULL) {
+ return NULL;
+ }
+
+ TAILQ_FOREACH(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
+ if (strcmp(name, nvme_bdev_ctrlr->name) == 0) {
+ return nvme_bdev_ctrlr;
+ }
+ }
+
+ return NULL;
+}
+
+struct nvme_bdev_ctrlr *
+nvme_bdev_first_ctrlr(void)
+{
+ return TAILQ_FIRST(&g_nvme_bdev_ctrlrs);
+}
+
+struct nvme_bdev_ctrlr *
+nvme_bdev_next_ctrlr(struct nvme_bdev_ctrlr *prev)
+{
+ return TAILQ_NEXT(prev, tailq);
+}
+
+void
+nvme_bdev_dump_trid_json(struct spdk_nvme_transport_id *trid, struct spdk_json_write_ctx *w)
+{
+ const char *trtype_str;
+ const char *adrfam_str;
+
+ trtype_str = spdk_nvme_transport_id_trtype_str(trid->trtype);
+ if (trtype_str) {
+ spdk_json_write_named_string(w, "trtype", trtype_str);
+ }
+
+ adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam);
+ if (adrfam_str) {
+ spdk_json_write_named_string(w, "adrfam", adrfam_str);
+ }
+
+ if (trid->traddr[0] != '\0') {
+ spdk_json_write_named_string(w, "traddr", trid->traddr);
+ }
+
+ if (trid->trsvcid[0] != '\0') {
+ spdk_json_write_named_string(w, "trsvcid", trid->trsvcid);
+ }
+
+ if (trid->subnqn[0] != '\0') {
+ spdk_json_write_named_string(w, "subnqn", trid->subnqn);
+ }
+}
+
+static void
+nvme_bdev_unregister_cb(void *io_device)
+{
+ struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = io_device;
+ uint32_t i;
+
+ pthread_mutex_lock(&g_bdev_nvme_mutex);
+ TAILQ_REMOVE(&g_nvme_bdev_ctrlrs, nvme_bdev_ctrlr, tailq);
+ pthread_mutex_unlock(&g_bdev_nvme_mutex);
+ spdk_nvme_detach(nvme_bdev_ctrlr->ctrlr);
+ spdk_poller_unregister(&nvme_bdev_ctrlr->adminq_timer_poller);
+ free(nvme_bdev_ctrlr->name);
+ for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) {
+ free(nvme_bdev_ctrlr->namespaces[i]);
+ }
+ free(nvme_bdev_ctrlr->namespaces);
+ free(nvme_bdev_ctrlr->trid);
+ free(nvme_bdev_ctrlr);
+
+ pthread_mutex_lock(&g_bdev_nvme_mutex);
+ if (g_bdev_nvme_module_finish && TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
+ pthread_mutex_unlock(&g_bdev_nvme_mutex);
+ spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL);
+ spdk_bdev_module_finish_done();
+ return;
+ }
+
+ pthread_mutex_unlock(&g_bdev_nvme_mutex);
+}
+
+int
+nvme_bdev_ctrlr_destruct(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr)
+{
+ assert(nvme_bdev_ctrlr->destruct);
+ pthread_mutex_lock(&g_bdev_nvme_mutex);
+
+ /* If we have already registered a poller, let that one take care of it. */
+ if (nvme_bdev_ctrlr->destruct_poller != NULL) {
+ pthread_mutex_unlock(&g_bdev_nvme_mutex);
+ return SPDK_POLLER_IDLE;
+ }
+
+ if (nvme_bdev_ctrlr->resetting) {
+ nvme_bdev_ctrlr->destruct_poller =
+ SPDK_POLLER_REGISTER((spdk_poller_fn)nvme_bdev_ctrlr_destruct, nvme_bdev_ctrlr, 1000);
+ pthread_mutex_unlock(&g_bdev_nvme_mutex);
+ return SPDK_POLLER_BUSY;
+ }
+ pthread_mutex_unlock(&g_bdev_nvme_mutex);
+
+ spdk_poller_unregister(&nvme_bdev_ctrlr->destruct_poller);
+ if (nvme_bdev_ctrlr->opal_dev) {
+ spdk_opal_dev_destruct(nvme_bdev_ctrlr->opal_dev);
+ nvme_bdev_ctrlr->opal_dev = NULL;
+ }
+
+ if (nvme_bdev_ctrlr->ocssd_ctrlr) {
+ bdev_ocssd_fini_ctrlr(nvme_bdev_ctrlr);
+ }
+
+ spdk_io_device_unregister(nvme_bdev_ctrlr, nvme_bdev_unregister_cb);
+ return SPDK_POLLER_BUSY;
+}
+
+void
+nvme_bdev_attach_bdev_to_ns(struct nvme_bdev_ns *nvme_ns, struct nvme_bdev *nvme_disk)
+{
+ nvme_ns->ctrlr->ref++;
+
+ TAILQ_INSERT_TAIL(&nvme_ns->bdevs, nvme_disk, tailq);
+}
+
+void
+nvme_bdev_detach_bdev_from_ns(struct nvme_bdev *nvme_disk)
+{
+ struct nvme_bdev_ctrlr *ctrlr = nvme_disk->nvme_ns->ctrlr;
+
+ pthread_mutex_lock(&g_bdev_nvme_mutex);
+ ctrlr->ref--;
+
+ TAILQ_REMOVE(&nvme_disk->nvme_ns->bdevs, nvme_disk, tailq);
+
+ if (ctrlr->ref == 0 && ctrlr->destruct) {
+ pthread_mutex_unlock(&g_bdev_nvme_mutex);
+ nvme_bdev_ctrlr_destruct(ctrlr);
+ return;
+ }
+
+ pthread_mutex_unlock(&g_bdev_nvme_mutex);
+}
diff --git a/src/spdk/module/bdev/nvme/common.h b/src/spdk/module/bdev/nvme/common.h
new file mode 100644
index 000000000..c710507a1
--- /dev/null
+++ b/src/spdk/module/bdev/nvme/common.h
@@ -0,0 +1,163 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_COMMON_BDEV_NVME_H
+#define SPDK_COMMON_BDEV_NVME_H
+
+#include "spdk/nvme.h"
+#include "spdk/bdev_module.h"
+#include "spdk/opal.h"
+
+TAILQ_HEAD(nvme_bdev_ctrlrs, nvme_bdev_ctrlr);
+extern struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs;
+extern pthread_mutex_t g_bdev_nvme_mutex;
+extern bool g_bdev_nvme_module_finish;
+
+#define NVME_MAX_CONTROLLERS 1024
+
+enum nvme_bdev_ns_type {
+ NVME_BDEV_NS_UNKNOWN = 0,
+ NVME_BDEV_NS_STANDARD = 1,
+ NVME_BDEV_NS_OCSSD = 2,
+};
+
+struct nvme_bdev_ns {
+ uint32_t id;
+ enum nvme_bdev_ns_type type;
+ /** Marks whether this data structure has its bdevs
+ * populated for the associated namespace. It is used
+ * to keep track if we need manage the populated
+ * resources when a newly active namespace is found,
+ * or when a namespace becomes inactive.
+ */
+ bool populated;
+ struct spdk_nvme_ns *ns;
+ struct nvme_bdev_ctrlr *ctrlr;
+ TAILQ_HEAD(, nvme_bdev) bdevs;
+ void *type_ctx;
+};
+
+struct ocssd_bdev_ctrlr;
+
+struct nvme_bdev_ctrlr {
+ /**
+ * points to pinned, physically contiguous memory region;
+ * contains 4KB IDENTIFY structure for controller which is
+ * target for CONTROLLER IDENTIFY command during initialization
+ */
+ struct spdk_nvme_ctrlr *ctrlr;
+ struct spdk_nvme_transport_id *trid;
+ char *name;
+ int ref;
+ bool resetting;
+ bool destruct;
+ /**
+ * PI check flags. This flags is set to NVMe controllers created only
+ * through bdev_nvme_attach_controller RPC or .INI config file. Hot added
+ * NVMe controllers are not included.
+ */
+ uint32_t prchk_flags;
+ uint32_t num_ns;
+ /** Array of pointers to namespaces indexed by nsid - 1 */
+ struct nvme_bdev_ns **namespaces;
+
+ struct spdk_opal_dev *opal_dev;
+
+ struct spdk_poller *adminq_timer_poller;
+ struct spdk_poller *destruct_poller;
+ struct spdk_thread *thread;
+
+ struct ocssd_bdev_ctrlr *ocssd_ctrlr;
+
+ /** linked list pointer for device list */
+ TAILQ_ENTRY(nvme_bdev_ctrlr) tailq;
+};
+
+struct nvme_bdev {
+ struct spdk_bdev disk;
+ struct nvme_bdev_ns *nvme_ns;
+ struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
+ TAILQ_ENTRY(nvme_bdev) tailq;
+};
+
+struct nvme_bdev_poll_group {
+ struct spdk_nvme_poll_group *group;
+ struct spdk_poller *poller;
+ bool collect_spin_stat;
+ uint64_t spin_ticks;
+ uint64_t start_ticks;
+ uint64_t end_ticks;
+};
+
+typedef void (*spdk_bdev_create_nvme_fn)(void *ctx, size_t bdev_count, int rc);
+
+struct nvme_async_probe_ctx {
+ struct spdk_nvme_probe_ctx *probe_ctx;
+ const char *base_name;
+ const char **names;
+ uint32_t count;
+ uint32_t prchk_flags;
+ struct spdk_poller *poller;
+ struct spdk_nvme_transport_id trid;
+ struct spdk_nvme_ctrlr_opts opts;
+ spdk_bdev_create_nvme_fn cb_fn;
+ void *cb_ctx;
+ uint32_t populates_in_progress;
+};
+
+struct ocssd_io_channel;
+
+struct nvme_io_channel {
+ struct spdk_nvme_qpair *qpair;
+ struct nvme_bdev_poll_group *group;
+ TAILQ_HEAD(, spdk_bdev_io) pending_resets;
+ struct ocssd_io_channel *ocssd_ioch;
+};
+
+void nvme_ctrlr_populate_namespace_done(struct nvme_async_probe_ctx *ctx,
+ struct nvme_bdev_ns *ns, int rc);
+void nvme_ctrlr_depopulate_namespace_done(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr);
+
+struct nvme_bdev_ctrlr *nvme_bdev_ctrlr_get(const struct spdk_nvme_transport_id *trid);
+struct nvme_bdev_ctrlr *nvme_bdev_ctrlr_get_by_name(const char *name);
+struct nvme_bdev_ctrlr *nvme_bdev_first_ctrlr(void);
+struct nvme_bdev_ctrlr *nvme_bdev_next_ctrlr(struct nvme_bdev_ctrlr *prev);
+
+void nvme_bdev_dump_trid_json(struct spdk_nvme_transport_id *trid,
+ struct spdk_json_write_ctx *w);
+
+int nvme_bdev_ctrlr_destruct(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr);
+void nvme_bdev_attach_bdev_to_ns(struct nvme_bdev_ns *nvme_ns, struct nvme_bdev *nvme_disk);
+void nvme_bdev_detach_bdev_from_ns(struct nvme_bdev *nvme_disk);
+
+#endif /* SPDK_COMMON_BDEV_NVME_H */
diff --git a/src/spdk/module/bdev/nvme/nvme_rpc.c b/src/spdk/module/bdev/nvme/nvme_rpc.c
new file mode 100644
index 000000000..e6a938384
--- /dev/null
+++ b/src/spdk/module/bdev/nvme/nvme_rpc.c
@@ -0,0 +1,492 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+#include "spdk/string.h"
+#include "spdk/rpc.h"
+#include "spdk/util.h"
+#include "spdk/bdev_module.h"
+#include "spdk_internal/log.h"
+
+#include "bdev_nvme.h"
+#include "common.h"
+#include "spdk/base64.h"
+
+enum spdk_nvme_rpc_type {
+ NVME_ADMIN_CMD = 1,
+ NVME_IO_CMD,
+};
+
+struct rpc_bdev_nvme_send_cmd_req {
+ char *name;
+ int cmd_type;
+ int data_direction;
+ uint32_t timeout_ms;
+ uint32_t data_len;
+ uint32_t md_len;
+
+ struct spdk_nvme_cmd *cmdbuf;
+ char *data;
+ char *md;
+};
+
+struct rpc_bdev_nvme_send_cmd_resp {
+ char *cpl_text;
+ char *data_text;
+ char *md_text;
+};
+
+struct rpc_bdev_nvme_send_cmd_ctx {
+ struct spdk_jsonrpc_request *jsonrpc_request;
+ struct rpc_bdev_nvme_send_cmd_req req;
+ struct rpc_bdev_nvme_send_cmd_resp resp;
+ struct nvme_bdev_ctrlr *nvme_bdev_ctrlr;
+ struct spdk_io_channel *ctrlr_io_ch;
+};
+
+static void
+free_rpc_bdev_nvme_send_cmd_ctx(struct rpc_bdev_nvme_send_cmd_ctx *ctx)
+{
+ assert(ctx != NULL);
+
+ free(ctx->req.name);
+ free(ctx->req.cmdbuf);
+ spdk_free(ctx->req.data);
+ spdk_free(ctx->req.md);
+ free(ctx->resp.cpl_text);
+ free(ctx->resp.data_text);
+ free(ctx->resp.md_text);
+ free(ctx);
+}
+
+static int
+rpc_bdev_nvme_send_cmd_resp_construct(struct rpc_bdev_nvme_send_cmd_resp *resp,
+ struct rpc_bdev_nvme_send_cmd_req *req,
+ const struct spdk_nvme_cpl *cpl)
+{
+ resp->cpl_text = malloc(spdk_base64_get_encoded_strlen(sizeof(*cpl)) + 1);
+ if (!resp->cpl_text) {
+ return -ENOMEM;
+ }
+ spdk_base64_urlsafe_encode(resp->cpl_text, cpl, sizeof(*cpl));
+
+ if (req->data_direction == SPDK_NVME_DATA_CONTROLLER_TO_HOST) {
+ if (req->data_len) {
+ resp->data_text = malloc(spdk_base64_get_encoded_strlen(req->data_len) + 1);
+ if (!resp->data_text) {
+ return -ENOMEM;
+ }
+ spdk_base64_urlsafe_encode(resp->data_text, req->data, req->data_len);
+ }
+ if (req->md_len) {
+ resp->md_text = malloc(spdk_base64_get_encoded_strlen(req->md_len) + 1);
+ if (!resp->md_text) {
+ return -ENOMEM;
+ }
+ spdk_base64_urlsafe_encode(resp->md_text, req->md, req->md_len);
+ }
+ }
+
+ return 0;
+}
+
+static void
+rpc_bdev_nvme_send_cmd_complete(struct rpc_bdev_nvme_send_cmd_ctx *ctx,
+ const struct spdk_nvme_cpl *cpl)
+{
+ struct spdk_jsonrpc_request *request = ctx->jsonrpc_request;
+ struct spdk_json_write_ctx *w;
+ int ret;
+
+ ret = rpc_bdev_nvme_send_cmd_resp_construct(&ctx->resp, &ctx->req, cpl);
+ if (ret) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ spdk_strerror(-ret));
+ goto out;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "cpl", ctx->resp.cpl_text);
+
+ if (ctx->resp.data_text) {
+ spdk_json_write_named_string(w, "data", ctx->resp.data_text);
+ }
+
+ if (ctx->resp.md_text) {
+ spdk_json_write_named_string(w, "metadata", ctx->resp.md_text);
+ }
+
+ spdk_json_write_object_end(w);
+ spdk_jsonrpc_end_result(request, w);
+
+out:
+ free_rpc_bdev_nvme_send_cmd_ctx(ctx);
+ return;
+}
+
+static void
+nvme_rpc_bdev_nvme_cb(void *ref, const struct spdk_nvme_cpl *cpl)
+{
+ struct rpc_bdev_nvme_send_cmd_ctx *ctx = (struct rpc_bdev_nvme_send_cmd_ctx *)ref;
+
+ if (ctx->ctrlr_io_ch) {
+ spdk_put_io_channel(ctx->ctrlr_io_ch);
+ ctx->ctrlr_io_ch = NULL;
+ }
+
+ rpc_bdev_nvme_send_cmd_complete(ctx, cpl);
+}
+
+static int
+nvme_rpc_admin_cmd_bdev_nvme(struct rpc_bdev_nvme_send_cmd_ctx *ctx, struct spdk_nvme_cmd *cmd,
+ void *buf, uint32_t nbytes, uint32_t timeout_ms)
+{
+ struct nvme_bdev_ctrlr *_nvme_ctrlr = ctx->nvme_bdev_ctrlr;
+ int ret;
+
+ ret = spdk_nvme_ctrlr_cmd_admin_raw(_nvme_ctrlr->ctrlr, cmd, buf,
+ nbytes, nvme_rpc_bdev_nvme_cb, ctx);
+
+ return ret;
+}
+
+static int
+nvme_rpc_io_cmd_bdev_nvme(struct rpc_bdev_nvme_send_cmd_ctx *ctx, struct spdk_nvme_cmd *cmd,
+ void *buf, uint32_t nbytes, void *md_buf, uint32_t md_len,
+ uint32_t timeout_ms)
+{
+ struct nvme_bdev_ctrlr *_nvme_ctrlr = ctx->nvme_bdev_ctrlr;
+ struct spdk_nvme_qpair *io_qpair;
+ int ret;
+
+ ctx->ctrlr_io_ch = spdk_get_io_channel(_nvme_ctrlr->ctrlr);
+ io_qpair = bdev_nvme_get_io_qpair(ctx->ctrlr_io_ch);
+
+ ret = spdk_nvme_ctrlr_cmd_io_raw_with_md(_nvme_ctrlr->ctrlr, io_qpair,
+ cmd, buf, nbytes, md_buf, nvme_rpc_bdev_nvme_cb, ctx);
+ if (ret) {
+ spdk_put_io_channel(ctx->ctrlr_io_ch);
+ }
+
+ return ret;
+
+}
+
+static int
+rpc_bdev_nvme_send_cmd_exec(struct rpc_bdev_nvme_send_cmd_ctx *ctx)
+{
+ struct rpc_bdev_nvme_send_cmd_req *req = &ctx->req;
+ int ret = -EINVAL;
+
+ switch (req->cmd_type) {
+ case NVME_ADMIN_CMD:
+ ret = nvme_rpc_admin_cmd_bdev_nvme(ctx, req->cmdbuf, req->data,
+ req->data_len, req->timeout_ms);
+ break;
+ case NVME_IO_CMD:
+ ret = nvme_rpc_io_cmd_bdev_nvme(ctx, req->cmdbuf, req->data,
+ req->data_len, req->md, req->md_len, req->timeout_ms);
+ break;
+ }
+
+ return ret;
+}
+
+static int
+rpc_decode_cmd_type(const struct spdk_json_val *val, void *out)
+{
+ int *cmd_type = out;
+
+ if (spdk_json_strequal(val, "admin") == true) {
+ *cmd_type = NVME_ADMIN_CMD;
+ } else if (spdk_json_strequal(val, "io") == true) {
+ *cmd_type = NVME_IO_CMD;
+ } else {
+ SPDK_NOTICELOG("Invalid parameter value: cmd_type\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int
+rpc_decode_data_direction(const struct spdk_json_val *val, void *out)
+{
+ int *data_direction = out;
+
+ if (spdk_json_strequal(val, "h2c") == true) {
+ *data_direction = SPDK_NVME_DATA_HOST_TO_CONTROLLER;
+ } else if (spdk_json_strequal(val, "c2h") == true) {
+ *data_direction = SPDK_NVME_DATA_CONTROLLER_TO_HOST;
+ } else {
+ SPDK_NOTICELOG("Invalid parameter value: data_direction\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int
+rpc_decode_cmdbuf(const struct spdk_json_val *val, void *out)
+{
+ char *text = NULL;
+ size_t text_strlen, raw_len;
+ struct spdk_nvme_cmd *cmdbuf, **_cmdbuf = out;
+ int rc;
+
+ rc = spdk_json_decode_string(val, &text);
+ if (rc) {
+ return val->type == SPDK_JSON_VAL_STRING ? -ENOMEM : -EINVAL;
+ }
+
+ text_strlen = strlen(text);
+ raw_len = spdk_base64_get_decoded_len(text_strlen);
+ cmdbuf = malloc(raw_len);
+ if (!cmdbuf) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ rc = spdk_base64_urlsafe_decode(cmdbuf, &raw_len, text);
+ if (rc) {
+ free(cmdbuf);
+ goto out;
+ }
+ if (raw_len != sizeof(*cmdbuf)) {
+ rc = -EINVAL;
+ free(cmdbuf);
+ goto out;
+ }
+
+ *_cmdbuf = cmdbuf;
+
+out:
+ free(text);
+ return rc;
+}
+
+static int
+rpc_decode_data(const struct spdk_json_val *val, void *out)
+{
+ struct rpc_bdev_nvme_send_cmd_req *req = (struct rpc_bdev_nvme_send_cmd_req *)out;
+ char *text = NULL;
+ size_t text_strlen;
+ int rc;
+
+ rc = spdk_json_decode_string(val, &text);
+ if (rc) {
+ return val->type == SPDK_JSON_VAL_STRING ? -ENOMEM : -EINVAL;
+ }
+ text_strlen = strlen(text);
+
+ if (req->data_len) {
+ /* data_len is decoded by param "data_len" */
+ if (req->data_len != spdk_base64_get_decoded_len(text_strlen)) {
+ rc = -EINVAL;
+ goto out;
+ }
+ } else {
+ req->data_len = spdk_base64_get_decoded_len(text_strlen);
+ req->data = spdk_malloc(req->data_len > 0x1000 ? req->data_len : 0x1000, 0x1000,
+ NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+ if (!req->data) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ }
+
+ rc = spdk_base64_urlsafe_decode(req->data, (size_t *)&req->data_len, text);
+
+out:
+ free(text);
+ return rc;
+}
+
+static int
+rpc_decode_data_len(const struct spdk_json_val *val, void *out)
+{
+ struct rpc_bdev_nvme_send_cmd_req *req = (struct rpc_bdev_nvme_send_cmd_req *)out;
+ uint32_t data_len;
+ int rc;
+
+ rc = spdk_json_decode_uint32(val, &data_len);
+ if (rc) {
+ return rc;
+ }
+
+ if (req->data_len) {
+ /* data_len is decoded by param "data" */
+ if (req->data_len != data_len) {
+ rc = -EINVAL;
+ }
+ } else {
+ req->data_len = data_len;
+ req->data = spdk_malloc(req->data_len > 0x1000 ? req->data_len : 0x1000, 0x1000,
+ NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+ if (!req->data) {
+ rc = -ENOMEM;
+ }
+ }
+
+ return rc;
+}
+
+static int
+rpc_decode_metadata(const struct spdk_json_val *val, void *out)
+{
+ struct rpc_bdev_nvme_send_cmd_req *req = (struct rpc_bdev_nvme_send_cmd_req *)out;
+ char *text = NULL;
+ size_t text_strlen;
+ int rc;
+
+ rc = spdk_json_decode_string(val, &text);
+ if (rc) {
+ return rc = val->type == SPDK_JSON_VAL_STRING ? -ENOMEM : -EINVAL;
+ }
+ text_strlen = strlen(text);
+
+ if (req->md_len) {
+ /* md_len is decoded by param "metadata_len" */
+ if (req->md_len != spdk_base64_get_decoded_len(text_strlen)) {
+ rc = -EINVAL;
+ goto out;
+ }
+ } else {
+ req->md_len = spdk_base64_get_decoded_len(text_strlen);
+ req->md = spdk_malloc(req->md_len, 0x1000, NULL,
+ SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+ if (!req->md) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ }
+
+ rc = spdk_base64_urlsafe_decode(req->md, (size_t *)&req->md_len, text);
+
+out:
+ free(text);
+ return rc;
+}
+
+static int
+rpc_decode_metadata_len(const struct spdk_json_val *val, void *out)
+{
+ struct rpc_bdev_nvme_send_cmd_req *req = (struct rpc_bdev_nvme_send_cmd_req *)out;
+ uint32_t md_len;
+ int rc;
+
+ rc = spdk_json_decode_uint32(val, &md_len);
+ if (rc) {
+ return rc;
+ }
+
+ if (req->md_len) {
+ /* md_len is decoded by param "metadata" */
+ if (req->md_len != md_len) {
+ rc = -EINVAL;
+ }
+ } else {
+ req->md_len = md_len;
+ req->md = spdk_malloc(req->md_len, 0x1000, NULL,
+ SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+ if (!req->md) {
+ rc = -ENOMEM;
+ }
+ }
+
+ return rc;
+}
+
+static const struct spdk_json_object_decoder rpc_bdev_nvme_send_cmd_req_decoders[] = {
+ {"name", offsetof(struct rpc_bdev_nvme_send_cmd_req, name), spdk_json_decode_string},
+ {"cmd_type", offsetof(struct rpc_bdev_nvme_send_cmd_req, cmd_type), rpc_decode_cmd_type},
+ {"data_direction", offsetof(struct rpc_bdev_nvme_send_cmd_req, data_direction), rpc_decode_data_direction},
+ {"cmdbuf", offsetof(struct rpc_bdev_nvme_send_cmd_req, cmdbuf), rpc_decode_cmdbuf},
+ {"timeout_ms", offsetof(struct rpc_bdev_nvme_send_cmd_req, timeout_ms), spdk_json_decode_uint32, true},
+ {"data_len", 0, rpc_decode_data_len, true},
+ {"metadata_len", 0, rpc_decode_metadata_len, true},
+ {"data", 0, rpc_decode_data, true},
+ {"metadata", 0, rpc_decode_metadata, true},
+};
+
+static void
+rpc_bdev_nvme_send_cmd(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_nvme_send_cmd_ctx *ctx;
+ int ret, error_code;
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx) {
+ SPDK_ERRLOG("Failed at Malloc ctx\n");
+ error_code = SPDK_JSONRPC_ERROR_INTERNAL_ERROR;
+ ret = -ENOMEM;
+ goto invalid;
+ }
+
+ if (spdk_json_decode_object(params, rpc_bdev_nvme_send_cmd_req_decoders,
+ SPDK_COUNTOF(rpc_bdev_nvme_send_cmd_req_decoders),
+ &ctx->req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ error_code = SPDK_JSONRPC_ERROR_INVALID_PARAMS;
+ ret = -EINVAL;
+ goto invalid;
+ }
+
+ ctx->nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(ctx->req.name);
+ if (ctx->nvme_bdev_ctrlr == NULL) {
+ SPDK_ERRLOG("Failed at device lookup\n");
+ error_code = SPDK_JSONRPC_ERROR_INVALID_PARAMS;
+ ret = -EINVAL;
+ goto invalid;
+ }
+
+ ctx->jsonrpc_request = request;
+
+ ret = rpc_bdev_nvme_send_cmd_exec(ctx);
+ if (ret < 0) {
+ SPDK_NOTICELOG("Failed at rpc_bdev_nvme_send_cmd_exec\n");
+ error_code = SPDK_JSONRPC_ERROR_INTERNAL_ERROR;
+ goto invalid;
+ }
+
+ return;
+
+invalid:
+ spdk_jsonrpc_send_error_response(request, error_code, spdk_strerror(-ret));
+ free_rpc_bdev_nvme_send_cmd_ctx(ctx);
+ return;
+}
+SPDK_RPC_REGISTER("bdev_nvme_send_cmd", rpc_bdev_nvme_send_cmd, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_nvme_send_cmd, send_nvme_cmd)
diff --git a/src/spdk/module/bdev/nvme/vbdev_opal.c b/src/spdk/module/bdev/nvme/vbdev_opal.c
new file mode 100644
index 000000000..68281c92b
--- /dev/null
+++ b/src/spdk/module/bdev/nvme/vbdev_opal.c
@@ -0,0 +1,630 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/opal.h"
+#include "spdk/bdev_module.h"
+#include "vbdev_opal.h"
+#include "spdk_internal/log.h"
+#include "spdk/string.h"
+
+/* OPAL locking range only supports operations on nsid=1 for now */
+#define NSID_SUPPORTED 1
+
+struct opal_vbdev {
+ char *name;
+ struct nvme_bdev_ctrlr *nvme_ctrlr;
+ struct spdk_opal_dev *opal_dev;
+ struct spdk_bdev_part *bdev_part;
+
+ uint8_t locking_range_id;
+ uint64_t range_start;
+ uint64_t range_length;
+ struct vbdev_opal_part_base *opal_base;
+
+ TAILQ_ENTRY(opal_vbdev) tailq;
+};
+
+static TAILQ_HEAD(, opal_vbdev) g_opal_vbdev =
+ TAILQ_HEAD_INITIALIZER(g_opal_vbdev);
+
+struct vbdev_opal_bdev_io {
+ struct spdk_io_channel *ch;
+ struct spdk_bdev_io *bdev_io;
+ struct spdk_bdev_io_wait_entry bdev_io_wait;
+};
+
+struct vbdev_opal_channel {
+ struct spdk_bdev_part_channel part_ch;
+};
+
+struct vbdev_opal_part_base {
+ char *nvme_ctrlr_name;
+ struct spdk_bdev_part_base *part_base;
+ SPDK_BDEV_PART_TAILQ part_tailq;
+ TAILQ_ENTRY(vbdev_opal_part_base) tailq;
+};
+
+static TAILQ_HEAD(, vbdev_opal_part_base) g_opal_base = TAILQ_HEAD_INITIALIZER(g_opal_base);
+
+static void _vbdev_opal_submit_request(struct spdk_io_channel *_ch, struct spdk_bdev_io *bdev_io);
+
+static void vbdev_opal_examine(struct spdk_bdev *bdev);
+
+static void
+vbdev_opal_delete(struct opal_vbdev *opal_bdev)
+{
+ TAILQ_REMOVE(&g_opal_vbdev, opal_bdev, tailq);
+ free(opal_bdev->name);
+ free(opal_bdev);
+ opal_bdev = NULL;
+}
+
+static void
+vbdev_opal_clear(void)
+{
+ struct opal_vbdev *opal_bdev, *tmp;
+
+ TAILQ_FOREACH_SAFE(opal_bdev, &g_opal_vbdev, tailq, tmp) {
+ vbdev_opal_delete(opal_bdev);
+ }
+}
+
+static int
+vbdev_opal_init(void)
+{
+ /* TODO */
+ return 0;
+}
+
+static void
+vbdev_opal_fini(void)
+{
+ vbdev_opal_clear();
+}
+
+static int
+vbdev_opal_get_ctx_size(void)
+{
+ return sizeof(struct vbdev_opal_bdev_io);
+}
+
+/* delete all the config of the same base bdev */
+static void
+vbdev_opal_delete_all_base_config(struct vbdev_opal_part_base *base)
+{
+ char *nvme_ctrlr_name = base->nvme_ctrlr_name;
+ struct opal_vbdev *bdev, *tmp_bdev;
+
+ TAILQ_FOREACH_SAFE(bdev, &g_opal_vbdev, tailq, tmp_bdev) {
+ if (!strcmp(nvme_ctrlr_name, bdev->nvme_ctrlr->name)) {
+ vbdev_opal_delete(bdev);
+ }
+ }
+}
+
+static int
+_vbdev_opal_destruct(void *ctx)
+{
+ struct spdk_bdev_part *part = ctx;
+
+ return spdk_bdev_part_free(part);
+}
+
+static void
+vbdev_opal_base_free(void *ctx)
+{
+ struct vbdev_opal_part_base *base = ctx;
+
+ TAILQ_REMOVE(&g_opal_base, base, tailq);
+
+ free(base->nvme_ctrlr_name);
+ free(base);
+}
+
+static void
+vbdev_opal_resubmit_io(void *arg)
+{
+ struct vbdev_opal_bdev_io *io_ctx = (struct vbdev_opal_bdev_io *)arg;
+
+ _vbdev_opal_submit_request(io_ctx->ch, io_ctx->bdev_io);
+}
+
+static void
+vbdev_opal_queue_io(struct vbdev_opal_bdev_io *io_ctx)
+{
+ struct vbdev_opal_channel *ch = spdk_io_channel_get_ctx(io_ctx->ch);
+ int rc;
+
+ io_ctx->bdev_io_wait.bdev = io_ctx->bdev_io->bdev;
+ io_ctx->bdev_io_wait.cb_fn = vbdev_opal_resubmit_io;
+ io_ctx->bdev_io_wait.cb_arg = io_ctx;
+
+ rc = spdk_bdev_queue_io_wait(io_ctx->bdev_io->bdev, ch->part_ch.base_ch, &io_ctx->bdev_io_wait);
+
+ if (rc != 0) {
+ SPDK_ERRLOG("Queue io failed in vbdev_opal_queue_io: %d\n", rc);
+ spdk_bdev_io_complete(io_ctx->bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+}
+
+static void
+_vbdev_opal_submit_request(struct spdk_io_channel *_ch, struct spdk_bdev_io *bdev_io)
+{
+ struct vbdev_opal_channel *ch = spdk_io_channel_get_ctx(_ch);
+ struct vbdev_opal_bdev_io *io_ctx = (struct vbdev_opal_bdev_io *)bdev_io->driver_ctx;
+ int rc;
+
+ rc = spdk_bdev_part_submit_request(&ch->part_ch, bdev_io);
+ if (rc) {
+ if (rc == -ENOMEM) {
+ SPDK_DEBUGLOG(SPDK_LOG_VBDEV_OPAL, "opal: no memory, queue io.\n");
+ io_ctx->ch = _ch;
+ io_ctx->bdev_io = bdev_io;
+ vbdev_opal_queue_io(io_ctx);
+ } else {
+ SPDK_ERRLOG("opal: error on io submission, rc=%d.\n", rc);
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+ }
+}
+
+static void
+vbdev_opal_io_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success)
+{
+ if (!success) {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ return;
+ }
+
+ _vbdev_opal_submit_request(ch, bdev_io);
+}
+
+static void
+vbdev_opal_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
+{
+ switch (bdev_io->type) {
+ case SPDK_BDEV_IO_TYPE_READ:
+ spdk_bdev_io_get_buf(bdev_io, vbdev_opal_io_get_buf_cb,
+ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
+ break;
+ default:
+ _vbdev_opal_submit_request(ch, bdev_io);
+ break;
+ }
+}
+
+struct spdk_opal_locking_range_info *
+vbdev_opal_get_info_from_bdev(const char *opal_bdev_name, const char *password)
+{
+ struct opal_vbdev *vbdev;
+ struct nvme_bdev_ctrlr *nvme_ctrlr;
+ int locking_range_id;
+ int rc;
+
+ TAILQ_FOREACH(vbdev, &g_opal_vbdev, tailq) {
+ if (strcmp(vbdev->name, opal_bdev_name) == 0) {
+ break;
+ }
+ }
+
+ if (vbdev == NULL) {
+ SPDK_ERRLOG("%s not found\n", opal_bdev_name);
+ return NULL;
+ }
+
+ nvme_ctrlr = vbdev->nvme_ctrlr;
+ if (nvme_ctrlr == NULL) {
+ SPDK_ERRLOG("can't find nvme_ctrlr of %s\n", vbdev->name);
+ return NULL;
+ }
+
+ locking_range_id = vbdev->locking_range_id;
+ rc = spdk_opal_cmd_get_locking_range_info(nvme_ctrlr->opal_dev, password,
+ OPAL_ADMIN1, locking_range_id);
+ if (rc) {
+ SPDK_ERRLOG("Get locking range info error: %d\n", rc);
+ return NULL;
+ }
+
+ return spdk_opal_get_locking_range_info(nvme_ctrlr->opal_dev, locking_range_id);
+}
+
+static int
+vbdev_opal_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
+{
+ struct spdk_bdev_part *part = ctx;
+ struct spdk_bdev *base_bdev = spdk_bdev_part_get_base_bdev(part);
+ uint64_t offset = spdk_bdev_part_get_offset_blocks(part);
+
+ spdk_json_write_named_object_begin(w, "opal");
+
+ spdk_json_write_named_string(w, "base_bdev", spdk_bdev_get_name(base_bdev));
+ spdk_json_write_named_uint64(w, "offset_blocks", offset);
+
+ spdk_json_write_object_end(w);
+
+ return 0;
+}
+
+static void
+vbdev_opal_base_bdev_hotremove_cb(void *_part_base)
+{
+ struct spdk_bdev_part_base *part_base = _part_base;
+ struct vbdev_opal_part_base *base = spdk_bdev_part_base_get_ctx(part_base);
+
+ spdk_bdev_part_base_hotremove(part_base, spdk_bdev_part_base_get_tailq(part_base));
+ vbdev_opal_delete_all_base_config(base);
+}
+
+static bool
+vbdev_opal_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
+{
+ struct spdk_bdev_part *part = ctx;
+ struct spdk_bdev *base_bdev = spdk_bdev_part_get_base_bdev(part);
+
+ return spdk_bdev_io_type_supported(base_bdev, io_type);
+}
+
+static struct spdk_bdev_fn_table opal_vbdev_fn_table = {
+ .destruct = _vbdev_opal_destruct,
+ .submit_request = vbdev_opal_submit_request,
+ .io_type_supported = vbdev_opal_io_type_supported,
+ .dump_info_json = vbdev_opal_dump_info_json,
+ .write_config_json = NULL,
+};
+
+static struct spdk_bdev_module opal_if = {
+ .name = "opal",
+ .module_init = vbdev_opal_init,
+ .module_fini = vbdev_opal_fini,
+ .get_ctx_size = vbdev_opal_get_ctx_size,
+ .examine_config = vbdev_opal_examine,
+ .config_json = NULL,
+};
+
+SPDK_BDEV_MODULE_REGISTER(opal, &opal_if)
+
+int
+vbdev_opal_create(const char *nvme_ctrlr_name, uint32_t nsid, uint8_t locking_range_id,
+ uint64_t range_start, uint64_t range_length, const char *password)
+{
+ int rc;
+ char *opal_vbdev_name;
+ char *base_bdev_name;
+ struct nvme_bdev_ctrlr *nvme_ctrlr;
+ struct opal_vbdev *opal_bdev;
+ struct vbdev_opal_part_base *opal_part_base = NULL;
+ struct spdk_bdev_part *part_bdev;
+ struct nvme_bdev *nvme_bdev;
+
+ if (nsid != NSID_SUPPORTED) {
+ SPDK_ERRLOG("nsid %d not supported", nsid);
+ return -EINVAL;
+ }
+
+ nvme_ctrlr = nvme_bdev_ctrlr_get_by_name(nvme_ctrlr_name);
+ if (!nvme_ctrlr) {
+ SPDK_ERRLOG("get nvme ctrlr failed\n");
+ return -ENODEV;
+ }
+
+ if (!nvme_ctrlr->opal_dev) {
+ SPDK_ERRLOG("Opal not supported\n");
+ return -ENOTSUP;
+ }
+
+ opal_bdev = calloc(1, sizeof(struct opal_vbdev));
+ if (!opal_bdev) {
+ SPDK_ERRLOG("allocation for opal_bdev failed\n");
+ return -ENOMEM;
+ }
+
+ opal_bdev->locking_range_id = locking_range_id;
+ opal_bdev->range_start = range_start;
+ opal_bdev->range_length = range_length;
+
+ opal_bdev->nvme_ctrlr = nvme_ctrlr;
+ opal_bdev->opal_dev = nvme_ctrlr->opal_dev;
+
+ nvme_bdev = TAILQ_FIRST(&nvme_ctrlr->namespaces[nsid - 1]->bdevs);
+ assert(nvme_bdev != NULL);
+ base_bdev_name = nvme_bdev->disk.name;
+
+ /* traverse base list to see if part_base is already create for this base bdev */
+ TAILQ_FOREACH(opal_part_base, &g_opal_base, tailq) {
+ if (!strcmp(spdk_bdev_part_base_get_bdev_name(opal_part_base->part_base), base_bdev_name)) {
+ break;
+ }
+ }
+
+ /* If there is not a corresponding opal_part_base, a new opal_part_base will be created.
+ For each new part_base, there will be one tailq to store all the parts of this base */
+ if (opal_part_base == NULL) {
+ opal_part_base = calloc(1, sizeof(*opal_part_base));
+ if (opal_part_base == NULL) {
+ SPDK_ERRLOG("Could not allocate opal_part_base\n");
+ free(opal_bdev);
+ return -ENOMEM;
+ }
+ TAILQ_INIT(&opal_part_base->part_tailq);
+
+ opal_part_base->part_base = spdk_bdev_part_base_construct(spdk_bdev_get_by_name(base_bdev_name),
+ vbdev_opal_base_bdev_hotremove_cb, &opal_if,
+ &opal_vbdev_fn_table, &opal_part_base->part_tailq, vbdev_opal_base_free,
+ opal_part_base, sizeof(struct vbdev_opal_channel), NULL, NULL);
+ if (opal_part_base->part_base == NULL) {
+ SPDK_ERRLOG("Could not allocate part_base\n");
+ free(opal_bdev);
+ free(opal_part_base);
+ return -ENOMEM;
+ }
+ opal_part_base->nvme_ctrlr_name = strdup(nvme_ctrlr_name);
+ if (opal_part_base->nvme_ctrlr_name == NULL) {
+ free(opal_bdev);
+ spdk_bdev_part_base_free(opal_part_base->part_base);
+ return -ENOMEM;
+ }
+
+ TAILQ_INSERT_TAIL(&g_opal_base, opal_part_base, tailq);
+ }
+ assert(opal_part_base != NULL);
+ opal_bdev->opal_base = opal_part_base;
+
+ part_bdev = calloc(1, sizeof(struct spdk_bdev_part));
+ if (!part_bdev) {
+ SPDK_ERRLOG("Could not allocate part_bdev\n");
+ free(opal_bdev);
+ return -ENOMEM;
+ }
+
+ TAILQ_INSERT_TAIL(&g_opal_vbdev, opal_bdev, tailq);
+ opal_vbdev_name = spdk_sprintf_alloc("%sr%" PRIu8, base_bdev_name,
+ opal_bdev->locking_range_id); /* e.g.: nvme0n1r1 */
+ if (opal_vbdev_name == NULL) {
+ SPDK_ERRLOG("Could not allocate opal_vbdev_name\n");
+ rc = -ENOMEM;
+ goto err;
+ }
+
+ opal_bdev->name = opal_vbdev_name;
+ rc = spdk_opal_cmd_setup_locking_range(opal_bdev->opal_dev, OPAL_ADMIN1,
+ opal_bdev->locking_range_id, opal_bdev->range_start,
+ opal_bdev->range_length, password);
+ if (rc) {
+ SPDK_ERRLOG("Error construct %s\n", opal_vbdev_name);
+ goto err;
+ }
+
+ rc = spdk_bdev_part_construct(part_bdev, opal_bdev->opal_base->part_base, opal_vbdev_name,
+ opal_bdev->range_start, opal_bdev->range_length, "Opal locking range");
+ if (rc) {
+ SPDK_ERRLOG("Could not allocate bdev part\n");
+ goto err;
+ }
+
+ /* lock this bdev initially */
+ rc = spdk_opal_cmd_lock_unlock(opal_bdev->opal_dev, OPAL_ADMIN1, OPAL_RWLOCK, locking_range_id,
+ password);
+ if (rc) {
+ SPDK_ERRLOG("Error lock %s\n", opal_vbdev_name);
+ goto err;
+ }
+
+ opal_bdev->bdev_part = part_bdev;
+ return 0;
+
+err:
+ vbdev_opal_delete(opal_bdev);
+ free(part_bdev);
+ return rc;
+}
+
+static void
+vbdev_opal_destruct_bdev(struct opal_vbdev *opal_bdev)
+{
+ struct spdk_bdev_part *part = opal_bdev->bdev_part;
+
+ assert(opal_bdev->opal_base != NULL);
+ assert(part != NULL);
+
+ if (opal_bdev->range_start == spdk_bdev_part_get_offset_blocks(part)) {
+ spdk_bdev_unregister(spdk_bdev_part_get_bdev(part), NULL, NULL);
+ }
+ vbdev_opal_delete(opal_bdev);
+}
+
+int
+vbdev_opal_destruct(const char *bdev_name, const char *password)
+{
+ struct nvme_bdev_ctrlr *nvme_ctrlr;
+ int locking_range_id;
+ int rc;
+ struct opal_vbdev *opal_bdev;
+
+ TAILQ_FOREACH(opal_bdev, &g_opal_vbdev, tailq) {
+ if (strcmp(opal_bdev->name, bdev_name) == 0) {
+ break;
+ }
+ }
+
+ if (opal_bdev == NULL) {
+ SPDK_ERRLOG("%s not found\n", bdev_name);
+ rc = -ENODEV;
+ goto err;
+ }
+
+ locking_range_id = opal_bdev->locking_range_id;
+
+ nvme_ctrlr = opal_bdev->nvme_ctrlr;
+ if (nvme_ctrlr == NULL) {
+ SPDK_ERRLOG("can't find nvme_ctrlr of %s\n", bdev_name);
+ return -ENODEV;
+ }
+
+ /* secure erase locking range */
+ rc = spdk_opal_cmd_secure_erase_locking_range(nvme_ctrlr->opal_dev, OPAL_ADMIN1, locking_range_id,
+ password);
+ if (rc) {
+ SPDK_ERRLOG("opal erase locking range failed\n");
+ goto err;
+ }
+
+ /* reset the locking range to 0 */
+ rc = spdk_opal_cmd_setup_locking_range(nvme_ctrlr->opal_dev, OPAL_ADMIN1, locking_range_id, 0,
+ 0, password);
+ if (rc) {
+ SPDK_ERRLOG("opal reset locking range failed\n");
+ goto err;
+ }
+
+ spdk_opal_free_locking_range_info(opal_bdev->opal_dev, locking_range_id);
+ vbdev_opal_destruct_bdev(opal_bdev);
+ return 0;
+
+err:
+ return rc;
+}
+
+static void
+vbdev_opal_examine(struct spdk_bdev *bdev)
+{
+ /* TODO */
+ spdk_bdev_module_examine_done(&opal_if);
+}
+
+int
+vbdev_opal_set_lock_state(const char *bdev_name, uint16_t user_id, const char *password,
+ const char *lock_state)
+{
+ struct nvme_bdev_ctrlr *nvme_ctrlr;
+ int locking_range_id;
+ int rc;
+ enum spdk_opal_lock_state state_flag;
+ struct opal_vbdev *opal_bdev;
+
+ TAILQ_FOREACH(opal_bdev, &g_opal_vbdev, tailq) {
+ if (strcmp(opal_bdev->name, bdev_name) == 0) {
+ break;
+ }
+ }
+
+ if (opal_bdev == NULL) {
+ SPDK_ERRLOG("%s not found\n", bdev_name);
+ return -ENODEV;
+ }
+
+ nvme_ctrlr = opal_bdev->nvme_ctrlr;
+ if (nvme_ctrlr == NULL) {
+ SPDK_ERRLOG("can't find nvme_ctrlr of %s\n", opal_bdev->name);
+ return -ENODEV;
+ }
+
+ if (strcasecmp(lock_state, "READWRITE") == 0) {
+ state_flag = OPAL_READWRITE;
+ } else if (strcasecmp(lock_state, "READONLY") == 0) {
+ state_flag = OPAL_READONLY;
+ } else if (strcasecmp(lock_state, "RWLOCK") == 0) {
+ state_flag = OPAL_RWLOCK;
+ } else {
+ SPDK_ERRLOG("Invalid OPAL lock state input\n");
+ return -EINVAL;
+ }
+
+ locking_range_id = opal_bdev->locking_range_id;
+ rc = spdk_opal_cmd_lock_unlock(nvme_ctrlr->opal_dev, user_id, state_flag, locking_range_id,
+ password);
+ if (rc) {
+ SPDK_ERRLOG("%s lock/unlock failure: %d\n", bdev_name, rc);
+ }
+
+ return rc;
+}
+
+int
+vbdev_opal_enable_new_user(const char *bdev_name, const char *admin_password, uint16_t user_id,
+ const char *user_password)
+{
+ struct nvme_bdev_ctrlr *nvme_ctrlr;
+ int locking_range_id;
+ int rc;
+ struct opal_vbdev *opal_bdev;
+
+ TAILQ_FOREACH(opal_bdev, &g_opal_vbdev, tailq) {
+ if (strcmp(opal_bdev->name, bdev_name) == 0) {
+ break;
+ }
+ }
+
+ if (opal_bdev == NULL) {
+ SPDK_ERRLOG("%s not found\n", bdev_name);
+ return -ENODEV;
+ }
+
+ nvme_ctrlr = opal_bdev->nvme_ctrlr;
+ if (nvme_ctrlr == NULL) {
+ SPDK_ERRLOG("can't find nvme_ctrlr of %s\n", opal_bdev->name);
+ return -ENODEV;
+ }
+
+ rc = spdk_opal_cmd_enable_user(nvme_ctrlr->opal_dev, user_id, admin_password);
+ if (rc) {
+ SPDK_ERRLOG("%s enable user error: %d\n", bdev_name, rc);
+ return rc;
+ }
+
+ rc = spdk_opal_cmd_set_new_passwd(nvme_ctrlr->opal_dev, user_id, user_password, admin_password,
+ true);
+ if (rc) {
+ SPDK_ERRLOG("%s set user password error: %d\n", bdev_name, rc);
+ return rc;
+ }
+
+ locking_range_id = opal_bdev->locking_range_id;
+ rc = spdk_opal_cmd_add_user_to_locking_range(nvme_ctrlr->opal_dev, user_id, locking_range_id,
+ OPAL_READONLY, admin_password);
+ if (rc) {
+ SPDK_ERRLOG("%s add user READONLY priority error: %d\n", bdev_name, rc);
+ return rc;
+ }
+
+ rc = spdk_opal_cmd_add_user_to_locking_range(nvme_ctrlr->opal_dev, user_id, locking_range_id,
+ OPAL_READWRITE, admin_password);
+ if (rc) {
+ SPDK_ERRLOG("%s add user READWRITE priority error: %d\n", bdev_name, rc);
+ return rc;
+ }
+
+ return 0;
+}
+
+SPDK_LOG_REGISTER_COMPONENT("vbdev_opal", SPDK_LOG_VBDEV_OPAL)
diff --git a/src/spdk/module/bdev/nvme/vbdev_opal.h b/src/spdk/module/bdev/nvme/vbdev_opal.h
new file mode 100644
index 000000000..0b2fd731f
--- /dev/null
+++ b/src/spdk/module/bdev/nvme/vbdev_opal.h
@@ -0,0 +1,54 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPDK_VBDEV_OPAL_H
+#define SPDK_VBDEV_OPAL_H
+
+#include "spdk/bdev_module.h"
+#include "bdev_nvme.h"
+#include "common.h"
+
+int vbdev_opal_create(const char *nvme_ctrlr_name, uint32_t nsid, uint8_t locking_range_id,
+ uint64_t range_start, uint64_t range_length, const char *password);
+
+struct spdk_opal_locking_range_info *vbdev_opal_get_info_from_bdev(const char *opal_bdev_name,
+ const char *password);
+
+int vbdev_opal_destruct(const char *bdev_name, const char *password);
+
+int vbdev_opal_enable_new_user(const char *bdev_name, const char *admin_password,
+ uint16_t user_id, const char *user_password);
+
+int vbdev_opal_set_lock_state(const char *bdev_name, uint16_t user_id, const char *password,
+ const char *lock_state);
+
+#endif
diff --git a/src/spdk/module/bdev/nvme/vbdev_opal_rpc.c b/src/spdk/module/bdev/nvme/vbdev_opal_rpc.c
new file mode 100644
index 000000000..ee270ef35
--- /dev/null
+++ b/src/spdk/module/bdev/nvme/vbdev_opal_rpc.c
@@ -0,0 +1,453 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/rpc.h"
+#include "spdk/util.h"
+#include "spdk/string.h"
+#include "spdk_internal/log.h"
+
+#include "vbdev_opal.h"
+
+struct rpc_bdev_nvme_opal_init {
+ char *nvme_ctrlr_name;
+ char *password;
+};
+
+static void
+free_rpc_bdev_nvme_opal_init(struct rpc_bdev_nvme_opal_init *req)
+{
+ free(req->nvme_ctrlr_name);
+ free(req->password);
+}
+
+static const struct spdk_json_object_decoder rpc_bdev_nvme_opal_init_decoders[] = {
+ {"nvme_ctrlr_name", offsetof(struct rpc_bdev_nvme_opal_init, nvme_ctrlr_name), spdk_json_decode_string},
+ {"password", offsetof(struct rpc_bdev_nvme_opal_init, password), spdk_json_decode_string},
+};
+
+static void
+rpc_bdev_nvme_opal_init(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_nvme_opal_init req = {};
+ struct spdk_json_write_ctx *w;
+ struct nvme_bdev_ctrlr *nvme_ctrlr;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_bdev_nvme_opal_init_decoders,
+ SPDK_COUNTOF(rpc_bdev_nvme_opal_init_decoders),
+ &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+ goto out;
+ }
+
+ /* check if opal supported */
+ nvme_ctrlr = nvme_bdev_ctrlr_get_by_name(req.nvme_ctrlr_name);
+ if (nvme_ctrlr == NULL || nvme_ctrlr->opal_dev == NULL) {
+ SPDK_ERRLOG("%s not support opal\n", req.nvme_ctrlr_name);
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+ goto out;
+ }
+
+ /* take ownership */
+ rc = spdk_opal_cmd_take_ownership(nvme_ctrlr->opal_dev, req.password);
+ if (rc) {
+ SPDK_ERRLOG("Take ownership failure: %d\n", rc);
+ switch (rc) {
+ case -EBUSY:
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "SP Busy, try again later");
+ break;
+ case -EACCES:
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "This drive is already enabled");
+ break;
+ default:
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error");
+ }
+ goto out;
+ }
+
+ /* activate locking SP */
+ rc = spdk_opal_cmd_activate_locking_sp(nvme_ctrlr->opal_dev, req.password);
+ if (rc) {
+ SPDK_ERRLOG("Activate locking SP failure: %d\n", rc);
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error");
+ goto out;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+
+out:
+ free_rpc_bdev_nvme_opal_init(&req);
+}
+SPDK_RPC_REGISTER("bdev_nvme_opal_init", rpc_bdev_nvme_opal_init, SPDK_RPC_RUNTIME)
+
+struct rpc_bdev_nvme_opal_revert {
+ char *nvme_ctrlr_name;
+ char *password;
+};
+
+static void
+free_rpc_bdev_nvme_opal_revert(struct rpc_bdev_nvme_opal_revert *req)
+{
+ free(req->nvme_ctrlr_name);
+ free(req->password);
+}
+
+static const struct spdk_json_object_decoder rpc_bdev_nvme_opal_revert_decoders[] = {
+ {"nvme_ctrlr_name", offsetof(struct rpc_bdev_nvme_opal_revert, nvme_ctrlr_name), spdk_json_decode_string},
+ {"password", offsetof(struct rpc_bdev_nvme_opal_revert, password), spdk_json_decode_string},
+};
+
+static void
+rpc_bdev_nvme_opal_revert(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_nvme_opal_revert req = {};
+ struct spdk_json_write_ctx *w;
+ struct nvme_bdev_ctrlr *nvme_ctrlr;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_bdev_nvme_opal_revert_decoders,
+ SPDK_COUNTOF(rpc_bdev_nvme_opal_revert_decoders),
+ &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+ goto out;
+ }
+
+ /* check if opal supported */
+ nvme_ctrlr = nvme_bdev_ctrlr_get_by_name(req.nvme_ctrlr_name);
+ if (nvme_ctrlr == NULL || nvme_ctrlr->opal_dev == NULL) {
+ SPDK_ERRLOG("%s not support opal\n", req.nvme_ctrlr_name);
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+ goto out;
+ }
+
+ /* TODO: delete all opal vbdev before revert TPer */
+
+ rc = spdk_opal_cmd_revert_tper(nvme_ctrlr->opal_dev, req.password);
+ if (rc) {
+ SPDK_ERRLOG("Revert TPer failure: %d\n", rc);
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error");
+ goto out;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+
+out:
+ free_rpc_bdev_nvme_opal_revert(&req);
+}
+SPDK_RPC_REGISTER("bdev_nvme_opal_revert", rpc_bdev_nvme_opal_revert, SPDK_RPC_RUNTIME)
+
+struct rpc_bdev_opal_create {
+ char *nvme_ctrlr_name;
+ uint32_t nsid;
+ uint16_t locking_range_id;
+ uint64_t range_start;
+ uint64_t range_length;
+ char *password;
+};
+
+static void
+free_rpc_bdev_opal_create(struct rpc_bdev_opal_create *req)
+{
+ free(req->nvme_ctrlr_name);
+ free(req->password);
+}
+
+static const struct spdk_json_object_decoder rpc_bdev_opal_create_decoders[] = {
+ {"nvme_ctrlr_name", offsetof(struct rpc_bdev_opal_create, nvme_ctrlr_name), spdk_json_decode_string},
+ {"nsid", offsetof(struct rpc_bdev_opal_create, nsid), spdk_json_decode_uint32},
+ {"locking_range_id", offsetof(struct rpc_bdev_opal_create, locking_range_id), spdk_json_decode_uint16},
+ {"range_start", offsetof(struct rpc_bdev_opal_create, range_start), spdk_json_decode_uint64},
+ {"range_length", offsetof(struct rpc_bdev_opal_create, range_length), spdk_json_decode_uint64},
+ {"password", offsetof(struct rpc_bdev_opal_create, password), spdk_json_decode_string},
+};
+
+static void
+rpc_bdev_opal_create(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_opal_create req = {};
+ struct spdk_json_write_ctx *w;
+ char *opal_bdev_name;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_bdev_opal_create_decoders,
+ SPDK_COUNTOF(rpc_bdev_opal_create_decoders),
+ &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+ goto out;
+ }
+
+ rc = vbdev_opal_create(req.nvme_ctrlr_name, req.nsid, req.locking_range_id, req.range_start,
+ req.range_length, req.password);
+ if (rc != 0) {
+ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Failed to create opal vbdev from '%s': %s",
+ req.nvme_ctrlr_name, spdk_strerror(-rc));
+ goto out;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ opal_bdev_name = spdk_sprintf_alloc("%sn%dr%d", req.nvme_ctrlr_name, req.nsid,
+ req.locking_range_id);
+ spdk_json_write_string(w, opal_bdev_name);
+ spdk_jsonrpc_end_result(request, w);
+ free(opal_bdev_name);
+
+out:
+ free_rpc_bdev_opal_create(&req);
+}
+SPDK_RPC_REGISTER("bdev_opal_create", rpc_bdev_opal_create, SPDK_RPC_RUNTIME)
+
+struct rpc_bdev_opal_get_info {
+ char *bdev_name;
+ char *password;
+};
+
+static void
+free_rpc_bdev_opal_get_info(struct rpc_bdev_opal_get_info *req)
+{
+ free(req->bdev_name);
+ free(req->password);
+}
+
+static const struct spdk_json_object_decoder rpc_bdev_opal_get_info_decoders[] = {
+ {"bdev_name", offsetof(struct rpc_bdev_opal_get_info, bdev_name), spdk_json_decode_string},
+ {"password", offsetof(struct rpc_bdev_opal_get_info, password), spdk_json_decode_string},
+};
+
+static void
+rpc_bdev_opal_get_info(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_opal_get_info req = {};
+ struct spdk_json_write_ctx *w;
+ struct spdk_opal_locking_range_info *info;
+
+ if (spdk_json_decode_object(params, rpc_bdev_opal_get_info_decoders,
+ SPDK_COUNTOF(rpc_bdev_opal_get_info_decoders),
+ &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+ goto out;
+ }
+
+ info = vbdev_opal_get_info_from_bdev(req.bdev_name, req.password);
+ if (info == NULL) {
+ SPDK_ERRLOG("Get opal info failure\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error");
+ goto out;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_string(w, "name", req.bdev_name);
+ spdk_json_write_named_uint64(w, "range_start", info->range_start);
+ spdk_json_write_named_uint64(w, "range_length", info->range_length);
+ spdk_json_write_named_bool(w, "read_lock_enabled", info->read_lock_enabled);
+ spdk_json_write_named_bool(w, "write_lock_enabled", info->write_lock_enabled);
+ spdk_json_write_named_bool(w, "read_locked", info->read_locked);
+ spdk_json_write_named_bool(w, "write_locked", info->write_locked);
+
+ spdk_json_write_object_end(w);
+ spdk_jsonrpc_end_result(request, w);
+
+out:
+ free_rpc_bdev_opal_get_info(&req);
+}
+SPDK_RPC_REGISTER("bdev_opal_get_info", rpc_bdev_opal_get_info, SPDK_RPC_RUNTIME)
+
+struct rpc_bdev_opal_delete {
+ char *bdev_name;
+ char *password;
+};
+
+static void
+free_rpc_bdev_opal_delete(struct rpc_bdev_opal_delete *req)
+{
+ free(req->bdev_name);
+ free(req->password);
+}
+
+static const struct spdk_json_object_decoder rpc_bdev_opal_delete_decoders[] = {
+ {"bdev_name", offsetof(struct rpc_bdev_opal_delete, bdev_name), spdk_json_decode_string},
+ {"password", offsetof(struct rpc_bdev_opal_delete, password), spdk_json_decode_string},
+};
+
+static void
+rpc_bdev_opal_delete(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_opal_delete req = {};
+ struct spdk_json_write_ctx *w;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_bdev_opal_delete_decoders,
+ SPDK_COUNTOF(rpc_bdev_opal_delete_decoders),
+ &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+ goto out;
+ }
+
+ rc = vbdev_opal_destruct(req.bdev_name, req.password);
+ if (rc < 0) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, spdk_strerror(-rc));
+ goto out;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+out:
+ free_rpc_bdev_opal_delete(&req);
+}
+SPDK_RPC_REGISTER("bdev_opal_delete", rpc_bdev_opal_delete, SPDK_RPC_RUNTIME)
+
+struct rpc_bdev_opal_set_lock_state {
+ char *bdev_name;
+ uint16_t user_id;
+ char *password;
+ char *lock_state;
+};
+
+static void
+free_rpc_bdev_opal_set_lock_state(struct rpc_bdev_opal_set_lock_state *req)
+{
+ free(req->bdev_name);
+ free(req->password);
+ free(req->lock_state);
+}
+
+static const struct spdk_json_object_decoder rpc_bdev_opal_set_lock_state_decoders[] = {
+ {"bdev_name", offsetof(struct rpc_bdev_opal_set_lock_state, bdev_name), spdk_json_decode_string},
+ {"user_id", offsetof(struct rpc_bdev_opal_set_lock_state, user_id), spdk_json_decode_uint16},
+ {"password", offsetof(struct rpc_bdev_opal_set_lock_state, password), spdk_json_decode_string},
+ {"lock_state", offsetof(struct rpc_bdev_opal_set_lock_state, lock_state), spdk_json_decode_string},
+};
+
+static void
+rpc_bdev_opal_set_lock_state(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_opal_set_lock_state req = {};
+ struct spdk_json_write_ctx *w;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_bdev_opal_set_lock_state_decoders,
+ SPDK_COUNTOF(rpc_bdev_opal_set_lock_state_decoders),
+ &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+ goto out;
+ }
+
+ rc = vbdev_opal_set_lock_state(req.bdev_name, req.user_id, req.password, req.lock_state);
+ if (rc != 0) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, spdk_strerror(-rc));
+ goto out;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+
+out:
+ free_rpc_bdev_opal_set_lock_state(&req);
+}
+SPDK_RPC_REGISTER("bdev_opal_set_lock_state", rpc_bdev_opal_set_lock_state, SPDK_RPC_RUNTIME)
+
+struct rpc_bdev_opal_new_user {
+ char *bdev_name;
+ char *admin_password;
+ uint16_t user_id;
+ char *user_password;
+};
+
+static void
+free_rpc_bdev_opal_new_user(struct rpc_bdev_opal_new_user *req)
+{
+ free(req->bdev_name);
+ free(req->admin_password);
+ free(req->user_password);
+}
+
+static const struct spdk_json_object_decoder rpc_bdev_opal_new_user_decoders[] = {
+ {"bdev_name", offsetof(struct rpc_bdev_opal_new_user, bdev_name), spdk_json_decode_string},
+ {"admin_password", offsetof(struct rpc_bdev_opal_new_user, admin_password), spdk_json_decode_string},
+ {"user_id", offsetof(struct rpc_bdev_opal_new_user, user_id), spdk_json_decode_uint16},
+ {"user_password", offsetof(struct rpc_bdev_opal_new_user, user_password), spdk_json_decode_string},
+};
+
+static void
+rpc_bdev_opal_new_user(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_opal_new_user req = {};
+ struct spdk_json_write_ctx *w;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_bdev_opal_new_user_decoders,
+ SPDK_COUNTOF(rpc_bdev_opal_new_user_decoders),
+ &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+ goto out;
+ }
+
+ rc = vbdev_opal_enable_new_user(req.bdev_name, req.admin_password, req.user_id,
+ req.user_password);
+ if (rc != 0) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, spdk_strerror(-rc));
+ goto out;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+
+out:
+ free_rpc_bdev_opal_new_user(&req);
+}
+SPDK_RPC_REGISTER("bdev_opal_new_user", rpc_bdev_opal_new_user, SPDK_RPC_RUNTIME)
diff --git a/src/spdk/module/bdev/ocf/Makefile b/src/spdk/module/bdev/ocf/Makefile
new file mode 100644
index 000000000..b931de106
--- /dev/null
+++ b/src/spdk/module/bdev/ocf/Makefile
@@ -0,0 +1,52 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+CFLAGS += $(ENV_CFLAGS) -I$(SPDK_ROOT_DIR)/lib/env_ocf -I$(SPDK_ROOT_DIR)/lib/env_ocf/include
+C_SRCS = $(shell ls *.c)
+
+LIBNAME := bdev_ocf
+
+SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
+
+OCF_ENV := $(call spdk_lib_list_to_static_libs,ocfenv)
+
+$(LIB) : $(OCF_ENV)
diff --git a/src/spdk/module/bdev/ocf/ctx.c b/src/spdk/module/bdev/ocf/ctx.c
new file mode 100644
index 000000000..5bf4c8fee
--- /dev/null
+++ b/src/spdk/module/bdev/ocf/ctx.c
@@ -0,0 +1,565 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <ocf/ocf.h>
+#include <execinfo.h>
+
+#include "spdk/env.h"
+#include "spdk_internal/log.h"
+
+#include "ctx.h"
+#include "ocf_env.h"
+#include "data.h"
+
+ocf_ctx_t vbdev_ocf_ctx;
+
+static ctx_data_t *
+vbdev_ocf_ctx_data_alloc(uint32_t pages)
+{
+ struct bdev_ocf_data *data;
+ void *buf;
+ uint32_t sz;
+
+ data = vbdev_ocf_data_alloc(1);
+
+ sz = pages * PAGE_SIZE;
+ buf = spdk_malloc(sz, PAGE_SIZE, NULL,
+ SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+ if (buf == NULL) {
+ return NULL;
+ }
+
+ vbdev_ocf_iovs_add(data, buf, sz);
+
+ data->size = sz;
+
+ return data;
+}
+
+static void
+vbdev_ocf_ctx_data_free(ctx_data_t *ctx_data)
+{
+ struct bdev_ocf_data *data = ctx_data;
+ int i;
+
+ if (!data) {
+ return;
+ }
+
+ for (i = 0; i < data->iovcnt; i++) {
+ spdk_free(data->iovs[i].iov_base);
+ }
+
+ vbdev_ocf_data_free(data);
+}
+
+static int
+vbdev_ocf_ctx_data_mlock(ctx_data_t *ctx_data)
+{
+ /* TODO [mlock]: add mlock option */
+ return 0;
+}
+
+static void
+vbdev_ocf_ctx_data_munlock(ctx_data_t *ctx_data)
+{
+ /* TODO [mlock]: add mlock option */
+}
+
+static size_t
+iovec_flatten(struct iovec *iov, size_t iovcnt, void *buf, size_t size, size_t offset)
+{
+ size_t i, len, done = 0;
+
+ for (i = 0; i < iovcnt; i++) {
+ if (offset >= iov[i].iov_len) {
+ offset -= iov[i].iov_len;
+ continue;
+ }
+
+ if (iov[i].iov_base == NULL) {
+ continue;
+ }
+
+ if (done >= size) {
+ break;
+ }
+
+ len = MIN(size - done, iov[i].iov_len - offset);
+ memcpy(buf, iov[i].iov_base + offset, len);
+ buf += len;
+ done += len;
+ offset = 0;
+ }
+
+ return done;
+}
+
+static uint32_t
+vbdev_ocf_ctx_data_rd(void *dst, ctx_data_t *src, uint32_t size)
+{
+ struct bdev_ocf_data *s = src;
+ uint32_t size_local;
+
+ size_local = iovec_flatten(s->iovs, s->iovcnt, dst, size, s->seek);
+ s->seek += size_local;
+
+ return size_local;
+}
+
+static size_t
+buf_to_iovec(const void *buf, size_t size, struct iovec *iov, size_t iovcnt, size_t offset)
+{
+ size_t i, len, done = 0;
+
+ for (i = 0; i < iovcnt; i++) {
+ if (offset >= iov[i].iov_len) {
+ offset -= iov[i].iov_len;
+ continue;
+ }
+
+ if (iov[i].iov_base == NULL) {
+ continue;
+ }
+
+ if (done >= size) {
+ break;
+ }
+
+ len = MIN(size - done, iov[i].iov_len - offset);
+ memcpy(iov[i].iov_base + offset, buf, len);
+ buf += len;
+ done += len;
+ offset = 0;
+ }
+
+ return done;
+}
+
+static uint32_t
+vbdev_ocf_ctx_data_wr(ctx_data_t *dst, const void *src, uint32_t size)
+{
+ struct bdev_ocf_data *d = dst;
+ uint32_t size_local;
+
+ size_local = buf_to_iovec(src, size, d->iovs, d->iovcnt, d->seek);
+ d->seek += size_local;
+
+ return size_local;
+}
+
+static size_t
+iovset(struct iovec *iov, size_t iovcnt, int byte, size_t size, size_t offset)
+{
+ size_t i, len, done = 0;
+
+ for (i = 0; i < iovcnt; i++) {
+ if (offset >= iov[i].iov_len) {
+ offset -= iov[i].iov_len;
+ continue;
+ }
+
+ if (iov[i].iov_base == NULL) {
+ continue;
+ }
+
+ if (done >= size) {
+ break;
+ }
+
+ len = MIN(size - done, iov[i].iov_len - offset);
+ memset(iov[i].iov_base + offset, byte, len);
+ done += len;
+ offset = 0;
+ }
+
+ return done;
+}
+
+static uint32_t
+vbdev_ocf_ctx_data_zero(ctx_data_t *dst, uint32_t size)
+{
+ struct bdev_ocf_data *d = dst;
+ uint32_t size_local;
+
+ size_local = iovset(d->iovs, d->iovcnt, 0, size, d->seek);
+ d->seek += size_local;
+
+ return size_local;
+}
+
+static uint32_t
+vbdev_ocf_ctx_data_seek(ctx_data_t *dst, ctx_data_seek_t seek, uint32_t offset)
+{
+ struct bdev_ocf_data *d = dst;
+ uint32_t off = 0;
+
+ switch (seek) {
+ case ctx_data_seek_begin:
+ off = MIN(offset, d->size);
+ d->seek = off;
+ break;
+ case ctx_data_seek_current:
+ off = MIN(offset, d->size - d->seek);
+ d->seek += off;
+ break;
+ }
+
+ return off;
+}
+
+static uint64_t
+vbdev_ocf_ctx_data_cpy(ctx_data_t *dst, ctx_data_t *src, uint64_t to,
+ uint64_t from, uint64_t bytes)
+{
+ struct bdev_ocf_data *s = src;
+ struct bdev_ocf_data *d = dst;
+ uint32_t it_iov = 0;
+ uint32_t it_off = 0;
+ uint32_t n, sz;
+
+ bytes = MIN(bytes, s->size - from);
+ bytes = MIN(bytes, d->size - to);
+ sz = bytes;
+
+ while (from || bytes) {
+ if (s->iovs[it_iov].iov_len == it_off) {
+ it_iov++;
+ it_off = 0;
+ continue;
+ }
+
+ if (from) {
+ n = MIN(from, s->iovs[it_iov].iov_len);
+ from -= n;
+ } else {
+ n = MIN(bytes, s->iovs[it_iov].iov_len);
+ buf_to_iovec(s->iovs[it_iov].iov_base + it_off, n, d->iovs, d->iovcnt, to);
+ bytes -= n;
+ to += n;
+ }
+
+ it_off += n;
+ }
+
+ return sz;
+}
+
+static void
+vbdev_ocf_ctx_data_secure_erase(ctx_data_t *ctx_data)
+{
+ struct bdev_ocf_data *data = ctx_data;
+ struct iovec *iovs = data->iovs;
+ int i;
+
+ for (i = 0; i < data->iovcnt; i++) {
+ if (env_memset(iovs[i].iov_base, iovs[i].iov_len, 0)) {
+ assert(false);
+ }
+ }
+}
+
+int vbdev_ocf_queue_create(ocf_cache_t cache, ocf_queue_t *queue, const struct ocf_queue_ops *ops)
+{
+ int rc;
+ struct vbdev_ocf_cache_ctx *ctx = ocf_cache_get_priv(cache);
+
+ pthread_mutex_lock(&ctx->lock);
+ rc = ocf_queue_create(cache, queue, ops);
+ pthread_mutex_unlock(&ctx->lock);
+ return rc;
+}
+
+void vbdev_ocf_queue_put(ocf_queue_t queue)
+{
+ ocf_cache_t cache = ocf_queue_get_cache(queue);
+ struct vbdev_ocf_cache_ctx *ctx = ocf_cache_get_priv(cache);
+
+ pthread_mutex_lock(&ctx->lock);
+ ocf_queue_put(queue);
+ pthread_mutex_unlock(&ctx->lock);
+}
+
+void vbdev_ocf_cache_ctx_put(struct vbdev_ocf_cache_ctx *ctx)
+{
+ if (env_atomic_dec_return(&ctx->refcnt) == 0) {
+ pthread_mutex_destroy(&ctx->lock);
+ free(ctx);
+ }
+}
+
+void vbdev_ocf_cache_ctx_get(struct vbdev_ocf_cache_ctx *ctx)
+{
+ env_atomic_inc(&ctx->refcnt);
+}
+
+struct cleaner_priv {
+ struct spdk_poller *poller;
+ ocf_queue_t queue;
+ uint64_t next_run;
+};
+
+static int
+cleaner_poll(void *arg)
+{
+ ocf_cleaner_t cleaner = arg;
+ struct cleaner_priv *priv = ocf_cleaner_get_priv(cleaner);
+ uint32_t iono = ocf_queue_pending_io(priv->queue);
+ int i, max = spdk_min(32, iono);
+
+ for (i = 0; i < max; i++) {
+ ocf_queue_run_single(priv->queue);
+ }
+
+ if (spdk_get_ticks() >= priv->next_run) {
+ ocf_cleaner_run(cleaner, priv->queue);
+ return SPDK_POLLER_BUSY;
+ }
+
+ if (iono > 0) {
+ return SPDK_POLLER_BUSY;
+ } else {
+ return SPDK_POLLER_IDLE;
+ }
+}
+
+static void
+cleaner_cmpl(ocf_cleaner_t c, uint32_t interval)
+{
+ struct cleaner_priv *priv = ocf_cleaner_get_priv(c);
+
+ priv->next_run = spdk_get_ticks() + ((interval * spdk_get_ticks_hz()) / 1000);
+}
+
+static void
+cleaner_queue_kick(ocf_queue_t q)
+{
+}
+
+static void
+cleaner_queue_stop(ocf_queue_t q)
+{
+ struct cleaner_priv *cpriv = ocf_queue_get_priv(q);
+
+ if (cpriv) {
+ spdk_poller_unregister(&cpriv->poller);
+ free(cpriv);
+ }
+}
+
+const struct ocf_queue_ops cleaner_queue_ops = {
+ .kick_sync = cleaner_queue_kick,
+ .kick = cleaner_queue_kick,
+ .stop = cleaner_queue_stop,
+};
+
+static int
+vbdev_ocf_ctx_cleaner_init(ocf_cleaner_t c)
+{
+ int rc;
+ struct cleaner_priv *priv = calloc(1, sizeof(*priv));
+ ocf_cache_t cache = ocf_cleaner_get_cache(c);
+ struct vbdev_ocf_cache_ctx *cctx = ocf_cache_get_priv(cache);
+
+ if (priv == NULL) {
+ return -ENOMEM;
+ }
+
+ rc = vbdev_ocf_queue_create(cache, &priv->queue, &cleaner_queue_ops);
+ if (rc) {
+ free(priv);
+ return rc;
+ }
+
+ ocf_queue_set_priv(priv->queue, priv);
+
+ cctx->cleaner_queue = priv->queue;
+
+ ocf_cleaner_set_cmpl(c, cleaner_cmpl);
+ ocf_cleaner_set_priv(c, priv);
+
+ return 0;
+}
+
+static void
+vbdev_ocf_ctx_cleaner_stop(ocf_cleaner_t c)
+{
+ struct cleaner_priv *priv = ocf_cleaner_get_priv(c);
+
+ vbdev_ocf_queue_put(priv->queue);
+}
+
+static void
+vbdev_ocf_ctx_cleaner_kick(ocf_cleaner_t cleaner)
+{
+ struct cleaner_priv *priv = ocf_cleaner_get_priv(cleaner);
+
+ if (priv->poller) {
+ return;
+ }
+
+ /* We start cleaner poller at the same thread where cache was created
+ * TODO: allow user to specify core at which cleaner should run */
+ priv->poller = SPDK_POLLER_REGISTER(cleaner_poll, cleaner, 0);
+}
+
+static void
+vbdev_ocf_md_kick(void *ctx)
+{
+ ocf_metadata_updater_t mu = ctx;
+ ocf_cache_t cache = ocf_metadata_updater_get_cache(mu);
+
+ if (ocf_cache_is_running(cache)) {
+ ocf_metadata_updater_run(mu);
+ }
+}
+
+static int
+vbdev_ocf_volume_updater_init(ocf_metadata_updater_t mu)
+{
+ struct spdk_thread *md_thread = spdk_get_thread();
+
+ ocf_metadata_updater_set_priv(mu, md_thread);
+
+ return 0;
+}
+
+static void
+vbdev_ocf_volume_updater_stop(ocf_metadata_updater_t mu)
+{
+
+}
+
+static void
+vbdev_ocf_volume_updater_kick(ocf_metadata_updater_t mu)
+{
+ struct spdk_thread *md_thread = ocf_metadata_updater_get_priv(mu);
+
+ /* We need to send message to updater thread because
+ * kick can happen from any thread */
+ spdk_thread_send_msg(md_thread, vbdev_ocf_md_kick, mu);
+}
+
+/* This function is main way by which OCF communicates with user
+ * We don't want to use SPDK_LOG here because debugging information that is
+ * associated with every print message is not helpful in callback that only prints info
+ * while the real source is somewhere in OCF code */
+static int
+vbdev_ocf_ctx_log_printf(ocf_logger_t logger, ocf_logger_lvl_t lvl,
+ const char *fmt, va_list args)
+{
+ int spdk_lvl;
+
+ switch (lvl) {
+ case log_emerg:
+ case log_alert:
+ case log_crit:
+ case log_err:
+ spdk_lvl = SPDK_LOG_ERROR;
+ break;
+
+ case log_warn:
+ spdk_lvl = SPDK_LOG_WARN;
+ break;
+
+ case log_notice:
+ spdk_lvl = SPDK_LOG_NOTICE;
+ break;
+
+ case log_info:
+ case log_debug:
+ default:
+ spdk_lvl = SPDK_LOG_INFO;
+ }
+
+ spdk_vlog(spdk_lvl, NULL, -1, NULL, fmt, args);
+ return 0;
+}
+
+static const struct ocf_ctx_config vbdev_ocf_ctx_cfg = {
+ .name = "OCF SPDK",
+
+ .ops = {
+ .data = {
+ .alloc = vbdev_ocf_ctx_data_alloc,
+ .free = vbdev_ocf_ctx_data_free,
+ .mlock = vbdev_ocf_ctx_data_mlock,
+ .munlock = vbdev_ocf_ctx_data_munlock,
+ .read = vbdev_ocf_ctx_data_rd,
+ .write = vbdev_ocf_ctx_data_wr,
+ .zero = vbdev_ocf_ctx_data_zero,
+ .seek = vbdev_ocf_ctx_data_seek,
+ .copy = vbdev_ocf_ctx_data_cpy,
+ .secure_erase = vbdev_ocf_ctx_data_secure_erase,
+ },
+
+ .metadata_updater = {
+ .init = vbdev_ocf_volume_updater_init,
+ .stop = vbdev_ocf_volume_updater_stop,
+ .kick = vbdev_ocf_volume_updater_kick,
+ },
+
+ .cleaner = {
+ .init = vbdev_ocf_ctx_cleaner_init,
+ .stop = vbdev_ocf_ctx_cleaner_stop,
+ .kick = vbdev_ocf_ctx_cleaner_kick,
+ },
+
+ .logger = {
+ .print = vbdev_ocf_ctx_log_printf,
+ .dump_stack = NULL,
+ },
+
+ },
+};
+
+int
+vbdev_ocf_ctx_init(void)
+{
+ int ret;
+
+ ret = ocf_ctx_create(&vbdev_ocf_ctx, &vbdev_ocf_ctx_cfg);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return 0;
+}
+
+void
+vbdev_ocf_ctx_cleanup(void)
+{
+ ocf_ctx_put(vbdev_ocf_ctx);
+ vbdev_ocf_ctx = NULL;
+}
+
+SPDK_LOG_REGISTER_COMPONENT("ocf_ocfctx", SPDK_LOG_OCFCTX)
diff --git a/src/spdk/module/bdev/ocf/ctx.h b/src/spdk/module/bdev/ocf/ctx.h
new file mode 100644
index 000000000..446ac8d8f
--- /dev/null
+++ b/src/spdk/module/bdev/ocf/ctx.h
@@ -0,0 +1,65 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef VBDEV_OCF_CTX_H
+#define VBDEV_OCF_CTX_H
+
+#include <ocf/ocf.h>
+#include "spdk/thread.h"
+
+extern ocf_ctx_t vbdev_ocf_ctx;
+
+#define OCF_WRITE_FLUSH 11
+
+#define SPDK_OBJECT 1
+
+/* Context of cache instance */
+struct vbdev_ocf_cache_ctx {
+ ocf_queue_t mngt_queue;
+ ocf_queue_t cleaner_queue;
+ pthread_mutex_t lock;
+ env_atomic refcnt;
+};
+
+void vbdev_ocf_cache_ctx_put(struct vbdev_ocf_cache_ctx *ctx);
+void vbdev_ocf_cache_ctx_get(struct vbdev_ocf_cache_ctx *ctx);
+
+int vbdev_ocf_ctx_init(void);
+void vbdev_ocf_ctx_cleanup(void);
+
+/* Thread safe queue creation and deletion
+ * These are wrappers for original ocf_queue_create() and ocf_queue_put() */
+int vbdev_ocf_queue_create(ocf_cache_t cache, ocf_queue_t *queue, const struct ocf_queue_ops *ops);
+void vbdev_ocf_queue_put(ocf_queue_t queue);
+
+#endif
diff --git a/src/spdk/module/bdev/ocf/data.c b/src/spdk/module/bdev/ocf/data.c
new file mode 100644
index 000000000..981c793f5
--- /dev/null
+++ b/src/spdk/module/bdev/ocf/data.c
@@ -0,0 +1,122 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <ocf/ocf.h>
+#include "spdk/bdev.h"
+#include "data.h"
+
+struct bdev_ocf_data *
+vbdev_ocf_data_alloc(uint32_t iovcnt)
+{
+ struct bdev_ocf_data *data;
+
+ data = env_malloc(sizeof(*data), ENV_MEM_NOIO);
+ if (!data) {
+ return NULL;
+ }
+
+ data->seek = 0;
+
+ if (iovcnt) {
+ data->iovs = env_malloc(sizeof(*data->iovs) * iovcnt, ENV_MEM_NOIO);
+ if (!data->iovs) {
+ env_free(data);
+ return NULL;
+ }
+ }
+
+ data->iovcnt = 0;
+ data->iovalloc = iovcnt;
+
+ return data;
+}
+
+void
+vbdev_ocf_data_free(struct bdev_ocf_data *data)
+{
+ if (!data) {
+ return;
+ }
+
+ if (data->iovalloc != 0) {
+ env_free(data->iovs);
+ }
+
+ env_free(data);
+}
+
+void
+vbdev_ocf_iovs_add(struct bdev_ocf_data *data, void *base, size_t len)
+{
+ assert(NULL != data);
+ assert(data->iovalloc != -1);
+
+ if (data->iovcnt == data->iovalloc) {
+ /* TODO: Realloc iovs */
+ SPDK_ERRLOG("IOV error\n");
+ }
+
+ data->iovs[data->iovcnt].iov_base = base;
+ data->iovs[data->iovcnt].iov_len = len;
+ data->iovcnt++;
+}
+
+struct bdev_ocf_data *
+vbdev_ocf_data_from_spdk_io(struct spdk_bdev_io *bdev_io)
+{
+ struct bdev_ocf_data *data;
+
+ if (bdev_io == NULL) {
+ return NULL;
+ }
+
+ switch (bdev_io->type) {
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ case SPDK_BDEV_IO_TYPE_READ:
+ assert(bdev_io->u.bdev.iovs);
+ break;
+ case SPDK_BDEV_IO_TYPE_FLUSH:
+ case SPDK_BDEV_IO_TYPE_UNMAP:
+ break;
+ default:
+ SPDK_ERRLOG("Unsupported IO type %d\n", bdev_io->type);
+ return NULL;
+ }
+
+ data = (struct bdev_ocf_data *)bdev_io->driver_ctx;
+ data->iovs = bdev_io->u.bdev.iovs;
+ data->iovcnt = bdev_io->u.bdev.iovcnt;
+ data->size = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
+
+ return data;
+}
diff --git a/src/spdk/module/bdev/ocf/data.h b/src/spdk/module/bdev/ocf/data.h
new file mode 100644
index 000000000..7ed5adcef
--- /dev/null
+++ b/src/spdk/module/bdev/ocf/data.h
@@ -0,0 +1,57 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef VBDEV_OCF_DATA_H
+#define VBDEV_OCF_DATA_H
+
+#include "spdk/bdev_module.h"
+
+struct bdev_ocf_data {
+ struct iovec *iovs;
+ int iovcnt;
+ int iovalloc;
+ uint32_t size;
+ uint32_t seek;
+};
+
+struct bdev_ocf_data *vbdev_ocf_data_from_spdk_io(struct spdk_bdev_io *bdev_io);
+
+struct bdev_ocf_data *vbdev_ocf_data_alloc(uint32_t nvecs);
+
+void vbdev_ocf_data_free(struct bdev_ocf_data *data);
+
+struct bdev_ocf_data *vbdev_ocf_data_from_iov(struct iovec *iovs);
+
+void vbdev_ocf_iovs_add(struct bdev_ocf_data *data, void *base, size_t len);
+
+#endif
diff --git a/src/spdk/module/bdev/ocf/stats.c b/src/spdk/module/bdev/ocf/stats.c
new file mode 100644
index 000000000..164da7d2e
--- /dev/null
+++ b/src/spdk/module/bdev/ocf/stats.c
@@ -0,0 +1,109 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "ctx.h"
+#include "stats.h"
+
+int
+vbdev_ocf_stats_get(ocf_cache_t cache, char *core_name, struct vbdev_ocf_stats *stats)
+{
+ int status;
+ ocf_core_t core;
+
+ status = ocf_core_get_by_name(cache, core_name, strlen(core_name), &core);
+ if (status) {
+ return status;
+ }
+
+ return ocf_stats_collect_core(core, &stats->usage, &stats->reqs, &stats->blocks, &stats->errors);
+}
+
+#define WJSON_STAT(w, stats, group, field, units) \
+ spdk_json_write_named_object_begin(w, #field); \
+ spdk_json_write_named_uint64(w, "count", stats->group.field.value); \
+ spdk_json_write_named_string_fmt(w, "percentage", "%lu.%lu", \
+ stats->group.field.fraction / 100, stats->group.field.fraction % 100); \
+ spdk_json_write_named_string(w, "units", units); \
+ spdk_json_write_object_end(w);
+
+void
+vbdev_ocf_stats_write_json(struct spdk_json_write_ctx *w, struct vbdev_ocf_stats *stats)
+{
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_object_begin(w, "usage");
+ WJSON_STAT(w, stats, usage, occupancy, "4KiB blocks");
+ WJSON_STAT(w, stats, usage, free, "4KiB blocks");
+ WJSON_STAT(w, stats, usage, clean, "4KiB blocks");
+ WJSON_STAT(w, stats, usage, dirty, "4KiB blocks");
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_named_object_begin(w, "requests");
+ WJSON_STAT(w, stats, reqs, rd_hits, "Requests");
+ WJSON_STAT(w, stats, reqs, rd_partial_misses, "Requests");
+ WJSON_STAT(w, stats, reqs, rd_full_misses, "Requests");
+ WJSON_STAT(w, stats, reqs, rd_total, "Requests");
+ WJSON_STAT(w, stats, reqs, wr_hits, "Requests");
+ WJSON_STAT(w, stats, reqs, wr_partial_misses, "Requests");
+ WJSON_STAT(w, stats, reqs, wr_full_misses, "Requests");
+ WJSON_STAT(w, stats, reqs, wr_total, "Requests");
+ WJSON_STAT(w, stats, reqs, rd_pt, "Requests");
+ WJSON_STAT(w, stats, reqs, wr_pt, "Requests");
+ WJSON_STAT(w, stats, reqs, serviced, "Requests");
+ WJSON_STAT(w, stats, reqs, total, "Requests");
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_named_object_begin(w, "blocks");
+ WJSON_STAT(w, stats, blocks, core_volume_rd, "4KiB blocks");
+ WJSON_STAT(w, stats, blocks, core_volume_wr, "4KiB blocks");
+ WJSON_STAT(w, stats, blocks, core_volume_total, "4KiB blocks");
+ WJSON_STAT(w, stats, blocks, cache_volume_rd, "4KiB blocks");
+ WJSON_STAT(w, stats, blocks, cache_volume_wr, "4KiB blocks");
+ WJSON_STAT(w, stats, blocks, cache_volume_total, "4KiB blocks");
+ WJSON_STAT(w, stats, blocks, volume_rd, "4KiB blocks");
+ WJSON_STAT(w, stats, blocks, volume_wr, "4KiB blocks");
+ WJSON_STAT(w, stats, blocks, volume_total, "4KiB blocks");
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_named_object_begin(w, "errors");
+ WJSON_STAT(w, stats, errors, core_volume_rd, "Requests");
+ WJSON_STAT(w, stats, errors, core_volume_wr, "Requests");
+ WJSON_STAT(w, stats, errors, core_volume_total, "Requests");
+ WJSON_STAT(w, stats, errors, cache_volume_rd, "Requests");
+ WJSON_STAT(w, stats, errors, cache_volume_wr, "Requests");
+ WJSON_STAT(w, stats, errors, cache_volume_total, "Requests");
+ WJSON_STAT(w, stats, errors, total, "Requests");
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+}
diff --git a/src/spdk/module/bdev/ocf/stats.h b/src/spdk/module/bdev/ocf/stats.h
new file mode 100644
index 000000000..b377c67f5
--- /dev/null
+++ b/src/spdk/module/bdev/ocf/stats.h
@@ -0,0 +1,51 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef VBDEV_OCF_STATS_H
+#define VBDEV_OCF_STATS_H
+
+#include "spdk/json.h"
+#include <ocf/ocf.h>
+
+struct vbdev_ocf_stats {
+ struct ocf_stats_usage usage;
+ struct ocf_stats_requests reqs;
+ struct ocf_stats_blocks blocks;
+ struct ocf_stats_errors errors;
+};
+
+int vbdev_ocf_stats_get(ocf_cache_t cache, char *core_name, struct vbdev_ocf_stats *stats);
+
+void vbdev_ocf_stats_write_json(struct spdk_json_write_ctx *w, struct vbdev_ocf_stats *stats);
+
+#endif
diff --git a/src/spdk/module/bdev/ocf/utils.c b/src/spdk/module/bdev/ocf/utils.c
new file mode 100644
index 000000000..3a1df3c9e
--- /dev/null
+++ b/src/spdk/module/bdev/ocf/utils.c
@@ -0,0 +1,136 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "utils.h"
+#include "vbdev_ocf.h"
+
+static char *cache_modes[ocf_cache_mode_max] = {
+ [ocf_cache_mode_wt] = "wt",
+ [ocf_cache_mode_wb] = "wb",
+ [ocf_cache_mode_wa] = "wa",
+ [ocf_cache_mode_pt] = "pt",
+ [ocf_cache_mode_wi] = "wi",
+ [ocf_cache_mode_wo] = "wo",
+};
+
+ocf_cache_mode_t
+ocf_get_cache_mode(const char *cache_mode)
+{
+ int i;
+
+ for (i = 0; i < ocf_cache_mode_max; i++) {
+ if (strcmp(cache_mode, cache_modes[i]) == 0) {
+ return i;
+ }
+ }
+
+ return ocf_cache_mode_none;
+}
+
+const char *
+ocf_get_cache_modename(ocf_cache_mode_t mode)
+{
+ if (mode > ocf_cache_mode_none && mode < ocf_cache_mode_max) {
+ return cache_modes[mode];
+ } else {
+ return NULL;
+ }
+}
+
+int
+vbdev_ocf_mngt_start(struct vbdev_ocf *vbdev, vbdev_ocf_mngt_fn *path,
+ vbdev_ocf_mngt_callback cb, void *cb_arg)
+{
+ if (vbdev->mngt_ctx.current_step) {
+ return -EBUSY;
+ }
+
+ memset(&vbdev->mngt_ctx, 0, sizeof(vbdev->mngt_ctx));
+
+ vbdev->mngt_ctx.current_step = path;
+ vbdev->mngt_ctx.cb = cb;
+ vbdev->mngt_ctx.cb_arg = cb_arg;
+
+ (*vbdev->mngt_ctx.current_step)(vbdev);
+
+ return 0;
+}
+
+void
+vbdev_ocf_mngt_stop(struct vbdev_ocf *vbdev, vbdev_ocf_mngt_fn *rollback_path, int status)
+{
+ if (status) {
+ vbdev->mngt_ctx.status = status;
+ }
+
+ if (vbdev->mngt_ctx.status && rollback_path) {
+ vbdev->mngt_ctx.poller_fn = NULL;
+ vbdev->mngt_ctx.current_step = rollback_path;
+ (*vbdev->mngt_ctx.current_step)(vbdev);
+ return;
+ }
+
+ if (vbdev->mngt_ctx.cb) {
+ vbdev->mngt_ctx.cb(vbdev->mngt_ctx.status, vbdev, vbdev->mngt_ctx.cb_arg);
+ }
+
+ memset(&vbdev->mngt_ctx, 0, sizeof(vbdev->mngt_ctx));
+}
+
+void
+vbdev_ocf_mngt_continue(struct vbdev_ocf *vbdev, int status)
+{
+ if (vbdev->mngt_ctx.current_step == NULL) {
+ return;
+ }
+
+ assert((*vbdev->mngt_ctx.current_step) != NULL);
+
+ vbdev->mngt_ctx.status = status;
+
+ vbdev->mngt_ctx.current_step++;
+ if (*vbdev->mngt_ctx.current_step) {
+ (*vbdev->mngt_ctx.current_step)(vbdev);
+ return;
+ }
+
+ vbdev_ocf_mngt_stop(vbdev, NULL, 0);
+}
+
+int
+vbdev_ocf_mngt_get_status(struct vbdev_ocf *vbdev)
+{
+ return vbdev->mngt_ctx.status;
+}
diff --git a/src/spdk/module/bdev/ocf/utils.h b/src/spdk/module/bdev/ocf/utils.h
new file mode 100644
index 000000000..73bf6c93a
--- /dev/null
+++ b/src/spdk/module/bdev/ocf/utils.h
@@ -0,0 +1,67 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef VBDEV_OCF_UTILS_H
+#define VBDEV_OCF_UTILS_H
+
+#include <ocf/ocf.h>
+#include "vbdev_ocf.h"
+
+ocf_cache_mode_t ocf_get_cache_mode(const char *cache_mode);
+const char *ocf_get_cache_modename(ocf_cache_mode_t mode);
+
+/* Initiate management operation
+ * Receives NULL terminated array of functions (path)
+ * and callback (cb)
+ * and callback argument (cb_arg)
+ * This function may fail with ENOMEM or EBUSY */
+int vbdev_ocf_mngt_start(struct vbdev_ocf *vbdev, vbdev_ocf_mngt_fn *path,
+ vbdev_ocf_mngt_callback cb, void *cb_arg);
+
+/* Continue execution with polling operation (fn)
+ * fn must invoke vbdev_ocf_mngt_continue() to stop polling
+ * Poller has default timeout of 5 seconds */
+void vbdev_ocf_mngt_poll(struct vbdev_ocf *vbdev, vbdev_ocf_mngt_fn fn);
+
+/* Continue execution with next function that is on path
+ * If next function is NULL, finish management operation and invoke callback */
+void vbdev_ocf_mngt_continue(struct vbdev_ocf *vbdev, int status);
+
+/* Stop the execution, if status is non zero set it,
+ * if rollback function is not null invoke rollback
+ * else invoke callback with last status returned */
+void vbdev_ocf_mngt_stop(struct vbdev_ocf *vbdev, vbdev_ocf_mngt_fn *rollback_path, int status);
+
+/* Get status */
+int vbdev_ocf_mngt_get_status(struct vbdev_ocf *vbdev);
+#endif
diff --git a/src/spdk/module/bdev/ocf/vbdev_ocf.c b/src/spdk/module/bdev/ocf/vbdev_ocf.c
new file mode 100644
index 000000000..4997772cd
--- /dev/null
+++ b/src/spdk/module/bdev/ocf/vbdev_ocf.c
@@ -0,0 +1,1775 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <ocf/ocf.h>
+#include <ocf/ocf_types.h>
+#include <ocf/ocf_mngt.h>
+
+#include "ctx.h"
+#include "data.h"
+#include "volume.h"
+#include "utils.h"
+#include "vbdev_ocf.h"
+
+#include "spdk/bdev_module.h"
+#include "spdk/conf.h"
+#include "spdk/thread.h"
+#include "spdk/string.h"
+#include "spdk_internal/log.h"
+#include "spdk/cpuset.h"
+
+static struct spdk_bdev_module ocf_if;
+
+static TAILQ_HEAD(, vbdev_ocf) g_ocf_vbdev_head
+ = TAILQ_HEAD_INITIALIZER(g_ocf_vbdev_head);
+
+static TAILQ_HEAD(, examining_bdev) g_ocf_examining_bdevs_head
+ = TAILQ_HEAD_INITIALIZER(g_ocf_examining_bdevs_head);
+
+bool g_fini_started = false;
+
+/* Structure for keeping list of bdevs that are claimed but not used yet */
+struct examining_bdev {
+ struct spdk_bdev *bdev;
+ TAILQ_ENTRY(examining_bdev) tailq;
+};
+
+/* Add bdev to list of claimed */
+static void
+examine_start(struct spdk_bdev *bdev)
+{
+ struct examining_bdev *entry = malloc(sizeof(*entry));
+
+ assert(entry);
+ entry->bdev = bdev;
+ TAILQ_INSERT_TAIL(&g_ocf_examining_bdevs_head, entry, tailq);
+}
+
+/* Find bdev on list of claimed bdevs, then remove it,
+ * if it was the last one on list then report examine done */
+static void
+examine_done(int status, struct vbdev_ocf *vbdev, void *cb_arg)
+{
+ struct spdk_bdev *bdev = cb_arg;
+ struct examining_bdev *entry, *safe, *found = NULL;
+
+ TAILQ_FOREACH_SAFE(entry, &g_ocf_examining_bdevs_head, tailq, safe) {
+ if (entry->bdev == bdev) {
+ if (found) {
+ goto remove;
+ } else {
+ found = entry;
+ }
+ }
+ }
+
+ assert(found);
+ spdk_bdev_module_examine_done(&ocf_if);
+
+remove:
+ TAILQ_REMOVE(&g_ocf_examining_bdevs_head, found, tailq);
+ free(found);
+}
+
+/* Free allocated strings and structure itself
+ * Used at shutdown only */
+static void
+free_vbdev(struct vbdev_ocf *vbdev)
+{
+ if (!vbdev) {
+ return;
+ }
+
+ free(vbdev->name);
+ free(vbdev->cache.name);
+ free(vbdev->core.name);
+ free(vbdev);
+}
+
+/* Get existing cache base
+ * that is attached to other vbdev */
+static struct vbdev_ocf_base *
+get_other_cache_base(struct vbdev_ocf_base *base)
+{
+ struct vbdev_ocf *vbdev;
+
+ TAILQ_FOREACH(vbdev, &g_ocf_vbdev_head, tailq) {
+ if (&vbdev->cache == base || !vbdev->cache.attached) {
+ continue;
+ }
+ if (!strcmp(vbdev->cache.name, base->name)) {
+ return &vbdev->cache;
+ }
+ }
+
+ return NULL;
+}
+
+static bool
+is_ocf_cache_running(struct vbdev_ocf *vbdev)
+{
+ if (vbdev->cache.attached && vbdev->ocf_cache) {
+ return ocf_cache_is_running(vbdev->ocf_cache);
+ }
+ return false;
+}
+
+/* Get existing OCF cache instance
+ * that is started by other vbdev */
+static ocf_cache_t
+get_other_cache_instance(struct vbdev_ocf *vbdev)
+{
+ struct vbdev_ocf *cmp;
+
+ TAILQ_FOREACH(cmp, &g_ocf_vbdev_head, tailq) {
+ if (cmp->state.doing_finish || cmp == vbdev) {
+ continue;
+ }
+ if (strcmp(cmp->cache.name, vbdev->cache.name)) {
+ continue;
+ }
+ if (is_ocf_cache_running(cmp)) {
+ return cmp->ocf_cache;
+ }
+ }
+
+ return NULL;
+}
+
+static void
+_remove_base_bdev(void *ctx)
+{
+ struct spdk_bdev_desc *desc = ctx;
+
+ spdk_bdev_close(desc);
+}
+
+/* Close and unclaim base bdev */
+static void
+remove_base_bdev(struct vbdev_ocf_base *base)
+{
+ if (base->attached) {
+ if (base->management_channel) {
+ spdk_put_io_channel(base->management_channel);
+ }
+
+ spdk_bdev_module_release_bdev(base->bdev);
+ /* Close the underlying bdev on its same opened thread. */
+ if (base->thread && base->thread != spdk_get_thread()) {
+ spdk_thread_send_msg(base->thread, _remove_base_bdev, base->desc);
+ } else {
+ spdk_bdev_close(base->desc);
+ }
+ base->attached = false;
+ }
+}
+
+/* Finish unregister operation */
+static void
+unregister_finish(struct vbdev_ocf *vbdev)
+{
+ spdk_bdev_destruct_done(&vbdev->exp_bdev, vbdev->state.stop_status);
+ ocf_mngt_cache_put(vbdev->ocf_cache);
+ vbdev_ocf_cache_ctx_put(vbdev->cache_ctx);
+ vbdev_ocf_mngt_continue(vbdev, 0);
+}
+
+static void
+close_core_bdev(struct vbdev_ocf *vbdev)
+{
+ remove_base_bdev(&vbdev->core);
+ vbdev_ocf_mngt_continue(vbdev, 0);
+}
+
+static void
+remove_core_cmpl(void *priv, int error)
+{
+ struct vbdev_ocf *vbdev = priv;
+
+ ocf_mngt_cache_unlock(vbdev->ocf_cache);
+ vbdev_ocf_mngt_continue(vbdev, error);
+}
+
+/* Try to lock cache, then remove core */
+static void
+remove_core_cache_lock_cmpl(ocf_cache_t cache, void *priv, int error)
+{
+ struct vbdev_ocf *vbdev = (struct vbdev_ocf *)priv;
+
+ if (error) {
+ SPDK_ERRLOG("Error %d, can not lock cache instance %s\n",
+ error, vbdev->name);
+ vbdev_ocf_mngt_continue(vbdev, error);
+ return;
+ }
+
+ ocf_mngt_cache_remove_core(vbdev->ocf_core, remove_core_cmpl, vbdev);
+}
+
+/* Detach core base */
+static void
+detach_core(struct vbdev_ocf *vbdev)
+{
+ if (is_ocf_cache_running(vbdev)) {
+ ocf_mngt_cache_lock(vbdev->ocf_cache, remove_core_cache_lock_cmpl, vbdev);
+ } else {
+ vbdev_ocf_mngt_continue(vbdev, 0);
+ }
+}
+
+static void
+close_cache_bdev(struct vbdev_ocf *vbdev)
+{
+ remove_base_bdev(&vbdev->cache);
+ vbdev_ocf_mngt_continue(vbdev, 0);
+}
+
+/* Detach cache base */
+static void
+detach_cache(struct vbdev_ocf *vbdev)
+{
+ vbdev->state.stop_status = vbdev->mngt_ctx.status;
+
+ /* If some other vbdev references this cache bdev,
+ * we detach this only by changing the flag, without actual close */
+ if (get_other_cache_base(&vbdev->cache)) {
+ vbdev->cache.attached = false;
+ }
+
+ vbdev_ocf_mngt_continue(vbdev, 0);
+}
+
+static void
+stop_vbdev_cmpl(ocf_cache_t cache, void *priv, int error)
+{
+ struct vbdev_ocf *vbdev = priv;
+
+ vbdev_ocf_queue_put(vbdev->cache_ctx->mngt_queue);
+ ocf_mngt_cache_unlock(cache);
+
+ vbdev_ocf_mngt_continue(vbdev, error);
+}
+
+/* Try to lock cache, then stop it */
+static void
+stop_vbdev_cache_lock_cmpl(ocf_cache_t cache, void *priv, int error)
+{
+ struct vbdev_ocf *vbdev = (struct vbdev_ocf *)priv;
+
+ if (error) {
+ SPDK_ERRLOG("Error %d, can not lock cache instance %s\n",
+ error, vbdev->name);
+ vbdev_ocf_mngt_continue(vbdev, error);
+ return;
+ }
+
+ ocf_mngt_cache_stop(vbdev->ocf_cache, stop_vbdev_cmpl, vbdev);
+}
+
+/* Stop OCF cache object
+ * vbdev_ocf is not operational after this */
+static void
+stop_vbdev(struct vbdev_ocf *vbdev)
+{
+ if (!is_ocf_cache_running(vbdev)) {
+ vbdev_ocf_mngt_continue(vbdev, 0);
+ return;
+ }
+
+ if (!g_fini_started && get_other_cache_instance(vbdev)) {
+ SPDK_NOTICELOG("Not stopping cache instance '%s'"
+ " because it is referenced by other OCF bdev\n",
+ vbdev->cache.name);
+ vbdev_ocf_mngt_continue(vbdev, 0);
+ return;
+ }
+
+ ocf_mngt_cache_lock(vbdev->ocf_cache, stop_vbdev_cache_lock_cmpl, vbdev);
+}
+
+static void
+flush_vbdev_cmpl(ocf_cache_t cache, void *priv, int error)
+{
+ struct vbdev_ocf *vbdev = priv;
+
+ ocf_mngt_cache_unlock(cache);
+ vbdev_ocf_mngt_continue(vbdev, error);
+}
+
+static void
+flush_vbdev_cache_lock_cmpl(ocf_cache_t cache, void *priv, int error)
+{
+ struct vbdev_ocf *vbdev = (struct vbdev_ocf *)priv;
+
+ if (error) {
+ SPDK_ERRLOG("Error %d, can not lock cache instance %s\n",
+ error, vbdev->name);
+ vbdev_ocf_mngt_continue(vbdev, error);
+ return;
+ }
+
+ ocf_mngt_cache_flush(vbdev->ocf_cache, flush_vbdev_cmpl, vbdev);
+}
+
+static void
+flush_vbdev(struct vbdev_ocf *vbdev)
+{
+ if (!is_ocf_cache_running(vbdev)) {
+ vbdev_ocf_mngt_continue(vbdev, -EINVAL);
+ return;
+ }
+
+ ocf_mngt_cache_lock(vbdev->ocf_cache, flush_vbdev_cache_lock_cmpl, vbdev);
+}
+
+/* Procedures called during dirty unregister */
+vbdev_ocf_mngt_fn unregister_path_dirty[] = {
+ flush_vbdev,
+ stop_vbdev,
+ detach_cache,
+ close_cache_bdev,
+ detach_core,
+ close_core_bdev,
+ unregister_finish,
+ NULL
+};
+
+/* Procedures called during clean unregister */
+vbdev_ocf_mngt_fn unregister_path_clean[] = {
+ flush_vbdev,
+ detach_core,
+ close_core_bdev,
+ stop_vbdev,
+ detach_cache,
+ close_cache_bdev,
+ unregister_finish,
+ NULL
+};
+
+/* Start asynchronous management operation using unregister_path */
+static void
+unregister_cb(void *opaque)
+{
+ struct vbdev_ocf *vbdev = opaque;
+ vbdev_ocf_mngt_fn *unregister_path;
+ int rc;
+
+ unregister_path = vbdev->state.doing_clean_delete ?
+ unregister_path_clean : unregister_path_dirty;
+
+ rc = vbdev_ocf_mngt_start(vbdev, unregister_path, NULL, NULL);
+ if (rc) {
+ SPDK_ERRLOG("Unable to unregister OCF bdev: %d\n", rc);
+ spdk_bdev_destruct_done(&vbdev->exp_bdev, rc);
+ }
+}
+
+/* Clean remove case - remove core and then cache, this order
+ * will remove instance permanently */
+static void
+_vbdev_ocf_destruct_clean(struct vbdev_ocf *vbdev)
+{
+ if (vbdev->core.attached) {
+ detach_core(vbdev);
+ close_core_bdev(vbdev);
+ }
+
+ if (vbdev->cache.attached) {
+ detach_cache(vbdev);
+ close_cache_bdev(vbdev);
+ }
+}
+
+/* Dirty shutdown/hot remove case - remove cache and then core, this order
+ * will allow us to recover this instance in the future */
+static void
+_vbdev_ocf_destruct_dirty(struct vbdev_ocf *vbdev)
+{
+ if (vbdev->cache.attached) {
+ detach_cache(vbdev);
+ close_cache_bdev(vbdev);
+ }
+
+ if (vbdev->core.attached) {
+ detach_core(vbdev);
+ close_core_bdev(vbdev);
+ }
+}
+
+/* Unregister io device with callback to unregister_cb
+ * This function is called during spdk_bdev_unregister */
+static int
+vbdev_ocf_destruct(void *opaque)
+{
+ struct vbdev_ocf *vbdev = opaque;
+
+ if (vbdev->state.doing_finish) {
+ return -EALREADY;
+ }
+
+ if (vbdev->state.starting && !vbdev->state.started) {
+ /* Prevent before detach cache/core during register path of
+ this bdev */
+ return -EBUSY;
+ }
+
+ vbdev->state.doing_finish = true;
+
+ if (vbdev->state.started) {
+ spdk_io_device_unregister(vbdev, unregister_cb);
+ /* Return 1 because unregister is delayed */
+ return 1;
+ }
+
+ if (vbdev->state.doing_clean_delete) {
+ _vbdev_ocf_destruct_clean(vbdev);
+ } else {
+ _vbdev_ocf_destruct_dirty(vbdev);
+ }
+
+ return 0;
+}
+
+/* Stop OCF cache and unregister SPDK bdev */
+int
+vbdev_ocf_delete(struct vbdev_ocf *vbdev, void (*cb)(void *, int), void *cb_arg)
+{
+ int rc = 0;
+
+ if (vbdev->state.started) {
+ spdk_bdev_unregister(&vbdev->exp_bdev, cb, cb_arg);
+ } else {
+ rc = vbdev_ocf_destruct(vbdev);
+ if (rc == 0 && cb) {
+ cb(cb_arg, 0);
+ }
+ }
+
+ return rc;
+}
+
+/* Remove cores permanently and then stop OCF cache and unregister SPDK bdev */
+int
+vbdev_ocf_delete_clean(struct vbdev_ocf *vbdev, void (*cb)(void *, int),
+ void *cb_arg)
+{
+ vbdev->state.doing_clean_delete = true;
+
+ return vbdev_ocf_delete(vbdev, cb, cb_arg);
+}
+
+
+/* If vbdev is online, return its object */
+struct vbdev_ocf *
+vbdev_ocf_get_by_name(const char *name)
+{
+ struct vbdev_ocf *vbdev;
+
+ if (name == NULL) {
+ assert(false);
+ return NULL;
+ }
+
+ TAILQ_FOREACH(vbdev, &g_ocf_vbdev_head, tailq) {
+ if (vbdev->name == NULL || vbdev->state.doing_finish) {
+ continue;
+ }
+ if (strcmp(vbdev->name, name) == 0) {
+ return vbdev;
+ }
+ }
+ return NULL;
+}
+
+/* Return matching base if parent vbdev is online */
+struct vbdev_ocf_base *
+vbdev_ocf_get_base_by_name(const char *name)
+{
+ struct vbdev_ocf *vbdev;
+
+ if (name == NULL) {
+ assert(false);
+ return NULL;
+ }
+
+ TAILQ_FOREACH(vbdev, &g_ocf_vbdev_head, tailq) {
+ if (vbdev->state.doing_finish) {
+ continue;
+ }
+
+ if (vbdev->cache.name && strcmp(vbdev->cache.name, name) == 0) {
+ return &vbdev->cache;
+ }
+ if (vbdev->core.name && strcmp(vbdev->core.name, name) == 0) {
+ return &vbdev->core;
+ }
+ }
+ return NULL;
+}
+
+/* Execute fn for each OCF device that is online or waits for base devices */
+void
+vbdev_ocf_foreach(vbdev_ocf_foreach_fn fn, void *ctx)
+{
+ struct vbdev_ocf *vbdev;
+
+ assert(fn != NULL);
+
+ TAILQ_FOREACH(vbdev, &g_ocf_vbdev_head, tailq) {
+ if (!vbdev->state.doing_finish) {
+ fn(vbdev, ctx);
+ }
+ }
+}
+
+/* Called from OCF when SPDK_IO is completed */
+static void
+vbdev_ocf_io_submit_cb(struct ocf_io *io, int error)
+{
+ struct spdk_bdev_io *bdev_io = io->priv1;
+
+ if (error == 0) {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
+ } else if (error == -ENOMEM) {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
+ } else {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+
+ ocf_io_put(io);
+}
+
+/* Configure io parameters and send it to OCF */
+static int
+io_submit_to_ocf(struct spdk_bdev_io *bdev_io, struct ocf_io *io)
+{
+ switch (bdev_io->type) {
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ case SPDK_BDEV_IO_TYPE_READ:
+ ocf_core_submit_io(io);
+ return 0;
+ case SPDK_BDEV_IO_TYPE_FLUSH:
+ ocf_core_submit_flush(io);
+ return 0;
+ case SPDK_BDEV_IO_TYPE_UNMAP:
+ ocf_core_submit_discard(io);
+ return 0;
+ case SPDK_BDEV_IO_TYPE_RESET:
+ case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
+ default:
+ SPDK_ERRLOG("Unsupported IO type: %d\n", bdev_io->type);
+ return -EINVAL;
+ }
+}
+
+/* Submit SPDK-IO to OCF */
+static void
+io_handle(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
+{
+ struct vbdev_ocf *vbdev = bdev_io->bdev->ctxt;
+ struct ocf_io *io = NULL;
+ struct bdev_ocf_data *data = NULL;
+ struct vbdev_ocf_qctx *qctx = spdk_io_channel_get_ctx(ch);
+ uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
+ uint64_t offset = bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen;
+ int dir, flags = 0;
+ int err;
+
+ switch (bdev_io->type) {
+ case SPDK_BDEV_IO_TYPE_READ:
+ dir = OCF_READ;
+ break;
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ dir = OCF_WRITE;
+ break;
+ case SPDK_BDEV_IO_TYPE_FLUSH:
+ dir = OCF_WRITE;
+ break;
+ case SPDK_BDEV_IO_TYPE_UNMAP:
+ dir = OCF_WRITE;
+ break;
+ default:
+ err = -EINVAL;
+ goto fail;
+ }
+
+ if (bdev_io->type == SPDK_BDEV_IO_TYPE_FLUSH) {
+ flags = OCF_WRITE_FLUSH;
+ }
+
+ io = ocf_core_new_io(vbdev->ocf_core, qctx->queue, offset, len, dir, 0, flags);
+ if (!io) {
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ data = vbdev_ocf_data_from_spdk_io(bdev_io);
+ if (!data) {
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ err = ocf_io_set_data(io, data, 0);
+ if (err) {
+ goto fail;
+ }
+
+ ocf_io_set_cmpl(io, bdev_io, NULL, vbdev_ocf_io_submit_cb);
+
+ err = io_submit_to_ocf(bdev_io, io);
+ if (err) {
+ goto fail;
+ }
+
+ return;
+
+fail:
+ if (io) {
+ ocf_io_put(io);
+ }
+
+ if (err == -ENOMEM) {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
+ } else {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+}
+
+static void
+vbdev_ocf_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
+ bool success)
+{
+ if (!success) {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ return;
+ }
+
+ io_handle(ch, bdev_io);
+}
+
+/* Called from bdev layer when an io to Cache vbdev is submitted */
+static void
+vbdev_ocf_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
+{
+ switch (bdev_io->type) {
+ case SPDK_BDEV_IO_TYPE_READ:
+ /* User does not have to allocate io vectors for the request,
+ * so in case they are not allocated, we allocate them here */
+ spdk_bdev_io_get_buf(bdev_io, vbdev_ocf_get_buf_cb,
+ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
+ break;
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ case SPDK_BDEV_IO_TYPE_FLUSH:
+ case SPDK_BDEV_IO_TYPE_UNMAP:
+ io_handle(ch, bdev_io);
+ break;
+ case SPDK_BDEV_IO_TYPE_RESET:
+ case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
+ default:
+ SPDK_ERRLOG("Unknown I/O type %d\n", bdev_io->type);
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ break;
+ }
+}
+
+/* Called from bdev layer */
+static bool
+vbdev_ocf_io_type_supported(void *opaque, enum spdk_bdev_io_type io_type)
+{
+ struct vbdev_ocf *vbdev = opaque;
+
+ switch (io_type) {
+ case SPDK_BDEV_IO_TYPE_READ:
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ case SPDK_BDEV_IO_TYPE_FLUSH:
+ case SPDK_BDEV_IO_TYPE_UNMAP:
+ return spdk_bdev_io_type_supported(vbdev->core.bdev, io_type);
+ case SPDK_BDEV_IO_TYPE_RESET:
+ case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
+ default:
+ return false;
+ }
+}
+
+/* Called from bdev layer */
+static struct spdk_io_channel *
+vbdev_ocf_get_io_channel(void *opaque)
+{
+ struct vbdev_ocf *bdev = opaque;
+
+ return spdk_get_io_channel(bdev);
+}
+
+static int
+vbdev_ocf_dump_info_json(void *opaque, struct spdk_json_write_ctx *w)
+{
+ struct vbdev_ocf *vbdev = opaque;
+
+ spdk_json_write_named_string(w, "cache_device", vbdev->cache.name);
+ spdk_json_write_named_string(w, "core_device", vbdev->core.name);
+
+ spdk_json_write_named_string(w, "mode",
+ ocf_get_cache_modename(ocf_cache_get_mode(vbdev->ocf_cache)));
+ spdk_json_write_named_uint32(w, "cache_line_size",
+ ocf_cache_get_line_size(vbdev->ocf_cache));
+ spdk_json_write_named_bool(w, "metadata_volatile",
+ vbdev->cfg.cache.metadata_volatile);
+
+ return 0;
+}
+
+static void
+vbdev_ocf_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
+{
+ struct vbdev_ocf *vbdev = bdev->ctxt;
+
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_string(w, "method", "bdev_ocf_create");
+
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_string(w, "name", vbdev->name);
+ spdk_json_write_named_string(w, "mode",
+ ocf_get_cache_modename(ocf_cache_get_mode(vbdev->ocf_cache)));
+ spdk_json_write_named_string(w, "cache_bdev_name", vbdev->cache.name);
+ spdk_json_write_named_string(w, "core_bdev_name", vbdev->core.name);
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+}
+
+/* Cache vbdev function table
+ * Used by bdev layer */
+static struct spdk_bdev_fn_table cache_dev_fn_table = {
+ .destruct = vbdev_ocf_destruct,
+ .io_type_supported = vbdev_ocf_io_type_supported,
+ .submit_request = vbdev_ocf_submit_request,
+ .get_io_channel = vbdev_ocf_get_io_channel,
+ .write_config_json = vbdev_ocf_write_json_config,
+ .dump_info_json = vbdev_ocf_dump_info_json,
+};
+
+/* Poller function for the OCF queue
+ * We execute OCF requests here synchronously */
+static int
+queue_poll(void *opaque)
+{
+ struct vbdev_ocf_qctx *qctx = opaque;
+ uint32_t iono = ocf_queue_pending_io(qctx->queue);
+ int i, max = spdk_min(32, iono);
+
+ for (i = 0; i < max; i++) {
+ ocf_queue_run_single(qctx->queue);
+ }
+
+ if (iono > 0) {
+ return SPDK_POLLER_BUSY;
+ } else {
+ return SPDK_POLLER_IDLE;
+ }
+}
+
+/* Called during ocf_submit_io, ocf_purge*
+ * and any other requests that need to submit io */
+static void
+vbdev_ocf_ctx_queue_kick(ocf_queue_t q)
+{
+}
+
+/* OCF queue deinitialization
+ * Called at ocf_cache_stop */
+static void
+vbdev_ocf_ctx_queue_stop(ocf_queue_t q)
+{
+ struct vbdev_ocf_qctx *qctx = ocf_queue_get_priv(q);
+
+ if (qctx) {
+ spdk_put_io_channel(qctx->cache_ch);
+ spdk_put_io_channel(qctx->core_ch);
+ spdk_poller_unregister(&qctx->poller);
+ if (qctx->allocated) {
+ free(qctx);
+ }
+ }
+}
+
+/* Queue ops is an interface for running queue thread
+ * stop() operation in called just before queue gets destroyed */
+const struct ocf_queue_ops queue_ops = {
+ .kick_sync = vbdev_ocf_ctx_queue_kick,
+ .kick = vbdev_ocf_ctx_queue_kick,
+ .stop = vbdev_ocf_ctx_queue_stop,
+};
+
+/* Called on cache vbdev creation at every thread
+ * We allocate OCF queues here and SPDK poller for it */
+static int
+io_device_create_cb(void *io_device, void *ctx_buf)
+{
+ struct vbdev_ocf *vbdev = io_device;
+ struct vbdev_ocf_qctx *qctx = ctx_buf;
+ int rc;
+
+ rc = vbdev_ocf_queue_create(vbdev->ocf_cache, &qctx->queue, &queue_ops);
+ if (rc) {
+ return rc;
+ }
+
+ ocf_queue_set_priv(qctx->queue, qctx);
+
+ qctx->vbdev = vbdev;
+ qctx->cache_ch = spdk_bdev_get_io_channel(vbdev->cache.desc);
+ qctx->core_ch = spdk_bdev_get_io_channel(vbdev->core.desc);
+ qctx->poller = SPDK_POLLER_REGISTER(queue_poll, qctx, 0);
+
+ return rc;
+}
+
+/* Called per thread
+ * Put OCF queue and relaunch poller with new context to finish pending requests */
+static void
+io_device_destroy_cb(void *io_device, void *ctx_buf)
+{
+ /* Making a copy of context to use it after io channel will be destroyed */
+ struct vbdev_ocf_qctx *copy = malloc(sizeof(*copy));
+ struct vbdev_ocf_qctx *qctx = ctx_buf;
+
+ if (copy) {
+ ocf_queue_set_priv(qctx->queue, copy);
+ memcpy(copy, qctx, sizeof(*copy));
+ spdk_poller_unregister(&qctx->poller);
+ copy->poller = SPDK_POLLER_REGISTER(queue_poll, copy, 0);
+ copy->allocated = true;
+ } else {
+ SPDK_ERRLOG("Unable to stop OCF queue properly: %s\n",
+ spdk_strerror(ENOMEM));
+ }
+
+ vbdev_ocf_queue_put(qctx->queue);
+}
+
+/* OCF management queue deinitialization */
+static void
+vbdev_ocf_ctx_mngt_queue_stop(ocf_queue_t q)
+{
+ struct spdk_poller *poller = ocf_queue_get_priv(q);
+
+ if (poller) {
+ spdk_poller_unregister(&poller);
+ }
+}
+
+static int
+mngt_queue_poll(void *opaque)
+{
+ ocf_queue_t q = opaque;
+ uint32_t iono = ocf_queue_pending_io(q);
+ int i, max = spdk_min(32, iono);
+
+ for (i = 0; i < max; i++) {
+ ocf_queue_run_single(q);
+ }
+
+ if (iono > 0) {
+ return SPDK_POLLER_BUSY;
+ } else {
+ return SPDK_POLLER_IDLE;
+ }
+}
+
+static void
+vbdev_ocf_ctx_mngt_queue_kick(ocf_queue_t q)
+{
+}
+
+/* Queue ops is an interface for running queue thread
+ * stop() operation in called just before queue gets destroyed */
+const struct ocf_queue_ops mngt_queue_ops = {
+ .kick_sync = NULL,
+ .kick = vbdev_ocf_ctx_mngt_queue_kick,
+ .stop = vbdev_ocf_ctx_mngt_queue_stop,
+};
+
+static void
+vbdev_ocf_mngt_exit(struct vbdev_ocf *vbdev, vbdev_ocf_mngt_fn *rollback_path, int rc)
+{
+ vbdev->state.starting = false;
+ vbdev_ocf_mngt_stop(vbdev, rollback_path, rc);
+}
+
+/* Create exported spdk object */
+static void
+finish_register(struct vbdev_ocf *vbdev)
+{
+ int result;
+
+ /* Copy properties of the base bdev */
+ vbdev->exp_bdev.blocklen = vbdev->core.bdev->blocklen;
+ vbdev->exp_bdev.write_cache = vbdev->core.bdev->write_cache;
+ vbdev->exp_bdev.required_alignment = vbdev->core.bdev->required_alignment;
+
+ vbdev->exp_bdev.name = vbdev->name;
+ vbdev->exp_bdev.product_name = "SPDK OCF";
+
+ vbdev->exp_bdev.blockcnt = vbdev->core.bdev->blockcnt;
+ vbdev->exp_bdev.ctxt = vbdev;
+ vbdev->exp_bdev.fn_table = &cache_dev_fn_table;
+ vbdev->exp_bdev.module = &ocf_if;
+
+ /* Finally register vbdev in SPDK */
+ spdk_io_device_register(vbdev, io_device_create_cb, io_device_destroy_cb,
+ sizeof(struct vbdev_ocf_qctx), vbdev->name);
+ result = spdk_bdev_register(&vbdev->exp_bdev);
+ if (result) {
+ SPDK_ERRLOG("Could not register exposed bdev %s\n",
+ vbdev->name);
+ vbdev_ocf_mngt_exit(vbdev, unregister_path_dirty, result);
+ return;
+ } else {
+ vbdev->state.started = true;
+ }
+
+ vbdev_ocf_mngt_continue(vbdev, result);
+}
+
+static void
+add_core_cmpl(ocf_cache_t cache, ocf_core_t core, void *priv, int error)
+{
+ struct vbdev_ocf *vbdev = priv;
+
+ ocf_mngt_cache_unlock(cache);
+
+ if (error) {
+ SPDK_ERRLOG("Error %d, failed to add core device to cache instance %s,"
+ "starting rollback\n", error, vbdev->name);
+ vbdev_ocf_mngt_exit(vbdev, unregister_path_dirty, error);
+ return;
+ } else {
+ vbdev->ocf_core = core;
+ }
+
+ vbdev_ocf_mngt_continue(vbdev, error);
+}
+
+/* Try to lock cache, then add core */
+static void
+add_core_cache_lock_cmpl(ocf_cache_t cache, void *priv, int error)
+{
+ struct vbdev_ocf *vbdev = (struct vbdev_ocf *)priv;
+
+ if (error) {
+ SPDK_ERRLOG("Error %d, can not lock cache instance %s,"
+ "starting rollback\n", error, vbdev->name);
+ vbdev_ocf_mngt_exit(vbdev, unregister_path_dirty, error);
+ }
+ ocf_mngt_cache_add_core(vbdev->ocf_cache, &vbdev->cfg.core, add_core_cmpl, vbdev);
+}
+
+/* Add core for existing OCF cache instance */
+static void
+add_core(struct vbdev_ocf *vbdev)
+{
+ ocf_mngt_cache_lock(vbdev->ocf_cache, add_core_cache_lock_cmpl, vbdev);
+}
+
+static void
+start_cache_cmpl(ocf_cache_t cache, void *priv, int error)
+{
+ struct vbdev_ocf *vbdev = priv;
+
+ ocf_mngt_cache_unlock(cache);
+
+ if (error) {
+ SPDK_ERRLOG("Error %d during start cache %s, starting rollback\n",
+ error, vbdev->name);
+ vbdev_ocf_mngt_exit(vbdev, unregister_path_dirty, error);
+ return;
+ }
+
+ vbdev_ocf_mngt_continue(vbdev, error);
+}
+
+static int
+create_management_queue(struct vbdev_ocf *vbdev)
+{
+ struct spdk_poller *mngt_poller;
+ int rc;
+
+ rc = vbdev_ocf_queue_create(vbdev->ocf_cache, &vbdev->cache_ctx->mngt_queue, &mngt_queue_ops);
+ if (rc) {
+ SPDK_ERRLOG("Unable to create mngt_queue: %d\n", rc);
+ return rc;
+ }
+
+ mngt_poller = SPDK_POLLER_REGISTER(mngt_queue_poll, vbdev->cache_ctx->mngt_queue, 100);
+ if (mngt_poller == NULL) {
+ SPDK_ERRLOG("Unable to initiate mngt request: %s", spdk_strerror(ENOMEM));
+ return -ENOMEM;
+ }
+
+ ocf_queue_set_priv(vbdev->cache_ctx->mngt_queue, mngt_poller);
+ ocf_mngt_cache_set_mngt_queue(vbdev->ocf_cache, vbdev->cache_ctx->mngt_queue);
+
+ return 0;
+}
+
+/* Start OCF cache, attach caching device */
+static void
+start_cache(struct vbdev_ocf *vbdev)
+{
+ ocf_cache_t existing;
+ int rc;
+
+ if (is_ocf_cache_running(vbdev)) {
+ vbdev_ocf_mngt_stop(vbdev, NULL, -EALREADY);
+ return;
+ }
+
+ existing = get_other_cache_instance(vbdev);
+ if (existing) {
+ SPDK_NOTICELOG("OCF bdev %s connects to existing cache device %s\n",
+ vbdev->name, vbdev->cache.name);
+ vbdev->ocf_cache = existing;
+ ocf_mngt_cache_get(vbdev->ocf_cache);
+ vbdev->cache_ctx = ocf_cache_get_priv(existing);
+ vbdev_ocf_cache_ctx_get(vbdev->cache_ctx);
+ vbdev_ocf_mngt_continue(vbdev, 0);
+ return;
+ }
+
+ vbdev->cache_ctx = calloc(1, sizeof(struct vbdev_ocf_cache_ctx));
+ if (vbdev->cache_ctx == NULL) {
+ vbdev_ocf_mngt_exit(vbdev, unregister_path_dirty, -ENOMEM);
+ return;
+ }
+
+ vbdev_ocf_cache_ctx_get(vbdev->cache_ctx);
+ pthread_mutex_init(&vbdev->cache_ctx->lock, NULL);
+
+ rc = ocf_mngt_cache_start(vbdev_ocf_ctx, &vbdev->ocf_cache, &vbdev->cfg.cache);
+ if (rc) {
+ vbdev_ocf_mngt_exit(vbdev, unregister_path_dirty, rc);
+ return;
+ }
+ ocf_mngt_cache_get(vbdev->ocf_cache);
+
+ ocf_cache_set_priv(vbdev->ocf_cache, vbdev->cache_ctx);
+
+ rc = create_management_queue(vbdev);
+ if (rc) {
+ SPDK_ERRLOG("Unable to create mngt_queue: %d\n", rc);
+ vbdev_ocf_mngt_exit(vbdev, unregister_path_dirty, rc);
+ return;
+ }
+
+ if (vbdev->cfg.loadq) {
+ ocf_mngt_cache_load(vbdev->ocf_cache, &vbdev->cfg.device, start_cache_cmpl, vbdev);
+ } else {
+ ocf_mngt_cache_attach(vbdev->ocf_cache, &vbdev->cfg.device, start_cache_cmpl, vbdev);
+ }
+}
+
+/* Procedures called during register operation */
+vbdev_ocf_mngt_fn register_path[] = {
+ start_cache,
+ add_core,
+ finish_register,
+ NULL
+};
+
+/* Start cache instance and register OCF bdev */
+static void
+register_vbdev(struct vbdev_ocf *vbdev, vbdev_ocf_mngt_callback cb, void *cb_arg)
+{
+ int rc;
+
+ if (!(vbdev->core.attached && vbdev->cache.attached) || vbdev->state.started) {
+ cb(-EPERM, vbdev, cb_arg);
+ return;
+ }
+
+ vbdev->state.starting = true;
+ rc = vbdev_ocf_mngt_start(vbdev, register_path, cb, cb_arg);
+ if (rc) {
+ cb(rc, vbdev, cb_arg);
+ }
+}
+
+/* Init OCF configuration options
+ * for core and cache devices */
+static void
+init_vbdev_config(struct vbdev_ocf *vbdev)
+{
+ struct vbdev_ocf_config *cfg = &vbdev->cfg;
+
+ snprintf(cfg->cache.name, sizeof(cfg->cache.name), "%s", vbdev->name);
+ snprintf(cfg->core.name, sizeof(cfg->core.name), "%s", vbdev->core.name);
+
+ /* TODO [metadata]: make configurable with persistent
+ * metadata support */
+ cfg->cache.metadata_volatile = false;
+
+ /* TODO [cache line size]: make cache line size configurable
+ * Using standard 4KiB for now */
+ cfg->cache.cache_line_size = ocf_cache_line_size_4;
+
+ /* This are suggested values that
+ * should be sufficient for most use cases */
+ cfg->cache.backfill.max_queue_size = 65536;
+ cfg->cache.backfill.queue_unblock_size = 60000;
+
+ /* TODO [cache line size] */
+ cfg->device.cache_line_size = ocf_cache_line_size_4;
+ cfg->device.force = true;
+ cfg->device.perform_test = false;
+ cfg->device.discard_on_start = false;
+
+ vbdev->cfg.cache.locked = true;
+
+ cfg->core.volume_type = SPDK_OBJECT;
+ cfg->device.volume_type = SPDK_OBJECT;
+
+ if (vbdev->cfg.loadq) {
+ /* When doing cache_load(), we need to set try_add to true,
+ * otherwise OCF will interpret this core as new
+ * instead of the inactive one */
+ vbdev->cfg.core.try_add = true;
+ }
+
+ /* Serialize bdev names in OCF UUID to interpret on future loads
+ * Core UUID is a triple of (core name, vbdev name, cache name)
+ * Cache UUID is cache bdev name */
+ cfg->device.uuid.size = strlen(vbdev->cache.name) + 1;
+ cfg->device.uuid.data = vbdev->cache.name;
+
+ snprintf(vbdev->uuid, VBDEV_OCF_MD_MAX_LEN, "%s %s %s",
+ vbdev->core.name, vbdev->name, vbdev->cache.name);
+ cfg->core.uuid.size = strlen(vbdev->uuid) + 1;
+ cfg->core.uuid.data = vbdev->uuid;
+ vbdev->uuid[strlen(vbdev->core.name)] = 0;
+ vbdev->uuid[strlen(vbdev->core.name) + 1 + strlen(vbdev->name)] = 0;
+}
+
+/* Allocate vbdev structure object and add it to the global list */
+static int
+init_vbdev(const char *vbdev_name,
+ const char *cache_mode_name,
+ const char *cache_name,
+ const char *core_name,
+ bool loadq)
+{
+ struct vbdev_ocf *vbdev;
+ int rc = 0;
+
+ if (spdk_bdev_get_by_name(vbdev_name) || vbdev_ocf_get_by_name(vbdev_name)) {
+ SPDK_ERRLOG("Device with name '%s' already exists\n", vbdev_name);
+ return -EPERM;
+ }
+
+ vbdev = calloc(1, sizeof(*vbdev));
+ if (!vbdev) {
+ goto error_mem;
+ }
+
+ vbdev->cache.parent = vbdev;
+ vbdev->core.parent = vbdev;
+ vbdev->cache.is_cache = true;
+ vbdev->core.is_cache = false;
+
+ if (cache_mode_name) {
+ vbdev->cfg.cache.cache_mode
+ = ocf_get_cache_mode(cache_mode_name);
+ } else if (!loadq) { /* In load path it is OK to pass NULL as cache mode */
+ SPDK_ERRLOG("No cache mode specified\n");
+ rc = -EINVAL;
+ goto error_free;
+ }
+ if (vbdev->cfg.cache.cache_mode < 0) {
+ SPDK_ERRLOG("Incorrect cache mode '%s'\n", cache_mode_name);
+ rc = -EINVAL;
+ goto error_free;
+ }
+
+ vbdev->name = strdup(vbdev_name);
+ if (!vbdev->name) {
+ goto error_mem;
+ }
+
+ vbdev->cache.name = strdup(cache_name);
+ if (!vbdev->cache.name) {
+ goto error_mem;
+ }
+
+ vbdev->core.name = strdup(core_name);
+ if (!vbdev->core.name) {
+ goto error_mem;
+ }
+
+ vbdev->cfg.loadq = loadq;
+ init_vbdev_config(vbdev);
+ TAILQ_INSERT_TAIL(&g_ocf_vbdev_head, vbdev, tailq);
+ return rc;
+
+error_mem:
+ rc = -ENOMEM;
+error_free:
+ free_vbdev(vbdev);
+ return rc;
+}
+
+/* Read configuration file at the start of SPDK application
+ * This adds vbdevs to global list if some mentioned in config */
+static int
+vbdev_ocf_init(void)
+{
+ const char *vbdev_name, *modename, *cache_name, *core_name;
+ struct spdk_conf_section *sp;
+ int status;
+
+ status = vbdev_ocf_ctx_init();
+ if (status) {
+ SPDK_ERRLOG("OCF ctx initialization failed with=%d\n", status);
+ return status;
+ }
+
+ status = vbdev_ocf_volume_init();
+ if (status) {
+ vbdev_ocf_ctx_cleanup();
+ SPDK_ERRLOG("OCF volume initialization failed with=%d\n", status);
+ return status;
+ }
+
+ sp = spdk_conf_find_section(NULL, "OCF");
+ if (sp == NULL) {
+ return 0;
+ }
+
+ for (int i = 0; ; i++) {
+ if (!spdk_conf_section_get_nval(sp, "OCF", i)) {
+ break;
+ }
+
+ vbdev_name = spdk_conf_section_get_nmval(sp, "OCF", i, 0);
+ if (!vbdev_name) {
+ SPDK_ERRLOG("No vbdev name specified\n");
+ continue;
+ }
+
+ modename = spdk_conf_section_get_nmval(sp, "OCF", i, 1);
+ if (!modename) {
+ SPDK_ERRLOG("No modename specified for OCF vbdev '%s'\n", vbdev_name);
+ continue;
+ }
+
+ cache_name = spdk_conf_section_get_nmval(sp, "OCF", i, 2);
+ if (!cache_name) {
+ SPDK_ERRLOG("No cache device specified for OCF vbdev '%s'\n", vbdev_name);
+ continue;
+ }
+
+ core_name = spdk_conf_section_get_nmval(sp, "OCF", i, 3);
+ if (!core_name) {
+ SPDK_ERRLOG("No core devices specified for OCF vbdev '%s'\n", vbdev_name);
+ continue;
+ }
+
+ status = init_vbdev(vbdev_name, modename, cache_name, core_name, false);
+ if (status) {
+ SPDK_ERRLOG("Config initialization failed with code: %d\n", status);
+ }
+ }
+
+ return status;
+}
+
+/* Called after application shutdown started
+ * Release memory of allocated structures here */
+static void
+vbdev_ocf_module_fini(void)
+{
+ struct vbdev_ocf *vbdev;
+
+ while ((vbdev = TAILQ_FIRST(&g_ocf_vbdev_head))) {
+ TAILQ_REMOVE(&g_ocf_vbdev_head, vbdev, tailq);
+ free_vbdev(vbdev);
+ }
+
+ vbdev_ocf_volume_cleanup();
+ vbdev_ocf_ctx_cleanup();
+}
+
+/* When base device gets unpluged this is called
+ * We will unregister cache vbdev here
+ * When cache device is removed, we delete every OCF bdev that used it */
+static void
+hotremove_cb(void *ctx)
+{
+ struct vbdev_ocf_base *base = ctx;
+ struct vbdev_ocf *vbdev;
+
+ if (!base->is_cache) {
+ if (base->parent->state.doing_finish) {
+ return;
+ }
+
+ SPDK_NOTICELOG("Deinitializing '%s' because its core device '%s' was removed\n",
+ base->parent->name, base->name);
+ vbdev_ocf_delete(base->parent, NULL, NULL);
+ return;
+ }
+
+ TAILQ_FOREACH(vbdev, &g_ocf_vbdev_head, tailq) {
+ if (vbdev->state.doing_finish) {
+ continue;
+ }
+ if (strcmp(base->name, vbdev->cache.name) == 0) {
+ SPDK_NOTICELOG("Deinitializing '%s' because"
+ " its cache device '%s' was removed\n",
+ vbdev->name, base->name);
+ vbdev_ocf_delete(vbdev, NULL, NULL);
+ }
+ }
+}
+
+/* Open base SPDK bdev and claim it */
+static int
+attach_base(struct vbdev_ocf_base *base)
+{
+ int status;
+
+ if (base->attached) {
+ return -EALREADY;
+ }
+
+ /* If base cache bdev was already opened by other vbdev,
+ * we just copy its descriptor here */
+ if (base->is_cache) {
+ struct vbdev_ocf_base *existing = get_other_cache_base(base);
+ if (existing) {
+ base->desc = existing->desc;
+ base->management_channel = existing->management_channel;
+ base->attached = true;
+ return 0;
+ }
+ }
+
+ status = spdk_bdev_open(base->bdev, true, hotremove_cb, base, &base->desc);
+ if (status) {
+ SPDK_ERRLOG("Unable to open device '%s' for writing\n", base->name);
+ return status;
+ }
+
+ status = spdk_bdev_module_claim_bdev(base->bdev, base->desc,
+ &ocf_if);
+ if (status) {
+ SPDK_ERRLOG("Unable to claim device '%s'\n", base->name);
+ spdk_bdev_close(base->desc);
+ return status;
+ }
+
+ base->management_channel = spdk_bdev_get_io_channel(base->desc);
+ if (!base->management_channel) {
+ SPDK_ERRLOG("Unable to get io channel '%s'\n", base->name);
+ spdk_bdev_module_release_bdev(base->bdev);
+ spdk_bdev_close(base->desc);
+ return -ENOMEM;
+ }
+
+ /* Save the thread where the base device is opened */
+ base->thread = spdk_get_thread();
+
+ base->attached = true;
+ return status;
+}
+
+/* Attach base bdevs */
+static int
+attach_base_bdevs(struct vbdev_ocf *vbdev,
+ struct spdk_bdev *cache_bdev,
+ struct spdk_bdev *core_bdev)
+{
+ int rc = 0;
+
+ if (cache_bdev) {
+ vbdev->cache.bdev = cache_bdev;
+ rc |= attach_base(&vbdev->cache);
+ }
+
+ if (core_bdev) {
+ vbdev->core.bdev = core_bdev;
+ rc |= attach_base(&vbdev->core);
+ }
+
+ return rc;
+}
+
+/* Init and then start vbdev if all base devices are present */
+void
+vbdev_ocf_construct(const char *vbdev_name,
+ const char *cache_mode_name,
+ const char *cache_name,
+ const char *core_name,
+ bool loadq,
+ void (*cb)(int, struct vbdev_ocf *, void *),
+ void *cb_arg)
+{
+ int rc;
+ struct spdk_bdev *cache_bdev = spdk_bdev_get_by_name(cache_name);
+ struct spdk_bdev *core_bdev = spdk_bdev_get_by_name(core_name);
+ struct vbdev_ocf *vbdev;
+
+ rc = init_vbdev(vbdev_name, cache_mode_name, cache_name, core_name, loadq);
+ if (rc) {
+ cb(rc, NULL, cb_arg);
+ return;
+ }
+
+ vbdev = vbdev_ocf_get_by_name(vbdev_name);
+ if (vbdev == NULL) {
+ cb(-ENODEV, NULL, cb_arg);
+ return;
+ }
+
+ if (cache_bdev == NULL) {
+ SPDK_NOTICELOG("OCF bdev '%s' is waiting for cache device '%s' to connect\n",
+ vbdev->name, cache_name);
+ }
+ if (core_bdev == NULL) {
+ SPDK_NOTICELOG("OCF bdev '%s' is waiting for core device '%s' to connect\n",
+ vbdev->name, core_name);
+ }
+
+ rc = attach_base_bdevs(vbdev, cache_bdev, core_bdev);
+ if (rc) {
+ cb(rc, vbdev, cb_arg);
+ return;
+ }
+
+ if (core_bdev && cache_bdev) {
+ register_vbdev(vbdev, cb, cb_arg);
+ } else {
+ cb(0, vbdev, cb_arg);
+ }
+}
+
+/* This called if new device is created in SPDK application
+ * If that device named as one of base bdevs of OCF vbdev,
+ * claim and open them */
+static void
+vbdev_ocf_examine(struct spdk_bdev *bdev)
+{
+ const char *bdev_name = spdk_bdev_get_name(bdev);
+ struct vbdev_ocf *vbdev;
+
+ TAILQ_FOREACH(vbdev, &g_ocf_vbdev_head, tailq) {
+ if (vbdev->state.doing_finish) {
+ continue;
+ }
+
+ if (!strcmp(bdev_name, vbdev->cache.name)) {
+ attach_base_bdevs(vbdev, bdev, NULL);
+ continue;
+ }
+ if (!strcmp(bdev_name, vbdev->core.name)) {
+ attach_base_bdevs(vbdev, NULL, bdev);
+ break;
+ }
+ }
+ spdk_bdev_module_examine_done(&ocf_if);
+}
+
+struct metadata_probe_ctx {
+ struct vbdev_ocf_base base;
+ ocf_volume_t volume;
+
+ struct ocf_volume_uuid *core_uuids;
+ unsigned int uuid_count;
+
+ int result;
+ int refcnt;
+};
+
+static void
+_examine_ctx_put(void *ctx)
+{
+ struct spdk_bdev_desc *desc = ctx;
+
+ spdk_bdev_close(desc);
+}
+
+static void
+examine_ctx_put(struct metadata_probe_ctx *ctx)
+{
+ unsigned int i;
+
+ ctx->refcnt--;
+ if (ctx->refcnt > 0) {
+ return;
+ }
+
+ if (ctx->result) {
+ SPDK_ERRLOG("OCF metadata probe for bdev '%s' failed with %d\n",
+ spdk_bdev_get_name(ctx->base.bdev), ctx->result);
+ }
+
+ if (ctx->base.desc) {
+ /* Close the underlying bdev on its same opened thread. */
+ if (ctx->base.thread && ctx->base.thread != spdk_get_thread()) {
+ spdk_thread_send_msg(ctx->base.thread, _examine_ctx_put, ctx->base.desc);
+ } else {
+ spdk_bdev_close(ctx->base.desc);
+ }
+ }
+
+ if (ctx->volume) {
+ ocf_volume_destroy(ctx->volume);
+ }
+
+ if (ctx->core_uuids) {
+ for (i = 0; i < ctx->uuid_count; i++) {
+ free(ctx->core_uuids[i].data);
+ }
+ }
+ free(ctx->core_uuids);
+
+ examine_done(ctx->result, NULL, ctx->base.bdev);
+ free(ctx);
+}
+
+static void
+metadata_probe_construct_cb(int rc, struct vbdev_ocf *vbdev, void *vctx)
+{
+ struct metadata_probe_ctx *ctx = vctx;
+
+ examine_ctx_put(ctx);
+}
+
+/* This is second callback for ocf_metadata_probe_cores()
+ * Here we create vbdev configurations based on UUIDs */
+static void
+metadata_probe_cores_construct(void *priv, int error, unsigned int num_cores)
+{
+ struct metadata_probe_ctx *ctx = priv;
+ const char *vbdev_name;
+ const char *core_name;
+ const char *cache_name;
+ unsigned int i;
+
+ if (error) {
+ ctx->result = error;
+ examine_ctx_put(ctx);
+ return;
+ }
+
+ for (i = 0; i < num_cores; i++) {
+ core_name = ocf_uuid_to_str(&ctx->core_uuids[i]);
+ vbdev_name = core_name + strlen(core_name) + 1;
+ cache_name = vbdev_name + strlen(vbdev_name) + 1;
+
+ if (strcmp(ctx->base.bdev->name, cache_name)) {
+ SPDK_NOTICELOG("OCF metadata found on %s belongs to bdev named '%s'\n",
+ ctx->base.bdev->name, cache_name);
+ }
+
+ ctx->refcnt++;
+ vbdev_ocf_construct(vbdev_name, NULL, cache_name, core_name, true,
+ metadata_probe_construct_cb, ctx);
+ }
+
+ examine_ctx_put(ctx);
+}
+
+/* This callback is called after OCF reads cores UUIDs from cache metadata
+ * Here we allocate memory for those UUIDs and call ocf_metadata_probe_cores() again */
+static void
+metadata_probe_cores_get_num(void *priv, int error, unsigned int num_cores)
+{
+ struct metadata_probe_ctx *ctx = priv;
+ unsigned int i;
+
+ if (error) {
+ ctx->result = error;
+ examine_ctx_put(ctx);
+ return;
+ }
+
+ ctx->uuid_count = num_cores;
+ ctx->core_uuids = calloc(num_cores, sizeof(struct ocf_volume_uuid));
+ if (!ctx->core_uuids) {
+ ctx->result = -ENOMEM;
+ examine_ctx_put(ctx);
+ return;
+ }
+
+ for (i = 0; i < ctx->uuid_count; i++) {
+ ctx->core_uuids[i].size = OCF_VOLUME_UUID_MAX_SIZE;
+ ctx->core_uuids[i].data = malloc(OCF_VOLUME_UUID_MAX_SIZE);
+ if (!ctx->core_uuids[i].data) {
+ ctx->result = -ENOMEM;
+ examine_ctx_put(ctx);
+ return;
+ }
+ }
+
+ ocf_metadata_probe_cores(vbdev_ocf_ctx, ctx->volume, ctx->core_uuids, ctx->uuid_count,
+ metadata_probe_cores_construct, ctx);
+}
+
+static void
+metadata_probe_cb(void *priv, int rc,
+ struct ocf_metadata_probe_status *status)
+{
+ struct metadata_probe_ctx *ctx = priv;
+
+ if (rc) {
+ /* -ENODATA means device does not have cache metadata on it */
+ if (rc != -OCF_ERR_NO_METADATA) {
+ ctx->result = rc;
+ }
+ examine_ctx_put(ctx);
+ return;
+ }
+
+ ocf_metadata_probe_cores(vbdev_ocf_ctx, ctx->volume, NULL, 0,
+ metadata_probe_cores_get_num, ctx);
+}
+
+/* This is called after vbdev_ocf_examine
+ * It allows to delay application initialization
+ * until all OCF bdevs get registered
+ * If vbdev has all of its base devices it starts asynchronously here
+ * We first check if bdev appears in configuration,
+ * if not we do metadata_probe() to create its configuration from bdev metadata */
+static void
+vbdev_ocf_examine_disk(struct spdk_bdev *bdev)
+{
+ const char *bdev_name = spdk_bdev_get_name(bdev);
+ struct vbdev_ocf *vbdev;
+ struct metadata_probe_ctx *ctx;
+ bool created_from_config = false;
+ int rc;
+
+ examine_start(bdev);
+
+ TAILQ_FOREACH(vbdev, &g_ocf_vbdev_head, tailq) {
+ if (vbdev->state.doing_finish || vbdev->state.started) {
+ continue;
+ }
+
+ if (!strcmp(bdev_name, vbdev->cache.name)) {
+ examine_start(bdev);
+ register_vbdev(vbdev, examine_done, bdev);
+ created_from_config = true;
+ continue;
+ }
+ if (!strcmp(bdev_name, vbdev->core.name)) {
+ examine_start(bdev);
+ register_vbdev(vbdev, examine_done, bdev);
+ examine_done(0, NULL, bdev);
+ return;
+ }
+ }
+
+ /* If devices is discovered during config we do not check for metadata */
+ if (created_from_config) {
+ examine_done(0, NULL, bdev);
+ return;
+ }
+
+ /* Metadata probe path
+ * We create temporary OCF volume and a temporary base structure
+ * to use them for ocf_metadata_probe() and for bottom adapter IOs
+ * Then we get UUIDs of core devices an create configurations based on them */
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx) {
+ examine_done(-ENOMEM, NULL, bdev);
+ return;
+ }
+
+ ctx->base.bdev = bdev;
+ ctx->refcnt = 1;
+
+ rc = spdk_bdev_open(ctx->base.bdev, true, NULL, NULL, &ctx->base.desc);
+ if (rc) {
+ ctx->result = rc;
+ examine_ctx_put(ctx);
+ return;
+ }
+
+ rc = ocf_ctx_volume_create(vbdev_ocf_ctx, &ctx->volume, NULL, SPDK_OBJECT);
+ if (rc) {
+ ctx->result = rc;
+ examine_ctx_put(ctx);
+ return;
+ }
+
+ rc = ocf_volume_open(ctx->volume, &ctx->base);
+ if (rc) {
+ ctx->result = rc;
+ examine_ctx_put(ctx);
+ return;
+ }
+
+ /* Save the thread where the base device is opened */
+ ctx->base.thread = spdk_get_thread();
+
+ ocf_metadata_probe(vbdev_ocf_ctx, ctx->volume, metadata_probe_cb, ctx);
+}
+
+static int
+vbdev_ocf_get_ctx_size(void)
+{
+ return sizeof(struct bdev_ocf_data);
+}
+
+static void
+fini_start(void)
+{
+ g_fini_started = true;
+}
+
+/* Module-global function table
+ * Does not relate to vbdev instances */
+static struct spdk_bdev_module ocf_if = {
+ .name = "ocf",
+ .module_init = vbdev_ocf_init,
+ .fini_start = fini_start,
+ .module_fini = vbdev_ocf_module_fini,
+ .config_text = NULL,
+ .get_ctx_size = vbdev_ocf_get_ctx_size,
+ .examine_config = vbdev_ocf_examine,
+ .examine_disk = vbdev_ocf_examine_disk,
+};
+SPDK_BDEV_MODULE_REGISTER(ocf, &ocf_if);
+
+SPDK_LOG_REGISTER_COMPONENT("vbdev_ocf", SPDK_TRACE_VBDEV_OCF)
diff --git a/src/spdk/module/bdev/ocf/vbdev_ocf.h b/src/spdk/module/bdev/ocf/vbdev_ocf.h
new file mode 100644
index 000000000..d0fd0b183
--- /dev/null
+++ b/src/spdk/module/bdev/ocf/vbdev_ocf.h
@@ -0,0 +1,210 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_VBDEV_OCF_H
+#define SPDK_VBDEV_OCF_H
+
+#include <ocf/ocf.h>
+
+#include "spdk/bdev.h"
+#include "spdk/bdev_module.h"
+
+#define VBDEV_OCF_MD_MAX_LEN 4096
+
+struct vbdev_ocf;
+
+/* Context for OCF queue poller
+ * Used for mapping SPDK threads to OCF queues */
+struct vbdev_ocf_qctx {
+ /* OCF queue. Contains OCF requests */
+ struct ocf_queue *queue;
+ /* Poller for OCF queue. Runs OCF requests */
+ struct spdk_poller *poller;
+ /* Reference to parent vbdev */
+ struct vbdev_ocf *vbdev;
+ /* Base devices channels */
+ struct spdk_io_channel *cache_ch;
+ struct spdk_io_channel *core_ch;
+ /* If true, we have to free this context on queue stop */
+ bool allocated;
+ /* Link to per-bdev list of queue contexts */
+ TAILQ_ENTRY(vbdev_ocf_qctx) tailq;
+};
+
+/* Important states */
+struct vbdev_ocf_state {
+ /* From the moment when clean delete started */
+ bool doing_clean_delete;
+ /* From the moment when finish started */
+ bool doing_finish;
+ /* From the moment when reset IO recieved, until it is completed */
+ bool doing_reset;
+ /* From the moment when exp_bdev is registered */
+ bool started;
+ /* From the moment when register path started */
+ bool starting;
+ /* Status of last attempt for stopping this device */
+ int stop_status;
+};
+
+/*
+ * OCF cache configuration options
+ */
+struct vbdev_ocf_config {
+ /* Initial cache configuration */
+ struct ocf_mngt_cache_config cache;
+
+ /* Cache device config */
+ struct ocf_mngt_cache_device_config device;
+
+ /* Core initial config */
+ struct ocf_mngt_core_config core;
+
+ /* Load flag, if set to true, then we will try load cache instance from disk,
+ * otherwise we will create new cache on that disk */
+ bool loadq;
+};
+
+/* Types for management operations */
+typedef void (*vbdev_ocf_mngt_fn)(struct vbdev_ocf *);
+typedef void (*vbdev_ocf_mngt_callback)(int, struct vbdev_ocf *, void *);
+
+/* Context for asynchronous management operations
+ * Single management operation usually contains a list of sub procedures,
+ * this structure handles sharing between those sub procedures */
+struct vbdev_ocf_mngt_ctx {
+ /* Pointer to function that is currently being executed
+ * It gets incremented on each step until it dereferences to NULL */
+ vbdev_ocf_mngt_fn *current_step;
+
+ /* Function that gets invoked by poller on each iteration */
+ vbdev_ocf_mngt_fn poller_fn;
+ /* Poller timeout time stamp - when the poller should stop with error */
+ uint64_t timeout_ts;
+
+ /* Status of management operation */
+ int status;
+
+ /* External callback and its argument */
+ vbdev_ocf_mngt_callback cb;
+ void *cb_arg;
+};
+
+/* Base device info */
+struct vbdev_ocf_base {
+ /* OCF internal name */
+ char *name;
+
+ /* True if this is a caching device */
+ bool is_cache;
+
+ /* Connected SPDK block device */
+ struct spdk_bdev *bdev;
+
+ /* SPDK device io handle */
+ struct spdk_bdev_desc *desc;
+
+ /* True if SPDK bdev has been claimed and opened for writing */
+ bool attached;
+
+ /* Channel for cleaner operations */
+ struct spdk_io_channel *management_channel;
+
+ /* Reference to main vbdev */
+ struct vbdev_ocf *parent;
+
+ /* thread where base device is opened */
+ struct spdk_thread *thread;
+};
+
+/*
+ * The main information provider
+ * It's also registered as io_device
+ */
+struct vbdev_ocf {
+ /* Exposed unique name */
+ char *name;
+
+ /* Base bdevs */
+ struct vbdev_ocf_base cache;
+ struct vbdev_ocf_base core;
+
+ /* Base bdevs OCF objects */
+ ocf_cache_t ocf_cache;
+ ocf_core_t ocf_core;
+
+ /* Parameters */
+ struct vbdev_ocf_config cfg;
+ struct vbdev_ocf_state state;
+
+ /* Management context */
+ struct vbdev_ocf_mngt_ctx mngt_ctx;
+ /* Cache conext */
+ struct vbdev_ocf_cache_ctx *cache_ctx;
+
+ /* Exposed SPDK bdev. Registered in bdev layer */
+ struct spdk_bdev exp_bdev;
+
+ /* OCF uuid for core device of this vbdev */
+ char uuid[VBDEV_OCF_MD_MAX_LEN];
+
+ /* Link to global list of this type structures */
+ TAILQ_ENTRY(vbdev_ocf) tailq;
+};
+
+void vbdev_ocf_construct(
+ const char *vbdev_name,
+ const char *cache_mode_name,
+ const char *cache_name,
+ const char *core_name,
+ bool loadq,
+ void (*cb)(int, struct vbdev_ocf *, void *),
+ void *cb_arg);
+
+/* If vbdev is online, return its object */
+struct vbdev_ocf *vbdev_ocf_get_by_name(const char *name);
+
+/* Return matching base if parent vbdev is online */
+struct vbdev_ocf_base *vbdev_ocf_get_base_by_name(const char *name);
+
+/* Stop OCF cache and unregister SPDK bdev */
+int vbdev_ocf_delete(struct vbdev_ocf *vbdev, void (*cb)(void *, int), void *cb_arg);
+
+int vbdev_ocf_delete_clean(struct vbdev_ocf *vbdev, void (*cb)(void *, int), void *cb_arg);
+
+typedef void (*vbdev_ocf_foreach_fn)(struct vbdev_ocf *, void *);
+
+/* Execute fn for each OCF device that is online or waits for base devices */
+void vbdev_ocf_foreach(vbdev_ocf_foreach_fn fn, void *ctx);
+
+#endif
diff --git a/src/spdk/module/bdev/ocf/vbdev_ocf_rpc.c b/src/spdk/module/bdev/ocf/vbdev_ocf_rpc.c
new file mode 100644
index 000000000..89286fe23
--- /dev/null
+++ b/src/spdk/module/bdev/ocf/vbdev_ocf_rpc.c
@@ -0,0 +1,362 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "vbdev_ocf.h"
+#include "stats.h"
+#include "spdk/log.h"
+#include "spdk/rpc.h"
+#include "spdk/string.h"
+
+/* Structure to hold the parameters for this RPC method. */
+struct rpc_bdev_ocf_create {
+ char *name; /* master vbdev */
+ char *mode; /* OCF mode (choose one) */
+ char *cache_bdev_name; /* sub bdev */
+ char *core_bdev_name; /* sub bdev */
+};
+
+static void
+free_rpc_bdev_ocf_create(struct rpc_bdev_ocf_create *r)
+{
+ free(r->name);
+ free(r->core_bdev_name);
+ free(r->cache_bdev_name);
+ free(r->mode);
+}
+
+/* Structure to decode the input parameters for this RPC method. */
+static const struct spdk_json_object_decoder rpc_bdev_ocf_create_decoders[] = {
+ {"name", offsetof(struct rpc_bdev_ocf_create, name), spdk_json_decode_string},
+ {"mode", offsetof(struct rpc_bdev_ocf_create, mode), spdk_json_decode_string},
+ {"cache_bdev_name", offsetof(struct rpc_bdev_ocf_create, cache_bdev_name), spdk_json_decode_string},
+ {"core_bdev_name", offsetof(struct rpc_bdev_ocf_create, core_bdev_name), spdk_json_decode_string},
+};
+
+static void
+construct_cb(int status, struct vbdev_ocf *vbdev, void *cb_arg)
+{
+ struct spdk_jsonrpc_request *request = cb_arg;
+ struct spdk_json_write_ctx *w;
+
+ if (status) {
+ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Could not create OCF vbdev: %d",
+ status);
+ } else {
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_string(w, vbdev->name);
+ spdk_jsonrpc_end_result(request, w);
+ }
+}
+
+static void
+rpc_bdev_ocf_create(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_ocf_create req = {NULL};
+ int ret;
+
+ ret = spdk_json_decode_object(params, rpc_bdev_ocf_create_decoders,
+ SPDK_COUNTOF(rpc_bdev_ocf_create_decoders),
+ &req);
+ if (ret) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid parameters");
+ free_rpc_bdev_ocf_create(&req);
+ return;
+ }
+
+ vbdev_ocf_construct(req.name, req.mode, req.cache_bdev_name, req.core_bdev_name, false,
+ construct_cb, request);
+ free_rpc_bdev_ocf_create(&req);
+}
+SPDK_RPC_REGISTER("bdev_ocf_create", rpc_bdev_ocf_create, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_ocf_create, construct_ocf_bdev)
+
+/* Structure to hold the parameters for this RPC method. */
+struct rpc_bdev_ocf_delete {
+ char *name; /* master vbdev name */
+};
+
+static void
+free_rpc_bdev_ocf_delete(struct rpc_bdev_ocf_delete *r)
+{
+ free(r->name);
+}
+
+/* Structure to decode the input parameters for this RPC method. */
+static const struct spdk_json_object_decoder rpc_bdev_ocf_delete_decoders[] = {
+ {"name", offsetof(struct rpc_bdev_ocf_delete, name), spdk_json_decode_string},
+};
+
+static void
+delete_cb(void *cb_arg, int status)
+{
+ struct spdk_jsonrpc_request *request = cb_arg;
+ struct spdk_json_write_ctx *w;
+
+ if (status) {
+ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Could not delete OCF vbdev: %d",
+ status);
+ } else {
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ }
+}
+
+static void
+rpc_bdev_ocf_delete(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_ocf_delete req = {NULL};
+ struct vbdev_ocf *vbdev;
+ int status;
+
+ status = spdk_json_decode_object(params, rpc_bdev_ocf_delete_decoders,
+ SPDK_COUNTOF(rpc_bdev_ocf_delete_decoders),
+ &req);
+ if (status) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid parameters");
+ goto end;
+ }
+
+ vbdev = vbdev_ocf_get_by_name(req.name);
+ if (vbdev == NULL) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ spdk_strerror(ENODEV));
+ goto end;
+ }
+
+ status = vbdev_ocf_delete_clean(vbdev, delete_cb, request);
+ if (status) {
+ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Could not delete OCF vbdev: %s",
+ spdk_strerror(-status));
+ goto end;
+ }
+
+end:
+ free_rpc_bdev_ocf_delete(&req);
+}
+SPDK_RPC_REGISTER("bdev_ocf_delete", rpc_bdev_ocf_delete, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_ocf_delete, delete_ocf_bdev)
+
+/* Structure to hold the parameters for this RPC method. */
+struct rpc_bdev_ocf_get_stats {
+ char *name; /* master vbdev name */
+};
+
+static void
+free_rpc_bdev_ocf_get_stats(struct rpc_bdev_ocf_get_stats *r)
+{
+ free(r->name);
+}
+
+/* Structure to decode the input parameters for this RPC method. */
+static const struct spdk_json_object_decoder rpc_bdev_ocf_get_stats_decoders[] = {
+ {"name", offsetof(struct rpc_bdev_ocf_get_stats, name), spdk_json_decode_string},
+};
+
+struct get_ocf_stats_ctx {
+ struct spdk_jsonrpc_request *request;
+ char *core_name;
+};
+
+static void
+rpc_bdev_ocf_get_stats_cmpl(ocf_cache_t cache, void *priv, int error)
+{
+ struct get_ocf_stats_ctx *ctx = (struct get_ocf_stats_ctx *) priv;
+ struct spdk_json_write_ctx *w;
+ struct vbdev_ocf_stats stats;
+
+ if (error) {
+ goto end;
+ }
+
+ error = vbdev_ocf_stats_get(cache, ctx->core_name, &stats);
+
+ ocf_mngt_cache_read_unlock(cache);
+
+ if (error) {
+ goto end;
+ }
+
+ w = spdk_jsonrpc_begin_result(ctx->request);
+ vbdev_ocf_stats_write_json(w, &stats);
+ spdk_jsonrpc_end_result(ctx->request, w);
+
+end:
+ if (error) {
+ spdk_jsonrpc_send_error_response_fmt(ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Could not get stats: %s",
+ spdk_strerror(-error));
+ }
+ free(ctx);
+}
+
+static void
+rpc_bdev_ocf_get_stats(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_ocf_get_stats req = {NULL};
+ struct vbdev_ocf *vbdev;
+ struct get_ocf_stats_ctx *ctx;
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Not enough memory to process request");
+ goto end;
+ }
+
+ if (spdk_json_decode_object(params, rpc_bdev_ocf_get_stats_decoders,
+ SPDK_COUNTOF(rpc_bdev_ocf_get_stats_decoders),
+ &req)) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid parameters");
+ free(ctx);
+ goto end;
+ }
+
+ vbdev = vbdev_ocf_get_by_name(req.name);
+ if (vbdev == NULL) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ spdk_strerror(ENODEV));
+ free(ctx);
+ goto end;
+ }
+
+ ctx->core_name = vbdev->core.name;
+ ctx->request = request;
+ ocf_mngt_cache_read_lock(vbdev->ocf_cache, rpc_bdev_ocf_get_stats_cmpl, ctx);
+
+end:
+ free_rpc_bdev_ocf_get_stats(&req);
+}
+SPDK_RPC_REGISTER("bdev_ocf_get_stats", rpc_bdev_ocf_get_stats, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_ocf_get_stats, get_ocf_stats)
+
+/* Structure to hold the parameters for this RPC method. */
+struct rpc_bdev_ocf_get_bdevs {
+ char *name;
+};
+
+static void
+free_rpc_bdev_ocf_get_bdevs(struct rpc_bdev_ocf_get_bdevs *r)
+{
+ free(r->name);
+}
+
+/* Structure to decode the input parameters for this RPC method. */
+static const struct spdk_json_object_decoder rpc_bdev_ocf_get_bdevs_decoders[] = {
+ {"name", offsetof(struct rpc_bdev_ocf_get_bdevs, name), spdk_json_decode_string, true},
+};
+
+struct bdev_get_bdevs_ctx {
+ char *name;
+ struct spdk_json_write_ctx *w;
+};
+
+static void
+bdev_get_bdevs_fn(struct vbdev_ocf *vbdev, void *ctx)
+{
+ struct bdev_get_bdevs_ctx *cctx = ctx;
+ struct spdk_json_write_ctx *w = cctx->w;
+
+ if (cctx->name != NULL &&
+ strcmp(vbdev->name, cctx->name) &&
+ strcmp(vbdev->cache.name, cctx->name) &&
+ strcmp(vbdev->core.name, cctx->name)) {
+ return;
+ }
+
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "name", vbdev->name);
+ spdk_json_write_named_bool(w, "started", vbdev->state.started);
+
+ spdk_json_write_named_object_begin(w, "cache");
+ spdk_json_write_named_string(w, "name", vbdev->cache.name);
+ spdk_json_write_named_bool(w, "attached", vbdev->cache.attached);
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_named_object_begin(w, "core");
+ spdk_json_write_named_string(w, "name", vbdev->core.name);
+ spdk_json_write_named_bool(w, "attached", vbdev->core.attached);
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+}
+
+static void
+rpc_bdev_ocf_get_bdevs(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct spdk_json_write_ctx *w;
+ struct rpc_bdev_ocf_get_bdevs req = {NULL};
+ struct bdev_get_bdevs_ctx cctx;
+
+ if (params && spdk_json_decode_object(params, rpc_bdev_ocf_get_bdevs_decoders,
+ SPDK_COUNTOF(rpc_bdev_ocf_get_bdevs_decoders),
+ &req)) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid parameters");
+ goto end;
+ }
+
+ if (req.name) {
+ if (!(vbdev_ocf_get_by_name(req.name) || vbdev_ocf_get_base_by_name(req.name))) {
+ spdk_jsonrpc_send_error_response(request,
+ SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ spdk_strerror(ENODEV));
+ goto end;
+ }
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+
+ cctx.name = req.name;
+ cctx.w = w;
+
+ spdk_json_write_array_begin(w);
+ vbdev_ocf_foreach(bdev_get_bdevs_fn, &cctx);
+ spdk_json_write_array_end(w);
+ spdk_jsonrpc_end_result(request, w);
+
+end:
+ free_rpc_bdev_ocf_get_bdevs(&req);
+}
+SPDK_RPC_REGISTER("bdev_ocf_get_bdevs", rpc_bdev_ocf_get_bdevs, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_ocf_get_bdevs, get_ocf_bdevs)
diff --git a/src/spdk/module/bdev/ocf/volume.c b/src/spdk/module/bdev/ocf/volume.c
new file mode 100644
index 000000000..de683b852
--- /dev/null
+++ b/src/spdk/module/bdev/ocf/volume.c
@@ -0,0 +1,441 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <ocf/ocf.h>
+
+#include "spdk/bdev_module.h"
+#include "spdk/env.h"
+#include "spdk/thread.h"
+#include "spdk_internal/log.h"
+
+#include "data.h"
+#include "volume.h"
+#include "ctx.h"
+#include "vbdev_ocf.h"
+
+static int
+vbdev_ocf_volume_open(ocf_volume_t volume, void *opts)
+{
+ struct vbdev_ocf_base **priv = ocf_volume_get_priv(volume);
+ struct vbdev_ocf_base *base;
+
+ if (opts) {
+ base = opts;
+ } else {
+ base = vbdev_ocf_get_base_by_name(ocf_volume_get_uuid(volume)->data);
+ if (base == NULL) {
+ return -ENODEV;
+ }
+ }
+
+ *priv = base;
+
+ return 0;
+}
+
+static void
+vbdev_ocf_volume_close(ocf_volume_t volume)
+{
+}
+
+static uint64_t
+vbdev_ocf_volume_get_length(ocf_volume_t volume)
+{
+ struct vbdev_ocf_base *base = *((struct vbdev_ocf_base **)ocf_volume_get_priv(volume));
+ uint64_t len;
+
+ len = base->bdev->blocklen * base->bdev->blockcnt;
+
+ return len;
+}
+
+static int
+vbdev_ocf_volume_io_set_data(struct ocf_io *io, ctx_data_t *data,
+ uint32_t offset)
+{
+ struct ocf_io_ctx *io_ctx = ocf_get_io_ctx(io);
+
+ io_ctx->offset = offset;
+ io_ctx->data = data;
+
+ if (io_ctx->data && offset >= io_ctx->data->size) {
+ return -ENOBUFS;
+ }
+
+ return 0;
+}
+
+static ctx_data_t *
+vbdev_ocf_volume_io_get_data(struct ocf_io *io)
+{
+ return ocf_get_io_ctx(io)->data;
+}
+
+static void
+vbdev_ocf_volume_io_get(struct ocf_io *io)
+{
+ struct ocf_io_ctx *io_ctx = ocf_get_io_ctx(io);
+
+ io_ctx->ref++;
+}
+
+static void
+vbdev_ocf_volume_io_put(struct ocf_io *io)
+{
+ struct ocf_io_ctx *io_ctx = ocf_get_io_ctx(io);
+
+ if (--io_ctx->ref) {
+ return;
+ }
+}
+
+static int
+get_starting_vec(struct iovec *iovs, int iovcnt, int *offset)
+{
+ int i;
+ size_t off;
+
+ off = *offset;
+
+ for (i = 0; i < iovcnt; i++) {
+ if (off < iovs[i].iov_len) {
+ *offset = off;
+ return i;
+ }
+ off -= iovs[i].iov_len;
+ }
+
+ return -1;
+}
+
+static void
+initialize_cpy_vector(struct iovec *cpy_vec, int cpy_vec_len, struct iovec *orig_vec,
+ int orig_vec_len,
+ size_t offset, size_t bytes)
+{
+ void *curr_base;
+ int len, i;
+
+ i = 0;
+
+ while (bytes > 0) {
+ curr_base = orig_vec[i].iov_base + offset;
+ len = MIN(bytes, orig_vec[i].iov_len - offset);
+
+ cpy_vec[i].iov_base = curr_base;
+ cpy_vec[i].iov_len = len;
+
+ bytes -= len;
+ offset = 0;
+ i++;
+ }
+}
+
+static void
+vbdev_ocf_volume_submit_io_cb(struct spdk_bdev_io *bdev_io, bool success, void *opaque)
+{
+ struct ocf_io *io;
+ struct ocf_io_ctx *io_ctx;
+
+ assert(opaque);
+
+ io = opaque;
+ io_ctx = ocf_get_io_ctx(io);
+ assert(io_ctx != NULL);
+
+ if (!success) {
+ io_ctx->error |= 1;
+ }
+
+ if (io_ctx->iovs_allocated && bdev_io != NULL) {
+ env_free(bdev_io->u.bdev.iovs);
+ }
+
+ if (io_ctx->error) {
+ SPDK_DEBUGLOG(SPDK_TRACE_VBDEV_OCF_VOLUME,
+ "base returned error on io submission: %d\n", io_ctx->error);
+ }
+
+ if (io->io_queue == NULL && io_ctx->ch != NULL) {
+ spdk_put_io_channel(io_ctx->ch);
+ }
+
+ vbdev_ocf_volume_io_put(io);
+ if (bdev_io) {
+ spdk_bdev_free_io(bdev_io);
+ }
+
+ if (--io_ctx->rq_cnt == 0) {
+ io->end(io, io_ctx->error);
+ }
+}
+
+static int
+prepare_submit(struct ocf_io *io)
+{
+ struct ocf_io_ctx *io_ctx = ocf_get_io_ctx(io);
+ struct vbdev_ocf_qctx *qctx;
+ struct vbdev_ocf_base *base;
+ ocf_queue_t q = io->io_queue;
+ ocf_cache_t cache;
+ struct vbdev_ocf_cache_ctx *cctx;
+ int rc = 0;
+
+ io_ctx->rq_cnt++;
+ if (io_ctx->rq_cnt != 1) {
+ return 0;
+ }
+
+ vbdev_ocf_volume_io_get(io);
+ base = *((struct vbdev_ocf_base **)ocf_volume_get_priv(ocf_io_get_volume(io)));
+
+ if (io->io_queue == NULL) {
+ /* In case IO is initiated by OCF, queue is unknown
+ * so we have to get io channel ourselves */
+ io_ctx->ch = spdk_bdev_get_io_channel(base->desc);
+ if (io_ctx->ch == NULL) {
+ return -EPERM;
+ }
+ return 0;
+ }
+
+ cache = ocf_queue_get_cache(q);
+ cctx = ocf_cache_get_priv(cache);
+
+ if (q == cctx->cleaner_queue || q == cctx->mngt_queue) {
+ io_ctx->ch = base->management_channel;
+ return 0;
+ }
+
+ qctx = ocf_queue_get_priv(q);
+ if (qctx == NULL) {
+ return -EFAULT;
+ }
+
+ if (base->is_cache) {
+ io_ctx->ch = qctx->cache_ch;
+ } else {
+ io_ctx->ch = qctx->core_ch;
+ }
+
+ return rc;
+}
+
+static void
+vbdev_ocf_volume_submit_flush(struct ocf_io *io)
+{
+ struct vbdev_ocf_base *base =
+ *((struct vbdev_ocf_base **)
+ ocf_volume_get_priv(ocf_io_get_volume(io)));
+ struct ocf_io_ctx *io_ctx = ocf_get_io_ctx(io);
+ int status;
+
+ status = prepare_submit(io);
+ if (status) {
+ SPDK_ERRLOG("Preparing io failed with status=%d\n", status);
+ vbdev_ocf_volume_submit_io_cb(NULL, false, io);
+ return;
+ }
+
+ status = spdk_bdev_flush(
+ base->desc, io_ctx->ch,
+ io->addr, io->bytes,
+ vbdev_ocf_volume_submit_io_cb, io);
+ if (status) {
+ /* Since callback is not called, we need to do it manually to free io structures */
+ SPDK_ERRLOG("Submission failed with status=%d\n", status);
+ vbdev_ocf_volume_submit_io_cb(NULL, false, io);
+ }
+}
+
+static void
+vbdev_ocf_volume_submit_io(struct ocf_io *io)
+{
+ struct vbdev_ocf_base *base =
+ *((struct vbdev_ocf_base **)
+ ocf_volume_get_priv(ocf_io_get_volume(io)));
+ struct ocf_io_ctx *io_ctx = ocf_get_io_ctx(io);
+ struct iovec *iovs;
+ int iovcnt, status = 0, i, offset;
+ uint64_t addr, len;
+
+ if (io->flags == OCF_WRITE_FLUSH) {
+ vbdev_ocf_volume_submit_flush(io);
+ return;
+ }
+
+ status = prepare_submit(io);
+ if (status) {
+ SPDK_ERRLOG("Preparing io failed with status=%d\n", status);
+ vbdev_ocf_volume_submit_io_cb(NULL, false, io);
+ return;
+ }
+
+ /* IO fields */
+ addr = io->addr;
+ len = io->bytes;
+ offset = io_ctx->offset;
+
+ if (len < io_ctx->data->size) {
+ if (io_ctx->data->iovcnt == 1) {
+ if (io->dir == OCF_READ) {
+ status = spdk_bdev_read(base->desc, io_ctx->ch,
+ io_ctx->data->iovs[0].iov_base + offset, addr, len,
+ vbdev_ocf_volume_submit_io_cb, io);
+ } else if (io->dir == OCF_WRITE) {
+ status = spdk_bdev_write(base->desc, io_ctx->ch,
+ io_ctx->data->iovs[0].iov_base + offset, addr, len,
+ vbdev_ocf_volume_submit_io_cb, io);
+ }
+ goto end;
+ } else {
+ i = get_starting_vec(io_ctx->data->iovs, io_ctx->data->iovcnt, &offset);
+
+ if (i < 0) {
+ SPDK_ERRLOG("offset bigger than data size\n");
+ vbdev_ocf_volume_submit_io_cb(NULL, false, io);
+ return;
+ }
+
+ iovcnt = io_ctx->data->iovcnt - i;
+
+ io_ctx->iovs_allocated = true;
+ iovs = env_malloc(sizeof(*iovs) * iovcnt, ENV_MEM_NOIO);
+
+ if (!iovs) {
+ SPDK_ERRLOG("allocation failed\n");
+ vbdev_ocf_volume_submit_io_cb(NULL, false, io);
+ return;
+ }
+
+ initialize_cpy_vector(iovs, io_ctx->data->iovcnt, &io_ctx->data->iovs[i],
+ iovcnt, offset, len);
+ }
+ } else {
+ iovs = io_ctx->data->iovs;
+ iovcnt = io_ctx->data->iovcnt;
+ }
+
+ if (io->dir == OCF_READ) {
+ status = spdk_bdev_readv(base->desc, io_ctx->ch,
+ iovs, iovcnt, addr, len, vbdev_ocf_volume_submit_io_cb, io);
+ } else if (io->dir == OCF_WRITE) {
+ status = spdk_bdev_writev(base->desc, io_ctx->ch,
+ iovs, iovcnt, addr, len, vbdev_ocf_volume_submit_io_cb, io);
+ }
+
+end:
+ if (status) {
+ /* TODO [ENOMEM]: implement ENOMEM handling when submitting IO to base device */
+
+ /* Since callback is not called, we need to do it manually to free io structures */
+ SPDK_ERRLOG("submission failed with status=%d\n", status);
+ vbdev_ocf_volume_submit_io_cb(NULL, false, io);
+ }
+}
+
+static void
+vbdev_ocf_volume_submit_discard(struct ocf_io *io)
+{
+ struct vbdev_ocf_base *base =
+ *((struct vbdev_ocf_base **)
+ ocf_volume_get_priv(ocf_io_get_volume(io)));
+ struct ocf_io_ctx *io_ctx = ocf_get_io_ctx(io);
+ int status = 0;
+
+ status = prepare_submit(io);
+ if (status) {
+ SPDK_ERRLOG("Preparing io failed with status=%d\n", status);
+ vbdev_ocf_volume_submit_io_cb(NULL, false, io);
+ return;
+ }
+
+ status = spdk_bdev_unmap(
+ base->desc, io_ctx->ch,
+ io->addr, io->bytes,
+ vbdev_ocf_volume_submit_io_cb, io);
+ if (status) {
+ /* Since callback is not called, we need to do it manually to free io structures */
+ SPDK_ERRLOG("Submission failed with status=%d\n", status);
+ vbdev_ocf_volume_submit_io_cb(NULL, false, io);
+ }
+}
+
+static void
+vbdev_ocf_volume_submit_metadata(struct ocf_io *io)
+{
+ /* Implement with persistent metadata support */
+}
+
+static unsigned int
+vbdev_ocf_volume_get_max_io_size(ocf_volume_t volume)
+{
+ return 131072;
+}
+
+static struct ocf_volume_properties vbdev_volume_props = {
+ .name = "SPDK block device",
+ .io_priv_size = sizeof(struct ocf_io_ctx),
+ .volume_priv_size = sizeof(struct vbdev_ocf_base *),
+ .caps = {
+ .atomic_writes = 0 /* to enable need to have ops->submit_metadata */
+ },
+ .ops = {
+ .open = vbdev_ocf_volume_open,
+ .close = vbdev_ocf_volume_close,
+ .get_length = vbdev_ocf_volume_get_length,
+ .submit_io = vbdev_ocf_volume_submit_io,
+ .submit_discard = vbdev_ocf_volume_submit_discard,
+ .submit_flush = vbdev_ocf_volume_submit_flush,
+ .get_max_io_size = vbdev_ocf_volume_get_max_io_size,
+ .submit_metadata = vbdev_ocf_volume_submit_metadata,
+ },
+ .io_ops = {
+ .set_data = vbdev_ocf_volume_io_set_data,
+ .get_data = vbdev_ocf_volume_io_get_data,
+ },
+};
+
+int
+vbdev_ocf_volume_init(void)
+{
+ return ocf_ctx_register_volume_type(vbdev_ocf_ctx, SPDK_OBJECT, &vbdev_volume_props);
+}
+
+void
+vbdev_ocf_volume_cleanup(void)
+{
+ ocf_ctx_unregister_volume_type(vbdev_ocf_ctx, SPDK_OBJECT);
+}
+
+SPDK_LOG_REGISTER_COMPONENT("vbdev_ocf_volume", SPDK_TRACE_VBDEV_OCF_VOLUME)
diff --git a/src/spdk/module/bdev/ocf/volume.h b/src/spdk/module/bdev/ocf/volume.h
new file mode 100644
index 000000000..6ae7488b5
--- /dev/null
+++ b/src/spdk/module/bdev/ocf/volume.h
@@ -0,0 +1,63 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef VBDEV_OCF_DOBJ_H
+#define VBDEV_OCF_DOBJ_H
+
+#include <ocf/ocf.h>
+
+#include "ocf_env.h"
+#include "ctx.h"
+#include "data.h"
+
+/* ocf_io context
+ * It is initialized from io size and offset */
+struct ocf_io_ctx {
+ struct bdev_ocf_data *data;
+ struct spdk_io_channel *ch;
+ uint32_t offset;
+ int ref;
+ int rq_cnt;
+ int error;
+ bool iovs_allocated;
+};
+
+int vbdev_ocf_volume_init(void);
+void vbdev_ocf_volume_cleanup(void);
+
+static inline struct ocf_io_ctx *ocf_get_io_ctx(struct ocf_io *io)
+{
+ return ocf_io_get_priv(io);
+}
+
+#endif
diff --git a/src/spdk/module/bdev/passthru/Makefile b/src/spdk/module/bdev/passthru/Makefile
new file mode 100644
index 000000000..c12b97691
--- /dev/null
+++ b/src/spdk/module/bdev/passthru/Makefile
@@ -0,0 +1,47 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+CFLAGS += -I$(SPDK_ROOT_DIR)/lib/bdev/
+
+C_SRCS = vbdev_passthru.c vbdev_passthru_rpc.c
+LIBNAME = bdev_passthru
+
+SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/module/bdev/passthru/vbdev_passthru.c b/src/spdk/module/bdev/passthru/vbdev_passthru.c
new file mode 100644
index 000000000..f166f3e34
--- /dev/null
+++ b/src/spdk/module/bdev/passthru/vbdev_passthru.c
@@ -0,0 +1,809 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This is a simple example of a virtual block device module that passes IO
+ * down to a bdev (or bdevs) that its configured to attach to.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "vbdev_passthru.h"
+#include "spdk/rpc.h"
+#include "spdk/env.h"
+#include "spdk/conf.h"
+#include "spdk/endian.h"
+#include "spdk/string.h"
+#include "spdk/thread.h"
+#include "spdk/util.h"
+
+#include "spdk/bdev_module.h"
+#include "spdk_internal/log.h"
+
+
+static int vbdev_passthru_init(void);
+static void vbdev_passthru_get_spdk_running_config(FILE *fp);
+static int vbdev_passthru_get_ctx_size(void);
+static void vbdev_passthru_examine(struct spdk_bdev *bdev);
+static void vbdev_passthru_finish(void);
+static int vbdev_passthru_config_json(struct spdk_json_write_ctx *w);
+
+static struct spdk_bdev_module passthru_if = {
+ .name = "passthru",
+ .module_init = vbdev_passthru_init,
+ .config_text = vbdev_passthru_get_spdk_running_config,
+ .get_ctx_size = vbdev_passthru_get_ctx_size,
+ .examine_config = vbdev_passthru_examine,
+ .module_fini = vbdev_passthru_finish,
+ .config_json = vbdev_passthru_config_json
+};
+
+SPDK_BDEV_MODULE_REGISTER(passthru, &passthru_if)
+
+/* List of pt_bdev names and their base bdevs via configuration file.
+ * Used so we can parse the conf once at init and use this list in examine().
+ */
+struct bdev_names {
+ char *vbdev_name;
+ char *bdev_name;
+ TAILQ_ENTRY(bdev_names) link;
+};
+static TAILQ_HEAD(, bdev_names) g_bdev_names = TAILQ_HEAD_INITIALIZER(g_bdev_names);
+
+/* List of virtual bdevs and associated info for each. */
+struct vbdev_passthru {
+ struct spdk_bdev *base_bdev; /* the thing we're attaching to */
+ struct spdk_bdev_desc *base_desc; /* its descriptor we get from open */
+ struct spdk_bdev pt_bdev; /* the PT virtual bdev */
+ TAILQ_ENTRY(vbdev_passthru) link;
+ struct spdk_thread *thread; /* thread where base device is opened */
+};
+static TAILQ_HEAD(, vbdev_passthru) g_pt_nodes = TAILQ_HEAD_INITIALIZER(g_pt_nodes);
+
+/* The pt vbdev channel struct. It is allocated and freed on my behalf by the io channel code.
+ * If this vbdev needed to implement a poller or a queue for IO, this is where those things
+ * would be defined. This passthru bdev doesn't actually need to allocate a channel, it could
+ * simply pass back the channel of the bdev underneath it but for example purposes we will
+ * present its own to the upper layers.
+ */
+struct pt_io_channel {
+ struct spdk_io_channel *base_ch; /* IO channel of base device */
+};
+
+/* Just for fun, this pt_bdev module doesn't need it but this is essentially a per IO
+ * context that we get handed by the bdev layer.
+ */
+struct passthru_bdev_io {
+ uint8_t test;
+
+ /* bdev related */
+ struct spdk_io_channel *ch;
+
+ /* for bdev_io_wait */
+ struct spdk_bdev_io_wait_entry bdev_io_wait;
+};
+
+static void
+vbdev_passthru_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io);
+
+
+/* Callback for unregistering the IO device. */
+static void
+_device_unregister_cb(void *io_device)
+{
+ struct vbdev_passthru *pt_node = io_device;
+
+ /* Done with this pt_node. */
+ free(pt_node->pt_bdev.name);
+ free(pt_node);
+}
+
+/* Wrapper for the bdev close operation. */
+static void
+_vbdev_passthru_destruct(void *ctx)
+{
+ struct spdk_bdev_desc *desc = ctx;
+
+ spdk_bdev_close(desc);
+}
+
+/* Called after we've unregistered following a hot remove callback.
+ * Our finish entry point will be called next.
+ */
+static int
+vbdev_passthru_destruct(void *ctx)
+{
+ struct vbdev_passthru *pt_node = (struct vbdev_passthru *)ctx;
+
+ /* It is important to follow this exact sequence of steps for destroying
+ * a vbdev...
+ */
+
+ TAILQ_REMOVE(&g_pt_nodes, pt_node, link);
+
+ /* Unclaim the underlying bdev. */
+ spdk_bdev_module_release_bdev(pt_node->base_bdev);
+
+ /* Close the underlying bdev on its same opened thread. */
+ if (pt_node->thread && pt_node->thread != spdk_get_thread()) {
+ spdk_thread_send_msg(pt_node->thread, _vbdev_passthru_destruct, pt_node->base_desc);
+ } else {
+ spdk_bdev_close(pt_node->base_desc);
+ }
+
+ /* Unregister the io_device. */
+ spdk_io_device_unregister(pt_node, _device_unregister_cb);
+
+ return 0;
+}
+
+/* Completion callback for IO that were issued from this bdev. The original bdev_io
+ * is passed in as an arg so we'll complete that one with the appropriate status
+ * and then free the one that this module issued.
+ */
+static void
+_pt_complete_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct spdk_bdev_io *orig_io = cb_arg;
+ int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
+ struct passthru_bdev_io *io_ctx = (struct passthru_bdev_io *)orig_io->driver_ctx;
+
+ /* We setup this value in the submission routine, just showing here that it is
+ * passed back to us.
+ */
+ if (io_ctx->test != 0x5a) {
+ SPDK_ERRLOG("Error, original IO device_ctx is wrong! 0x%x\n",
+ io_ctx->test);
+ }
+
+ /* Complete the original IO and then free the one that we created here
+ * as a result of issuing an IO via submit_request.
+ */
+ spdk_bdev_io_complete(orig_io, status);
+ spdk_bdev_free_io(bdev_io);
+}
+
+static void
+_pt_complete_zcopy_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct spdk_bdev_io *orig_io = cb_arg;
+ int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
+ struct passthru_bdev_io *io_ctx = (struct passthru_bdev_io *)orig_io->driver_ctx;
+
+ /* We setup this value in the submission routine, just showing here that it is
+ * passed back to us.
+ */
+ if (io_ctx->test != 0x5a) {
+ SPDK_ERRLOG("Error, original IO device_ctx is wrong! 0x%x\n",
+ io_ctx->test);
+ }
+
+ /* Complete the original IO and then free the one that we created here
+ * as a result of issuing an IO via submit_request.
+ */
+ spdk_bdev_io_set_buf(orig_io, bdev_io->u.bdev.iovs[0].iov_base, bdev_io->u.bdev.iovs[0].iov_len);
+ spdk_bdev_io_complete(orig_io, status);
+ spdk_bdev_free_io(bdev_io);
+}
+
+static void
+vbdev_passthru_resubmit_io(void *arg)
+{
+ struct spdk_bdev_io *bdev_io = (struct spdk_bdev_io *)arg;
+ struct passthru_bdev_io *io_ctx = (struct passthru_bdev_io *)bdev_io->driver_ctx;
+
+ vbdev_passthru_submit_request(io_ctx->ch, bdev_io);
+}
+
+static void
+vbdev_passthru_queue_io(struct spdk_bdev_io *bdev_io)
+{
+ struct passthru_bdev_io *io_ctx = (struct passthru_bdev_io *)bdev_io->driver_ctx;
+ struct pt_io_channel *pt_ch = spdk_io_channel_get_ctx(io_ctx->ch);
+ int rc;
+
+ io_ctx->bdev_io_wait.bdev = bdev_io->bdev;
+ io_ctx->bdev_io_wait.cb_fn = vbdev_passthru_resubmit_io;
+ io_ctx->bdev_io_wait.cb_arg = bdev_io;
+
+ /* Queue the IO using the channel of the base device. */
+ rc = spdk_bdev_queue_io_wait(bdev_io->bdev, pt_ch->base_ch, &io_ctx->bdev_io_wait);
+ if (rc != 0) {
+ SPDK_ERRLOG("Queue io failed in vbdev_passthru_queue_io, rc=%d.\n", rc);
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+}
+
+/* Callback for getting a buf from the bdev pool in the event that the caller passed
+ * in NULL, we need to own the buffer so it doesn't get freed by another vbdev module
+ * beneath us before we're done with it. That won't happen in this example but it could
+ * if this example were used as a template for something more complex.
+ */
+static void
+pt_read_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success)
+{
+ struct vbdev_passthru *pt_node = SPDK_CONTAINEROF(bdev_io->bdev, struct vbdev_passthru,
+ pt_bdev);
+ struct pt_io_channel *pt_ch = spdk_io_channel_get_ctx(ch);
+ struct passthru_bdev_io *io_ctx = (struct passthru_bdev_io *)bdev_io->driver_ctx;
+ int rc;
+
+ if (!success) {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ return;
+ }
+
+ if (bdev_io->u.bdev.md_buf == NULL) {
+ rc = spdk_bdev_readv_blocks(pt_node->base_desc, pt_ch->base_ch, bdev_io->u.bdev.iovs,
+ bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks,
+ bdev_io->u.bdev.num_blocks, _pt_complete_io,
+ bdev_io);
+ } else {
+ rc = spdk_bdev_readv_blocks_with_md(pt_node->base_desc, pt_ch->base_ch,
+ bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
+ bdev_io->u.bdev.md_buf,
+ bdev_io->u.bdev.offset_blocks,
+ bdev_io->u.bdev.num_blocks,
+ _pt_complete_io, bdev_io);
+ }
+
+ if (rc != 0) {
+ if (rc == -ENOMEM) {
+ SPDK_ERRLOG("No memory, start to queue io for passthru.\n");
+ io_ctx->ch = ch;
+ vbdev_passthru_queue_io(bdev_io);
+ } else {
+ SPDK_ERRLOG("ERROR on bdev_io submission!\n");
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+ }
+}
+
+/* Called when someone above submits IO to this pt vbdev. We're simply passing it on here
+ * via SPDK IO calls which in turn allocate another bdev IO and call our cpl callback provided
+ * below along with the original bdev_io so that we can complete it once this IO completes.
+ */
+static void
+vbdev_passthru_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
+{
+ struct vbdev_passthru *pt_node = SPDK_CONTAINEROF(bdev_io->bdev, struct vbdev_passthru, pt_bdev);
+ struct pt_io_channel *pt_ch = spdk_io_channel_get_ctx(ch);
+ struct passthru_bdev_io *io_ctx = (struct passthru_bdev_io *)bdev_io->driver_ctx;
+ int rc = 0;
+
+ /* Setup a per IO context value; we don't do anything with it in the vbdev other
+ * than confirm we get the same thing back in the completion callback just to
+ * demonstrate.
+ */
+ io_ctx->test = 0x5a;
+
+ switch (bdev_io->type) {
+ case SPDK_BDEV_IO_TYPE_READ:
+ spdk_bdev_io_get_buf(bdev_io, pt_read_get_buf_cb,
+ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
+ break;
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ if (bdev_io->u.bdev.md_buf == NULL) {
+ rc = spdk_bdev_writev_blocks(pt_node->base_desc, pt_ch->base_ch, bdev_io->u.bdev.iovs,
+ bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks,
+ bdev_io->u.bdev.num_blocks, _pt_complete_io,
+ bdev_io);
+ } else {
+ rc = spdk_bdev_writev_blocks_with_md(pt_node->base_desc, pt_ch->base_ch,
+ bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
+ bdev_io->u.bdev.md_buf,
+ bdev_io->u.bdev.offset_blocks,
+ bdev_io->u.bdev.num_blocks,
+ _pt_complete_io, bdev_io);
+ }
+ break;
+ case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
+ rc = spdk_bdev_write_zeroes_blocks(pt_node->base_desc, pt_ch->base_ch,
+ bdev_io->u.bdev.offset_blocks,
+ bdev_io->u.bdev.num_blocks,
+ _pt_complete_io, bdev_io);
+ break;
+ case SPDK_BDEV_IO_TYPE_UNMAP:
+ rc = spdk_bdev_unmap_blocks(pt_node->base_desc, pt_ch->base_ch,
+ bdev_io->u.bdev.offset_blocks,
+ bdev_io->u.bdev.num_blocks,
+ _pt_complete_io, bdev_io);
+ break;
+ case SPDK_BDEV_IO_TYPE_FLUSH:
+ rc = spdk_bdev_flush_blocks(pt_node->base_desc, pt_ch->base_ch,
+ bdev_io->u.bdev.offset_blocks,
+ bdev_io->u.bdev.num_blocks,
+ _pt_complete_io, bdev_io);
+ break;
+ case SPDK_BDEV_IO_TYPE_RESET:
+ rc = spdk_bdev_reset(pt_node->base_desc, pt_ch->base_ch,
+ _pt_complete_io, bdev_io);
+ break;
+ case SPDK_BDEV_IO_TYPE_ZCOPY:
+ rc = spdk_bdev_zcopy_start(pt_node->base_desc, pt_ch->base_ch, bdev_io->u.bdev.offset_blocks,
+ bdev_io->u.bdev.num_blocks, bdev_io->u.bdev.zcopy.populate,
+ _pt_complete_zcopy_io, bdev_io);
+ break;
+ case SPDK_BDEV_IO_TYPE_ABORT:
+ rc = spdk_bdev_abort(pt_node->base_desc, pt_ch->base_ch, bdev_io->u.abort.bio_to_abort,
+ _pt_complete_io, bdev_io);
+ break;
+ default:
+ SPDK_ERRLOG("passthru: unknown I/O type %d\n", bdev_io->type);
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ return;
+ }
+ if (rc != 0) {
+ if (rc == -ENOMEM) {
+ SPDK_ERRLOG("No memory, start to queue io for passthru.\n");
+ io_ctx->ch = ch;
+ vbdev_passthru_queue_io(bdev_io);
+ } else {
+ SPDK_ERRLOG("ERROR on bdev_io submission!\n");
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+ }
+}
+
+/* We'll just call the base bdev and let it answer however if we were more
+ * restrictive for some reason (or less) we could get the response back
+ * and modify according to our purposes.
+ */
+static bool
+vbdev_passthru_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
+{
+ struct vbdev_passthru *pt_node = (struct vbdev_passthru *)ctx;
+
+ return spdk_bdev_io_type_supported(pt_node->base_bdev, io_type);
+}
+
+/* We supplied this as an entry point for upper layers who want to communicate to this
+ * bdev. This is how they get a channel. We are passed the same context we provided when
+ * we created our PT vbdev in examine() which, for this bdev, is the address of one of
+ * our context nodes. From here we'll ask the SPDK channel code to fill out our channel
+ * struct and we'll keep it in our PT node.
+ */
+static struct spdk_io_channel *
+vbdev_passthru_get_io_channel(void *ctx)
+{
+ struct vbdev_passthru *pt_node = (struct vbdev_passthru *)ctx;
+ struct spdk_io_channel *pt_ch = NULL;
+
+ /* The IO channel code will allocate a channel for us which consists of
+ * the SPDK channel structure plus the size of our pt_io_channel struct
+ * that we passed in when we registered our IO device. It will then call
+ * our channel create callback to populate any elements that we need to
+ * update.
+ */
+ pt_ch = spdk_get_io_channel(pt_node);
+
+ return pt_ch;
+}
+
+/* This is the output for bdev_get_bdevs() for this vbdev */
+static int
+vbdev_passthru_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
+{
+ struct vbdev_passthru *pt_node = (struct vbdev_passthru *)ctx;
+
+ spdk_json_write_name(w, "passthru");
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&pt_node->pt_bdev));
+ spdk_json_write_named_string(w, "base_bdev_name", spdk_bdev_get_name(pt_node->base_bdev));
+ spdk_json_write_object_end(w);
+
+ return 0;
+}
+
+/* This is used to generate JSON that can configure this module to its current state. */
+static int
+vbdev_passthru_config_json(struct spdk_json_write_ctx *w)
+{
+ struct vbdev_passthru *pt_node;
+
+ TAILQ_FOREACH(pt_node, &g_pt_nodes, link) {
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "method", "bdev_passthru_create");
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_string(w, "base_bdev_name", spdk_bdev_get_name(pt_node->base_bdev));
+ spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&pt_node->pt_bdev));
+ spdk_json_write_object_end(w);
+ spdk_json_write_object_end(w);
+ }
+ return 0;
+}
+
+/* We provide this callback for the SPDK channel code to create a channel using
+ * the channel struct we provided in our module get_io_channel() entry point. Here
+ * we get and save off an underlying base channel of the device below us so that
+ * we can communicate with the base bdev on a per channel basis. If we needed
+ * our own poller for this vbdev, we'd register it here.
+ */
+static int
+pt_bdev_ch_create_cb(void *io_device, void *ctx_buf)
+{
+ struct pt_io_channel *pt_ch = ctx_buf;
+ struct vbdev_passthru *pt_node = io_device;
+
+ pt_ch->base_ch = spdk_bdev_get_io_channel(pt_node->base_desc);
+
+ return 0;
+}
+
+/* We provide this callback for the SPDK channel code to destroy a channel
+ * created with our create callback. We just need to undo anything we did
+ * when we created. If this bdev used its own poller, we'd unregister it here.
+ */
+static void
+pt_bdev_ch_destroy_cb(void *io_device, void *ctx_buf)
+{
+ struct pt_io_channel *pt_ch = ctx_buf;
+
+ spdk_put_io_channel(pt_ch->base_ch);
+}
+
+/* Create the passthru association from the bdev and vbdev name and insert
+ * on the global list. */
+static int
+vbdev_passthru_insert_name(const char *bdev_name, const char *vbdev_name)
+{
+ struct bdev_names *name;
+
+ TAILQ_FOREACH(name, &g_bdev_names, link) {
+ if (strcmp(vbdev_name, name->vbdev_name) == 0) {
+ SPDK_ERRLOG("passthru bdev %s already exists\n", vbdev_name);
+ return -EEXIST;
+ }
+ }
+
+ name = calloc(1, sizeof(struct bdev_names));
+ if (!name) {
+ SPDK_ERRLOG("could not allocate bdev_names\n");
+ return -ENOMEM;
+ }
+
+ name->bdev_name = strdup(bdev_name);
+ if (!name->bdev_name) {
+ SPDK_ERRLOG("could not allocate name->bdev_name\n");
+ free(name);
+ return -ENOMEM;
+ }
+
+ name->vbdev_name = strdup(vbdev_name);
+ if (!name->vbdev_name) {
+ SPDK_ERRLOG("could not allocate name->vbdev_name\n");
+ free(name->bdev_name);
+ free(name);
+ return -ENOMEM;
+ }
+
+ TAILQ_INSERT_TAIL(&g_bdev_names, name, link);
+
+ return 0;
+}
+
+/* On init, just parse config file and build list of pt vbdevs and bdev name pairs. */
+static int
+vbdev_passthru_init(void)
+{
+ struct spdk_conf_section *sp = NULL;
+ const char *conf_bdev_name = NULL;
+ const char *conf_vbdev_name = NULL;
+ struct bdev_names *name;
+ int i, rc;
+
+ sp = spdk_conf_find_section(NULL, "Passthru");
+ if (sp == NULL) {
+ return 0;
+ }
+
+ for (i = 0; ; i++) {
+ if (!spdk_conf_section_get_nval(sp, "PT", i)) {
+ break;
+ }
+
+ conf_bdev_name = spdk_conf_section_get_nmval(sp, "PT", i, 0);
+ if (!conf_bdev_name) {
+ SPDK_ERRLOG("Passthru configuration missing bdev name\n");
+ break;
+ }
+
+ conf_vbdev_name = spdk_conf_section_get_nmval(sp, "PT", i, 1);
+ if (!conf_vbdev_name) {
+ SPDK_ERRLOG("Passthru configuration missing pt_bdev name\n");
+ break;
+ }
+
+ rc = vbdev_passthru_insert_name(conf_bdev_name, conf_vbdev_name);
+ if (rc != 0) {
+ return rc;
+ }
+ }
+ TAILQ_FOREACH(name, &g_bdev_names, link) {
+ SPDK_NOTICELOG("conf parse matched: %s\n", name->bdev_name);
+ }
+ return 0;
+}
+
+/* Called when the entire module is being torn down. */
+static void
+vbdev_passthru_finish(void)
+{
+ struct bdev_names *name;
+
+ while ((name = TAILQ_FIRST(&g_bdev_names))) {
+ TAILQ_REMOVE(&g_bdev_names, name, link);
+ free(name->bdev_name);
+ free(name->vbdev_name);
+ free(name);
+ }
+}
+
+/* During init we'll be asked how much memory we'd like passed to us
+ * in bev_io structures as context. Here's where we specify how
+ * much context we want per IO.
+ */
+static int
+vbdev_passthru_get_ctx_size(void)
+{
+ return sizeof(struct passthru_bdev_io);
+}
+
+/* Called when SPDK wants to save the current config of this vbdev module to
+ * a file.
+ */
+static void
+vbdev_passthru_get_spdk_running_config(FILE *fp)
+{
+ struct bdev_names *names = NULL;
+
+ fprintf(fp, "\n[Passthru]\n");
+ TAILQ_FOREACH(names, &g_bdev_names, link) {
+ fprintf(fp, " PT %s %s\n", names->bdev_name, names->vbdev_name);
+ }
+ fprintf(fp, "\n");
+}
+
+/* Where vbdev_passthru_config_json() is used to generate per module JSON config data, this
+ * function is called to output any per bdev specific methods. For the PT module, there are
+ * none.
+ */
+static void
+vbdev_passthru_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
+{
+ /* No config per bdev needed */
+}
+
+/* When we register our bdev this is how we specify our entry points. */
+static const struct spdk_bdev_fn_table vbdev_passthru_fn_table = {
+ .destruct = vbdev_passthru_destruct,
+ .submit_request = vbdev_passthru_submit_request,
+ .io_type_supported = vbdev_passthru_io_type_supported,
+ .get_io_channel = vbdev_passthru_get_io_channel,
+ .dump_info_json = vbdev_passthru_dump_info_json,
+ .write_config_json = vbdev_passthru_write_config_json,
+};
+
+/* Called when the underlying base bdev goes away. */
+static void
+vbdev_passthru_base_bdev_hotremove_cb(void *ctx)
+{
+ struct vbdev_passthru *pt_node, *tmp;
+ struct spdk_bdev *bdev_find = ctx;
+
+ TAILQ_FOREACH_SAFE(pt_node, &g_pt_nodes, link, tmp) {
+ if (bdev_find == pt_node->base_bdev) {
+ spdk_bdev_unregister(&pt_node->pt_bdev, NULL, NULL);
+ }
+ }
+}
+
+/* Create and register the passthru vbdev if we find it in our list of bdev names.
+ * This can be called either by the examine path or RPC method.
+ */
+static int
+vbdev_passthru_register(struct spdk_bdev *bdev)
+{
+ struct bdev_names *name;
+ struct vbdev_passthru *pt_node;
+ int rc = 0;
+
+ /* Check our list of names from config versus this bdev and if
+ * there's a match, create the pt_node & bdev accordingly.
+ */
+ TAILQ_FOREACH(name, &g_bdev_names, link) {
+ if (strcmp(name->bdev_name, bdev->name) != 0) {
+ continue;
+ }
+
+ SPDK_NOTICELOG("Match on %s\n", bdev->name);
+ pt_node = calloc(1, sizeof(struct vbdev_passthru));
+ if (!pt_node) {
+ rc = -ENOMEM;
+ SPDK_ERRLOG("could not allocate pt_node\n");
+ break;
+ }
+
+ /* The base bdev that we're attaching to. */
+ pt_node->base_bdev = bdev;
+ pt_node->pt_bdev.name = strdup(name->vbdev_name);
+ if (!pt_node->pt_bdev.name) {
+ rc = -ENOMEM;
+ SPDK_ERRLOG("could not allocate pt_bdev name\n");
+ free(pt_node);
+ break;
+ }
+ pt_node->pt_bdev.product_name = "passthru";
+
+ /* Copy some properties from the underlying base bdev. */
+ pt_node->pt_bdev.write_cache = bdev->write_cache;
+ pt_node->pt_bdev.required_alignment = bdev->required_alignment;
+ pt_node->pt_bdev.optimal_io_boundary = bdev->optimal_io_boundary;
+ pt_node->pt_bdev.blocklen = bdev->blocklen;
+ pt_node->pt_bdev.blockcnt = bdev->blockcnt;
+
+ pt_node->pt_bdev.md_interleave = bdev->md_interleave;
+ pt_node->pt_bdev.md_len = bdev->md_len;
+ pt_node->pt_bdev.dif_type = bdev->dif_type;
+ pt_node->pt_bdev.dif_is_head_of_md = bdev->dif_is_head_of_md;
+ pt_node->pt_bdev.dif_check_flags = bdev->dif_check_flags;
+
+ /* This is the context that is passed to us when the bdev
+ * layer calls in so we'll save our pt_bdev node here.
+ */
+ pt_node->pt_bdev.ctxt = pt_node;
+ pt_node->pt_bdev.fn_table = &vbdev_passthru_fn_table;
+ pt_node->pt_bdev.module = &passthru_if;
+ TAILQ_INSERT_TAIL(&g_pt_nodes, pt_node, link);
+
+ spdk_io_device_register(pt_node, pt_bdev_ch_create_cb, pt_bdev_ch_destroy_cb,
+ sizeof(struct pt_io_channel),
+ name->vbdev_name);
+ SPDK_NOTICELOG("io_device created at: 0x%p\n", pt_node);
+
+ rc = spdk_bdev_open(bdev, true, vbdev_passthru_base_bdev_hotremove_cb,
+ bdev, &pt_node->base_desc);
+ if (rc) {
+ SPDK_ERRLOG("could not open bdev %s\n", spdk_bdev_get_name(bdev));
+ TAILQ_REMOVE(&g_pt_nodes, pt_node, link);
+ spdk_io_device_unregister(pt_node, NULL);
+ free(pt_node->pt_bdev.name);
+ free(pt_node);
+ break;
+ }
+ SPDK_NOTICELOG("bdev opened\n");
+
+ /* Save the thread where the base device is opened */
+ pt_node->thread = spdk_get_thread();
+
+ rc = spdk_bdev_module_claim_bdev(bdev, pt_node->base_desc, pt_node->pt_bdev.module);
+ if (rc) {
+ SPDK_ERRLOG("could not claim bdev %s\n", spdk_bdev_get_name(bdev));
+ spdk_bdev_close(pt_node->base_desc);
+ TAILQ_REMOVE(&g_pt_nodes, pt_node, link);
+ spdk_io_device_unregister(pt_node, NULL);
+ free(pt_node->pt_bdev.name);
+ free(pt_node);
+ break;
+ }
+ SPDK_NOTICELOG("bdev claimed\n");
+
+ rc = spdk_bdev_register(&pt_node->pt_bdev);
+ if (rc) {
+ SPDK_ERRLOG("could not register pt_bdev\n");
+ spdk_bdev_module_release_bdev(&pt_node->pt_bdev);
+ spdk_bdev_close(pt_node->base_desc);
+ TAILQ_REMOVE(&g_pt_nodes, pt_node, link);
+ spdk_io_device_unregister(pt_node, NULL);
+ free(pt_node->pt_bdev.name);
+ free(pt_node);
+ break;
+ }
+ SPDK_NOTICELOG("pt_bdev registered\n");
+ SPDK_NOTICELOG("created pt_bdev for: %s\n", name->vbdev_name);
+ }
+
+ return rc;
+}
+
+/* Create the passthru disk from the given bdev and vbdev name. */
+int
+bdev_passthru_create_disk(const char *bdev_name, const char *vbdev_name)
+{
+ struct spdk_bdev *bdev = NULL;
+ int rc = 0;
+
+ /* Insert the bdev into our global name list even if it doesn't exist yet,
+ * it may show up soon...
+ */
+ rc = vbdev_passthru_insert_name(bdev_name, vbdev_name);
+ if (rc) {
+ return rc;
+ }
+
+ bdev = spdk_bdev_get_by_name(bdev_name);
+ if (!bdev) {
+ /* This is not an error, we tracked the name above and it still
+ * may show up later.
+ */
+ SPDK_NOTICELOG("vbdev creation deferred pending base bdev arrival\n");
+ return 0;
+ }
+
+ return vbdev_passthru_register(bdev);
+}
+
+void
+bdev_passthru_delete_disk(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg)
+{
+ struct bdev_names *name;
+
+ if (!bdev || bdev->module != &passthru_if) {
+ cb_fn(cb_arg, -ENODEV);
+ return;
+ }
+
+ /* Remove the association (vbdev, bdev) from g_bdev_names. This is required so that the
+ * vbdev does not get re-created if the same bdev is constructed at some other time,
+ * unless the underlying bdev was hot-removed.
+ */
+ TAILQ_FOREACH(name, &g_bdev_names, link) {
+ if (strcmp(name->vbdev_name, bdev->name) == 0) {
+ TAILQ_REMOVE(&g_bdev_names, name, link);
+ free(name->bdev_name);
+ free(name->vbdev_name);
+ free(name);
+ break;
+ }
+ }
+
+ /* Additional cleanup happens in the destruct callback. */
+ spdk_bdev_unregister(bdev, cb_fn, cb_arg);
+}
+
+/* Because we specified this function in our pt bdev function table when we
+ * registered our pt bdev, we'll get this call anytime a new bdev shows up.
+ * Here we need to decide if we care about it and if so what to do. We
+ * parsed the config file at init so we check the new bdev against the list
+ * we built up at that time and if the user configured us to attach to this
+ * bdev, here's where we do it.
+ */
+static void
+vbdev_passthru_examine(struct spdk_bdev *bdev)
+{
+ vbdev_passthru_register(bdev);
+
+ spdk_bdev_module_examine_done(&passthru_if);
+}
+
+SPDK_LOG_REGISTER_COMPONENT("vbdev_passthru", SPDK_LOG_VBDEV_PASSTHRU)
diff --git a/src/spdk/module/bdev/passthru/vbdev_passthru.h b/src/spdk/module/bdev/passthru/vbdev_passthru.h
new file mode 100644
index 000000000..716e187c1
--- /dev/null
+++ b/src/spdk/module/bdev/passthru/vbdev_passthru.h
@@ -0,0 +1,61 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_VBDEV_PASSTHRU_H
+#define SPDK_VBDEV_PASSTHRU_H
+
+#include "spdk/stdinc.h"
+
+#include "spdk/bdev.h"
+#include "spdk/bdev_module.h"
+
+/**
+ * Create new pass through bdev.
+ *
+ * \param bdev_name Bdev on which pass through vbdev will be created.
+ * \param vbdev_name Name of the pass through bdev.
+ * \return 0 on success, other on failure.
+ */
+int bdev_passthru_create_disk(const char *bdev_name, const char *vbdev_name);
+
+/**
+ * Delete passthru bdev.
+ *
+ * \param bdev Pointer to pass through bdev.
+ * \param cb_fn Function to call after deletion.
+ * \param cb_arg Argument to pass to cb_fn.
+ */
+void bdev_passthru_delete_disk(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn,
+ void *cb_arg);
+
+#endif /* SPDK_VBDEV_PASSTHRU_H */
diff --git a/src/spdk/module/bdev/passthru/vbdev_passthru_rpc.c b/src/spdk/module/bdev/passthru/vbdev_passthru_rpc.c
new file mode 100644
index 000000000..ae4014294
--- /dev/null
+++ b/src/spdk/module/bdev/passthru/vbdev_passthru_rpc.c
@@ -0,0 +1,148 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "vbdev_passthru.h"
+#include "spdk/rpc.h"
+#include "spdk/util.h"
+#include "spdk/string.h"
+#include "spdk_internal/log.h"
+
+/* Structure to hold the parameters for this RPC method. */
+struct rpc_bdev_passthru_create {
+ char *base_bdev_name;
+ char *name;
+};
+
+/* Free the allocated memory resource after the RPC handling. */
+static void
+free_rpc_bdev_passthru_create(struct rpc_bdev_passthru_create *r)
+{
+ free(r->base_bdev_name);
+ free(r->name);
+}
+
+/* Structure to decode the input parameters for this RPC method. */
+static const struct spdk_json_object_decoder rpc_bdev_passthru_create_decoders[] = {
+ {"base_bdev_name", offsetof(struct rpc_bdev_passthru_create, base_bdev_name), spdk_json_decode_string},
+ {"name", offsetof(struct rpc_bdev_passthru_create, name), spdk_json_decode_string},
+};
+
+/* Decode the parameters for this RPC method and properly construct the passthru
+ * device. Error status returned in the failed cases.
+ */
+static void
+rpc_bdev_passthru_create(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_passthru_create req = {NULL};
+ struct spdk_json_write_ctx *w;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_bdev_passthru_create_decoders,
+ SPDK_COUNTOF(rpc_bdev_passthru_create_decoders),
+ &req)) {
+ SPDK_DEBUGLOG(SPDK_LOG_VBDEV_PASSTHRU, "spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ rc = bdev_passthru_create_disk(req.base_bdev_name, req.name);
+ if (rc != 0) {
+ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc));
+ goto cleanup;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_string(w, req.name);
+ spdk_jsonrpc_end_result(request, w);
+
+cleanup:
+ free_rpc_bdev_passthru_create(&req);
+}
+SPDK_RPC_REGISTER("bdev_passthru_create", rpc_bdev_passthru_create, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_passthru_create, construct_passthru_bdev)
+
+struct rpc_bdev_passthru_delete {
+ char *name;
+};
+
+static void
+free_rpc_bdev_passthru_delete(struct rpc_bdev_passthru_delete *req)
+{
+ free(req->name);
+}
+
+static const struct spdk_json_object_decoder rpc_bdev_passthru_delete_decoders[] = {
+ {"name", offsetof(struct rpc_bdev_passthru_delete, name), spdk_json_decode_string},
+};
+
+static void
+rpc_bdev_passthru_delete_cb(void *cb_arg, int bdeverrno)
+{
+ struct spdk_jsonrpc_request *request = cb_arg;
+ struct spdk_json_write_ctx *w;
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, bdeverrno == 0);
+ spdk_jsonrpc_end_result(request, w);
+}
+
+static void
+rpc_bdev_passthru_delete(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_passthru_delete req = {NULL};
+ struct spdk_bdev *bdev;
+
+ if (spdk_json_decode_object(params, rpc_bdev_passthru_delete_decoders,
+ SPDK_COUNTOF(rpc_bdev_passthru_delete_decoders),
+ &req)) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ bdev = spdk_bdev_get_by_name(req.name);
+ if (bdev == NULL) {
+ spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV));
+ goto cleanup;
+ }
+
+ bdev_passthru_delete_disk(bdev, rpc_bdev_passthru_delete_cb, request);
+
+cleanup:
+ free_rpc_bdev_passthru_delete(&req);
+}
+SPDK_RPC_REGISTER("bdev_passthru_delete", rpc_bdev_passthru_delete, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_passthru_delete, delete_passthru_bdev)
diff --git a/src/spdk/module/bdev/pmem/Makefile b/src/spdk/module/bdev/pmem/Makefile
new file mode 100644
index 000000000..3a918be78
--- /dev/null
+++ b/src/spdk/module/bdev/pmem/Makefile
@@ -0,0 +1,45 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = bdev_pmem.c bdev_pmem_rpc.c
+LIBNAME = bdev_pmem
+
+SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/module/bdev/pmem/bdev_pmem.c b/src/spdk/module/bdev/pmem/bdev_pmem.c
new file mode 100644
index 000000000..79ffb960a
--- /dev/null
+++ b/src/spdk/module/bdev/pmem/bdev_pmem.c
@@ -0,0 +1,473 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/conf.h"
+#include "spdk/string.h"
+#include "spdk/likely.h"
+#include "spdk/util.h"
+#include "spdk/rpc.h"
+#include "spdk/bdev_module.h"
+#include "spdk_internal/log.h"
+#include "spdk/config.h"
+
+#include "bdev_pmem.h"
+#include "libpmemblk.h"
+
+struct pmem_disk {
+ struct spdk_bdev disk;
+ PMEMblkpool *pool;
+ char pmem_file[NAME_MAX];
+ TAILQ_ENTRY(pmem_disk) tailq;
+};
+
+static TAILQ_HEAD(, pmem_disk) g_pmem_disks = TAILQ_HEAD_INITIALIZER(g_pmem_disks);
+
+static int bdev_pmem_initialize(void);
+static void bdev_pmem_finish(void);
+
+static struct spdk_bdev_module pmem_if = {
+ .name = "pmem",
+ .module_init = bdev_pmem_initialize,
+ .module_fini = bdev_pmem_finish,
+ .async_fini = true,
+
+};
+
+SPDK_BDEV_MODULE_REGISTER(pmem, &pmem_if)
+
+typedef int(*spdk_bdev_pmem_io_request)(PMEMblkpool *pbp, void *buf, long long blockno);
+
+static int
+_bdev_pmem_submit_io_read(PMEMblkpool *pbp, void *buf, long long blockno)
+{
+ return pmemblk_read(pbp, buf, blockno);
+}
+
+static int
+_bdev_pmem_submit_io_write(PMEMblkpool *pbp, void *buf, long long blockno)
+{
+ return pmemblk_write(pbp, buf, blockno);
+}
+
+static int
+bdev_pmem_destruct(void *ctx)
+{
+ struct pmem_disk *pdisk = ctx;
+
+ TAILQ_REMOVE(&g_pmem_disks, pdisk, tailq);
+ free(pdisk->disk.name);
+ pmemblk_close(pdisk->pool);
+ free(pdisk);
+
+ return 0;
+}
+
+static int
+bdev_pmem_check_iov_len(struct iovec *iovs, int iovcnt, size_t num_blocks, uint32_t block_size)
+{
+ size_t nbytes = num_blocks * block_size;
+ int i;
+
+ for (i = 0; i < iovcnt; i++) {
+ if (spdk_unlikely(iovs[i].iov_base == NULL && iovs[i].iov_len != 0)) {
+ return -1;
+ }
+
+ if (nbytes <= iovs[i].iov_len) {
+ return 0;
+ }
+
+ if (spdk_unlikely(iovs[i].iov_len % block_size != 0)) {
+ return -1;
+ }
+
+ nbytes -= iovs[i].iov_len;
+ }
+
+ return -1;
+}
+
+static void
+bdev_pmem_submit_io(struct spdk_bdev_io *bdev_io, struct pmem_disk *pdisk,
+ struct spdk_io_channel *ch,
+ struct iovec *iov, int iovcnt,
+ uint64_t offset_blocks, size_t num_blocks, uint32_t block_size,
+ spdk_bdev_pmem_io_request fn)
+{
+ int rc;
+ size_t nbytes, offset, len;
+ enum spdk_bdev_io_status status;
+
+ rc = bdev_pmem_check_iov_len(iov, iovcnt, num_blocks, block_size);
+ if (rc) {
+ status = SPDK_BDEV_IO_STATUS_FAILED;
+ goto end;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_PMEM, "io %lu bytes from offset %#lx\n",
+ num_blocks, offset_blocks);
+
+ for (nbytes = num_blocks * block_size; nbytes > 0; iov++) {
+ len = spdk_min(iov->iov_len, nbytes);
+ nbytes -= len;
+
+ offset = 0;
+ while (offset != len) {
+ rc = fn(pdisk->pool, iov->iov_base + offset, offset_blocks);
+ if (rc != 0) {
+ SPDK_ERRLOG("pmemblk io failed: %d (%s)\n", errno, pmemblk_errormsg());
+ status = SPDK_BDEV_IO_STATUS_FAILED;
+ goto end;
+ }
+
+ offset += block_size;
+ offset_blocks++;
+ }
+ }
+
+ assert(num_blocks == offset_blocks - bdev_io->u.bdev.offset_blocks);
+ status = SPDK_BDEV_IO_STATUS_SUCCESS;
+end:
+
+ spdk_bdev_io_complete(bdev_io, status);
+}
+
+static void
+bdev_pmem_write_zeros(struct spdk_bdev_io *bdev_io, struct pmem_disk *pdisk,
+ struct spdk_io_channel *ch, uint64_t offset_blocks,
+ uint64_t num_blocks, uint32_t block_size)
+{
+ int rc;
+ enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS;
+
+ while (num_blocks > 0) {
+ rc = pmemblk_set_zero(pdisk->pool, offset_blocks);
+ if (rc != 0) {
+ SPDK_ERRLOG("pmemblk_set_zero failed: %d (%s)\n", errno, pmemblk_errormsg());
+ status = SPDK_BDEV_IO_STATUS_FAILED;
+ break;
+ }
+ offset_blocks++;
+ num_blocks--;
+ }
+ spdk_bdev_io_complete(bdev_io, status);
+}
+
+static void
+bdev_pmem_io_get_buf_cb(struct spdk_io_channel *channel, struct spdk_bdev_io *bdev_io,
+ bool success)
+{
+ if (!success) {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ return;
+ }
+
+ bdev_pmem_submit_io(bdev_io,
+ bdev_io->bdev->ctxt,
+ channel,
+ bdev_io->u.bdev.iovs,
+ bdev_io->u.bdev.iovcnt,
+ bdev_io->u.bdev.offset_blocks,
+ bdev_io->u.bdev.num_blocks,
+ bdev_io->bdev->blocklen,
+ _bdev_pmem_submit_io_read);
+}
+
+static void
+bdev_pmem_submit_request(struct spdk_io_channel *channel, struct spdk_bdev_io *bdev_io)
+{
+ switch (bdev_io->type) {
+ case SPDK_BDEV_IO_TYPE_READ:
+ spdk_bdev_io_get_buf(bdev_io, bdev_pmem_io_get_buf_cb,
+ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
+ break;
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ bdev_pmem_submit_io(bdev_io,
+ bdev_io->bdev->ctxt,
+ channel,
+ bdev_io->u.bdev.iovs,
+ bdev_io->u.bdev.iovcnt,
+ bdev_io->u.bdev.offset_blocks,
+ bdev_io->u.bdev.num_blocks,
+ bdev_io->bdev->blocklen,
+ _bdev_pmem_submit_io_write);
+ break;
+ case SPDK_BDEV_IO_TYPE_UNMAP:
+ case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
+ bdev_pmem_write_zeros(bdev_io,
+ bdev_io->bdev->ctxt,
+ channel,
+ bdev_io->u.bdev.offset_blocks,
+ bdev_io->u.bdev.num_blocks,
+ bdev_io->bdev->blocklen);
+ break;
+ case SPDK_BDEV_IO_TYPE_RESET:
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
+ break;
+ default:
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+}
+
+static bool
+bdev_pmem_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
+{
+ switch (io_type) {
+ case SPDK_BDEV_IO_TYPE_READ:
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ case SPDK_BDEV_IO_TYPE_RESET:
+ case SPDK_BDEV_IO_TYPE_UNMAP:
+ case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static struct spdk_io_channel *
+bdev_pmem_get_io_channel(void *ctx)
+{
+ return spdk_get_io_channel(&g_pmem_disks);
+}
+
+static int
+bdev_pmem_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
+{
+ struct pmem_disk *pdisk = ctx;
+
+ spdk_json_write_named_object_begin(w, "pmem");
+ spdk_json_write_named_string(w, "pmem_file", pdisk->pmem_file);
+ spdk_json_write_object_end(w);
+
+ return 0;
+}
+
+static int
+bdev_pmem_create_cb(void *io_device, void *ctx_buf)
+{
+ return 0;
+}
+
+static void
+bdev_pmem_destroy_cb(void *io_device, void *ctx_buf)
+{
+}
+
+static void
+bdev_pmem_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
+{
+ struct pmem_disk *disk = bdev->ctxt;
+
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_string(w, "method", "bdev_pmem_create");
+
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_string(w, "name", bdev->name);
+ spdk_json_write_named_string(w, "pmem_file", disk->pmem_file);
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+}
+
+static const struct spdk_bdev_fn_table pmem_fn_table = {
+ .destruct = bdev_pmem_destruct,
+ .submit_request = bdev_pmem_submit_request,
+ .io_type_supported = bdev_pmem_io_type_supported,
+ .get_io_channel = bdev_pmem_get_io_channel,
+ .dump_info_json = bdev_pmem_dump_info_json,
+ .write_config_json = bdev_pmem_write_config_json,
+};
+
+int
+create_pmem_disk(const char *pmem_file, const char *name, struct spdk_bdev **bdev)
+{
+ uint64_t num_blocks;
+ uint32_t block_size;
+ struct pmem_disk *pdisk;
+ int rc;
+
+ *bdev = NULL;
+
+ if (name == NULL) {
+ SPDK_ERRLOG("Missing name parameter for create_pmem_disk()\n");
+ return -EINVAL;
+ }
+
+ if (pmemblk_check(pmem_file, 0) != 1) {
+ SPDK_ERRLOG("Pool '%s' check failed: %s\n", pmem_file, pmemblk_errormsg());
+ return -EIO;
+ }
+
+ pdisk = calloc(1, sizeof(*pdisk));
+ if (!pdisk) {
+ return -ENOMEM;
+ }
+
+ snprintf(pdisk->pmem_file, sizeof(pdisk->pmem_file), "%s", pmem_file);
+ pdisk->pool = pmemblk_open(pmem_file, 0);
+ if (!pdisk->pool) {
+ SPDK_ERRLOG("Opening pmem pool '%s' failed: %d\n", pmem_file, errno);
+ free(pdisk);
+ return -errno;
+ }
+
+ block_size = pmemblk_bsize(pdisk->pool);
+ num_blocks = pmemblk_nblock(pdisk->pool);
+
+ if (block_size == 0) {
+ SPDK_ERRLOG("Block size must be more than 0 bytes\n");
+ pmemblk_close(pdisk->pool);
+ free(pdisk);
+ return -EINVAL;
+ }
+
+ if (num_blocks == 0) {
+ SPDK_ERRLOG("Disk must be more than 0 blocks\n");
+ pmemblk_close(pdisk->pool);
+ free(pdisk);
+ return -EINVAL;
+ }
+
+ pdisk->disk.name = strdup(name);
+ if (!pdisk->disk.name) {
+ pmemblk_close(pdisk->pool);
+ free(pdisk);
+ return -ENOMEM;
+ }
+
+ pdisk->disk.product_name = "pmemblk disk";
+ pdisk->disk.write_cache = 0;
+ pdisk->disk.blocklen = block_size;
+ pdisk->disk.blockcnt = num_blocks;
+
+ pdisk->disk.ctxt = pdisk;
+ pdisk->disk.fn_table = &pmem_fn_table;
+ pdisk->disk.module = &pmem_if;
+
+ rc = spdk_bdev_register(&pdisk->disk);
+ if (rc) {
+ pmemblk_close(pdisk->pool);
+ free(pdisk->disk.name);
+ free(pdisk);
+ return rc;
+ }
+
+ TAILQ_INSERT_TAIL(&g_pmem_disks, pdisk, tailq);
+
+ *bdev = &pdisk->disk;
+
+ return 0;
+}
+
+void
+delete_pmem_disk(struct spdk_bdev *bdev, spdk_delete_pmem_complete cb_fn, void *cb_arg)
+{
+ if (!bdev || bdev->module != &pmem_if) {
+ cb_fn(cb_arg, -ENODEV);
+ return;
+ }
+
+ spdk_bdev_unregister(bdev, cb_fn, cb_arg);
+}
+
+static void
+bdev_pmem_read_conf(void)
+{
+ struct spdk_conf_section *sp;
+ struct spdk_bdev *bdev;
+ const char *pmem_file;
+ const char *bdev_name;
+ int i;
+
+ sp = spdk_conf_find_section(NULL, "Pmem");
+ if (sp == NULL) {
+ return;
+ }
+
+ for (i = 0; ; i++) {
+ if (!spdk_conf_section_get_nval(sp, "Blk", i)) {
+ break;
+ }
+
+ pmem_file = spdk_conf_section_get_nmval(sp, "Blk", i, 0);
+ if (pmem_file == NULL) {
+ SPDK_ERRLOG("Pmem: missing filename\n");
+ continue;
+ }
+
+ bdev_name = spdk_conf_section_get_nmval(sp, "Blk", i, 1);
+ if (bdev_name == NULL) {
+ SPDK_ERRLOG("Pmem: missing bdev name\n");
+ continue;
+ }
+
+ create_pmem_disk(pmem_file, bdev_name, &bdev);
+ }
+}
+
+static int
+bdev_pmem_initialize(void)
+{
+ const char *err = pmemblk_check_version(PMEMBLK_MAJOR_VERSION, PMEMBLK_MINOR_VERSION);
+
+ if (err != NULL) {
+ SPDK_ERRLOG("Invalid libpmemblk version (expected %d.%d): %s\n", PMEMBLK_MAJOR_VERSION,
+ PMEMBLK_MINOR_VERSION, err);
+ return -1;
+ }
+
+#ifdef SPDK_CONFIG_DEBUG
+ setenv("PMEMBLK_LOG_LEVEL", "1", 1);
+#endif
+ spdk_io_device_register(&g_pmem_disks, bdev_pmem_create_cb, bdev_pmem_destroy_cb, 0, "pmem_bdev");
+
+ bdev_pmem_read_conf();
+
+ return 0;
+
+}
+
+static void
+bdev_pmem_finish_done(void *io_device)
+{
+ spdk_bdev_module_finish_done();
+}
+
+static void
+bdev_pmem_finish(void)
+{
+ spdk_io_device_unregister(&g_pmem_disks, bdev_pmem_finish_done);
+}
+
+SPDK_LOG_REGISTER_COMPONENT("bdev_pmem", SPDK_LOG_BDEV_PMEM)
diff --git a/src/spdk/module/bdev/pmem/bdev_pmem.h b/src/spdk/module/bdev/pmem/bdev_pmem.h
new file mode 100644
index 000000000..d9292b114
--- /dev/null
+++ b/src/spdk/module/bdev/pmem/bdev_pmem.h
@@ -0,0 +1,64 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_BDEV_PMEM_H
+#define SPDK_BDEV_PMEM_H
+
+#include "spdk/bdev.h"
+
+typedef void (*spdk_delete_pmem_complete)(void *cb_arg, int bdeverrno);
+
+/**
+ * Create new pmem bdev.
+ *
+ * \param pmem_file Pointer to pmem pool file.
+ * \param name Bdev name.
+ * \param bdev output parameter for bdev when operation is successful.
+ * \return 0 on success.
+ * -EIO if pool check failed
+ * -EINVAL if input parameters check failed
+ * -ENOMEM if buffer cannot be allocated
+ */
+int create_pmem_disk(const char *pmem_file, const char *name, struct spdk_bdev **bdev);
+
+/**
+ * Delete pmem bdev.
+ *
+ * \param bdev Pointer to pmem bdev.
+ * \param cb_fn Function to call after deletion.
+ * \param cb_arg Argument to pass to cb_fn.
+ */
+void delete_pmem_disk(struct spdk_bdev *bdev, spdk_delete_pmem_complete cb_fn,
+ void *cb_arg);
+
+#endif /* SPDK_BDEV_PMEM_H */
diff --git a/src/spdk/module/bdev/pmem/bdev_pmem_rpc.c b/src/spdk/module/bdev/pmem/bdev_pmem_rpc.c
new file mode 100644
index 000000000..2af7c1c7a
--- /dev/null
+++ b/src/spdk/module/bdev/pmem/bdev_pmem_rpc.c
@@ -0,0 +1,337 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "bdev_pmem.h"
+#include "spdk/rpc.h"
+#include "spdk/util.h"
+#include "spdk/string.h"
+#include "libpmemblk.h"
+
+#include "spdk_internal/log.h"
+
+struct rpc_construct_pmem {
+ char *pmem_file;
+ char *name;
+};
+
+static void
+free_rpc_bdev_pmem_create(struct rpc_construct_pmem *req)
+{
+ free(req->pmem_file);
+ free(req->name);
+}
+
+static const struct spdk_json_object_decoder rpc_construct_pmem_decoders[] = {
+ {"pmem_file", offsetof(struct rpc_construct_pmem, pmem_file), spdk_json_decode_string},
+ {"name", offsetof(struct rpc_construct_pmem, name), spdk_json_decode_string},
+};
+
+static void
+rpc_bdev_pmem_create(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_construct_pmem req = {};
+ struct spdk_json_write_ctx *w;
+ struct spdk_bdev *bdev;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_construct_pmem_decoders,
+ SPDK_COUNTOF(rpc_construct_pmem_decoders),
+ &req)) {
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_PMEM, "spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+ rc = create_pmem_disk(req.pmem_file, req.name, &bdev);
+ if (rc != 0) {
+ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc));
+ goto cleanup;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_string(w, spdk_bdev_get_name(bdev));
+ spdk_jsonrpc_end_result(request, w);
+
+cleanup:
+ free_rpc_bdev_pmem_create(&req);
+}
+SPDK_RPC_REGISTER("bdev_pmem_create", rpc_bdev_pmem_create, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_pmem_create, construct_pmem_bdev)
+
+struct rpc_delete_pmem {
+ char *name;
+};
+
+static void
+free_rpc_delete_pmem(struct rpc_delete_pmem *req)
+{
+ free(req->name);
+}
+
+static const struct spdk_json_object_decoder rpc_delete_pmem_decoders[] = {
+ {"name", offsetof(struct rpc_delete_pmem, name), spdk_json_decode_string},
+};
+
+static void
+_rpc_bdev_pmem_delete_cb(void *cb_arg, int bdeverrno)
+{
+ struct spdk_jsonrpc_request *request = cb_arg;
+ struct spdk_json_write_ctx *w = spdk_jsonrpc_begin_result(request);
+
+ spdk_json_write_bool(w, bdeverrno == 0);
+ spdk_jsonrpc_end_result(request, w);
+}
+
+static void
+rpc_bdev_pmem_delete(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_delete_pmem req = {NULL};
+ struct spdk_bdev *bdev;
+
+ if (spdk_json_decode_object(params, rpc_delete_pmem_decoders,
+ SPDK_COUNTOF(rpc_delete_pmem_decoders),
+ &req)) {
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_PMEM, "spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ }
+
+ bdev = spdk_bdev_get_by_name(req.name);
+ if (bdev == NULL) {
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_PMEM, "bdev '%s' does not exist\n", req.name);
+ spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV));
+ goto cleanup;
+ }
+
+ delete_pmem_disk(bdev, _rpc_bdev_pmem_delete_cb, request);
+
+cleanup:
+ free_rpc_delete_pmem(&req);
+}
+SPDK_RPC_REGISTER("bdev_pmem_delete", rpc_bdev_pmem_delete, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_pmem_delete, delete_pmem_bdev)
+
+struct rpc_bdev_pmem_create_pool {
+ char *pmem_file;
+ uint64_t num_blocks;
+ uint32_t block_size;
+};
+
+static const struct spdk_json_object_decoder rpc_bdev_pmem_create_pool_decoders[] = {
+ {"pmem_file", offsetof(struct rpc_bdev_pmem_create_pool, pmem_file), spdk_json_decode_string},
+ {"num_blocks", offsetof(struct rpc_bdev_pmem_create_pool, num_blocks), spdk_json_decode_uint64},
+ {"block_size", offsetof(struct rpc_bdev_pmem_create_pool, block_size), spdk_json_decode_uint32},
+};
+
+static void
+free_rpc_bdev_pmem_create_pool(struct rpc_bdev_pmem_create_pool *req)
+{
+ free(req->pmem_file);
+}
+
+static void
+rpc_bdev_pmem_create_pool(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_pmem_create_pool req = {};
+ struct spdk_json_write_ctx *w;
+ uint64_t pool_size;
+ PMEMblkpool *pbp;
+
+ if (spdk_json_decode_object(params, rpc_bdev_pmem_create_pool_decoders,
+ SPDK_COUNTOF(rpc_bdev_pmem_create_pool_decoders),
+ &req)) {
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_PMEM, "spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ /* libpmemblk pool has to contain at least 256 blocks */
+ if (req.num_blocks < 256) {
+ spdk_jsonrpc_send_error_response(request, -EINVAL,
+ "Pmem pool num_blocks must be at least 256");
+ goto cleanup;
+ }
+
+ pool_size = req.num_blocks * req.block_size;
+ if (pool_size < PMEMBLK_MIN_POOL) {
+ spdk_jsonrpc_send_error_response_fmt(request, -EINVAL,
+ "Pmem pool size must be at least %ld", PMEMBLK_MIN_POOL);
+ goto cleanup;
+ }
+
+ pbp = pmemblk_create(req.pmem_file, req.block_size, pool_size, 0666);
+ if (pbp == NULL) {
+ const char *msg = pmemblk_errormsg();
+
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_PMEM, "pmemblk_create() failed: %s\n", msg ? msg : "(logs disabled)");
+ spdk_jsonrpc_send_error_response_fmt(request, -SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "pmemblk_create failed: %s", msg ? msg : "(logs disabled)");
+ goto cleanup;
+ }
+
+ pmemblk_close(pbp);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+
+cleanup:
+ free_rpc_bdev_pmem_create_pool(&req);
+}
+SPDK_RPC_REGISTER("bdev_pmem_create_pool", rpc_bdev_pmem_create_pool, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_pmem_create_pool, create_pmem_pool)
+
+struct rpc_bdev_pmem_get_pool_info {
+ char *pmem_file;
+};
+
+static const struct spdk_json_object_decoder rpc_bdev_pmem_get_pool_info_decoders[] = {
+ {"pmem_file", offsetof(struct rpc_bdev_pmem_get_pool_info, pmem_file), spdk_json_decode_string},
+};
+
+static void
+free_rpc_bdev_pmem_get_pool_info(struct rpc_bdev_pmem_get_pool_info *req)
+{
+ free(req->pmem_file);
+}
+
+static void
+rpc_bdev_pmem_get_pool_info(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_pmem_get_pool_info req = {};
+ struct spdk_json_write_ctx *w;
+ size_t num_blocks, block_size;
+ PMEMblkpool *pbp;
+
+ if (spdk_json_decode_object(params, rpc_bdev_pmem_get_pool_info_decoders,
+ SPDK_COUNTOF(rpc_bdev_pmem_get_pool_info_decoders),
+ &req)) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ pbp = pmemblk_open(req.pmem_file, 0);
+ if (pbp == NULL) {
+ const char *msg = pmemblk_errormsg();
+
+ spdk_jsonrpc_send_error_response_fmt(request, -SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "pmemblk_open failed: %s", msg ? msg : "(logs disabled)");
+ goto cleanup;
+ }
+
+ block_size = pmemblk_bsize(pbp);
+ num_blocks = pmemblk_nblock(pbp);
+
+ pmemblk_close(pbp);
+
+ /* Check pmem pool consistency */
+ if (pmemblk_check(req.pmem_file, block_size) != 1) {
+ const char *msg = pmemblk_errormsg();
+
+ spdk_jsonrpc_send_error_response_fmt(request, -SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "pmemblk_check failed: %s", msg ? msg : "(logs disabled)");
+ goto cleanup;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_array_begin(w);
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_uint64(w, "num_blocks", num_blocks);
+ spdk_json_write_named_uint64(w, "block_size", block_size);
+ spdk_json_write_object_end(w);
+ spdk_json_write_array_end(w);
+ spdk_jsonrpc_end_result(request, w);
+
+cleanup:
+ free_rpc_bdev_pmem_get_pool_info(&req);
+}
+SPDK_RPC_REGISTER("bdev_pmem_get_pool_info", rpc_bdev_pmem_get_pool_info, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_pmem_get_pool_info, pmem_pool_info)
+
+struct rpc_bdev_pmem_delete_pool {
+ char *pmem_file;
+};
+
+static const struct spdk_json_object_decoder rpc_bdev_pmem_delete_pool_decoders[] = {
+ {"pmem_file", offsetof(struct rpc_bdev_pmem_delete_pool, pmem_file), spdk_json_decode_string},
+};
+
+static void
+free_rpc_bdev_pmem_delete_pool(struct rpc_bdev_pmem_delete_pool *req)
+{
+ free(req->pmem_file);
+}
+
+static void
+rpc_bdev_pmem_delete_pool(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_pmem_delete_pool req = {};
+ struct spdk_json_write_ctx *w;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_bdev_pmem_delete_pool_decoders,
+ SPDK_COUNTOF(rpc_bdev_pmem_delete_pool_decoders),
+ &req)) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ /* Check if file is actually pmem pool */
+ rc = pmemblk_check(req.pmem_file, 0);
+ if (rc != 1) {
+ const char *msg = pmemblk_errormsg();
+
+ spdk_jsonrpc_send_error_response_fmt(request, -SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "pmemblk_check failed: %s", msg ? msg : "(logs disabled)");
+ goto cleanup;
+ }
+
+ unlink(req.pmem_file);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+
+cleanup:
+ free_rpc_bdev_pmem_delete_pool(&req);
+}
+SPDK_RPC_REGISTER("bdev_pmem_delete_pool", rpc_bdev_pmem_delete_pool, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_pmem_delete_pool, delete_pmem_pool)
diff --git a/src/spdk/module/bdev/raid/Makefile b/src/spdk/module/bdev/raid/Makefile
new file mode 100644
index 000000000..452d32e79
--- /dev/null
+++ b/src/spdk/module/bdev/raid/Makefile
@@ -0,0 +1,51 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+CFLAGS += -I$(SPDK_ROOT_DIR)/lib/bdev/
+C_SRCS = bdev_raid.c bdev_raid_rpc.c raid0.c
+
+ifeq ($(CONFIG_RAID5),y)
+C_SRCS += raid5.c
+endif
+
+LIBNAME = bdev_raid
+
+SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/module/bdev/raid/bdev_raid.c b/src/spdk/module/bdev/raid/bdev_raid.c
new file mode 100644
index 000000000..10da1a799
--- /dev/null
+++ b/src/spdk/module/bdev/raid/bdev_raid.c
@@ -0,0 +1,1719 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "bdev_raid.h"
+#include "spdk/env.h"
+#include "spdk/thread.h"
+#include "spdk/conf.h"
+#include "spdk_internal/log.h"
+#include "spdk/string.h"
+#include "spdk/util.h"
+#include "spdk/json.h"
+#include "spdk/string.h"
+
+static bool g_shutdown_started = false;
+
+/* raid bdev config as read from config file */
+struct raid_config g_raid_config = {
+ .raid_bdev_config_head = TAILQ_HEAD_INITIALIZER(g_raid_config.raid_bdev_config_head),
+};
+
+/*
+ * List of raid bdev in configured list, these raid bdevs are registered with
+ * bdev layer
+ */
+struct raid_configured_tailq g_raid_bdev_configured_list = TAILQ_HEAD_INITIALIZER(
+ g_raid_bdev_configured_list);
+
+/* List of raid bdev in configuring list */
+struct raid_configuring_tailq g_raid_bdev_configuring_list = TAILQ_HEAD_INITIALIZER(
+ g_raid_bdev_configuring_list);
+
+/* List of all raid bdevs */
+struct raid_all_tailq g_raid_bdev_list = TAILQ_HEAD_INITIALIZER(g_raid_bdev_list);
+
+/* List of all raid bdevs that are offline */
+struct raid_offline_tailq g_raid_bdev_offline_list = TAILQ_HEAD_INITIALIZER(
+ g_raid_bdev_offline_list);
+
+static TAILQ_HEAD(, raid_bdev_module) g_raid_modules = TAILQ_HEAD_INITIALIZER(g_raid_modules);
+
+static struct raid_bdev_module *raid_bdev_module_find(enum raid_level level)
+{
+ struct raid_bdev_module *raid_module;
+
+ TAILQ_FOREACH(raid_module, &g_raid_modules, link) {
+ if (raid_module->level == level) {
+ return raid_module;
+ }
+ }
+
+ return NULL;
+}
+
+void raid_bdev_module_list_add(struct raid_bdev_module *raid_module)
+{
+ if (raid_bdev_module_find(raid_module->level) != NULL) {
+ SPDK_ERRLOG("module for raid level '%s' already registered.\n",
+ raid_bdev_level_to_str(raid_module->level));
+ assert(false);
+ } else {
+ TAILQ_INSERT_TAIL(&g_raid_modules, raid_module, link);
+ }
+}
+
+/* Function declarations */
+static void raid_bdev_examine(struct spdk_bdev *bdev);
+static int raid_bdev_init(void);
+static void raid_bdev_deconfigure(struct raid_bdev *raid_bdev,
+ raid_bdev_destruct_cb cb_fn, void *cb_arg);
+static void raid_bdev_remove_base_bdev(void *ctx);
+
+/*
+ * brief:
+ * raid_bdev_create_cb function is a cb function for raid bdev which creates the
+ * hierarchy from raid bdev to base bdev io channels. It will be called per core
+ * params:
+ * io_device - pointer to raid bdev io device represented by raid_bdev
+ * ctx_buf - pointer to context buffer for raid bdev io channel
+ * returns:
+ * 0 - success
+ * non zero - failure
+ */
+static int
+raid_bdev_create_cb(void *io_device, void *ctx_buf)
+{
+ struct raid_bdev *raid_bdev = io_device;
+ struct raid_bdev_io_channel *raid_ch = ctx_buf;
+ uint8_t i;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_create_cb, %p\n", raid_ch);
+
+ assert(raid_bdev != NULL);
+ assert(raid_bdev->state == RAID_BDEV_STATE_ONLINE);
+
+ raid_ch->num_channels = raid_bdev->num_base_bdevs;
+
+ raid_ch->base_channel = calloc(raid_ch->num_channels,
+ sizeof(struct spdk_io_channel *));
+ if (!raid_ch->base_channel) {
+ SPDK_ERRLOG("Unable to allocate base bdevs io channel\n");
+ return -ENOMEM;
+ }
+ for (i = 0; i < raid_ch->num_channels; i++) {
+ /*
+ * Get the spdk_io_channel for all the base bdevs. This is used during
+ * split logic to send the respective child bdev ios to respective base
+ * bdev io channel.
+ */
+ raid_ch->base_channel[i] = spdk_bdev_get_io_channel(
+ raid_bdev->base_bdev_info[i].desc);
+ if (!raid_ch->base_channel[i]) {
+ uint8_t j;
+
+ for (j = 0; j < i; j++) {
+ spdk_put_io_channel(raid_ch->base_channel[j]);
+ }
+ free(raid_ch->base_channel);
+ raid_ch->base_channel = NULL;
+ SPDK_ERRLOG("Unable to create io channel for base bdev\n");
+ return -ENOMEM;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * brief:
+ * raid_bdev_destroy_cb function is a cb function for raid bdev which deletes the
+ * hierarchy from raid bdev to base bdev io channels. It will be called per core
+ * params:
+ * io_device - pointer to raid bdev io device represented by raid_bdev
+ * ctx_buf - pointer to context buffer for raid bdev io channel
+ * returns:
+ * none
+ */
+static void
+raid_bdev_destroy_cb(void *io_device, void *ctx_buf)
+{
+ struct raid_bdev_io_channel *raid_ch = ctx_buf;
+ uint8_t i;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_destroy_cb\n");
+
+ assert(raid_ch != NULL);
+ assert(raid_ch->base_channel);
+ for (i = 0; i < raid_ch->num_channels; i++) {
+ /* Free base bdev channels */
+ assert(raid_ch->base_channel[i] != NULL);
+ spdk_put_io_channel(raid_ch->base_channel[i]);
+ }
+ free(raid_ch->base_channel);
+ raid_ch->base_channel = NULL;
+}
+
+/*
+ * brief:
+ * raid_bdev_cleanup is used to cleanup and free raid_bdev related data
+ * structures.
+ * params:
+ * raid_bdev - pointer to raid_bdev
+ * returns:
+ * none
+ */
+static void
+raid_bdev_cleanup(struct raid_bdev *raid_bdev)
+{
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_cleanup, %p name %s, state %u, config %p\n",
+ raid_bdev,
+ raid_bdev->bdev.name, raid_bdev->state, raid_bdev->config);
+ if (raid_bdev->state == RAID_BDEV_STATE_CONFIGURING) {
+ TAILQ_REMOVE(&g_raid_bdev_configuring_list, raid_bdev, state_link);
+ } else if (raid_bdev->state == RAID_BDEV_STATE_OFFLINE) {
+ TAILQ_REMOVE(&g_raid_bdev_offline_list, raid_bdev, state_link);
+ } else {
+ assert(0);
+ }
+ TAILQ_REMOVE(&g_raid_bdev_list, raid_bdev, global_link);
+ free(raid_bdev->bdev.name);
+ free(raid_bdev->base_bdev_info);
+ if (raid_bdev->config) {
+ raid_bdev->config->raid_bdev = NULL;
+ }
+ free(raid_bdev);
+}
+
+/*
+ * brief:
+ * wrapper for the bdev close operation
+ * params:
+ * base_info - raid base bdev info
+ * returns:
+ */
+static void
+_raid_bdev_free_base_bdev_resource(void *ctx)
+{
+ struct spdk_bdev_desc *desc = ctx;
+
+ spdk_bdev_close(desc);
+}
+
+
+/*
+ * brief:
+ * free resource of base bdev for raid bdev
+ * params:
+ * raid_bdev - pointer to raid bdev
+ * base_info - raid base bdev info
+ * returns:
+ * 0 - success
+ * non zero - failure
+ */
+static void
+raid_bdev_free_base_bdev_resource(struct raid_bdev *raid_bdev,
+ struct raid_base_bdev_info *base_info)
+{
+ spdk_bdev_module_release_bdev(base_info->bdev);
+ if (base_info->thread && base_info->thread != spdk_get_thread()) {
+ spdk_thread_send_msg(base_info->thread, _raid_bdev_free_base_bdev_resource, base_info->desc);
+ } else {
+ spdk_bdev_close(base_info->desc);
+ }
+ base_info->desc = NULL;
+ base_info->bdev = NULL;
+
+ assert(raid_bdev->num_base_bdevs_discovered);
+ raid_bdev->num_base_bdevs_discovered--;
+}
+
+/*
+ * brief:
+ * raid_bdev_destruct is the destruct function table pointer for raid bdev
+ * params:
+ * ctxt - pointer to raid_bdev
+ * returns:
+ * 0 - success
+ * non zero - failure
+ */
+static int
+raid_bdev_destruct(void *ctxt)
+{
+ struct raid_bdev *raid_bdev = ctxt;
+ struct raid_base_bdev_info *base_info;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_destruct\n");
+
+ raid_bdev->destruct_called = true;
+ RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
+ /*
+ * Close all base bdev descriptors for which call has come from below
+ * layers. Also close the descriptors if we have started shutdown.
+ */
+ if (g_shutdown_started ||
+ ((base_info->remove_scheduled == true) &&
+ (base_info->bdev != NULL))) {
+ raid_bdev_free_base_bdev_resource(raid_bdev, base_info);
+ }
+ }
+
+ if (g_shutdown_started) {
+ TAILQ_REMOVE(&g_raid_bdev_configured_list, raid_bdev, state_link);
+ if (raid_bdev->module->stop != NULL) {
+ raid_bdev->module->stop(raid_bdev);
+ }
+ raid_bdev->state = RAID_BDEV_STATE_OFFLINE;
+ TAILQ_INSERT_TAIL(&g_raid_bdev_offline_list, raid_bdev, state_link);
+ }
+
+ spdk_io_device_unregister(raid_bdev, NULL);
+
+ if (raid_bdev->num_base_bdevs_discovered == 0) {
+ /* Free raid_bdev when there are no base bdevs left */
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid bdev base bdevs is 0, going to free all in destruct\n");
+ raid_bdev_cleanup(raid_bdev);
+ }
+
+ return 0;
+}
+
+void
+raid_bdev_io_complete(struct raid_bdev_io *raid_io, enum spdk_bdev_io_status status)
+{
+ struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io);
+
+ spdk_bdev_io_complete(bdev_io, status);
+}
+
+/*
+ * brief:
+ * raid_bdev_io_complete_part - signal the completion of a part of the expected
+ * base bdev IOs and complete the raid_io if this is the final expected IO.
+ * The caller should first set raid_io->base_bdev_io_remaining. This function
+ * will decrement this counter by the value of the 'completed' parameter and
+ * complete the raid_io if the counter reaches 0. The caller is free to
+ * interpret the 'base_bdev_io_remaining' and 'completed' values as needed,
+ * it can represent e.g. blocks or IOs.
+ * params:
+ * raid_io - pointer to raid_bdev_io
+ * completed - the part of the raid_io that has been completed
+ * status - status of the base IO
+ * returns:
+ * true - if the raid_io is completed
+ * false - otherwise
+ */
+bool
+raid_bdev_io_complete_part(struct raid_bdev_io *raid_io, uint64_t completed,
+ enum spdk_bdev_io_status status)
+{
+ assert(raid_io->base_bdev_io_remaining >= completed);
+ raid_io->base_bdev_io_remaining -= completed;
+
+ if (status != SPDK_BDEV_IO_STATUS_SUCCESS) {
+ raid_io->base_bdev_io_status = status;
+ }
+
+ if (raid_io->base_bdev_io_remaining == 0) {
+ raid_bdev_io_complete(raid_io, raid_io->base_bdev_io_status);
+ return true;
+ } else {
+ return false;
+ }
+}
+
+/*
+ * brief:
+ * raid_bdev_queue_io_wait function processes the IO which failed to submit.
+ * It will try to queue the IOs after storing the context to bdev wait queue logic.
+ * params:
+ * raid_io - pointer to raid_bdev_io
+ * bdev - the block device that the IO is submitted to
+ * ch - io channel
+ * cb_fn - callback when the spdk_bdev_io for bdev becomes available
+ * returns:
+ * none
+ */
+void
+raid_bdev_queue_io_wait(struct raid_bdev_io *raid_io, struct spdk_bdev *bdev,
+ struct spdk_io_channel *ch, spdk_bdev_io_wait_cb cb_fn)
+{
+ raid_io->waitq_entry.bdev = bdev;
+ raid_io->waitq_entry.cb_fn = cb_fn;
+ raid_io->waitq_entry.cb_arg = raid_io;
+ spdk_bdev_queue_io_wait(bdev, ch, &raid_io->waitq_entry);
+}
+
+static void
+raid_base_bdev_reset_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct raid_bdev_io *raid_io = cb_arg;
+
+ spdk_bdev_free_io(bdev_io);
+
+ raid_bdev_io_complete_part(raid_io, 1, success ?
+ SPDK_BDEV_IO_STATUS_SUCCESS :
+ SPDK_BDEV_IO_STATUS_FAILED);
+}
+
+static void
+raid_bdev_submit_reset_request(struct raid_bdev_io *raid_io);
+
+static void
+_raid_bdev_submit_reset_request(void *_raid_io)
+{
+ struct raid_bdev_io *raid_io = _raid_io;
+
+ raid_bdev_submit_reset_request(raid_io);
+}
+
+/*
+ * brief:
+ * raid_bdev_submit_reset_request function submits reset requests
+ * to member disks; it will submit as many as possible unless a reset fails with -ENOMEM, in
+ * which case it will queue it for later submission
+ * params:
+ * raid_io
+ * returns:
+ * none
+ */
+static void
+raid_bdev_submit_reset_request(struct raid_bdev_io *raid_io)
+{
+ struct raid_bdev *raid_bdev;
+ int ret;
+ uint8_t i;
+ struct raid_base_bdev_info *base_info;
+ struct spdk_io_channel *base_ch;
+
+ raid_bdev = raid_io->raid_bdev;
+
+ if (raid_io->base_bdev_io_remaining == 0) {
+ raid_io->base_bdev_io_remaining = raid_bdev->num_base_bdevs;
+ }
+
+ while (raid_io->base_bdev_io_submitted < raid_bdev->num_base_bdevs) {
+ i = raid_io->base_bdev_io_submitted;
+ base_info = &raid_bdev->base_bdev_info[i];
+ base_ch = raid_io->raid_ch->base_channel[i];
+ ret = spdk_bdev_reset(base_info->desc, base_ch,
+ raid_base_bdev_reset_complete, raid_io);
+ if (ret == 0) {
+ raid_io->base_bdev_io_submitted++;
+ } else if (ret == -ENOMEM) {
+ raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch,
+ _raid_bdev_submit_reset_request);
+ return;
+ } else {
+ SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n");
+ assert(false);
+ raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
+ return;
+ }
+ }
+}
+
+/*
+ * brief:
+ * Callback function to spdk_bdev_io_get_buf.
+ * params:
+ * ch - pointer to raid bdev io channel
+ * bdev_io - pointer to parent bdev_io on raid bdev device
+ * success - True if buffer is allocated or false otherwise.
+ * returns:
+ * none
+ */
+static void
+raid_bdev_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
+ bool success)
+{
+ struct raid_bdev_io *raid_io = (struct raid_bdev_io *)bdev_io->driver_ctx;
+
+ if (!success) {
+ raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
+ return;
+ }
+
+ raid_io->raid_bdev->module->submit_rw_request(raid_io);
+}
+
+/*
+ * brief:
+ * raid_bdev_submit_request function is the submit_request function pointer of
+ * raid bdev function table. This is used to submit the io on raid_bdev to below
+ * layers.
+ * params:
+ * ch - pointer to raid bdev io channel
+ * bdev_io - pointer to parent bdev_io on raid bdev device
+ * returns:
+ * none
+ */
+static void
+raid_bdev_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
+{
+ struct raid_bdev_io *raid_io = (struct raid_bdev_io *)bdev_io->driver_ctx;
+
+ raid_io->raid_bdev = bdev_io->bdev->ctxt;
+ raid_io->raid_ch = spdk_io_channel_get_ctx(ch);
+ raid_io->base_bdev_io_remaining = 0;
+ raid_io->base_bdev_io_submitted = 0;
+ raid_io->base_bdev_io_status = SPDK_BDEV_IO_STATUS_SUCCESS;
+
+ switch (bdev_io->type) {
+ case SPDK_BDEV_IO_TYPE_READ:
+ spdk_bdev_io_get_buf(bdev_io, raid_bdev_get_buf_cb,
+ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
+ break;
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ raid_io->raid_bdev->module->submit_rw_request(raid_io);
+ break;
+
+ case SPDK_BDEV_IO_TYPE_RESET:
+ raid_bdev_submit_reset_request(raid_io);
+ break;
+
+ case SPDK_BDEV_IO_TYPE_FLUSH:
+ case SPDK_BDEV_IO_TYPE_UNMAP:
+ raid_io->raid_bdev->module->submit_null_payload_request(raid_io);
+ break;
+
+ default:
+ SPDK_ERRLOG("submit request, invalid io type %u\n", bdev_io->type);
+ raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
+ break;
+ }
+}
+
+/*
+ * brief:
+ * _raid_bdev_io_type_supported checks whether io_type is supported in
+ * all base bdev modules of raid bdev module. If anyone among the base_bdevs
+ * doesn't support, the raid device doesn't supports.
+ *
+ * params:
+ * raid_bdev - pointer to raid bdev context
+ * io_type - io type
+ * returns:
+ * true - io_type is supported
+ * false - io_type is not supported
+ */
+inline static bool
+_raid_bdev_io_type_supported(struct raid_bdev *raid_bdev, enum spdk_bdev_io_type io_type)
+{
+ struct raid_base_bdev_info *base_info;
+
+ if (io_type == SPDK_BDEV_IO_TYPE_FLUSH ||
+ io_type == SPDK_BDEV_IO_TYPE_UNMAP) {
+ if (raid_bdev->module->submit_null_payload_request == NULL) {
+ return false;
+ }
+ }
+
+ RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
+ if (base_info->bdev == NULL) {
+ assert(false);
+ continue;
+ }
+
+ if (spdk_bdev_io_type_supported(base_info->bdev, io_type) == false) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/*
+ * brief:
+ * raid_bdev_io_type_supported is the io_supported function for bdev function
+ * table which returns whether the particular io type is supported or not by
+ * raid bdev module
+ * params:
+ * ctx - pointer to raid bdev context
+ * type - io type
+ * returns:
+ * true - io_type is supported
+ * false - io_type is not supported
+ */
+static bool
+raid_bdev_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
+{
+ switch (io_type) {
+ case SPDK_BDEV_IO_TYPE_READ:
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ return true;
+
+ case SPDK_BDEV_IO_TYPE_FLUSH:
+ case SPDK_BDEV_IO_TYPE_RESET:
+ case SPDK_BDEV_IO_TYPE_UNMAP:
+ return _raid_bdev_io_type_supported(ctx, io_type);
+
+ default:
+ return false;
+ }
+
+ return false;
+}
+
+/*
+ * brief:
+ * raid_bdev_get_io_channel is the get_io_channel function table pointer for
+ * raid bdev. This is used to return the io channel for this raid bdev
+ * params:
+ * ctxt - pointer to raid_bdev
+ * returns:
+ * pointer to io channel for raid bdev
+ */
+static struct spdk_io_channel *
+raid_bdev_get_io_channel(void *ctxt)
+{
+ struct raid_bdev *raid_bdev = ctxt;
+
+ return spdk_get_io_channel(raid_bdev);
+}
+
+/*
+ * brief:
+ * raid_bdev_dump_info_json is the function table pointer for raid bdev
+ * params:
+ * ctx - pointer to raid_bdev
+ * w - pointer to json context
+ * returns:
+ * 0 - success
+ * non zero - failure
+ */
+static int
+raid_bdev_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
+{
+ struct raid_bdev *raid_bdev = ctx;
+ struct raid_base_bdev_info *base_info;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_dump_config_json\n");
+ assert(raid_bdev != NULL);
+
+ /* Dump the raid bdev configuration related information */
+ spdk_json_write_named_object_begin(w, "raid");
+ spdk_json_write_named_uint32(w, "strip_size", raid_bdev->strip_size);
+ spdk_json_write_named_uint32(w, "strip_size_kb", raid_bdev->strip_size_kb);
+ spdk_json_write_named_uint32(w, "state", raid_bdev->state);
+ spdk_json_write_named_string(w, "raid_level", raid_bdev_level_to_str(raid_bdev->level));
+ spdk_json_write_named_uint32(w, "destruct_called", raid_bdev->destruct_called);
+ spdk_json_write_named_uint32(w, "num_base_bdevs", raid_bdev->num_base_bdevs);
+ spdk_json_write_named_uint32(w, "num_base_bdevs_discovered", raid_bdev->num_base_bdevs_discovered);
+ spdk_json_write_name(w, "base_bdevs_list");
+ spdk_json_write_array_begin(w);
+ RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
+ if (base_info->bdev) {
+ spdk_json_write_string(w, base_info->bdev->name);
+ } else {
+ spdk_json_write_null(w);
+ }
+ }
+ spdk_json_write_array_end(w);
+ spdk_json_write_object_end(w);
+
+ return 0;
+}
+
+/*
+ * brief:
+ * raid_bdev_write_config_json is the function table pointer for raid bdev
+ * params:
+ * bdev - pointer to spdk_bdev
+ * w - pointer to json context
+ * returns:
+ * none
+ */
+static void
+raid_bdev_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
+{
+ struct raid_bdev *raid_bdev = bdev->ctxt;
+ struct raid_base_bdev_info *base_info;
+
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_string(w, "method", "bdev_raid_create");
+
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_string(w, "name", bdev->name);
+ spdk_json_write_named_uint32(w, "strip_size", raid_bdev->strip_size_kb);
+ spdk_json_write_named_string(w, "raid_level", raid_bdev_level_to_str(raid_bdev->level));
+
+ spdk_json_write_named_array_begin(w, "base_bdevs");
+ RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
+ if (base_info->bdev) {
+ spdk_json_write_string(w, base_info->bdev->name);
+ }
+ }
+ spdk_json_write_array_end(w);
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+}
+
+/* g_raid_bdev_fn_table is the function table for raid bdev */
+static const struct spdk_bdev_fn_table g_raid_bdev_fn_table = {
+ .destruct = raid_bdev_destruct,
+ .submit_request = raid_bdev_submit_request,
+ .io_type_supported = raid_bdev_io_type_supported,
+ .get_io_channel = raid_bdev_get_io_channel,
+ .dump_info_json = raid_bdev_dump_info_json,
+ .write_config_json = raid_bdev_write_config_json,
+};
+
+/*
+ * brief:
+ * raid_bdev_config_cleanup function is used to free memory for one raid_bdev in configuration
+ * params:
+ * raid_cfg - pointer to raid_bdev_config structure
+ * returns:
+ * none
+ */
+void
+raid_bdev_config_cleanup(struct raid_bdev_config *raid_cfg)
+{
+ uint8_t i;
+
+ TAILQ_REMOVE(&g_raid_config.raid_bdev_config_head, raid_cfg, link);
+ g_raid_config.total_raid_bdev--;
+
+ if (raid_cfg->base_bdev) {
+ for (i = 0; i < raid_cfg->num_base_bdevs; i++) {
+ free(raid_cfg->base_bdev[i].name);
+ }
+ free(raid_cfg->base_bdev);
+ }
+ free(raid_cfg->name);
+ free(raid_cfg);
+}
+
+/*
+ * brief:
+ * raid_bdev_free is the raid bdev function table function pointer. This is
+ * called on bdev free path
+ * params:
+ * none
+ * returns:
+ * none
+ */
+static void
+raid_bdev_free(void)
+{
+ struct raid_bdev_config *raid_cfg, *tmp;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_free\n");
+ TAILQ_FOREACH_SAFE(raid_cfg, &g_raid_config.raid_bdev_config_head, link, tmp) {
+ raid_bdev_config_cleanup(raid_cfg);
+ }
+}
+
+/* brief
+ * raid_bdev_config_find_by_name is a helper function to find raid bdev config
+ * by name as key.
+ *
+ * params:
+ * raid_name - name for raid bdev.
+ */
+struct raid_bdev_config *
+raid_bdev_config_find_by_name(const char *raid_name)
+{
+ struct raid_bdev_config *raid_cfg;
+
+ TAILQ_FOREACH(raid_cfg, &g_raid_config.raid_bdev_config_head, link) {
+ if (!strcmp(raid_cfg->name, raid_name)) {
+ return raid_cfg;
+ }
+ }
+
+ return raid_cfg;
+}
+
+/*
+ * brief
+ * raid_bdev_config_add function adds config for newly created raid bdev.
+ *
+ * params:
+ * raid_name - name for raid bdev.
+ * strip_size - strip size in KB
+ * num_base_bdevs - number of base bdevs.
+ * level - raid level.
+ * _raid_cfg - Pointer to newly added configuration
+ */
+int
+raid_bdev_config_add(const char *raid_name, uint32_t strip_size, uint8_t num_base_bdevs,
+ enum raid_level level, struct raid_bdev_config **_raid_cfg)
+{
+ struct raid_bdev_config *raid_cfg;
+
+ raid_cfg = raid_bdev_config_find_by_name(raid_name);
+ if (raid_cfg != NULL) {
+ SPDK_ERRLOG("Duplicate raid bdev name found in config file %s\n",
+ raid_name);
+ return -EEXIST;
+ }
+
+ if (spdk_u32_is_pow2(strip_size) == false) {
+ SPDK_ERRLOG("Invalid strip size %" PRIu32 "\n", strip_size);
+ return -EINVAL;
+ }
+
+ if (num_base_bdevs == 0) {
+ SPDK_ERRLOG("Invalid base device count %u\n", num_base_bdevs);
+ return -EINVAL;
+ }
+
+ raid_cfg = calloc(1, sizeof(*raid_cfg));
+ if (raid_cfg == NULL) {
+ SPDK_ERRLOG("unable to allocate memory\n");
+ return -ENOMEM;
+ }
+
+ raid_cfg->name = strdup(raid_name);
+ if (!raid_cfg->name) {
+ free(raid_cfg);
+ SPDK_ERRLOG("unable to allocate memory\n");
+ return -ENOMEM;
+ }
+ raid_cfg->strip_size = strip_size;
+ raid_cfg->num_base_bdevs = num_base_bdevs;
+ raid_cfg->level = level;
+
+ raid_cfg->base_bdev = calloc(num_base_bdevs, sizeof(*raid_cfg->base_bdev));
+ if (raid_cfg->base_bdev == NULL) {
+ free(raid_cfg->name);
+ free(raid_cfg);
+ SPDK_ERRLOG("unable to allocate memory\n");
+ return -ENOMEM;
+ }
+
+ TAILQ_INSERT_TAIL(&g_raid_config.raid_bdev_config_head, raid_cfg, link);
+ g_raid_config.total_raid_bdev++;
+
+ *_raid_cfg = raid_cfg;
+ return 0;
+}
+
+/*
+ * brief:
+ * raid_bdev_config_add_base_bdev function add base bdev to raid bdev config.
+ *
+ * params:
+ * raid_cfg - pointer to raid bdev configuration
+ * base_bdev_name - name of base bdev
+ * slot - Position to add base bdev
+ */
+int
+raid_bdev_config_add_base_bdev(struct raid_bdev_config *raid_cfg, const char *base_bdev_name,
+ uint8_t slot)
+{
+ uint8_t i;
+ struct raid_bdev_config *tmp;
+
+ if (slot >= raid_cfg->num_base_bdevs) {
+ return -EINVAL;
+ }
+
+ TAILQ_FOREACH(tmp, &g_raid_config.raid_bdev_config_head, link) {
+ for (i = 0; i < tmp->num_base_bdevs; i++) {
+ if (tmp->base_bdev[i].name != NULL) {
+ if (!strcmp(tmp->base_bdev[i].name, base_bdev_name)) {
+ SPDK_ERRLOG("duplicate base bdev name %s mentioned\n",
+ base_bdev_name);
+ return -EEXIST;
+ }
+ }
+ }
+ }
+
+ raid_cfg->base_bdev[slot].name = strdup(base_bdev_name);
+ if (raid_cfg->base_bdev[slot].name == NULL) {
+ SPDK_ERRLOG("unable to allocate memory\n");
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static struct {
+ const char *name;
+ enum raid_level value;
+} g_raid_level_names[] = {
+ { "raid0", RAID0 },
+ { "0", RAID0 },
+ { "raid5", RAID5 },
+ { "5", RAID5 },
+ { }
+};
+
+enum raid_level raid_bdev_parse_raid_level(const char *str)
+{
+ unsigned int i;
+
+ assert(str != NULL);
+
+ for (i = 0; g_raid_level_names[i].name != NULL; i++) {
+ if (strcasecmp(g_raid_level_names[i].name, str) == 0) {
+ return g_raid_level_names[i].value;
+ }
+ }
+
+ return INVALID_RAID_LEVEL;
+}
+
+const char *
+raid_bdev_level_to_str(enum raid_level level)
+{
+ unsigned int i;
+
+ for (i = 0; g_raid_level_names[i].name != NULL; i++) {
+ if (g_raid_level_names[i].value == level) {
+ return g_raid_level_names[i].name;
+ }
+ }
+
+ return "";
+}
+
+/*
+ * brief:
+ * raid_bdev_parse_raid is used to parse the raid bdev from config file based on
+ * pre-defined raid bdev format in config file.
+ * Format of config file:
+ * [RAID1]
+ * Name raid1
+ * StripSize 64
+ * NumDevices 2
+ * RaidLevel 0
+ * Devices Nvme0n1 Nvme1n1
+ *
+ * [RAID2]
+ * Name raid2
+ * StripSize 64
+ * NumDevices 3
+ * RaidLevel 0
+ * Devices Nvme2n1 Nvme3n1 Nvme4n1
+ *
+ * params:
+ * conf_section - pointer to config section
+ * returns:
+ * 0 - success
+ * non zero - failure
+ */
+static int
+raid_bdev_parse_raid(struct spdk_conf_section *conf_section)
+{
+ const char *raid_name;
+ uint32_t strip_size;
+ uint8_t num_base_bdevs;
+ const char *raid_level_str;
+ enum raid_level level;
+ const char *base_bdev_name;
+ struct raid_bdev_config *raid_cfg;
+ int rc, i, val;
+
+ raid_name = spdk_conf_section_get_val(conf_section, "Name");
+ if (raid_name == NULL) {
+ SPDK_ERRLOG("raid_name is null\n");
+ return -EINVAL;
+ }
+
+ val = spdk_conf_section_get_intval(conf_section, "StripSize");
+ if (val < 0) {
+ return -EINVAL;
+ }
+ strip_size = val;
+
+ val = spdk_conf_section_get_intval(conf_section, "NumDevices");
+ if (val < 0) {
+ return -EINVAL;
+ }
+ num_base_bdevs = val;
+
+ raid_level_str = spdk_conf_section_get_val(conf_section, "RaidLevel");
+ if (raid_level_str == NULL) {
+ SPDK_ERRLOG("Missing RaidLevel\n");
+ return -EINVAL;
+ }
+ level = raid_bdev_parse_raid_level(raid_level_str);
+ if (level == INVALID_RAID_LEVEL) {
+ SPDK_ERRLOG("Invalid RaidLevel\n");
+ return -EINVAL;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "%s %" PRIu32 " %u %u\n",
+ raid_name, strip_size, num_base_bdevs, level);
+
+ rc = raid_bdev_config_add(raid_name, strip_size, num_base_bdevs, level,
+ &raid_cfg);
+ if (rc != 0) {
+ SPDK_ERRLOG("Failed to add raid bdev config\n");
+ return rc;
+ }
+
+ for (i = 0; true; i++) {
+ base_bdev_name = spdk_conf_section_get_nmval(conf_section, "Devices", 0, i);
+ if (base_bdev_name == NULL) {
+ break;
+ }
+ if (i >= num_base_bdevs) {
+ raid_bdev_config_cleanup(raid_cfg);
+ SPDK_ERRLOG("Number of devices mentioned is more than count\n");
+ return -EINVAL;
+ }
+
+ rc = raid_bdev_config_add_base_bdev(raid_cfg, base_bdev_name, i);
+ if (rc != 0) {
+ raid_bdev_config_cleanup(raid_cfg);
+ SPDK_ERRLOG("Failed to add base bdev to raid bdev config\n");
+ return rc;
+ }
+ }
+
+ if (i != raid_cfg->num_base_bdevs) {
+ raid_bdev_config_cleanup(raid_cfg);
+ SPDK_ERRLOG("Number of devices mentioned is less than count\n");
+ return -EINVAL;
+ }
+
+ rc = raid_bdev_create(raid_cfg);
+ if (rc != 0) {
+ raid_bdev_config_cleanup(raid_cfg);
+ SPDK_ERRLOG("Failed to create raid bdev\n");
+ return rc;
+ }
+
+ rc = raid_bdev_add_base_devices(raid_cfg);
+ if (rc != 0) {
+ SPDK_ERRLOG("Failed to add any base bdev to raid bdev\n");
+ /* Config is not removed in this case. */
+ }
+
+ return 0;
+}
+
+/*
+ * brief:
+ * raid_bdev_parse_config is used to find the raid bdev config section and parse it
+ * Format of config file:
+ * params:
+ * none
+ * returns:
+ * 0 - success
+ * non zero - failure
+ */
+static int
+raid_bdev_parse_config(void)
+{
+ int ret;
+ struct spdk_conf_section *conf_section;
+
+ conf_section = spdk_conf_first_section(NULL);
+ while (conf_section != NULL) {
+ if (spdk_conf_section_match_prefix(conf_section, "RAID")) {
+ ret = raid_bdev_parse_raid(conf_section);
+ if (ret < 0) {
+ SPDK_ERRLOG("Unable to parse raid bdev section\n");
+ return ret;
+ }
+ }
+ conf_section = spdk_conf_next_section(conf_section);
+ }
+
+ return 0;
+}
+
+/*
+ * brief:
+ * raid_bdev_fini_start is called when bdev layer is starting the
+ * shutdown process
+ * params:
+ * none
+ * returns:
+ * none
+ */
+static void
+raid_bdev_fini_start(void)
+{
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_fini_start\n");
+ g_shutdown_started = true;
+}
+
+/*
+ * brief:
+ * raid_bdev_exit is called on raid bdev module exit time by bdev layer
+ * params:
+ * none
+ * returns:
+ * none
+ */
+static void
+raid_bdev_exit(void)
+{
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_exit\n");
+ raid_bdev_free();
+}
+
+/*
+ * brief:
+ * raid_bdev_get_ctx_size is used to return the context size of bdev_io for raid
+ * module
+ * params:
+ * none
+ * returns:
+ * size of spdk_bdev_io context for raid
+ */
+static int
+raid_bdev_get_ctx_size(void)
+{
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_get_ctx_size\n");
+ return sizeof(struct raid_bdev_io);
+}
+
+/*
+ * brief:
+ * raid_bdev_get_running_config is used to get the configuration options.
+ *
+ * params:
+ * fp - The pointer to a file that will be written to the configuration options.
+ * returns:
+ * none
+ */
+static void
+raid_bdev_get_running_config(FILE *fp)
+{
+ struct raid_bdev *raid_bdev;
+ struct raid_base_bdev_info *base_info;
+ int index = 1;
+
+ TAILQ_FOREACH(raid_bdev, &g_raid_bdev_configured_list, state_link) {
+ fprintf(fp,
+ "\n"
+ "[RAID%d]\n"
+ " Name %s\n"
+ " StripSize %" PRIu32 "\n"
+ " NumDevices %u\n"
+ " RaidLevel %s\n",
+ index, raid_bdev->bdev.name, raid_bdev->strip_size_kb,
+ raid_bdev->num_base_bdevs,
+ raid_bdev_level_to_str(raid_bdev->level));
+ fprintf(fp,
+ " Devices ");
+ RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
+ if (base_info->bdev) {
+ fprintf(fp,
+ "%s ",
+ base_info->bdev->name);
+ }
+ }
+ fprintf(fp,
+ "\n");
+ index++;
+ }
+}
+
+/*
+ * brief:
+ * raid_bdev_can_claim_bdev is the function to check if this base_bdev can be
+ * claimed by raid bdev or not.
+ * params:
+ * bdev_name - represents base bdev name
+ * _raid_cfg - pointer to raid bdev config parsed from config file
+ * base_bdev_slot - if bdev can be claimed, it represents the base_bdev correct
+ * slot. This field is only valid if return value of this function is true
+ * returns:
+ * true - if bdev can be claimed
+ * false - if bdev can't be claimed
+ */
+static bool
+raid_bdev_can_claim_bdev(const char *bdev_name, struct raid_bdev_config **_raid_cfg,
+ uint8_t *base_bdev_slot)
+{
+ struct raid_bdev_config *raid_cfg;
+ uint8_t i;
+
+ TAILQ_FOREACH(raid_cfg, &g_raid_config.raid_bdev_config_head, link) {
+ for (i = 0; i < raid_cfg->num_base_bdevs; i++) {
+ /*
+ * Check if the base bdev name is part of raid bdev configuration.
+ * If match is found then return true and the slot information where
+ * this base bdev should be inserted in raid bdev
+ */
+ if (!strcmp(bdev_name, raid_cfg->base_bdev[i].name)) {
+ *_raid_cfg = raid_cfg;
+ *base_bdev_slot = i;
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+
+static struct spdk_bdev_module g_raid_if = {
+ .name = "raid",
+ .module_init = raid_bdev_init,
+ .fini_start = raid_bdev_fini_start,
+ .module_fini = raid_bdev_exit,
+ .get_ctx_size = raid_bdev_get_ctx_size,
+ .examine_config = raid_bdev_examine,
+ .config_text = raid_bdev_get_running_config,
+ .async_init = false,
+ .async_fini = false,
+};
+SPDK_BDEV_MODULE_REGISTER(raid, &g_raid_if)
+
+/*
+ * brief:
+ * raid_bdev_init is the initialization function for raid bdev module
+ * params:
+ * none
+ * returns:
+ * 0 - success
+ * non zero - failure
+ */
+static int
+raid_bdev_init(void)
+{
+ int ret;
+
+ /* Parse config file for raids */
+ ret = raid_bdev_parse_config();
+ if (ret < 0) {
+ SPDK_ERRLOG("raid bdev init failed parsing\n");
+ raid_bdev_free();
+ return ret;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_init completed successfully\n");
+
+ return 0;
+}
+
+/*
+ * brief:
+ * raid_bdev_create allocates raid bdev based on passed configuration
+ * params:
+ * raid_cfg - configuration of raid bdev
+ * returns:
+ * 0 - success
+ * non zero - failure
+ */
+int
+raid_bdev_create(struct raid_bdev_config *raid_cfg)
+{
+ struct raid_bdev *raid_bdev;
+ struct spdk_bdev *raid_bdev_gen;
+ struct raid_bdev_module *module;
+
+ module = raid_bdev_module_find(raid_cfg->level);
+ if (module == NULL) {
+ SPDK_ERRLOG("Unsupported raid level '%d'\n", raid_cfg->level);
+ return -EINVAL;
+ }
+
+ assert(module->base_bdevs_min != 0);
+ if (raid_cfg->num_base_bdevs < module->base_bdevs_min) {
+ SPDK_ERRLOG("At least %u base devices required for %s\n",
+ module->base_bdevs_min,
+ raid_bdev_level_to_str(raid_cfg->level));
+ return -EINVAL;
+ }
+
+ raid_bdev = calloc(1, sizeof(*raid_bdev));
+ if (!raid_bdev) {
+ SPDK_ERRLOG("Unable to allocate memory for raid bdev\n");
+ return -ENOMEM;
+ }
+
+ raid_bdev->module = module;
+ raid_bdev->num_base_bdevs = raid_cfg->num_base_bdevs;
+ raid_bdev->base_bdev_info = calloc(raid_bdev->num_base_bdevs,
+ sizeof(struct raid_base_bdev_info));
+ if (!raid_bdev->base_bdev_info) {
+ SPDK_ERRLOG("Unable able to allocate base bdev info\n");
+ free(raid_bdev);
+ return -ENOMEM;
+ }
+
+ /* strip_size_kb is from the rpc param. strip_size is in blocks and used
+ * internally and set later.
+ */
+ raid_bdev->strip_size = 0;
+ raid_bdev->strip_size_kb = raid_cfg->strip_size;
+ raid_bdev->state = RAID_BDEV_STATE_CONFIGURING;
+ raid_bdev->config = raid_cfg;
+ raid_bdev->level = raid_cfg->level;
+
+ raid_bdev_gen = &raid_bdev->bdev;
+
+ raid_bdev_gen->name = strdup(raid_cfg->name);
+ if (!raid_bdev_gen->name) {
+ SPDK_ERRLOG("Unable to allocate name for raid\n");
+ free(raid_bdev->base_bdev_info);
+ free(raid_bdev);
+ return -ENOMEM;
+ }
+
+ raid_bdev_gen->product_name = "Raid Volume";
+ raid_bdev_gen->ctxt = raid_bdev;
+ raid_bdev_gen->fn_table = &g_raid_bdev_fn_table;
+ raid_bdev_gen->module = &g_raid_if;
+ raid_bdev_gen->write_cache = 0;
+
+ TAILQ_INSERT_TAIL(&g_raid_bdev_configuring_list, raid_bdev, state_link);
+ TAILQ_INSERT_TAIL(&g_raid_bdev_list, raid_bdev, global_link);
+
+ raid_cfg->raid_bdev = raid_bdev;
+
+ return 0;
+}
+
+/*
+ * brief
+ * raid_bdev_alloc_base_bdev_resource allocates resource of base bdev.
+ * params:
+ * raid_bdev - pointer to raid bdev
+ * bdev - pointer to base bdev
+ * base_bdev_slot - position to add base bdev
+ * returns:
+ * 0 - success
+ * non zero - failure
+ */
+static int
+raid_bdev_alloc_base_bdev_resource(struct raid_bdev *raid_bdev, struct spdk_bdev *bdev,
+ uint8_t base_bdev_slot)
+{
+ struct spdk_bdev_desc *desc;
+ int rc;
+
+ rc = spdk_bdev_open(bdev, true, raid_bdev_remove_base_bdev, bdev, &desc);
+ if (rc != 0) {
+ SPDK_ERRLOG("Unable to create desc on bdev '%s'\n", bdev->name);
+ return rc;
+ }
+
+ rc = spdk_bdev_module_claim_bdev(bdev, NULL, &g_raid_if);
+ if (rc != 0) {
+ SPDK_ERRLOG("Unable to claim this bdev as it is already claimed\n");
+ spdk_bdev_close(desc);
+ return rc;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "bdev %s is claimed\n", bdev->name);
+
+ assert(raid_bdev->state != RAID_BDEV_STATE_ONLINE);
+ assert(base_bdev_slot < raid_bdev->num_base_bdevs);
+
+ raid_bdev->base_bdev_info[base_bdev_slot].thread = spdk_get_thread();
+ raid_bdev->base_bdev_info[base_bdev_slot].bdev = bdev;
+ raid_bdev->base_bdev_info[base_bdev_slot].desc = desc;
+ raid_bdev->num_base_bdevs_discovered++;
+ assert(raid_bdev->num_base_bdevs_discovered <= raid_bdev->num_base_bdevs);
+
+ return 0;
+}
+
+/*
+ * brief:
+ * If raid bdev config is complete, then only register the raid bdev to
+ * bdev layer and remove this raid bdev from configuring list and
+ * insert the raid bdev to configured list
+ * params:
+ * raid_bdev - pointer to raid bdev
+ * returns:
+ * 0 - success
+ * non zero - failure
+ */
+static int
+raid_bdev_configure(struct raid_bdev *raid_bdev)
+{
+ uint32_t blocklen = 0;
+ struct spdk_bdev *raid_bdev_gen;
+ struct raid_base_bdev_info *base_info;
+ int rc = 0;
+
+ assert(raid_bdev->state == RAID_BDEV_STATE_CONFIGURING);
+ assert(raid_bdev->num_base_bdevs_discovered == raid_bdev->num_base_bdevs);
+
+ RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
+ /* Check blocklen for all base bdevs that it should be same */
+ if (blocklen == 0) {
+ blocklen = base_info->bdev->blocklen;
+ } else if (blocklen != base_info->bdev->blocklen) {
+ /*
+ * Assumption is that all the base bdevs for any raid bdev should
+ * have same blocklen
+ */
+ SPDK_ERRLOG("Blocklen of various bdevs not matching\n");
+ return -EINVAL;
+ }
+ }
+ assert(blocklen > 0);
+
+ /* The strip_size_kb is read in from user in KB. Convert to blocks here for
+ * internal use.
+ */
+ raid_bdev->strip_size = (raid_bdev->strip_size_kb * 1024) / blocklen;
+ raid_bdev->strip_size_shift = spdk_u32log2(raid_bdev->strip_size);
+ raid_bdev->blocklen_shift = spdk_u32log2(blocklen);
+
+ raid_bdev_gen = &raid_bdev->bdev;
+ raid_bdev_gen->blocklen = blocklen;
+
+ rc = raid_bdev->module->start(raid_bdev);
+ if (rc != 0) {
+ SPDK_ERRLOG("raid module startup callback failed\n");
+ return rc;
+ }
+ raid_bdev->state = RAID_BDEV_STATE_ONLINE;
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "io device register %p\n", raid_bdev);
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "blockcnt %lu, blocklen %u\n",
+ raid_bdev_gen->blockcnt, raid_bdev_gen->blocklen);
+ spdk_io_device_register(raid_bdev, raid_bdev_create_cb, raid_bdev_destroy_cb,
+ sizeof(struct raid_bdev_io_channel),
+ raid_bdev->bdev.name);
+ rc = spdk_bdev_register(raid_bdev_gen);
+ if (rc != 0) {
+ SPDK_ERRLOG("Unable to register raid bdev and stay at configuring state\n");
+ if (raid_bdev->module->stop != NULL) {
+ raid_bdev->module->stop(raid_bdev);
+ }
+ spdk_io_device_unregister(raid_bdev, NULL);
+ raid_bdev->state = RAID_BDEV_STATE_CONFIGURING;
+ return rc;
+ }
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid bdev generic %p\n", raid_bdev_gen);
+ TAILQ_REMOVE(&g_raid_bdev_configuring_list, raid_bdev, state_link);
+ TAILQ_INSERT_TAIL(&g_raid_bdev_configured_list, raid_bdev, state_link);
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid bdev is created with name %s, raid_bdev %p\n",
+ raid_bdev_gen->name, raid_bdev);
+
+ return 0;
+}
+
+/*
+ * brief:
+ * If raid bdev is online and registered, change the bdev state to
+ * configuring and unregister this raid device. Queue this raid device
+ * in configuring list
+ * params:
+ * raid_bdev - pointer to raid bdev
+ * cb_fn - callback function
+ * cb_arg - argument to callback function
+ * returns:
+ * none
+ */
+static void
+raid_bdev_deconfigure(struct raid_bdev *raid_bdev, raid_bdev_destruct_cb cb_fn,
+ void *cb_arg)
+{
+ if (raid_bdev->state != RAID_BDEV_STATE_ONLINE) {
+ if (cb_fn) {
+ cb_fn(cb_arg, 0);
+ }
+ return;
+ }
+
+ assert(raid_bdev->num_base_bdevs == raid_bdev->num_base_bdevs_discovered);
+ TAILQ_REMOVE(&g_raid_bdev_configured_list, raid_bdev, state_link);
+ if (raid_bdev->module->stop != NULL) {
+ raid_bdev->module->stop(raid_bdev);
+ }
+ raid_bdev->state = RAID_BDEV_STATE_OFFLINE;
+ assert(raid_bdev->num_base_bdevs_discovered);
+ TAILQ_INSERT_TAIL(&g_raid_bdev_offline_list, raid_bdev, state_link);
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid bdev state chaning from online to offline\n");
+
+ spdk_bdev_unregister(&raid_bdev->bdev, cb_fn, cb_arg);
+}
+
+/*
+ * brief:
+ * raid_bdev_find_by_base_bdev function finds the raid bdev which has
+ * claimed the base bdev.
+ * params:
+ * base_bdev - pointer to base bdev pointer
+ * _raid_bdev - Reference to pointer to raid bdev
+ * _base_info - Reference to the raid base bdev info.
+ * returns:
+ * true - if the raid bdev is found.
+ * false - if the raid bdev is not found.
+ */
+static bool
+raid_bdev_find_by_base_bdev(struct spdk_bdev *base_bdev, struct raid_bdev **_raid_bdev,
+ struct raid_base_bdev_info **_base_info)
+{
+ struct raid_bdev *raid_bdev;
+ struct raid_base_bdev_info *base_info;
+
+ TAILQ_FOREACH(raid_bdev, &g_raid_bdev_list, global_link) {
+ RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
+ if (base_info->bdev == base_bdev) {
+ *_raid_bdev = raid_bdev;
+ *_base_info = base_info;
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+/*
+ * brief:
+ * raid_bdev_remove_base_bdev function is called by below layers when base_bdev
+ * is removed. This function checks if this base bdev is part of any raid bdev
+ * or not. If yes, it takes necessary action on that particular raid bdev.
+ * params:
+ * ctx - pointer to base bdev pointer which got removed
+ * returns:
+ * none
+ */
+static void
+raid_bdev_remove_base_bdev(void *ctx)
+{
+ struct spdk_bdev *base_bdev = ctx;
+ struct raid_bdev *raid_bdev = NULL;
+ struct raid_base_bdev_info *base_info;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_remove_base_bdev\n");
+
+ /* Find the raid_bdev which has claimed this base_bdev */
+ if (!raid_bdev_find_by_base_bdev(base_bdev, &raid_bdev, &base_info)) {
+ SPDK_ERRLOG("bdev to remove '%s' not found\n", base_bdev->name);
+ return;
+ }
+
+ assert(base_info->desc);
+ base_info->remove_scheduled = true;
+
+ if (raid_bdev->destruct_called == true ||
+ raid_bdev->state == RAID_BDEV_STATE_CONFIGURING) {
+ /*
+ * As raid bdev is not registered yet or already unregistered,
+ * so cleanup should be done here itself.
+ */
+ raid_bdev_free_base_bdev_resource(raid_bdev, base_info);
+ if (raid_bdev->num_base_bdevs_discovered == 0) {
+ /* There is no base bdev for this raid, so free the raid device. */
+ raid_bdev_cleanup(raid_bdev);
+ return;
+ }
+ }
+
+ raid_bdev_deconfigure(raid_bdev, NULL, NULL);
+}
+
+/*
+ * brief:
+ * Remove base bdevs from the raid bdev one by one. Skip any base bdev which
+ * doesn't exist.
+ * params:
+ * raid_cfg - pointer to raid bdev config.
+ * cb_fn - callback function
+ * cb_ctx - argument to callback function
+ */
+void
+raid_bdev_remove_base_devices(struct raid_bdev_config *raid_cfg,
+ raid_bdev_destruct_cb cb_fn, void *cb_arg)
+{
+ struct raid_bdev *raid_bdev;
+ struct raid_base_bdev_info *base_info;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_remove_base_devices\n");
+
+ raid_bdev = raid_cfg->raid_bdev;
+ if (raid_bdev == NULL) {
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid bdev %s doesn't exist now\n", raid_cfg->name);
+ if (cb_fn) {
+ cb_fn(cb_arg, 0);
+ }
+ return;
+ }
+
+ if (raid_bdev->destroy_started) {
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "destroying raid bdev %s is already started\n",
+ raid_cfg->name);
+ if (cb_fn) {
+ cb_fn(cb_arg, -EALREADY);
+ }
+ return;
+ }
+
+ raid_bdev->destroy_started = true;
+
+ RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
+ if (base_info->bdev == NULL) {
+ continue;
+ }
+
+ assert(base_info->desc);
+ base_info->remove_scheduled = true;
+
+ if (raid_bdev->destruct_called == true ||
+ raid_bdev->state == RAID_BDEV_STATE_CONFIGURING) {
+ /*
+ * As raid bdev is not registered yet or already unregistered,
+ * so cleanup should be done here itself.
+ */
+ raid_bdev_free_base_bdev_resource(raid_bdev, base_info);
+ if (raid_bdev->num_base_bdevs_discovered == 0) {
+ /* There is no base bdev for this raid, so free the raid device. */
+ raid_bdev_cleanup(raid_bdev);
+ if (cb_fn) {
+ cb_fn(cb_arg, 0);
+ }
+ return;
+ }
+ }
+ }
+
+ raid_bdev_deconfigure(raid_bdev, cb_fn, cb_arg);
+}
+
+/*
+ * brief:
+ * raid_bdev_add_base_device function is the actual function which either adds
+ * the nvme base device to existing raid bdev or create a new raid bdev. It also claims
+ * the base device and keep the open descriptor.
+ * params:
+ * raid_cfg - pointer to raid bdev config
+ * bdev - pointer to base bdev
+ * base_bdev_slot - position to add base bdev
+ * returns:
+ * 0 - success
+ * non zero - failure
+ */
+static int
+raid_bdev_add_base_device(struct raid_bdev_config *raid_cfg, struct spdk_bdev *bdev,
+ uint8_t base_bdev_slot)
+{
+ struct raid_bdev *raid_bdev;
+ int rc;
+
+ raid_bdev = raid_cfg->raid_bdev;
+ if (!raid_bdev) {
+ SPDK_ERRLOG("Raid bdev '%s' is not created yet\n", raid_cfg->name);
+ return -ENODEV;
+ }
+
+ rc = raid_bdev_alloc_base_bdev_resource(raid_bdev, bdev, base_bdev_slot);
+ if (rc != 0) {
+ SPDK_ERRLOG("Failed to allocate resource for bdev '%s'\n", bdev->name);
+ return rc;
+ }
+
+ assert(raid_bdev->num_base_bdevs_discovered <= raid_bdev->num_base_bdevs);
+
+ if (raid_bdev->num_base_bdevs_discovered == raid_bdev->num_base_bdevs) {
+ rc = raid_bdev_configure(raid_bdev);
+ if (rc != 0) {
+ SPDK_ERRLOG("Failed to configure raid bdev\n");
+ return rc;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * brief:
+ * Add base bdevs to the raid bdev one by one. Skip any base bdev which doesn't
+ * exist or fails to add. If all base bdevs are successfully added, the raid bdev
+ * moves to the configured state and becomes available. Otherwise, the raid bdev
+ * stays at the configuring state with added base bdevs.
+ * params:
+ * raid_cfg - pointer to raid bdev config
+ * returns:
+ * 0 - The raid bdev moves to the configured state or stays at the configuring
+ * state with added base bdevs due to any nonexistent base bdev.
+ * non zero - Failed to add any base bdev and stays at the configuring state with
+ * added base bdevs.
+ */
+int
+raid_bdev_add_base_devices(struct raid_bdev_config *raid_cfg)
+{
+ struct spdk_bdev *base_bdev;
+ uint8_t i;
+ int rc = 0, _rc;
+
+ for (i = 0; i < raid_cfg->num_base_bdevs; i++) {
+ base_bdev = spdk_bdev_get_by_name(raid_cfg->base_bdev[i].name);
+ if (base_bdev == NULL) {
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "base bdev %s doesn't exist now\n",
+ raid_cfg->base_bdev[i].name);
+ continue;
+ }
+
+ _rc = raid_bdev_add_base_device(raid_cfg, base_bdev, i);
+ if (_rc != 0) {
+ SPDK_ERRLOG("Failed to add base bdev %s to RAID bdev %s: %s\n",
+ raid_cfg->base_bdev[i].name, raid_cfg->name,
+ spdk_strerror(-_rc));
+ if (rc == 0) {
+ rc = _rc;
+ }
+ }
+ }
+
+ return rc;
+}
+
+/*
+ * brief:
+ * raid_bdev_examine function is the examine function call by the below layers
+ * like bdev_nvme layer. This function will check if this base bdev can be
+ * claimed by this raid bdev or not.
+ * params:
+ * bdev - pointer to base bdev
+ * returns:
+ * none
+ */
+static void
+raid_bdev_examine(struct spdk_bdev *bdev)
+{
+ struct raid_bdev_config *raid_cfg;
+ uint8_t base_bdev_slot;
+
+ if (raid_bdev_can_claim_bdev(bdev->name, &raid_cfg, &base_bdev_slot)) {
+ raid_bdev_add_base_device(raid_cfg, bdev, base_bdev_slot);
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "bdev %s can't be claimed\n",
+ bdev->name);
+ }
+
+ spdk_bdev_module_examine_done(&g_raid_if);
+}
+
+/* Log component for bdev raid bdev module */
+SPDK_LOG_REGISTER_COMPONENT("bdev_raid", SPDK_LOG_BDEV_RAID)
diff --git a/src/spdk/module/bdev/raid/bdev_raid.h b/src/spdk/module/bdev/raid/bdev_raid.h
new file mode 100644
index 000000000..4acca1da6
--- /dev/null
+++ b/src/spdk/module/bdev/raid/bdev_raid.h
@@ -0,0 +1,319 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_BDEV_RAID_INTERNAL_H
+#define SPDK_BDEV_RAID_INTERNAL_H
+
+#include "spdk/bdev_module.h"
+
+enum raid_level {
+ INVALID_RAID_LEVEL = -1,
+ RAID0 = 0,
+ RAID5 = 5,
+};
+
+/*
+ * Raid state describes the state of the raid. This raid bdev can be either in
+ * configured list or configuring list
+ */
+enum raid_bdev_state {
+ /* raid bdev is ready and is seen by upper layers */
+ RAID_BDEV_STATE_ONLINE,
+
+ /*
+ * raid bdev is configuring, not all underlying bdevs are present.
+ * And can't be seen by upper layers.
+ */
+ RAID_BDEV_STATE_CONFIGURING,
+
+ /*
+ * In offline state, raid bdev layer will complete all incoming commands without
+ * submitting to underlying base nvme bdevs
+ */
+ RAID_BDEV_STATE_OFFLINE,
+
+ /* raid bdev max, new states should be added before this */
+ RAID_BDEV_MAX
+};
+
+/*
+ * raid_base_bdev_info contains information for the base bdevs which are part of some
+ * raid. This structure contains the per base bdev information. Whatever is
+ * required per base device for raid bdev will be kept here
+ */
+struct raid_base_bdev_info {
+ /* pointer to base spdk bdev */
+ struct spdk_bdev *bdev;
+
+ /* pointer to base bdev descriptor opened by raid bdev */
+ struct spdk_bdev_desc *desc;
+
+ /*
+ * When underlying base device calls the hot plug function on drive removal,
+ * this flag will be set and later after doing some processing, base device
+ * descriptor will be closed
+ */
+ bool remove_scheduled;
+
+ /* thread where base device is opened */
+ struct spdk_thread *thread;
+};
+
+/*
+ * raid_bdev_io is the context part of bdev_io. It contains the information
+ * related to bdev_io for a raid bdev
+ */
+struct raid_bdev_io {
+ /* The raid bdev associated with this IO */
+ struct raid_bdev *raid_bdev;
+
+ /* WaitQ entry, used only in waitq logic */
+ struct spdk_bdev_io_wait_entry waitq_entry;
+
+ /* Context of the original channel for this IO */
+ struct raid_bdev_io_channel *raid_ch;
+
+ /* Used for tracking progress on io requests sent to member disks. */
+ uint64_t base_bdev_io_remaining;
+ uint8_t base_bdev_io_submitted;
+ uint8_t base_bdev_io_status;
+};
+
+/*
+ * raid_bdev is the single entity structure which contains SPDK block device
+ * and the information related to any raid bdev either configured or
+ * in configuring list. io device is created on this.
+ */
+struct raid_bdev {
+ /* raid bdev device, this will get registered in bdev layer */
+ struct spdk_bdev bdev;
+
+ /* link of raid bdev to link it to configured, configuring or offline list */
+ TAILQ_ENTRY(raid_bdev) state_link;
+
+ /* link of raid bdev to link it to global raid bdev list */
+ TAILQ_ENTRY(raid_bdev) global_link;
+
+ /* pointer to config file entry */
+ struct raid_bdev_config *config;
+
+ /* array of base bdev info */
+ struct raid_base_bdev_info *base_bdev_info;
+
+ /* strip size of raid bdev in blocks */
+ uint32_t strip_size;
+
+ /* strip size of raid bdev in KB */
+ uint32_t strip_size_kb;
+
+ /* strip size bit shift for optimized calculation */
+ uint32_t strip_size_shift;
+
+ /* block length bit shift for optimized calculation */
+ uint32_t blocklen_shift;
+
+ /* state of raid bdev */
+ enum raid_bdev_state state;
+
+ /* number of base bdevs comprising raid bdev */
+ uint8_t num_base_bdevs;
+
+ /* number of base bdevs discovered */
+ uint8_t num_base_bdevs_discovered;
+
+ /* Raid Level of this raid bdev */
+ enum raid_level level;
+
+ /* Set to true if destruct is called for this raid bdev */
+ bool destruct_called;
+
+ /* Set to true if destroy of this raid bdev is started. */
+ bool destroy_started;
+
+ /* Module for RAID-level specific operations */
+ struct raid_bdev_module *module;
+
+ /* Private data for the raid module */
+ void *module_private;
+};
+
+#define RAID_FOR_EACH_BASE_BDEV(r, i) \
+ for (i = r->base_bdev_info; i < r->base_bdev_info + r->num_base_bdevs; i++)
+
+/*
+ * raid_base_bdev_config is the per base bdev data structure which contains
+ * information w.r.t to per base bdev during parsing config
+ */
+struct raid_base_bdev_config {
+ /* base bdev name from config file */
+ char *name;
+};
+
+/*
+ * raid_bdev_config contains the raid bdev config related information after
+ * parsing the config file
+ */
+struct raid_bdev_config {
+ /* base bdev config per underlying bdev */
+ struct raid_base_bdev_config *base_bdev;
+
+ /* Points to already created raid bdev */
+ struct raid_bdev *raid_bdev;
+
+ char *name;
+
+ /* strip size of this raid bdev in kilo bytes */
+ uint32_t strip_size;
+
+ /* number of base bdevs */
+ uint8_t num_base_bdevs;
+
+ /* raid level */
+ enum raid_level level;
+
+ TAILQ_ENTRY(raid_bdev_config) link;
+};
+
+/*
+ * raid_config is the top level structure representing the raid bdev config as read
+ * from config file for all raids
+ */
+struct raid_config {
+ /* raid bdev context from config file */
+ TAILQ_HEAD(, raid_bdev_config) raid_bdev_config_head;
+
+ /* total raid bdev from config file */
+ uint8_t total_raid_bdev;
+};
+
+/*
+ * raid_bdev_io_channel is the context of spdk_io_channel for raid bdev device. It
+ * contains the relationship of raid bdev io channel with base bdev io channels.
+ */
+struct raid_bdev_io_channel {
+ /* Array of IO channels of base bdevs */
+ struct spdk_io_channel **base_channel;
+
+ /* Number of IO channels */
+ uint8_t num_channels;
+};
+
+/* TAIL heads for various raid bdev lists */
+TAILQ_HEAD(raid_configured_tailq, raid_bdev);
+TAILQ_HEAD(raid_configuring_tailq, raid_bdev);
+TAILQ_HEAD(raid_all_tailq, raid_bdev);
+TAILQ_HEAD(raid_offline_tailq, raid_bdev);
+
+extern struct raid_configured_tailq g_raid_bdev_configured_list;
+extern struct raid_configuring_tailq g_raid_bdev_configuring_list;
+extern struct raid_all_tailq g_raid_bdev_list;
+extern struct raid_offline_tailq g_raid_bdev_offline_list;
+extern struct raid_config g_raid_config;
+
+typedef void (*raid_bdev_destruct_cb)(void *cb_ctx, int rc);
+
+int raid_bdev_create(struct raid_bdev_config *raid_cfg);
+int raid_bdev_add_base_devices(struct raid_bdev_config *raid_cfg);
+void raid_bdev_remove_base_devices(struct raid_bdev_config *raid_cfg,
+ raid_bdev_destruct_cb cb_fn, void *cb_ctx);
+int raid_bdev_config_add(const char *raid_name, uint32_t strip_size, uint8_t num_base_bdevs,
+ enum raid_level level, struct raid_bdev_config **_raid_cfg);
+int raid_bdev_config_add_base_bdev(struct raid_bdev_config *raid_cfg,
+ const char *base_bdev_name, uint8_t slot);
+void raid_bdev_config_cleanup(struct raid_bdev_config *raid_cfg);
+struct raid_bdev_config *raid_bdev_config_find_by_name(const char *raid_name);
+enum raid_level raid_bdev_parse_raid_level(const char *str);
+const char *raid_bdev_level_to_str(enum raid_level level);
+
+/*
+ * RAID module descriptor
+ */
+struct raid_bdev_module {
+ /* RAID level implemented by this module */
+ enum raid_level level;
+
+ /* Minimum required number of base bdevs. Must be > 0. */
+ uint8_t base_bdevs_min;
+
+ /*
+ * Maximum number of base bdevs that can be removed without failing
+ * the array.
+ */
+ uint8_t base_bdevs_max_degraded;
+
+ /*
+ * Called when the raid is starting, right before changing the state to
+ * online and registering the bdev. Parameters of the bdev like blockcnt
+ * should be set here.
+ *
+ * Non-zero return value will abort the startup process.
+ */
+ int (*start)(struct raid_bdev *raid_bdev);
+
+ /*
+ * Called when the raid is stopping, right before changing the state to
+ * offline and unregistering the bdev. Optional.
+ */
+ void (*stop)(struct raid_bdev *raid_bdev);
+
+ /* Handler for R/W requests */
+ void (*submit_rw_request)(struct raid_bdev_io *raid_io);
+
+ /* Handler for requests without payload (flush, unmap). Optional. */
+ void (*submit_null_payload_request)(struct raid_bdev_io *raid_io);
+
+ TAILQ_ENTRY(raid_bdev_module) link;
+};
+
+void raid_bdev_module_list_add(struct raid_bdev_module *raid_module);
+
+#define __RAID_MODULE_REGISTER(line) __RAID_MODULE_REGISTER_(line)
+#define __RAID_MODULE_REGISTER_(line) raid_module_register_##line
+
+#define RAID_MODULE_REGISTER(_module) \
+__attribute__((constructor)) static void \
+__RAID_MODULE_REGISTER(__LINE__)(void) \
+{ \
+ raid_bdev_module_list_add(_module); \
+}
+
+bool
+raid_bdev_io_complete_part(struct raid_bdev_io *raid_io, uint64_t completed,
+ enum spdk_bdev_io_status status);
+void
+raid_bdev_queue_io_wait(struct raid_bdev_io *raid_io, struct spdk_bdev *bdev,
+ struct spdk_io_channel *ch, spdk_bdev_io_wait_cb cb_fn);
+void
+raid_bdev_io_complete(struct raid_bdev_io *raid_io, enum spdk_bdev_io_status status);
+
+#endif /* SPDK_BDEV_RAID_INTERNAL_H */
diff --git a/src/spdk/module/bdev/raid/bdev_raid_rpc.c b/src/spdk/module/bdev/raid/bdev_raid_rpc.c
new file mode 100644
index 000000000..1c2d070c3
--- /dev/null
+++ b/src/spdk/module/bdev/raid/bdev_raid_rpc.c
@@ -0,0 +1,452 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/rpc.h"
+#include "spdk/bdev.h"
+#include "bdev_raid.h"
+#include "spdk/util.h"
+#include "spdk/string.h"
+#include "spdk_internal/log.h"
+#include "spdk/env.h"
+
+#define RPC_MAX_BASE_BDEVS 255
+
+SPDK_LOG_REGISTER_COMPONENT("raidrpc", SPDK_LOG_RAID_RPC)
+
+/*
+ * Input structure for bdev_raid_get_bdevs RPC
+ */
+struct rpc_bdev_raid_get_bdevs {
+ /* category - all or online or configuring or offline */
+ char *category;
+};
+
+/*
+ * brief:
+ * free_rpc_bdev_raid_get_bdevs function frees RPC bdev_raid_get_bdevs related parameters
+ * params:
+ * req - pointer to RPC request
+ * returns:
+ * none
+ */
+static void
+free_rpc_bdev_raid_get_bdevs(struct rpc_bdev_raid_get_bdevs *req)
+{
+ free(req->category);
+}
+
+/*
+ * Decoder object for RPC get_raids
+ */
+static const struct spdk_json_object_decoder rpc_bdev_raid_get_bdevs_decoders[] = {
+ {"category", offsetof(struct rpc_bdev_raid_get_bdevs, category), spdk_json_decode_string},
+};
+
+/*
+ * brief:
+ * rpc_bdev_raid_get_bdevs function is the RPC for rpc_bdev_raid_get_bdevs. This is used to list
+ * all the raid bdev names based on the input category requested. Category should be
+ * one of "all", "online", "configuring" or "offline". "all" means all the raids
+ * whether they are online or configuring or offline. "online" is the raid bdev which
+ * is registered with bdev layer. "configuring" is the raid bdev which does not have
+ * full configuration discovered yet. "offline" is the raid bdev which is not
+ * registered with bdev as of now and it has encountered any error or user has
+ * requested to offline the raid.
+ * params:
+ * request - pointer to json rpc request
+ * params - pointer to request parameters
+ * returns:
+ * none
+ */
+static void
+rpc_bdev_raid_get_bdevs(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_raid_get_bdevs req = {};
+ struct spdk_json_write_ctx *w;
+ struct raid_bdev *raid_bdev;
+
+ if (spdk_json_decode_object(params, rpc_bdev_raid_get_bdevs_decoders,
+ SPDK_COUNTOF(rpc_bdev_raid_get_bdevs_decoders),
+ &req)) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ if (!(strcmp(req.category, "all") == 0 ||
+ strcmp(req.category, "online") == 0 ||
+ strcmp(req.category, "configuring") == 0 ||
+ strcmp(req.category, "offline") == 0)) {
+ spdk_jsonrpc_send_error_response(request, -EINVAL, spdk_strerror(EINVAL));
+ goto cleanup;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_array_begin(w);
+
+ /* Get raid bdev list based on the category requested */
+ if (strcmp(req.category, "all") == 0) {
+ TAILQ_FOREACH(raid_bdev, &g_raid_bdev_list, global_link) {
+ spdk_json_write_string(w, raid_bdev->bdev.name);
+ }
+ } else if (strcmp(req.category, "online") == 0) {
+ TAILQ_FOREACH(raid_bdev, &g_raid_bdev_configured_list, state_link) {
+ spdk_json_write_string(w, raid_bdev->bdev.name);
+ }
+ } else if (strcmp(req.category, "configuring") == 0) {
+ TAILQ_FOREACH(raid_bdev, &g_raid_bdev_configuring_list, state_link) {
+ spdk_json_write_string(w, raid_bdev->bdev.name);
+ }
+ } else {
+ TAILQ_FOREACH(raid_bdev, &g_raid_bdev_offline_list, state_link) {
+ spdk_json_write_string(w, raid_bdev->bdev.name);
+ }
+ }
+ spdk_json_write_array_end(w);
+ spdk_jsonrpc_end_result(request, w);
+
+cleanup:
+ free_rpc_bdev_raid_get_bdevs(&req);
+}
+SPDK_RPC_REGISTER("bdev_raid_get_bdevs", rpc_bdev_raid_get_bdevs, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_raid_get_bdevs, get_raid_bdevs)
+
+/*
+ * Base bdevs in RPC bdev_raid_create
+ */
+struct rpc_bdev_raid_create_base_bdevs {
+ /* Number of base bdevs */
+ size_t num_base_bdevs;
+
+ /* List of base bdevs names */
+ char *base_bdevs[RPC_MAX_BASE_BDEVS];
+};
+
+/*
+ * Input structure for RPC rpc_bdev_raid_create
+ */
+struct rpc_bdev_raid_create {
+ /* Raid bdev name */
+ char *name;
+
+ /* RAID strip size KB, 'strip_size' is deprecated. */
+ uint32_t strip_size;
+ uint32_t strip_size_kb;
+
+ /* RAID raid level */
+ enum raid_level level;
+
+ /* Base bdevs information */
+ struct rpc_bdev_raid_create_base_bdevs base_bdevs;
+};
+
+/*
+ * brief:
+ * free_rpc_bdev_raid_create function is to free RPC bdev_raid_create related parameters
+ * params:
+ * req - pointer to RPC request
+ * returns:
+ * none
+ */
+static void
+free_rpc_bdev_raid_create(struct rpc_bdev_raid_create *req)
+{
+ size_t i;
+
+ free(req->name);
+ for (i = 0; i < req->base_bdevs.num_base_bdevs; i++) {
+ free(req->base_bdevs.base_bdevs[i]);
+ }
+}
+
+/*
+ * Decoder function for RPC bdev_raid_create to decode raid level
+ */
+static int
+decode_raid_level(const struct spdk_json_val *val, void *out)
+{
+ int ret;
+ char *str = NULL;
+ enum raid_level level;
+
+ ret = spdk_json_decode_string(val, &str);
+ if (ret == 0 && str != NULL) {
+ level = raid_bdev_parse_raid_level(str);
+ if (level == INVALID_RAID_LEVEL) {
+ ret = -EINVAL;
+ } else {
+ *(enum raid_level *)out = level;
+ }
+ }
+
+ free(str);
+ return ret;
+}
+
+/*
+ * Decoder function for RPC bdev_raid_create to decode base bdevs list
+ */
+static int
+decode_base_bdevs(const struct spdk_json_val *val, void *out)
+{
+ struct rpc_bdev_raid_create_base_bdevs *base_bdevs = out;
+ return spdk_json_decode_array(val, spdk_json_decode_string, base_bdevs->base_bdevs,
+ RPC_MAX_BASE_BDEVS, &base_bdevs->num_base_bdevs, sizeof(char *));
+}
+
+/*
+ * Decoder object for RPC bdev_raid_create
+ */
+/* Note: strip_size is deprecated, one of the two options must be specified but not both. */
+static const struct spdk_json_object_decoder rpc_bdev_raid_create_decoders[] = {
+ {"name", offsetof(struct rpc_bdev_raid_create, name), spdk_json_decode_string},
+ {"strip_size", offsetof(struct rpc_bdev_raid_create, strip_size), spdk_json_decode_uint32, true},
+ {"strip_size_kb", offsetof(struct rpc_bdev_raid_create, strip_size_kb), spdk_json_decode_uint32, true},
+ {"raid_level", offsetof(struct rpc_bdev_raid_create, level), decode_raid_level},
+ {"base_bdevs", offsetof(struct rpc_bdev_raid_create, base_bdevs), decode_base_bdevs},
+};
+
+/*
+ * brief:
+ * rpc_bdev_raid_create function is the RPC for creating RAID bdevs. It takes
+ * input as raid bdev name, raid level, strip size in KB and list of base bdev names.
+ * params:
+ * request - pointer to json rpc request
+ * params - pointer to request parameters
+ * returns:
+ * none
+ */
+static void
+rpc_bdev_raid_create(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_raid_create req = {};
+ struct spdk_json_write_ctx *w;
+ struct raid_bdev_config *raid_cfg;
+ int rc;
+ size_t i;
+
+ if (spdk_json_decode_object(params, rpc_bdev_raid_create_decoders,
+ SPDK_COUNTOF(rpc_bdev_raid_create_decoders),
+ &req)) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ if (req.strip_size == 0 && req.strip_size_kb == 0) {
+ spdk_jsonrpc_send_error_response(request, EINVAL, "strip size not specified");
+ goto cleanup;
+ } else if (req.strip_size > 0 && req.strip_size_kb > 0) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "please use only one strip size option");
+ goto cleanup;
+ } else if (req.strip_size > 0 && req.strip_size_kb == 0) {
+ SPDK_ERRLOG("the rpc param strip_size is deprecated.\n");
+ req.strip_size_kb = req.strip_size;
+ }
+
+ rc = raid_bdev_config_add(req.name, req.strip_size_kb, req.base_bdevs.num_base_bdevs,
+ req.level,
+ &raid_cfg);
+ if (rc != 0) {
+ spdk_jsonrpc_send_error_response_fmt(request, rc,
+ "Failed to add RAID bdev config %s: %s",
+ req.name, spdk_strerror(-rc));
+ goto cleanup;
+ }
+
+ for (i = 0; i < req.base_bdevs.num_base_bdevs; i++) {
+ rc = raid_bdev_config_add_base_bdev(raid_cfg, req.base_bdevs.base_bdevs[i], i);
+ if (rc != 0) {
+ raid_bdev_config_cleanup(raid_cfg);
+ spdk_jsonrpc_send_error_response_fmt(request, rc,
+ "Failed to add base bdev %s to RAID bdev config %s: %s",
+ req.base_bdevs.base_bdevs[i], req.name,
+ spdk_strerror(-rc));
+ goto cleanup;
+ }
+ }
+
+ rc = raid_bdev_create(raid_cfg);
+ if (rc != 0) {
+ raid_bdev_config_cleanup(raid_cfg);
+ spdk_jsonrpc_send_error_response_fmt(request, rc,
+ "Failed to create RAID bdev %s: %s",
+ req.name, spdk_strerror(-rc));
+ goto cleanup;
+ }
+
+ rc = raid_bdev_add_base_devices(raid_cfg);
+ if (rc != 0) {
+ spdk_jsonrpc_send_error_response_fmt(request, rc,
+ "Failed to add any base bdev to RAID bdev %s: %s",
+ req.name, spdk_strerror(-rc));
+ goto cleanup;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+
+cleanup:
+ free_rpc_bdev_raid_create(&req);
+}
+SPDK_RPC_REGISTER("bdev_raid_create", rpc_bdev_raid_create, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_raid_create, construct_raid_bdev)
+
+/*
+ * Input structure for RPC deleting a raid bdev
+ */
+struct rpc_bdev_raid_delete {
+ /* raid bdev name */
+ char *name;
+};
+
+/*
+ * brief:
+ * free_rpc_bdev_raid_delete function is used to free RPC bdev_raid_delete related parameters
+ * params:
+ * req - pointer to RPC request
+ * params:
+ * none
+ */
+static void
+free_rpc_bdev_raid_delete(struct rpc_bdev_raid_delete *req)
+{
+ free(req->name);
+}
+
+/*
+ * Decoder object for RPC raid_bdev_delete
+ */
+static const struct spdk_json_object_decoder rpc_bdev_raid_delete_decoders[] = {
+ {"name", offsetof(struct rpc_bdev_raid_delete, name), spdk_json_decode_string},
+};
+
+struct rpc_bdev_raid_delete_ctx {
+ struct rpc_bdev_raid_delete req;
+ struct raid_bdev_config *raid_cfg;
+ struct spdk_jsonrpc_request *request;
+};
+
+/*
+ * brief:
+ * params:
+ * cb_arg - pointer to the callback context.
+ * rc - return code of the deletion of the raid bdev.
+ * returns:
+ * none
+ */
+static void
+bdev_raid_delete_done(void *cb_arg, int rc)
+{
+ struct rpc_bdev_raid_delete_ctx *ctx = cb_arg;
+ struct raid_bdev_config *raid_cfg;
+ struct spdk_jsonrpc_request *request = ctx->request;
+ struct spdk_json_write_ctx *w;
+
+ if (rc != 0) {
+ SPDK_ERRLOG("Failed to delete raid bdev %s (%d): %s\n",
+ ctx->req.name, rc, spdk_strerror(-rc));
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ spdk_strerror(-rc));
+ goto exit;
+ }
+
+ raid_cfg = ctx->raid_cfg;
+ assert(raid_cfg->raid_bdev == NULL);
+
+ raid_bdev_config_cleanup(raid_cfg);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+exit:
+ free_rpc_bdev_raid_delete(&ctx->req);
+ free(ctx);
+}
+
+/*
+ * brief:
+ * rpc_bdev_raid_delete function is the RPC for deleting a raid bdev. It takes raid
+ * name as input and delete that raid bdev including freeing the base bdev
+ * resources.
+ * params:
+ * request - pointer to json rpc request
+ * params - pointer to request parameters
+ * returns:
+ * none
+ */
+static void
+rpc_bdev_raid_delete(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_raid_delete_ctx *ctx;
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx) {
+ spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM));
+ return;
+ }
+
+ if (spdk_json_decode_object(params, rpc_bdev_raid_delete_decoders,
+ SPDK_COUNTOF(rpc_bdev_raid_delete_decoders),
+ &ctx->req)) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ ctx->raid_cfg = raid_bdev_config_find_by_name(ctx->req.name);
+ if (ctx->raid_cfg == NULL) {
+ spdk_jsonrpc_send_error_response_fmt(request, ENODEV,
+ "raid bdev %s is not found in config",
+ ctx->req.name);
+ goto cleanup;
+ }
+
+ ctx->request = request;
+
+ /* Remove all the base bdevs from this raid bdev before deleting the raid bdev */
+ raid_bdev_remove_base_devices(ctx->raid_cfg, bdev_raid_delete_done, ctx);
+
+ return;
+
+cleanup:
+ free_rpc_bdev_raid_delete(&ctx->req);
+ free(ctx);
+}
+SPDK_RPC_REGISTER("bdev_raid_delete", rpc_bdev_raid_delete, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_raid_delete, destroy_raid_bdev)
diff --git a/src/spdk/module/bdev/raid/raid0.c b/src/spdk/module/bdev/raid/raid0.c
new file mode 100644
index 000000000..5632c5b7c
--- /dev/null
+++ b/src/spdk/module/bdev/raid/raid0.c
@@ -0,0 +1,398 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "bdev_raid.h"
+
+#include "spdk/env.h"
+#include "spdk/thread.h"
+#include "spdk/string.h"
+#include "spdk/util.h"
+
+#include "spdk_internal/log.h"
+
+/*
+ * brief:
+ * raid0_bdev_io_completion function is called by lower layers to notify raid
+ * module that particular bdev_io is completed.
+ * params:
+ * bdev_io - pointer to bdev io submitted to lower layers, like child io
+ * success - bdev_io status
+ * cb_arg - function callback context (parent raid_bdev_io)
+ * returns:
+ * none
+ */
+static void
+raid0_bdev_io_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct raid_bdev_io *raid_io = cb_arg;
+
+ spdk_bdev_free_io(bdev_io);
+
+ if (success) {
+ raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_SUCCESS);
+ } else {
+ raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+}
+
+static void
+raid0_submit_rw_request(struct raid_bdev_io *raid_io);
+
+static void
+_raid0_submit_rw_request(void *_raid_io)
+{
+ struct raid_bdev_io *raid_io = _raid_io;
+
+ raid0_submit_rw_request(raid_io);
+}
+
+/*
+ * brief:
+ * raid0_submit_rw_request function is used to submit I/O to the correct
+ * member disk for raid0 bdevs.
+ * params:
+ * raid_io
+ * returns:
+ * none
+ */
+static void
+raid0_submit_rw_request(struct raid_bdev_io *raid_io)
+{
+ struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io);
+ struct raid_bdev_io_channel *raid_ch = raid_io->raid_ch;
+ struct raid_bdev *raid_bdev = raid_io->raid_bdev;
+ uint64_t pd_strip;
+ uint32_t offset_in_strip;
+ uint64_t pd_lba;
+ uint64_t pd_blocks;
+ uint8_t pd_idx;
+ int ret = 0;
+ uint64_t start_strip;
+ uint64_t end_strip;
+ struct raid_base_bdev_info *base_info;
+ struct spdk_io_channel *base_ch;
+
+ start_strip = bdev_io->u.bdev.offset_blocks >> raid_bdev->strip_size_shift;
+ end_strip = (bdev_io->u.bdev.offset_blocks + bdev_io->u.bdev.num_blocks - 1) >>
+ raid_bdev->strip_size_shift;
+ if (start_strip != end_strip && raid_bdev->num_base_bdevs > 1) {
+ assert(false);
+ SPDK_ERRLOG("I/O spans strip boundary!\n");
+ raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
+ return;
+ }
+
+ pd_strip = start_strip / raid_bdev->num_base_bdevs;
+ pd_idx = start_strip % raid_bdev->num_base_bdevs;
+ offset_in_strip = bdev_io->u.bdev.offset_blocks & (raid_bdev->strip_size - 1);
+ pd_lba = (pd_strip << raid_bdev->strip_size_shift) + offset_in_strip;
+ pd_blocks = bdev_io->u.bdev.num_blocks;
+ base_info = &raid_bdev->base_bdev_info[pd_idx];
+ if (base_info->desc == NULL) {
+ SPDK_ERRLOG("base bdev desc null for pd_idx %u\n", pd_idx);
+ assert(0);
+ }
+
+ /*
+ * Submit child io to bdev layer with using base bdev descriptors, base
+ * bdev lba, base bdev child io length in blocks, buffer, completion
+ * function and function callback context
+ */
+ assert(raid_ch != NULL);
+ assert(raid_ch->base_channel);
+ base_ch = raid_ch->base_channel[pd_idx];
+ if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
+ ret = spdk_bdev_readv_blocks(base_info->desc, base_ch,
+ bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
+ pd_lba, pd_blocks, raid0_bdev_io_completion,
+ raid_io);
+ } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
+ ret = spdk_bdev_writev_blocks(base_info->desc, base_ch,
+ bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
+ pd_lba, pd_blocks, raid0_bdev_io_completion,
+ raid_io);
+ } else {
+ SPDK_ERRLOG("Recvd not supported io type %u\n", bdev_io->type);
+ assert(0);
+ }
+
+ if (ret == -ENOMEM) {
+ raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch,
+ _raid0_submit_rw_request);
+ } else if (ret != 0) {
+ SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n");
+ assert(false);
+ raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+}
+
+/* raid0 IO range */
+struct raid_bdev_io_range {
+ uint64_t strip_size;
+ uint64_t start_strip_in_disk;
+ uint64_t end_strip_in_disk;
+ uint64_t start_offset_in_strip;
+ uint64_t end_offset_in_strip;
+ uint8_t start_disk;
+ uint8_t end_disk;
+ uint8_t n_disks_involved;
+};
+
+static inline void
+_raid0_get_io_range(struct raid_bdev_io_range *io_range,
+ uint8_t num_base_bdevs, uint64_t strip_size, uint64_t strip_size_shift,
+ uint64_t offset_blocks, uint64_t num_blocks)
+{
+ uint64_t start_strip;
+ uint64_t end_strip;
+
+ io_range->strip_size = strip_size;
+
+ /* The start and end strip index in raid0 bdev scope */
+ start_strip = offset_blocks >> strip_size_shift;
+ end_strip = (offset_blocks + num_blocks - 1) >> strip_size_shift;
+ io_range->start_strip_in_disk = start_strip / num_base_bdevs;
+ io_range->end_strip_in_disk = end_strip / num_base_bdevs;
+
+ /* The first strip may have unaligned start LBA offset.
+ * The end strip may have unaligned end LBA offset.
+ * Strips between them certainly have aligned offset and length to boundaries.
+ */
+ io_range->start_offset_in_strip = offset_blocks % strip_size;
+ io_range->end_offset_in_strip = (offset_blocks + num_blocks - 1) % strip_size;
+
+ /* The base bdev indexes in which start and end strips are located */
+ io_range->start_disk = start_strip % num_base_bdevs;
+ io_range->end_disk = end_strip % num_base_bdevs;
+
+ /* Calculate how many base_bdevs are involved in io operation.
+ * Number of base bdevs involved is between 1 and num_base_bdevs.
+ * It will be 1 if the first strip and last strip are the same one.
+ */
+ io_range->n_disks_involved = spdk_min((end_strip - start_strip + 1), num_base_bdevs);
+}
+
+static inline void
+_raid0_split_io_range(struct raid_bdev_io_range *io_range, uint8_t disk_idx,
+ uint64_t *_offset_in_disk, uint64_t *_nblocks_in_disk)
+{
+ uint64_t n_strips_in_disk;
+ uint64_t start_offset_in_disk;
+ uint64_t end_offset_in_disk;
+ uint64_t offset_in_disk;
+ uint64_t nblocks_in_disk;
+ uint64_t start_strip_in_disk;
+ uint64_t end_strip_in_disk;
+
+ start_strip_in_disk = io_range->start_strip_in_disk;
+ if (disk_idx < io_range->start_disk) {
+ start_strip_in_disk += 1;
+ }
+
+ end_strip_in_disk = io_range->end_strip_in_disk;
+ if (disk_idx > io_range->end_disk) {
+ end_strip_in_disk -= 1;
+ }
+
+ assert(end_strip_in_disk >= start_strip_in_disk);
+ n_strips_in_disk = end_strip_in_disk - start_strip_in_disk + 1;
+
+ if (disk_idx == io_range->start_disk) {
+ start_offset_in_disk = io_range->start_offset_in_strip;
+ } else {
+ start_offset_in_disk = 0;
+ }
+
+ if (disk_idx == io_range->end_disk) {
+ end_offset_in_disk = io_range->end_offset_in_strip;
+ } else {
+ end_offset_in_disk = io_range->strip_size - 1;
+ }
+
+ offset_in_disk = start_offset_in_disk + start_strip_in_disk * io_range->strip_size;
+ nblocks_in_disk = (n_strips_in_disk - 1) * io_range->strip_size
+ + end_offset_in_disk - start_offset_in_disk + 1;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID0,
+ "raid_bdev (strip_size 0x%lx) splits IO to base_bdev (%u) at (0x%lx, 0x%lx).\n",
+ io_range->strip_size, disk_idx, offset_in_disk, nblocks_in_disk);
+
+ *_offset_in_disk = offset_in_disk;
+ *_nblocks_in_disk = nblocks_in_disk;
+}
+
+static void
+raid0_submit_null_payload_request(struct raid_bdev_io *raid_io);
+
+static void
+_raid0_submit_null_payload_request(void *_raid_io)
+{
+ struct raid_bdev_io *raid_io = _raid_io;
+
+ raid0_submit_null_payload_request(raid_io);
+}
+
+static void
+raid0_base_io_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct raid_bdev_io *raid_io = cb_arg;
+
+ raid_bdev_io_complete_part(raid_io, 1, success ?
+ SPDK_BDEV_IO_STATUS_SUCCESS :
+ SPDK_BDEV_IO_STATUS_FAILED);
+
+ spdk_bdev_free_io(bdev_io);
+}
+
+/*
+ * brief:
+ * raid0_submit_null_payload_request function submits the next batch of
+ * io requests with range but without payload, like FLUSH and UNMAP, to member disks;
+ * it will submit as many as possible unless one base io request fails with -ENOMEM,
+ * in which case it will queue itself for later submission.
+ * params:
+ * bdev_io - pointer to parent bdev_io on raid bdev device
+ * returns:
+ * none
+ */
+static void
+raid0_submit_null_payload_request(struct raid_bdev_io *raid_io)
+{
+ struct spdk_bdev_io *bdev_io;
+ struct raid_bdev *raid_bdev;
+ struct raid_bdev_io_range io_range;
+ int ret;
+ struct raid_base_bdev_info *base_info;
+ struct spdk_io_channel *base_ch;
+
+ bdev_io = spdk_bdev_io_from_ctx(raid_io);
+ raid_bdev = raid_io->raid_bdev;
+
+ _raid0_get_io_range(&io_range, raid_bdev->num_base_bdevs,
+ raid_bdev->strip_size, raid_bdev->strip_size_shift,
+ bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks);
+
+ if (raid_io->base_bdev_io_remaining == 0) {
+ raid_io->base_bdev_io_remaining = io_range.n_disks_involved;
+ }
+
+ while (raid_io->base_bdev_io_submitted < io_range.n_disks_involved) {
+ uint8_t disk_idx;
+ uint64_t offset_in_disk;
+ uint64_t nblocks_in_disk;
+
+ /* base_bdev is started from start_disk to end_disk.
+ * It is possible that index of start_disk is larger than end_disk's.
+ */
+ disk_idx = (io_range.start_disk + raid_io->base_bdev_io_submitted) % raid_bdev->num_base_bdevs;
+ base_info = &raid_bdev->base_bdev_info[disk_idx];
+ base_ch = raid_io->raid_ch->base_channel[disk_idx];
+
+ _raid0_split_io_range(&io_range, disk_idx, &offset_in_disk, &nblocks_in_disk);
+
+ switch (bdev_io->type) {
+ case SPDK_BDEV_IO_TYPE_UNMAP:
+ ret = spdk_bdev_unmap_blocks(base_info->desc, base_ch,
+ offset_in_disk, nblocks_in_disk,
+ raid0_base_io_complete, raid_io);
+ break;
+
+ case SPDK_BDEV_IO_TYPE_FLUSH:
+ ret = spdk_bdev_flush_blocks(base_info->desc, base_ch,
+ offset_in_disk, nblocks_in_disk,
+ raid0_base_io_complete, raid_io);
+ break;
+
+ default:
+ SPDK_ERRLOG("submit request, invalid io type with null payload %u\n", bdev_io->type);
+ assert(false);
+ ret = -EIO;
+ }
+
+ if (ret == 0) {
+ raid_io->base_bdev_io_submitted++;
+ } else if (ret == -ENOMEM) {
+ raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch,
+ _raid0_submit_null_payload_request);
+ return;
+ } else {
+ SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n");
+ assert(false);
+ raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
+ return;
+ }
+ }
+}
+
+static int raid0_start(struct raid_bdev *raid_bdev)
+{
+ uint64_t min_blockcnt = UINT64_MAX;
+ struct raid_base_bdev_info *base_info;
+
+ RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
+ /* Calculate minimum block count from all base bdevs */
+ min_blockcnt = spdk_min(min_blockcnt, base_info->bdev->blockcnt);
+ }
+
+ /*
+ * Take the minimum block count based approach where total block count
+ * of raid bdev is the number of base bdev times the minimum block count
+ * of any base bdev.
+ */
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID0, "min blockcount %lu, numbasedev %u, strip size shift %u\n",
+ min_blockcnt, raid_bdev->num_base_bdevs, raid_bdev->strip_size_shift);
+ raid_bdev->bdev.blockcnt = ((min_blockcnt >> raid_bdev->strip_size_shift) <<
+ raid_bdev->strip_size_shift) * raid_bdev->num_base_bdevs;
+
+ if (raid_bdev->num_base_bdevs > 1) {
+ raid_bdev->bdev.optimal_io_boundary = raid_bdev->strip_size;
+ raid_bdev->bdev.split_on_optimal_io_boundary = true;
+ } else {
+ /* Do not need to split reads/writes on single bdev RAID modules. */
+ raid_bdev->bdev.optimal_io_boundary = 0;
+ raid_bdev->bdev.split_on_optimal_io_boundary = false;
+ }
+
+ return 0;
+}
+
+static struct raid_bdev_module g_raid0_module = {
+ .level = RAID0,
+ .base_bdevs_min = 1,
+ .start = raid0_start,
+ .submit_rw_request = raid0_submit_rw_request,
+ .submit_null_payload_request = raid0_submit_null_payload_request,
+};
+RAID_MODULE_REGISTER(&g_raid0_module)
+
+SPDK_LOG_REGISTER_COMPONENT("bdev_raid0", SPDK_LOG_BDEV_RAID0)
diff --git a/src/spdk/module/bdev/raid/raid5.c b/src/spdk/module/bdev/raid/raid5.c
new file mode 100644
index 000000000..1e287c863
--- /dev/null
+++ b/src/spdk/module/bdev/raid/raid5.c
@@ -0,0 +1,114 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "bdev_raid.h"
+
+#include "spdk/env.h"
+#include "spdk/thread.h"
+#include "spdk/string.h"
+#include "spdk/util.h"
+
+#include "spdk_internal/log.h"
+
+struct raid5_info {
+ /* The parent raid bdev */
+ struct raid_bdev *raid_bdev;
+
+ /* Number of data blocks in a stripe (without parity) */
+ uint64_t stripe_blocks;
+
+ /* Number of stripes on this array */
+ uint64_t total_stripes;
+};
+
+static inline uint8_t
+raid5_stripe_data_chunks_num(const struct raid_bdev *raid_bdev)
+{
+ return raid_bdev->num_base_bdevs - raid_bdev->module->base_bdevs_max_degraded;
+}
+
+static void
+raid5_submit_rw_request(struct raid_bdev_io *raid_io)
+{
+ raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
+}
+
+static int
+raid5_start(struct raid_bdev *raid_bdev)
+{
+ uint64_t min_blockcnt = UINT64_MAX;
+ struct raid_base_bdev_info *base_info;
+ struct raid5_info *r5info;
+
+ r5info = calloc(1, sizeof(*r5info));
+ if (!r5info) {
+ SPDK_ERRLOG("Failed to allocate r5info\n");
+ return -ENOMEM;
+ }
+ r5info->raid_bdev = raid_bdev;
+
+ RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
+ min_blockcnt = spdk_min(min_blockcnt, base_info->bdev->blockcnt);
+ }
+
+ r5info->total_stripes = min_blockcnt / raid_bdev->strip_size;
+ r5info->stripe_blocks = raid_bdev->strip_size * raid5_stripe_data_chunks_num(raid_bdev);
+
+ raid_bdev->bdev.blockcnt = r5info->stripe_blocks * r5info->total_stripes;
+ raid_bdev->bdev.optimal_io_boundary = r5info->stripe_blocks;
+ raid_bdev->bdev.split_on_optimal_io_boundary = true;
+
+ raid_bdev->module_private = r5info;
+
+ return 0;
+}
+
+static void
+raid5_stop(struct raid_bdev *raid_bdev)
+{
+ struct raid5_info *r5info = raid_bdev->module_private;
+
+ free(r5info);
+}
+
+static struct raid_bdev_module g_raid5_module = {
+ .level = RAID5,
+ .base_bdevs_min = 3,
+ .base_bdevs_max_degraded = 1,
+ .start = raid5_start,
+ .stop = raid5_stop,
+ .submit_rw_request = raid5_submit_rw_request,
+};
+RAID_MODULE_REGISTER(&g_raid5_module)
+
+SPDK_LOG_REGISTER_COMPONENT("bdev_raid5", SPDK_LOG_BDEV_RAID5)
diff --git a/src/spdk/module/bdev/rbd/Makefile b/src/spdk/module/bdev/rbd/Makefile
new file mode 100644
index 000000000..055e14dac
--- /dev/null
+++ b/src/spdk/module/bdev/rbd/Makefile
@@ -0,0 +1,45 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 3
+SO_MINOR := 0
+
+C_SRCS = bdev_rbd.c bdev_rbd_rpc.c
+LIBNAME = bdev_rbd
+
+SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/module/bdev/rbd/bdev_rbd.c b/src/spdk/module/bdev/rbd/bdev_rbd.c
new file mode 100644
index 000000000..f3b2547c4
--- /dev/null
+++ b/src/spdk/module/bdev/rbd/bdev_rbd.c
@@ -0,0 +1,898 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "bdev_rbd.h"
+
+#include <rbd/librbd.h>
+#include <rados/librados.h>
+#include <sys/eventfd.h>
+
+#include "spdk/conf.h"
+#include "spdk/env.h"
+#include "spdk/bdev.h"
+#include "spdk/thread.h"
+#include "spdk/json.h"
+#include "spdk/string.h"
+#include "spdk/util.h"
+
+#include "spdk/bdev_module.h"
+#include "spdk_internal/log.h"
+
+#define SPDK_RBD_QUEUE_DEPTH 128
+
+static int bdev_rbd_count = 0;
+
+#define BDEV_RBD_POLL_US 50
+
+struct bdev_rbd {
+ struct spdk_bdev disk;
+ char *rbd_name;
+ char *user_id;
+ char *pool_name;
+ char **config;
+ rbd_image_info_t info;
+ TAILQ_ENTRY(bdev_rbd) tailq;
+ struct spdk_poller *reset_timer;
+ struct spdk_bdev_io *reset_bdev_io;
+};
+
+struct bdev_rbd_io_channel {
+ rados_ioctx_t io_ctx;
+ rados_t cluster;
+ struct pollfd pfd;
+ rbd_image_t image;
+ struct bdev_rbd *disk;
+ struct spdk_poller *poller;
+};
+
+struct bdev_rbd_io {
+ uint64_t remaining_len;
+ int num_segments;
+ bool failed;
+};
+
+static void
+bdev_rbd_free(struct bdev_rbd *rbd)
+{
+ if (!rbd) {
+ return;
+ }
+
+ free(rbd->disk.name);
+ free(rbd->rbd_name);
+ free(rbd->user_id);
+ free(rbd->pool_name);
+ bdev_rbd_free_config(rbd->config);
+ free(rbd);
+}
+
+void
+bdev_rbd_free_config(char **config)
+{
+ char **entry;
+
+ if (config) {
+ for (entry = config; *entry; entry++) {
+ free(*entry);
+ }
+ free(config);
+ }
+}
+
+char **
+bdev_rbd_dup_config(const char *const *config)
+{
+ size_t count;
+ char **copy;
+
+ if (!config) {
+ return NULL;
+ }
+ for (count = 0; config[count]; count++) {}
+ copy = calloc(count + 1, sizeof(*copy));
+ if (!copy) {
+ return NULL;
+ }
+ for (count = 0; config[count]; count++) {
+ if (!(copy[count] = strdup(config[count]))) {
+ bdev_rbd_free_config(copy);
+ return NULL;
+ }
+ }
+ return copy;
+}
+
+static int
+bdev_rados_context_init(const char *user_id, const char *rbd_pool_name, const char *const *config,
+ rados_t *cluster, rados_ioctx_t *io_ctx)
+{
+ int ret;
+
+ ret = rados_create(cluster, user_id);
+ if (ret < 0) {
+ SPDK_ERRLOG("Failed to create rados_t struct\n");
+ return -1;
+ }
+
+ if (config) {
+ const char *const *entry = config;
+ while (*entry) {
+ ret = rados_conf_set(*cluster, entry[0], entry[1]);
+ if (ret < 0) {
+ SPDK_ERRLOG("Failed to set %s = %s\n", entry[0], entry[1]);
+ rados_shutdown(*cluster);
+ return -1;
+ }
+ entry += 2;
+ }
+ } else {
+ ret = rados_conf_read_file(*cluster, NULL);
+ if (ret < 0) {
+ SPDK_ERRLOG("Failed to read conf file\n");
+ rados_shutdown(*cluster);
+ return -1;
+ }
+ }
+
+ ret = rados_connect(*cluster);
+ if (ret < 0) {
+ SPDK_ERRLOG("Failed to connect to rbd_pool\n");
+ rados_shutdown(*cluster);
+ return -1;
+ }
+
+ ret = rados_ioctx_create(*cluster, rbd_pool_name, io_ctx);
+
+ if (ret < 0) {
+ SPDK_ERRLOG("Failed to create ioctx\n");
+ rados_shutdown(*cluster);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int
+bdev_rbd_init(const char *user_id, const char *rbd_pool_name, const char *const *config,
+ const char *rbd_name, rbd_image_info_t *info)
+{
+ int ret;
+ rados_t cluster = NULL;
+ rados_ioctx_t io_ctx = NULL;
+ rbd_image_t image = NULL;
+
+ ret = bdev_rados_context_init(user_id, rbd_pool_name, config, &cluster, &io_ctx);
+ if (ret < 0) {
+ SPDK_ERRLOG("Failed to create rados context for user_id=%s and rbd_pool=%s\n",
+ user_id ? user_id : "admin (the default)", rbd_pool_name);
+ return -1;
+ }
+
+ ret = rbd_open(io_ctx, rbd_name, &image, NULL);
+ if (ret < 0) {
+ SPDK_ERRLOG("Failed to open specified rbd device\n");
+ goto err;
+ }
+ ret = rbd_stat(image, info, sizeof(*info));
+ rbd_close(image);
+ if (ret < 0) {
+ SPDK_ERRLOG("Failed to stat specified rbd device\n");
+ goto err;
+ }
+
+ rados_ioctx_destroy(io_ctx);
+ return 0;
+err:
+ rados_ioctx_destroy(io_ctx);
+ rados_shutdown(cluster);
+ return -1;
+}
+
+static void
+bdev_rbd_exit(rbd_image_t image)
+{
+ rbd_flush(image);
+ rbd_close(image);
+}
+
+static void
+bdev_rbd_finish_aiocb(rbd_completion_t cb, void *arg)
+{
+ /* Doing nothing here */
+}
+
+static int
+bdev_rbd_start_aio(rbd_image_t image, struct spdk_bdev_io *bdev_io,
+ void *buf, uint64_t offset, size_t len)
+{
+ int ret;
+ rbd_completion_t comp;
+
+ ret = rbd_aio_create_completion(bdev_io, bdev_rbd_finish_aiocb,
+ &comp);
+ if (ret < 0) {
+ return -1;
+ }
+
+ if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
+ ret = rbd_aio_read(image, offset, len,
+ buf, comp);
+ } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
+ ret = rbd_aio_write(image, offset, len,
+ buf, comp);
+ } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_FLUSH) {
+ ret = rbd_aio_flush(image, comp);
+ }
+
+ if (ret < 0) {
+ rbd_aio_release(comp);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int bdev_rbd_library_init(void);
+
+static int
+bdev_rbd_get_ctx_size(void)
+{
+ return sizeof(struct bdev_rbd_io);
+}
+
+static struct spdk_bdev_module rbd_if = {
+ .name = "rbd",
+ .module_init = bdev_rbd_library_init,
+ .get_ctx_size = bdev_rbd_get_ctx_size,
+
+};
+SPDK_BDEV_MODULE_REGISTER(rbd, &rbd_if)
+
+static int64_t
+bdev_rbd_rw(struct bdev_rbd *disk, struct spdk_io_channel *ch,
+ struct spdk_bdev_io *bdev_io, struct iovec *iov,
+ int iovcnt, size_t len, uint64_t offset)
+{
+ struct bdev_rbd_io *rbd_io = (struct bdev_rbd_io *)bdev_io->driver_ctx;
+ struct bdev_rbd_io_channel *rbdio_ch = spdk_io_channel_get_ctx(ch);
+ size_t remaining = len;
+ int i, rc;
+
+ rbd_io->remaining_len = 0;
+ rbd_io->num_segments = 0;
+ rbd_io->failed = false;
+
+ for (i = 0; i < iovcnt && remaining > 0; i++) {
+ size_t seg_len = spdk_min(remaining, iov[i].iov_len);
+
+ rc = bdev_rbd_start_aio(rbdio_ch->image, bdev_io, iov[i].iov_base, offset, seg_len);
+ if (rc) {
+ /*
+ * This bdev_rbd_start_aio() call failed, but if any previous ones were
+ * submitted, we need to wait for them to finish.
+ */
+ if (rbd_io->num_segments == 0) {
+ /* No previous I/O submitted - return error code immediately. */
+ return rc;
+ }
+
+ /* Return and wait for outstanding I/O to complete. */
+ rbd_io->failed = true;
+ return 0;
+ }
+
+ rbd_io->num_segments++;
+ rbd_io->remaining_len += seg_len;
+
+ offset += seg_len;
+ remaining -= seg_len;
+ }
+
+ return 0;
+}
+
+static int64_t
+bdev_rbd_flush(struct bdev_rbd *disk, struct spdk_io_channel *ch,
+ struct spdk_bdev_io *bdev_io, uint64_t offset, uint64_t nbytes)
+{
+ struct bdev_rbd_io_channel *rbdio_ch = spdk_io_channel_get_ctx(ch);
+ struct bdev_rbd_io *rbd_io = (struct bdev_rbd_io *)bdev_io->driver_ctx;
+
+ rbd_io->num_segments++;
+ return bdev_rbd_start_aio(rbdio_ch->image, bdev_io, NULL, offset, nbytes);
+}
+
+static int
+bdev_rbd_reset_timer(void *arg)
+{
+ struct bdev_rbd *disk = arg;
+
+ /*
+ * TODO: This should check if any I/O is still in flight before completing the reset.
+ * For now, just complete after the timer expires.
+ */
+ spdk_bdev_io_complete(disk->reset_bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
+ spdk_poller_unregister(&disk->reset_timer);
+ disk->reset_bdev_io = NULL;
+
+ return SPDK_POLLER_BUSY;
+}
+
+static int
+bdev_rbd_reset(struct bdev_rbd *disk, struct spdk_bdev_io *bdev_io)
+{
+ /*
+ * HACK: Since librbd doesn't provide any way to cancel outstanding aio, just kick off a
+ * timer to wait for in-flight I/O to complete.
+ */
+ assert(disk->reset_bdev_io == NULL);
+ disk->reset_bdev_io = bdev_io;
+ disk->reset_timer = SPDK_POLLER_REGISTER(bdev_rbd_reset_timer, disk, 1 * 1000 * 1000);
+
+ return 0;
+}
+
+static int
+bdev_rbd_destruct(void *ctx)
+{
+ struct bdev_rbd *rbd = ctx;
+
+ spdk_io_device_unregister(rbd, NULL);
+
+ bdev_rbd_free(rbd);
+ return 0;
+}
+
+static void
+bdev_rbd_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
+ bool success)
+{
+ int ret;
+
+ if (!success) {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ return;
+ }
+
+ ret = bdev_rbd_rw(bdev_io->bdev->ctxt,
+ ch,
+ bdev_io,
+ bdev_io->u.bdev.iovs,
+ bdev_io->u.bdev.iovcnt,
+ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen,
+ bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen);
+
+ if (ret != 0) {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+}
+
+static int _bdev_rbd_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
+{
+ switch (bdev_io->type) {
+ case SPDK_BDEV_IO_TYPE_READ:
+ spdk_bdev_io_get_buf(bdev_io, bdev_rbd_get_buf_cb,
+ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
+ return 0;
+
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ return bdev_rbd_rw((struct bdev_rbd *)bdev_io->bdev->ctxt,
+ ch,
+ bdev_io,
+ bdev_io->u.bdev.iovs,
+ bdev_io->u.bdev.iovcnt,
+ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen,
+ bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen);
+
+ case SPDK_BDEV_IO_TYPE_FLUSH:
+ return bdev_rbd_flush((struct bdev_rbd *)bdev_io->bdev->ctxt,
+ ch,
+ bdev_io,
+ bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen,
+ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
+
+ case SPDK_BDEV_IO_TYPE_RESET:
+ return bdev_rbd_reset((struct bdev_rbd *)bdev_io->bdev->ctxt,
+ bdev_io);
+
+ default:
+ return -1;
+ }
+ return 0;
+}
+
+static void bdev_rbd_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
+{
+ if (_bdev_rbd_submit_request(ch, bdev_io) < 0) {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+}
+
+static bool
+bdev_rbd_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
+{
+ switch (io_type) {
+ case SPDK_BDEV_IO_TYPE_READ:
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ case SPDK_BDEV_IO_TYPE_FLUSH:
+ case SPDK_BDEV_IO_TYPE_RESET:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+static int
+bdev_rbd_io_poll(void *arg)
+{
+ struct bdev_rbd_io_channel *ch = arg;
+ int i, io_status, rc;
+ rbd_completion_t comps[SPDK_RBD_QUEUE_DEPTH];
+ struct spdk_bdev_io *bdev_io;
+ struct bdev_rbd_io *rbd_io;
+
+ rc = poll(&ch->pfd, 1, 0);
+
+ /* check the return value of poll since we have only one fd for each channel */
+ if (rc != 1) {
+ return SPDK_POLLER_BUSY;
+ }
+
+ rc = rbd_poll_io_events(ch->image, comps, SPDK_RBD_QUEUE_DEPTH);
+ for (i = 0; i < rc; i++) {
+ bdev_io = rbd_aio_get_arg(comps[i]);
+ rbd_io = (struct bdev_rbd_io *)bdev_io->driver_ctx;
+ io_status = rbd_aio_get_return_value(comps[i]);
+
+ assert(rbd_io->num_segments > 0);
+ rbd_io->num_segments--;
+
+ if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
+ if (io_status > 0) {
+ /* For reads, io_status is the length */
+ rbd_io->remaining_len -= io_status;
+ }
+
+ if (rbd_io->num_segments == 0 && rbd_io->remaining_len != 0) {
+ rbd_io->failed = true;
+ }
+ } else {
+ /* For others, 0 means success */
+ if (io_status != 0) {
+ rbd_io->failed = true;
+ }
+ }
+
+ rbd_aio_release(comps[i]);
+
+ if (rbd_io->num_segments == 0) {
+ spdk_bdev_io_complete(bdev_io,
+ rbd_io->failed ? SPDK_BDEV_IO_STATUS_FAILED : SPDK_BDEV_IO_STATUS_SUCCESS);
+ }
+ }
+
+ return rc > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
+}
+
+static void
+bdev_rbd_free_channel(struct bdev_rbd_io_channel *ch)
+{
+ if (!ch) {
+ return;
+ }
+
+ if (ch->image) {
+ bdev_rbd_exit(ch->image);
+ }
+
+ if (ch->io_ctx) {
+ rados_ioctx_destroy(ch->io_ctx);
+ }
+
+ if (ch->cluster) {
+ rados_shutdown(ch->cluster);
+ }
+
+ if (ch->pfd.fd >= 0) {
+ close(ch->pfd.fd);
+ }
+}
+
+static void *
+bdev_rbd_handle(void *arg)
+{
+ struct bdev_rbd_io_channel *ch = arg;
+ void *ret = arg;
+
+ if (rbd_open(ch->io_ctx, ch->disk->rbd_name, &ch->image, NULL) < 0) {
+ SPDK_ERRLOG("Failed to open specified rbd device\n");
+ ret = NULL;
+ }
+
+ return ret;
+}
+
+static int
+bdev_rbd_create_cb(void *io_device, void *ctx_buf)
+{
+ struct bdev_rbd_io_channel *ch = ctx_buf;
+ int ret;
+
+ ch->disk = io_device;
+ ch->image = NULL;
+ ch->io_ctx = NULL;
+ ch->pfd.fd = -1;
+
+ ret = bdev_rados_context_init(ch->disk->user_id, ch->disk->pool_name,
+ (const char *const *)ch->disk->config,
+ &ch->cluster, &ch->io_ctx);
+ if (ret < 0) {
+ SPDK_ERRLOG("Failed to create rados context for user_id %s and rbd_pool=%s\n",
+ ch->disk->user_id ? ch->disk->user_id : "admin (the default)", ch->disk->pool_name);
+ goto err;
+ }
+
+ if (spdk_call_unaffinitized(bdev_rbd_handle, ch) == NULL) {
+ goto err;
+ }
+
+ ch->pfd.fd = eventfd(0, EFD_NONBLOCK);
+ if (ch->pfd.fd < 0) {
+ SPDK_ERRLOG("Failed to get eventfd\n");
+ goto err;
+ }
+
+ ch->pfd.events = POLLIN;
+ ret = rbd_set_image_notification(ch->image, ch->pfd.fd, EVENT_TYPE_EVENTFD);
+ if (ret < 0) {
+ SPDK_ERRLOG("Failed to set rbd image notification\n");
+ goto err;
+ }
+
+ ch->poller = SPDK_POLLER_REGISTER(bdev_rbd_io_poll, ch, BDEV_RBD_POLL_US);
+
+ return 0;
+
+err:
+ bdev_rbd_free_channel(ch);
+ return -1;
+}
+
+static void
+bdev_rbd_destroy_cb(void *io_device, void *ctx_buf)
+{
+ struct bdev_rbd_io_channel *io_channel = ctx_buf;
+
+ bdev_rbd_free_channel(io_channel);
+
+ spdk_poller_unregister(&io_channel->poller);
+}
+
+static struct spdk_io_channel *
+bdev_rbd_get_io_channel(void *ctx)
+{
+ struct bdev_rbd *rbd_bdev = ctx;
+
+ return spdk_get_io_channel(rbd_bdev);
+}
+
+static int
+bdev_rbd_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
+{
+ struct bdev_rbd *rbd_bdev = ctx;
+
+ spdk_json_write_named_object_begin(w, "rbd");
+
+ spdk_json_write_named_string(w, "pool_name", rbd_bdev->pool_name);
+
+ spdk_json_write_named_string(w, "rbd_name", rbd_bdev->rbd_name);
+
+ if (rbd_bdev->user_id) {
+ spdk_json_write_named_string(w, "user_id", rbd_bdev->user_id);
+ }
+
+ if (rbd_bdev->config) {
+ char **entry = rbd_bdev->config;
+
+ spdk_json_write_named_object_begin(w, "config");
+ while (*entry) {
+ spdk_json_write_named_string(w, entry[0], entry[1]);
+ entry += 2;
+ }
+ spdk_json_write_object_end(w);
+ }
+
+ spdk_json_write_object_end(w);
+
+ return 0;
+}
+
+static void
+bdev_rbd_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
+{
+ struct bdev_rbd *rbd = bdev->ctxt;
+
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_string(w, "method", "bdev_rbd_create");
+
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_string(w, "name", bdev->name);
+ spdk_json_write_named_string(w, "pool_name", rbd->pool_name);
+ spdk_json_write_named_string(w, "rbd_name", rbd->rbd_name);
+ spdk_json_write_named_uint32(w, "block_size", bdev->blocklen);
+ if (rbd->user_id) {
+ spdk_json_write_named_string(w, "user_id", rbd->user_id);
+ }
+
+ if (rbd->config) {
+ char **entry = rbd->config;
+
+ spdk_json_write_named_object_begin(w, "config");
+ while (*entry) {
+ spdk_json_write_named_string(w, entry[0], entry[1]);
+ entry += 2;
+ }
+ spdk_json_write_object_end(w);
+ }
+
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+}
+
+static const struct spdk_bdev_fn_table rbd_fn_table = {
+ .destruct = bdev_rbd_destruct,
+ .submit_request = bdev_rbd_submit_request,
+ .io_type_supported = bdev_rbd_io_type_supported,
+ .get_io_channel = bdev_rbd_get_io_channel,
+ .dump_info_json = bdev_rbd_dump_info_json,
+ .write_config_json = bdev_rbd_write_config_json,
+};
+
+int
+bdev_rbd_create(struct spdk_bdev **bdev, const char *name, const char *user_id,
+ const char *pool_name,
+ const char *const *config,
+ const char *rbd_name,
+ uint32_t block_size)
+{
+ struct bdev_rbd *rbd;
+ int ret;
+
+ if ((pool_name == NULL) || (rbd_name == NULL)) {
+ return -EINVAL;
+ }
+
+ rbd = calloc(1, sizeof(struct bdev_rbd));
+ if (rbd == NULL) {
+ SPDK_ERRLOG("Failed to allocate bdev_rbd struct\n");
+ return -ENOMEM;
+ }
+
+ rbd->rbd_name = strdup(rbd_name);
+ if (!rbd->rbd_name) {
+ bdev_rbd_free(rbd);
+ return -ENOMEM;
+ }
+
+ if (user_id) {
+ rbd->user_id = strdup(user_id);
+ if (!rbd->user_id) {
+ bdev_rbd_free(rbd);
+ return -ENOMEM;
+ }
+ }
+
+ rbd->pool_name = strdup(pool_name);
+ if (!rbd->pool_name) {
+ bdev_rbd_free(rbd);
+ return -ENOMEM;
+ }
+
+ if (config && !(rbd->config = bdev_rbd_dup_config(config))) {
+ bdev_rbd_free(rbd);
+ return -ENOMEM;
+ }
+
+ ret = bdev_rbd_init(rbd->user_id, rbd->pool_name,
+ (const char *const *)rbd->config,
+ rbd_name, &rbd->info);
+ if (ret < 0) {
+ bdev_rbd_free(rbd);
+ SPDK_ERRLOG("Failed to init rbd device\n");
+ return ret;
+ }
+
+ if (name) {
+ rbd->disk.name = strdup(name);
+ } else {
+ rbd->disk.name = spdk_sprintf_alloc("Ceph%d", bdev_rbd_count);
+ }
+ if (!rbd->disk.name) {
+ bdev_rbd_free(rbd);
+ return -ENOMEM;
+ }
+ rbd->disk.product_name = "Ceph Rbd Disk";
+ bdev_rbd_count++;
+
+ rbd->disk.write_cache = 0;
+ rbd->disk.blocklen = block_size;
+ rbd->disk.blockcnt = rbd->info.size / rbd->disk.blocklen;
+ rbd->disk.ctxt = rbd;
+ rbd->disk.fn_table = &rbd_fn_table;
+ rbd->disk.module = &rbd_if;
+
+ SPDK_NOTICELOG("Add %s rbd disk to lun\n", rbd->disk.name);
+
+ spdk_io_device_register(rbd, bdev_rbd_create_cb,
+ bdev_rbd_destroy_cb,
+ sizeof(struct bdev_rbd_io_channel),
+ rbd_name);
+ ret = spdk_bdev_register(&rbd->disk);
+ if (ret) {
+ spdk_io_device_unregister(rbd, NULL);
+ bdev_rbd_free(rbd);
+ return ret;
+ }
+
+ *bdev = &(rbd->disk);
+
+ return ret;
+}
+
+void
+bdev_rbd_delete(struct spdk_bdev *bdev, spdk_delete_rbd_complete cb_fn, void *cb_arg)
+{
+ if (!bdev || bdev->module != &rbd_if) {
+ cb_fn(cb_arg, -ENODEV);
+ return;
+ }
+
+ spdk_bdev_unregister(bdev, cb_fn, cb_arg);
+}
+
+int
+bdev_rbd_resize(struct spdk_bdev *bdev, const uint64_t new_size_in_mb)
+{
+ struct spdk_io_channel *ch;
+ struct bdev_rbd_io_channel *rbd_io_ch;
+ int rc;
+ uint64_t new_size_in_byte;
+ uint64_t current_size_in_mb;
+
+ if (bdev->module != &rbd_if) {
+ return -EINVAL;
+ }
+
+ current_size_in_mb = bdev->blocklen * bdev->blockcnt / (1024 * 1024);
+ if (current_size_in_mb > new_size_in_mb) {
+ SPDK_ERRLOG("The new bdev size must be lager than current bdev size.\n");
+ return -EINVAL;
+ }
+
+ ch = bdev_rbd_get_io_channel(bdev);
+ rbd_io_ch = spdk_io_channel_get_ctx(ch);
+ new_size_in_byte = new_size_in_mb * 1024 * 1024;
+
+ rc = rbd_resize(rbd_io_ch->image, new_size_in_byte);
+ if (rc != 0) {
+ SPDK_ERRLOG("failed to resize the ceph bdev.\n");
+ return rc;
+ }
+
+ rc = spdk_bdev_notify_blockcnt_change(bdev, new_size_in_byte / bdev->blocklen);
+ if (rc != 0) {
+ SPDK_ERRLOG("failed to notify block cnt change.\n");
+ return rc;
+ }
+
+ return rc;
+}
+
+static int
+bdev_rbd_library_init(void)
+{
+ int i, rc = 0;
+ const char *val;
+ const char *pool_name;
+ const char *rbd_name;
+ struct spdk_bdev *bdev;
+ uint32_t block_size;
+ long int tmp;
+
+ struct spdk_conf_section *sp = spdk_conf_find_section(NULL, "Ceph");
+
+ if (sp == NULL) {
+ /*
+ * Ceph section not found. Do not initialize any rbd LUNS.
+ */
+ goto end;
+ }
+
+ /* Init rbd block devices */
+ for (i = 0; ; i++) {
+ val = spdk_conf_section_get_nval(sp, "Ceph", i);
+ if (val == NULL) {
+ break;
+ }
+
+ /* get the Rbd_pool name */
+ pool_name = spdk_conf_section_get_nmval(sp, "Ceph", i, 0);
+ if (pool_name == NULL) {
+ SPDK_ERRLOG("Ceph%d: rbd pool name needs to be provided\n", i);
+ rc = -1;
+ goto end;
+ }
+
+ rbd_name = spdk_conf_section_get_nmval(sp, "Ceph", i, 1);
+ if (rbd_name == NULL) {
+ SPDK_ERRLOG("Ceph%d: format error\n", i);
+ rc = -1;
+ goto end;
+ }
+
+ val = spdk_conf_section_get_nmval(sp, "Ceph", i, 2);
+
+ if (val == NULL) {
+ block_size = 512; /* default value */
+ } else {
+ tmp = spdk_strtol(val, 10);
+ if (tmp <= 0) {
+ SPDK_ERRLOG("Invalid block size\n");
+ rc = -1;
+ goto end;
+ } else if (tmp & 0x1ff) {
+ SPDK_ERRLOG("current block_size = %ld, it should be multiple of 512\n",
+ tmp);
+ rc = -1;
+ goto end;
+ }
+ block_size = (uint32_t)tmp;
+ }
+
+ /* TODO(?): user_id and rbd config values */
+ rc = bdev_rbd_create(&bdev, NULL, NULL, pool_name, NULL, rbd_name, block_size);
+ if (rc) {
+ goto end;
+ }
+ }
+
+end:
+ return rc;
+}
+
+SPDK_LOG_REGISTER_COMPONENT("bdev_rbd", SPDK_LOG_BDEV_RBD)
diff --git a/src/spdk/module/bdev/rbd/bdev_rbd.h b/src/spdk/module/bdev/rbd/bdev_rbd.h
new file mode 100644
index 000000000..1d16a02db
--- /dev/null
+++ b/src/spdk/module/bdev/rbd/bdev_rbd.h
@@ -0,0 +1,68 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_BDEV_RBD_H
+#define SPDK_BDEV_RBD_H
+
+#include "spdk/stdinc.h"
+
+#include "spdk/bdev.h"
+
+void bdev_rbd_free_config(char **config);
+char **bdev_rbd_dup_config(const char *const *config);
+
+typedef void (*spdk_delete_rbd_complete)(void *cb_arg, int bdeverrno);
+
+int bdev_rbd_create(struct spdk_bdev **bdev, const char *name, const char *user_id,
+ const char *pool_name,
+ const char *const *config,
+ const char *rbd_name, uint32_t block_size);
+/**
+ * Delete rbd bdev.
+ *
+ * \param bdev Pointer to rbd bdev.
+ * \param cb_fn Function to call after deletion.
+ * \param cb_arg Argument to pass to cb_fn.
+ */
+void bdev_rbd_delete(struct spdk_bdev *bdev, spdk_delete_rbd_complete cb_fn,
+ void *cb_arg);
+
+/**
+ * Resize rbd bdev.
+ *
+ * \param bdev Pointer to rbd bdev.
+ * \param new_size_in_mb The new size in MiB for this bdev.
+ */
+int bdev_rbd_resize(struct spdk_bdev *bdev, const uint64_t new_size_in_mb);
+
+#endif /* SPDK_BDEV_RBD_H */
diff --git a/src/spdk/module/bdev/rbd/bdev_rbd_rpc.c b/src/spdk/module/bdev/rbd/bdev_rbd_rpc.c
new file mode 100644
index 000000000..c60c83a58
--- /dev/null
+++ b/src/spdk/module/bdev/rbd/bdev_rbd_rpc.c
@@ -0,0 +1,252 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "bdev_rbd.h"
+#include "spdk/rpc.h"
+#include "spdk/util.h"
+#include "spdk/string.h"
+#include "spdk_internal/log.h"
+
+struct rpc_create_rbd {
+ char *name;
+ char *user_id;
+ char *pool_name;
+ char *rbd_name;
+ uint32_t block_size;
+ char **config;
+};
+
+static void
+free_rpc_create_rbd(struct rpc_create_rbd *req)
+{
+ free(req->name);
+ free(req->user_id);
+ free(req->pool_name);
+ free(req->rbd_name);
+ bdev_rbd_free_config(req->config);
+}
+
+static int
+bdev_rbd_decode_config(const struct spdk_json_val *values, void *out)
+{
+ char ***map = out;
+ char **entry;
+ uint32_t i;
+
+ if (values->type == SPDK_JSON_VAL_NULL) {
+ /* treated like empty object: empty config */
+ *map = calloc(1, sizeof(**map));
+ if (!*map) {
+ return -1;
+ }
+ return 0;
+ }
+
+ if (values->type != SPDK_JSON_VAL_OBJECT_BEGIN) {
+ return -1;
+ }
+
+ *map = calloc(values->len + 1, sizeof(**map));
+ if (!*map) {
+ return -1;
+ }
+
+ for (i = 0, entry = *map; i < values->len;) {
+ const struct spdk_json_val *name = &values[i + 1];
+ const struct spdk_json_val *v = &values[i + 2];
+ /* Here we catch errors like invalid types. */
+ if (!(entry[0] = spdk_json_strdup(name)) ||
+ !(entry[1] = spdk_json_strdup(v))) {
+ bdev_rbd_free_config(*map);
+ *map = NULL;
+ return -1;
+ }
+ i += 1 + spdk_json_val_len(v);
+ entry += 2;
+ }
+
+ return 0;
+}
+
+static const struct spdk_json_object_decoder rpc_create_rbd_decoders[] = {
+ {"name", offsetof(struct rpc_create_rbd, name), spdk_json_decode_string, true},
+ {"user_id", offsetof(struct rpc_create_rbd, user_id), spdk_json_decode_string, true},
+ {"pool_name", offsetof(struct rpc_create_rbd, pool_name), spdk_json_decode_string},
+ {"rbd_name", offsetof(struct rpc_create_rbd, rbd_name), spdk_json_decode_string},
+ {"block_size", offsetof(struct rpc_create_rbd, block_size), spdk_json_decode_uint32},
+ {"config", offsetof(struct rpc_create_rbd, config), bdev_rbd_decode_config, true}
+};
+
+static void
+rpc_bdev_rbd_create(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_create_rbd req = {};
+ struct spdk_json_write_ctx *w;
+ struct spdk_bdev *bdev;
+ int rc = 0;
+
+ if (spdk_json_decode_object(params, rpc_create_rbd_decoders,
+ SPDK_COUNTOF(rpc_create_rbd_decoders),
+ &req)) {
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV_RBD, "spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ rc = bdev_rbd_create(&bdev, req.name, req.user_id, req.pool_name,
+ (const char *const *)req.config,
+ req.rbd_name,
+ req.block_size);
+ if (rc) {
+ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc));
+ goto cleanup;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_string(w, spdk_bdev_get_name(bdev));
+ spdk_jsonrpc_end_result(request, w);
+
+cleanup:
+ free_rpc_create_rbd(&req);
+}
+SPDK_RPC_REGISTER("bdev_rbd_create", rpc_bdev_rbd_create, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_rbd_create, construct_rbd_bdev)
+
+struct rpc_bdev_rbd_delete {
+ char *name;
+};
+
+static void
+free_rpc_bdev_rbd_delete(struct rpc_bdev_rbd_delete *req)
+{
+ free(req->name);
+}
+
+static const struct spdk_json_object_decoder rpc_bdev_rbd_delete_decoders[] = {
+ {"name", offsetof(struct rpc_bdev_rbd_delete, name), spdk_json_decode_string},
+};
+
+static void
+_rpc_bdev_rbd_delete_cb(void *cb_arg, int bdeverrno)
+{
+ struct spdk_jsonrpc_request *request = cb_arg;
+ struct spdk_json_write_ctx *w;
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, bdeverrno == 0);
+ spdk_jsonrpc_end_result(request, w);
+}
+
+static void
+rpc_bdev_rbd_delete(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_rbd_delete req = {NULL};
+ struct spdk_bdev *bdev;
+
+ if (spdk_json_decode_object(params, rpc_bdev_rbd_delete_decoders,
+ SPDK_COUNTOF(rpc_bdev_rbd_delete_decoders),
+ &req)) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ bdev = spdk_bdev_get_by_name(req.name);
+ if (bdev == NULL) {
+ spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV));
+ goto cleanup;
+ }
+
+ bdev_rbd_delete(bdev, _rpc_bdev_rbd_delete_cb, request);
+
+cleanup:
+ free_rpc_bdev_rbd_delete(&req);
+}
+SPDK_RPC_REGISTER("bdev_rbd_delete", rpc_bdev_rbd_delete, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_rbd_delete, delete_rbd_bdev)
+
+struct rpc_bdev_rbd_resize {
+ char *name;
+ uint64_t new_size;
+};
+
+static const struct spdk_json_object_decoder rpc_bdev_rbd_resize_decoders[] = {
+ {"name", offsetof(struct rpc_bdev_rbd_resize, name), spdk_json_decode_string},
+ {"new_size", offsetof(struct rpc_bdev_rbd_resize, new_size), spdk_json_decode_uint64}
+};
+
+static void
+free_rpc_bdev_rbd_resize(struct rpc_bdev_rbd_resize *req)
+{
+ free(req->name);
+}
+
+static void
+rpc_bdev_rbd_resize(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_rbd_resize req = {};
+ struct spdk_bdev *bdev;
+ struct spdk_json_write_ctx *w;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_bdev_rbd_resize_decoders,
+ SPDK_COUNTOF(rpc_bdev_rbd_resize_decoders),
+ &req)) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ bdev = spdk_bdev_get_by_name(req.name);
+ if (bdev == NULL) {
+ spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV));
+ goto cleanup;
+ }
+
+ rc = bdev_rbd_resize(bdev, req.new_size);
+ if (rc) {
+ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc));
+ goto cleanup;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+cleanup:
+ free_rpc_bdev_rbd_resize(&req);
+}
+SPDK_RPC_REGISTER("bdev_rbd_resize", rpc_bdev_rbd_resize, SPDK_RPC_RUNTIME)
diff --git a/src/spdk/module/bdev/rpc/Makefile b/src/spdk/module/bdev/rpc/Makefile
new file mode 100644
index 000000000..15de4fef9
--- /dev/null
+++ b/src/spdk/module/bdev/rpc/Makefile
@@ -0,0 +1,45 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = bdev_rpc.c
+LIBNAME = bdev_rpc
+
+SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/module/bdev/rpc/bdev_rpc.c b/src/spdk/module/bdev/rpc/bdev_rpc.c
new file mode 100644
index 000000000..166ab1a42
--- /dev/null
+++ b/src/spdk/module/bdev/rpc/bdev_rpc.c
@@ -0,0 +1,676 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/env.h"
+#include "spdk/log.h"
+#include "spdk/rpc.h"
+#include "spdk/string.h"
+#include "spdk/util.h"
+#include "spdk/histogram_data.h"
+#include "spdk/base64.h"
+
+#include "spdk/bdev_module.h"
+
+struct rpc_bdev_get_iostat_ctx {
+ int bdev_count;
+ struct spdk_jsonrpc_request *request;
+ struct spdk_json_write_ctx *w;
+};
+
+static void
+rpc_bdev_get_iostat_cb(struct spdk_bdev *bdev,
+ struct spdk_bdev_io_stat *stat, void *cb_arg, int rc)
+{
+ struct rpc_bdev_get_iostat_ctx *ctx = cb_arg;
+ struct spdk_json_write_ctx *w = ctx->w;
+ const char *bdev_name;
+
+ if (rc != 0) {
+ goto done;
+ }
+
+ bdev_name = spdk_bdev_get_name(bdev);
+ if (bdev_name != NULL) {
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_string(w, "name", bdev_name);
+
+ spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read);
+
+ spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops);
+
+ spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written);
+
+ spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops);
+
+ spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped);
+
+ spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops);
+
+ spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks);
+
+ spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks);
+
+ spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks);
+
+ if (spdk_bdev_get_qd_sampling_period(bdev)) {
+ spdk_json_write_named_uint64(w, "queue_depth_polling_period",
+ spdk_bdev_get_qd_sampling_period(bdev));
+
+ spdk_json_write_named_uint64(w, "queue_depth", spdk_bdev_get_qd(bdev));
+
+ spdk_json_write_named_uint64(w, "io_time", spdk_bdev_get_io_time(bdev));
+
+ spdk_json_write_named_uint64(w, "weighted_io_time",
+ spdk_bdev_get_weighted_io_time(bdev));
+ }
+
+ spdk_json_write_object_end(w);
+ }
+
+done:
+ free(stat);
+ if (--ctx->bdev_count == 0) {
+ spdk_json_write_array_end(ctx->w);
+ spdk_json_write_object_end(w);
+ spdk_jsonrpc_end_result(ctx->request, ctx->w);
+ free(ctx);
+ }
+}
+
+struct rpc_bdev_get_iostat {
+ char *name;
+};
+
+static void
+free_rpc_bdev_get_iostat(struct rpc_bdev_get_iostat *r)
+{
+ free(r->name);
+}
+
+static const struct spdk_json_object_decoder rpc_bdev_get_iostat_decoders[] = {
+ {"name", offsetof(struct rpc_bdev_get_iostat, name), spdk_json_decode_string, true},
+};
+
+static void
+rpc_bdev_get_iostat(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_get_iostat req = {};
+ struct spdk_bdev *bdev = NULL;
+ struct spdk_json_write_ctx *w;
+ struct spdk_bdev_io_stat *stat;
+ struct rpc_bdev_get_iostat_ctx *ctx;
+
+ if (params != NULL) {
+ if (spdk_json_decode_object(params, rpc_bdev_get_iostat_decoders,
+ SPDK_COUNTOF(rpc_bdev_get_iostat_decoders),
+ &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ free_rpc_bdev_get_iostat(&req);
+ return;
+ }
+
+ if (req.name) {
+ bdev = spdk_bdev_get_by_name(req.name);
+ if (bdev == NULL) {
+ SPDK_ERRLOG("bdev '%s' does not exist\n", req.name);
+ spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV));
+ free_rpc_bdev_get_iostat(&req);
+ return;
+ }
+ }
+ }
+
+ free_rpc_bdev_get_iostat(&req);
+
+ ctx = calloc(1, sizeof(struct rpc_bdev_get_iostat_ctx));
+ if (ctx == NULL) {
+ SPDK_ERRLOG("Failed to allocate rpc_bdev_get_iostat_ctx struct\n");
+ spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM));
+ return;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ /*
+ * Increment initial bdev_count so that it will never reach 0 in the middle
+ * of iterating.
+ */
+ ctx->bdev_count++;
+ ctx->request = request;
+ ctx->w = w;
+
+
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_uint64(w, "tick_rate", spdk_get_ticks_hz());
+ spdk_json_write_named_uint64(w, "ticks", spdk_get_ticks());
+
+ spdk_json_write_named_array_begin(w, "bdevs");
+
+ if (bdev != NULL) {
+ stat = calloc(1, sizeof(struct spdk_bdev_io_stat));
+ if (stat == NULL) {
+ SPDK_ERRLOG("Failed to allocate rpc_bdev_get_iostat_ctx struct\n");
+ } else {
+ ctx->bdev_count++;
+ spdk_bdev_get_device_stat(bdev, stat, rpc_bdev_get_iostat_cb, ctx);
+ }
+ } else {
+ for (bdev = spdk_bdev_first(); bdev != NULL; bdev = spdk_bdev_next(bdev)) {
+ stat = calloc(1, sizeof(struct spdk_bdev_io_stat));
+ if (stat == NULL) {
+ SPDK_ERRLOG("Failed to allocate spdk_bdev_io_stat struct\n");
+ break;
+ }
+ ctx->bdev_count++;
+ spdk_bdev_get_device_stat(bdev, stat, rpc_bdev_get_iostat_cb, ctx);
+ }
+ }
+
+ if (--ctx->bdev_count == 0) {
+ spdk_json_write_array_end(w);
+ spdk_json_write_object_end(w);
+ spdk_jsonrpc_end_result(request, w);
+ free(ctx);
+ }
+}
+SPDK_RPC_REGISTER("bdev_get_iostat", rpc_bdev_get_iostat, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_get_iostat, get_bdevs_iostat)
+
+static void
+rpc_dump_bdev_info(struct spdk_json_write_ctx *w,
+ struct spdk_bdev *bdev)
+{
+ struct spdk_bdev_alias *tmp;
+ uint64_t qos_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES];
+ int i;
+
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_string(w, "name", spdk_bdev_get_name(bdev));
+
+ spdk_json_write_named_array_begin(w, "aliases");
+
+ TAILQ_FOREACH(tmp, spdk_bdev_get_aliases(bdev), tailq) {
+ spdk_json_write_string(w, tmp->alias);
+ }
+
+ spdk_json_write_array_end(w);
+
+ spdk_json_write_named_string(w, "product_name", spdk_bdev_get_product_name(bdev));
+
+ spdk_json_write_named_uint32(w, "block_size", spdk_bdev_get_block_size(bdev));
+
+ spdk_json_write_named_uint64(w, "num_blocks", spdk_bdev_get_num_blocks(bdev));
+
+ if (!spdk_mem_all_zero(&bdev->uuid, sizeof(bdev->uuid))) {
+ char uuid_str[SPDK_UUID_STRING_LEN];
+
+ spdk_uuid_fmt_lower(uuid_str, sizeof(uuid_str), &bdev->uuid);
+ spdk_json_write_named_string(w, "uuid", uuid_str);
+ }
+
+ if (spdk_bdev_get_md_size(bdev) != 0) {
+ spdk_json_write_named_uint32(w, "md_size", spdk_bdev_get_md_size(bdev));
+ spdk_json_write_named_bool(w, "md_interleave", spdk_bdev_is_md_interleaved(bdev));
+ spdk_json_write_named_uint32(w, "dif_type", spdk_bdev_get_dif_type(bdev));
+ if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) {
+ spdk_json_write_named_bool(w, "dif_is_head_of_md", spdk_bdev_is_dif_head_of_md(bdev));
+ spdk_json_write_named_object_begin(w, "enabled_dif_check_types");
+ spdk_json_write_named_bool(w, "reftag",
+ spdk_bdev_is_dif_check_enabled(bdev, SPDK_DIF_CHECK_TYPE_REFTAG));
+ spdk_json_write_named_bool(w, "apptag",
+ spdk_bdev_is_dif_check_enabled(bdev, SPDK_DIF_CHECK_TYPE_APPTAG));
+ spdk_json_write_named_bool(w, "guard",
+ spdk_bdev_is_dif_check_enabled(bdev, SPDK_DIF_CHECK_TYPE_GUARD));
+ spdk_json_write_object_end(w);
+ }
+ }
+
+ spdk_json_write_named_object_begin(w, "assigned_rate_limits");
+ spdk_bdev_get_qos_rate_limits(bdev, qos_limits);
+ for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
+ spdk_json_write_named_uint64(w, spdk_bdev_get_qos_rpc_type(i), qos_limits[i]);
+ }
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_named_bool(w, "claimed", (bdev->internal.claim_module != NULL));
+
+ spdk_json_write_named_bool(w, "zoned", bdev->zoned);
+ if (bdev->zoned) {
+ spdk_json_write_named_uint64(w, "zone_size", bdev->zone_size);
+ spdk_json_write_named_uint64(w, "max_open_zones", bdev->max_open_zones);
+ spdk_json_write_named_uint64(w, "optimal_open_zones", bdev->optimal_open_zones);
+ }
+
+ spdk_json_write_named_object_begin(w, "supported_io_types");
+ spdk_json_write_named_bool(w, "read",
+ spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_READ));
+ spdk_json_write_named_bool(w, "write",
+ spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE));
+ spdk_json_write_named_bool(w, "unmap",
+ spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP));
+ spdk_json_write_named_bool(w, "write_zeroes",
+ spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES));
+ spdk_json_write_named_bool(w, "flush",
+ spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH));
+ spdk_json_write_named_bool(w, "reset",
+ spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_RESET));
+ spdk_json_write_named_bool(w, "nvme_admin",
+ spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN));
+ spdk_json_write_named_bool(w, "nvme_io",
+ spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO));
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_named_object_begin(w, "driver_specific");
+ spdk_bdev_dump_info_json(bdev, w);
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+}
+
+struct rpc_bdev_get_bdevs {
+ char *name;
+};
+
+static void
+free_rpc_bdev_get_bdevs(struct rpc_bdev_get_bdevs *r)
+{
+ free(r->name);
+}
+
+static const struct spdk_json_object_decoder rpc_bdev_get_bdevs_decoders[] = {
+ {"name", offsetof(struct rpc_bdev_get_bdevs, name), spdk_json_decode_string, true},
+};
+
+static void
+rpc_bdev_get_bdevs(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_get_bdevs req = {};
+ struct spdk_json_write_ctx *w;
+ struct spdk_bdev *bdev = NULL;
+
+ if (params && spdk_json_decode_object(params, rpc_bdev_get_bdevs_decoders,
+ SPDK_COUNTOF(rpc_bdev_get_bdevs_decoders),
+ &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ free_rpc_bdev_get_bdevs(&req);
+ return;
+ }
+
+ if (req.name) {
+ bdev = spdk_bdev_get_by_name(req.name);
+ if (bdev == NULL) {
+ SPDK_ERRLOG("bdev '%s' does not exist\n", req.name);
+ spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV));
+ free_rpc_bdev_get_bdevs(&req);
+ return;
+ }
+ }
+
+ free_rpc_bdev_get_bdevs(&req);
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_array_begin(w);
+
+ if (bdev != NULL) {
+ rpc_dump_bdev_info(w, bdev);
+ } else {
+ for (bdev = spdk_bdev_first(); bdev != NULL; bdev = spdk_bdev_next(bdev)) {
+ rpc_dump_bdev_info(w, bdev);
+ }
+ }
+
+ spdk_json_write_array_end(w);
+
+ spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("bdev_get_bdevs", rpc_bdev_get_bdevs, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_get_bdevs, get_bdevs)
+
+struct rpc_bdev_set_qd_sampling_period {
+ char *name;
+ uint64_t period;
+};
+
+static void
+free_rpc_bdev_set_qd_sampling_period(struct rpc_bdev_set_qd_sampling_period *r)
+{
+ free(r->name);
+}
+
+static const struct spdk_json_object_decoder
+ rpc_bdev_set_qd_sampling_period_decoders[] = {
+ {"name", offsetof(struct rpc_bdev_set_qd_sampling_period, name), spdk_json_decode_string},
+ {"period", offsetof(struct rpc_bdev_set_qd_sampling_period, period), spdk_json_decode_uint64},
+};
+
+static void
+rpc_bdev_set_qd_sampling_period(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_set_qd_sampling_period req = {0};
+ struct spdk_bdev *bdev;
+ struct spdk_json_write_ctx *w;
+
+ if (spdk_json_decode_object(params, rpc_bdev_set_qd_sampling_period_decoders,
+ SPDK_COUNTOF(rpc_bdev_set_qd_sampling_period_decoders),
+ &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ bdev = spdk_bdev_get_by_name(req.name);
+ if (bdev == NULL) {
+ SPDK_ERRLOG("bdev '%s' does not exist\n", req.name);
+ spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV));
+ goto cleanup;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_bdev_set_qd_sampling_period(bdev, req.period);
+
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+
+cleanup:
+ free_rpc_bdev_set_qd_sampling_period(&req);
+}
+SPDK_RPC_REGISTER("bdev_set_qd_sampling_period",
+ rpc_bdev_set_qd_sampling_period,
+ SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_set_qd_sampling_period,
+ set_bdev_qd_sampling_period)
+
+struct rpc_bdev_set_qos_limit {
+ char *name;
+ uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES];
+};
+
+static void
+free_rpc_bdev_set_qos_limit(struct rpc_bdev_set_qos_limit *r)
+{
+ free(r->name);
+}
+
+static const struct spdk_json_object_decoder rpc_bdev_set_qos_limit_decoders[] = {
+ {"name", offsetof(struct rpc_bdev_set_qos_limit, name), spdk_json_decode_string},
+ {
+ "rw_ios_per_sec", offsetof(struct rpc_bdev_set_qos_limit,
+ limits[SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT]),
+ spdk_json_decode_uint64, true
+ },
+ {
+ "rw_mbytes_per_sec", offsetof(struct rpc_bdev_set_qos_limit,
+ limits[SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT]),
+ spdk_json_decode_uint64, true
+ },
+ {
+ "r_mbytes_per_sec", offsetof(struct rpc_bdev_set_qos_limit,
+ limits[SPDK_BDEV_QOS_R_BPS_RATE_LIMIT]),
+ spdk_json_decode_uint64, true
+ },
+ {
+ "w_mbytes_per_sec", offsetof(struct rpc_bdev_set_qos_limit,
+ limits[SPDK_BDEV_QOS_W_BPS_RATE_LIMIT]),
+ spdk_json_decode_uint64, true
+ },
+};
+
+static void
+rpc_bdev_set_qos_limit_complete(void *cb_arg, int status)
+{
+ struct spdk_jsonrpc_request *request = cb_arg;
+ struct spdk_json_write_ctx *w;
+
+ if (status != 0) {
+ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Failed to configure rate limit: %s",
+ spdk_strerror(-status));
+ return;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+}
+
+static void
+rpc_bdev_set_qos_limit(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_set_qos_limit req = {NULL, {UINT64_MAX, UINT64_MAX, UINT64_MAX, UINT64_MAX}};
+ struct spdk_bdev *bdev;
+ int i;
+
+ if (spdk_json_decode_object(params, rpc_bdev_set_qos_limit_decoders,
+ SPDK_COUNTOF(rpc_bdev_set_qos_limit_decoders),
+ &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ bdev = spdk_bdev_get_by_name(req.name);
+ if (bdev == NULL) {
+ SPDK_ERRLOG("bdev '%s' does not exist\n", req.name);
+ spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV));
+ goto cleanup;
+ }
+
+ for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
+ if (req.limits[i] != UINT64_MAX) {
+ break;
+ }
+ }
+ if (i == SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES) {
+ SPDK_ERRLOG("no rate limits specified\n");
+ spdk_jsonrpc_send_error_response(request, -EINVAL, "No rate limits specified");
+ goto cleanup;
+ }
+
+ spdk_bdev_set_qos_rate_limits(bdev, req.limits, rpc_bdev_set_qos_limit_complete, request);
+
+cleanup:
+ free_rpc_bdev_set_qos_limit(&req);
+}
+
+SPDK_RPC_REGISTER("bdev_set_qos_limit", rpc_bdev_set_qos_limit, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_set_qos_limit, set_bdev_qos_limit)
+
+/* SPDK_RPC_ENABLE_BDEV_HISTOGRAM */
+
+struct rpc_bdev_enable_histogram_request {
+ char *name;
+ bool enable;
+};
+
+static void
+free_rpc_bdev_enable_histogram_request(struct rpc_bdev_enable_histogram_request *r)
+{
+ free(r->name);
+}
+
+static const struct spdk_json_object_decoder rpc_bdev_enable_histogram_request_decoders[] = {
+ {"name", offsetof(struct rpc_bdev_enable_histogram_request, name), spdk_json_decode_string},
+ {"enable", offsetof(struct rpc_bdev_enable_histogram_request, enable), spdk_json_decode_bool},
+};
+
+static void
+bdev_histogram_status_cb(void *cb_arg, int status)
+{
+ struct spdk_jsonrpc_request *request = cb_arg;
+ struct spdk_json_write_ctx *w = spdk_jsonrpc_begin_result(request);
+
+ spdk_json_write_bool(w, status == 0);
+ spdk_jsonrpc_end_result(request, w);
+}
+
+static void
+rpc_bdev_enable_histogram(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_enable_histogram_request req = {NULL};
+ struct spdk_bdev *bdev;
+
+ if (spdk_json_decode_object(params, rpc_bdev_enable_histogram_request_decoders,
+ SPDK_COUNTOF(rpc_bdev_enable_histogram_request_decoders),
+ &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ bdev = spdk_bdev_get_by_name(req.name);
+ if (bdev == NULL) {
+ spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV));
+ goto cleanup;
+ }
+
+ spdk_bdev_histogram_enable(bdev, bdev_histogram_status_cb, request, req.enable);
+
+cleanup:
+ free_rpc_bdev_enable_histogram_request(&req);
+}
+
+SPDK_RPC_REGISTER("bdev_enable_histogram", rpc_bdev_enable_histogram, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_enable_histogram, enable_bdev_histogram)
+
+/* SPDK_RPC_GET_BDEV_HISTOGRAM */
+
+struct rpc_bdev_get_histogram_request {
+ char *name;
+};
+
+static const struct spdk_json_object_decoder rpc_bdev_get_histogram_request_decoders[] = {
+ {"name", offsetof(struct rpc_bdev_get_histogram_request, name), spdk_json_decode_string}
+};
+
+static void
+free_rpc_bdev_get_histogram_request(struct rpc_bdev_get_histogram_request *r)
+{
+ free(r->name);
+}
+
+static void
+_rpc_bdev_histogram_data_cb(void *cb_arg, int status, struct spdk_histogram_data *histogram)
+{
+ struct spdk_jsonrpc_request *request = cb_arg;
+ struct spdk_json_write_ctx *w;
+ int rc;
+ char *encoded_histogram;
+ size_t src_len, dst_len;
+
+
+ if (status != 0) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ spdk_strerror(-status));
+ goto invalid;
+ }
+
+ src_len = SPDK_HISTOGRAM_NUM_BUCKETS(histogram) * sizeof(uint64_t);
+ dst_len = spdk_base64_get_encoded_strlen(src_len) + 1;
+
+ encoded_histogram = malloc(dst_len);
+ if (encoded_histogram == NULL) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ spdk_strerror(ENOMEM));
+ goto invalid;
+ }
+
+ rc = spdk_base64_encode(encoded_histogram, histogram->bucket, src_len);
+ if (rc != 0) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ spdk_strerror(-rc));
+ goto free_encoded_histogram;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "histogram", encoded_histogram);
+ spdk_json_write_named_int64(w, "bucket_shift", histogram->bucket_shift);
+ spdk_json_write_named_int64(w, "tsc_rate", spdk_get_ticks_hz());
+ spdk_json_write_object_end(w);
+ spdk_jsonrpc_end_result(request, w);
+
+free_encoded_histogram:
+ free(encoded_histogram);
+invalid:
+ spdk_histogram_data_free(histogram);
+}
+
+static void
+rpc_bdev_get_histogram(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_get_histogram_request req = {NULL};
+ struct spdk_histogram_data *histogram;
+ struct spdk_bdev *bdev;
+
+ if (spdk_json_decode_object(params, rpc_bdev_get_histogram_request_decoders,
+ SPDK_COUNTOF(rpc_bdev_get_histogram_request_decoders),
+ &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ bdev = spdk_bdev_get_by_name(req.name);
+ if (bdev == NULL) {
+ spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV));
+ goto cleanup;
+ }
+
+ histogram = spdk_histogram_data_alloc();
+ if (histogram == NULL) {
+ spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM));
+ goto cleanup;
+ }
+
+ spdk_bdev_histogram_get(bdev, histogram, _rpc_bdev_histogram_data_cb, request);
+
+cleanup:
+ free_rpc_bdev_get_histogram_request(&req);
+}
+
+SPDK_RPC_REGISTER("bdev_get_histogram", rpc_bdev_get_histogram, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_get_histogram, get_bdev_histogram)
diff --git a/src/spdk/module/bdev/split/Makefile b/src/spdk/module/bdev/split/Makefile
new file mode 100644
index 000000000..830224c62
--- /dev/null
+++ b/src/spdk/module/bdev/split/Makefile
@@ -0,0 +1,45 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = vbdev_split.c vbdev_split_rpc.c
+LIBNAME = bdev_split
+
+SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/module/bdev/split/vbdev_split.c b/src/spdk/module/bdev/split/vbdev_split.c
new file mode 100644
index 000000000..fd175d339
--- /dev/null
+++ b/src/spdk/module/bdev/split/vbdev_split.c
@@ -0,0 +1,582 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This is a simple example of a virtual block device that takes a single
+ * bdev and slices it into multiple smaller bdevs.
+ */
+
+#include "vbdev_split.h"
+
+#include "spdk/rpc.h"
+#include "spdk/conf.h"
+#include "spdk/endian.h"
+#include "spdk/string.h"
+#include "spdk/thread.h"
+#include "spdk/util.h"
+
+#include "spdk/bdev_module.h"
+#include "spdk_internal/log.h"
+
+struct spdk_vbdev_split_config {
+ char *base_bdev;
+ unsigned split_count;
+ uint64_t split_size_mb;
+
+ SPDK_BDEV_PART_TAILQ splits;
+ struct spdk_bdev_part_base *split_base;
+
+ TAILQ_ENTRY(spdk_vbdev_split_config) tailq;
+};
+
+static TAILQ_HEAD(, spdk_vbdev_split_config) g_split_config = TAILQ_HEAD_INITIALIZER(
+ g_split_config);
+
+struct vbdev_split_channel {
+ struct spdk_bdev_part_channel part_ch;
+};
+
+struct vbdev_split_bdev_io {
+ struct spdk_io_channel *ch;
+ struct spdk_bdev_io *bdev_io;
+
+ /* for bdev_io_wait */
+ struct spdk_bdev_io_wait_entry bdev_io_wait;
+};
+
+static void vbdev_split_del_config(struct spdk_vbdev_split_config *cfg);
+
+static int vbdev_split_init(void);
+static void vbdev_split_fini(void);
+static void vbdev_split_examine(struct spdk_bdev *bdev);
+static int vbdev_split_config_json(struct spdk_json_write_ctx *w);
+static int vbdev_split_get_ctx_size(void);
+
+static void
+_vbdev_split_submit_request(struct spdk_io_channel *_ch, struct spdk_bdev_io *bdev_io);
+
+static struct spdk_bdev_module split_if = {
+ .name = "split",
+ .module_init = vbdev_split_init,
+ .module_fini = vbdev_split_fini,
+ .get_ctx_size = vbdev_split_get_ctx_size,
+ .examine_config = vbdev_split_examine,
+ .config_json = vbdev_split_config_json,
+};
+
+SPDK_BDEV_MODULE_REGISTER(split, &split_if)
+
+static void
+vbdev_split_base_free(void *ctx)
+{
+ struct spdk_vbdev_split_config *cfg = ctx;
+
+ vbdev_split_del_config(cfg);
+}
+
+static int
+_vbdev_split_destruct(void *ctx)
+{
+ struct spdk_bdev_part *part = ctx;
+
+ return spdk_bdev_part_free(part);
+}
+
+static void
+vbdev_split_base_bdev_hotremove_cb(void *_part_base)
+{
+ struct spdk_bdev_part_base *part_base = _part_base;
+ struct spdk_vbdev_split_config *cfg = spdk_bdev_part_base_get_ctx(part_base);
+
+ spdk_bdev_part_base_hotremove(part_base, &cfg->splits);
+}
+
+static void
+vbdev_split_resubmit_io(void *arg)
+{
+ struct vbdev_split_bdev_io *split_io = (struct vbdev_split_bdev_io *)arg;
+
+ _vbdev_split_submit_request(split_io->ch, split_io->bdev_io);
+}
+
+static void
+vbdev_split_queue_io(struct vbdev_split_bdev_io *split_io)
+{
+ struct vbdev_split_channel *ch = spdk_io_channel_get_ctx(split_io->ch);
+ int rc;
+
+ split_io->bdev_io_wait.bdev = split_io->bdev_io->bdev;
+ split_io->bdev_io_wait.cb_fn = vbdev_split_resubmit_io;
+ split_io->bdev_io_wait.cb_arg = split_io;
+
+ rc = spdk_bdev_queue_io_wait(split_io->bdev_io->bdev,
+ ch->part_ch.base_ch, &split_io->bdev_io_wait);
+ if (rc != 0) {
+ SPDK_ERRLOG("Queue io failed in vbdev_split_queue_io, rc=%d\n", rc);
+ spdk_bdev_io_complete(split_io->bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+}
+
+static void
+_vbdev_split_submit_request(struct spdk_io_channel *_ch, struct spdk_bdev_io *bdev_io)
+{
+ struct vbdev_split_channel *ch = spdk_io_channel_get_ctx(_ch);
+ struct vbdev_split_bdev_io *io_ctx = (struct vbdev_split_bdev_io *)bdev_io->driver_ctx;
+ int rc;
+
+ rc = spdk_bdev_part_submit_request(&ch->part_ch, bdev_io);
+ if (rc) {
+ if (rc == -ENOMEM) {
+ SPDK_DEBUGLOG(SPDK_LOG_VBDEV_SPLIT, "split: no memory, queue io.\n");
+ io_ctx->ch = _ch;
+ io_ctx->bdev_io = bdev_io;
+ vbdev_split_queue_io(io_ctx);
+ } else {
+ SPDK_ERRLOG("split: error on io submission, rc=%d.\n", rc);
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+ }
+}
+
+static void
+vbdev_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success)
+{
+ if (!success) {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ return;
+ }
+
+ _vbdev_split_submit_request(ch, bdev_io);
+}
+
+static void
+vbdev_split_submit_request(struct spdk_io_channel *_ch, struct spdk_bdev_io *bdev_io)
+{
+ switch (bdev_io->type) {
+ case SPDK_BDEV_IO_TYPE_READ:
+ spdk_bdev_io_get_buf(bdev_io, vbdev_split_get_buf_cb,
+ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
+ break;
+ default:
+ _vbdev_split_submit_request(_ch, bdev_io);
+ break;
+ }
+}
+
+static int
+vbdev_split_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
+{
+ struct spdk_bdev_part *part = ctx;
+ struct spdk_bdev *split_base_bdev = spdk_bdev_part_get_base_bdev(part);
+ uint64_t offset_blocks = spdk_bdev_part_get_offset_blocks(part);
+
+ spdk_json_write_named_object_begin(w, "split");
+
+ spdk_json_write_named_string(w, "base_bdev", spdk_bdev_get_name(split_base_bdev));
+ spdk_json_write_named_uint64(w, "offset_blocks", offset_blocks);
+
+ spdk_json_write_object_end(w);
+
+ return 0;
+}
+
+static void
+vbdev_split_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
+{
+ /* No config per bdev needed */
+}
+
+static struct spdk_bdev_fn_table vbdev_split_fn_table = {
+ .destruct = _vbdev_split_destruct,
+ .submit_request = vbdev_split_submit_request,
+ .dump_info_json = vbdev_split_dump_info_json,
+ .write_config_json = vbdev_split_write_config_json
+};
+
+static int
+vbdev_split_create(struct spdk_vbdev_split_config *cfg)
+{
+ uint64_t split_size_blocks, offset_blocks;
+ uint64_t split_count, max_split_count;
+ uint64_t mb = 1024 * 1024;
+ uint64_t i;
+ int rc;
+ char *name;
+ struct spdk_bdev *base_bdev;
+ struct bdev_part_tailq *split_base_tailq;
+
+ assert(cfg->split_count > 0);
+
+ base_bdev = spdk_bdev_get_by_name(cfg->base_bdev);
+ if (!base_bdev) {
+ return -ENODEV;
+ }
+
+ if (cfg->split_size_mb) {
+ if (((cfg->split_size_mb * mb) % base_bdev->blocklen) != 0) {
+ SPDK_ERRLOG("Split size %" PRIu64 " MB is not possible with block size "
+ "%" PRIu32 "\n",
+ cfg->split_size_mb, base_bdev->blocklen);
+ return -EINVAL;
+ }
+ split_size_blocks = (cfg->split_size_mb * mb) / base_bdev->blocklen;
+ SPDK_DEBUGLOG(SPDK_LOG_VBDEV_SPLIT, "Split size %" PRIu64 " MB specified by user\n",
+ cfg->split_size_mb);
+ } else {
+ split_size_blocks = base_bdev->blockcnt / cfg->split_count;
+ SPDK_DEBUGLOG(SPDK_LOG_VBDEV_SPLIT, "Split size not specified by user\n");
+ }
+
+ max_split_count = base_bdev->blockcnt / split_size_blocks;
+ split_count = cfg->split_count;
+ if (split_count > max_split_count) {
+ SPDK_WARNLOG("Split count %" PRIu64 " is greater than maximum possible split count "
+ "%" PRIu64 " - clamping\n", split_count, max_split_count);
+ split_count = max_split_count;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_VBDEV_SPLIT, "base_bdev: %s split_count: %" PRIu64
+ " split_size_blocks: %" PRIu64 "\n",
+ spdk_bdev_get_name(base_bdev), split_count, split_size_blocks);
+
+ TAILQ_INIT(&cfg->splits);
+ cfg->split_base = spdk_bdev_part_base_construct(base_bdev,
+ vbdev_split_base_bdev_hotremove_cb,
+ &split_if, &vbdev_split_fn_table,
+ &cfg->splits, vbdev_split_base_free, cfg,
+ sizeof(struct vbdev_split_channel), NULL, NULL);
+ if (!cfg->split_base) {
+ SPDK_ERRLOG("Cannot construct bdev part base\n");
+ return -ENOMEM;
+ }
+
+ offset_blocks = 0;
+ for (i = 0; i < split_count; i++) {
+ struct spdk_bdev_part *d;
+
+ d = calloc(1, sizeof(*d));
+ if (d == NULL) {
+ SPDK_ERRLOG("could not allocate bdev part\n");
+ rc = -ENOMEM;
+ goto err;
+ }
+
+ name = spdk_sprintf_alloc("%sp%" PRIu64, cfg->base_bdev, i);
+ if (!name) {
+ SPDK_ERRLOG("could not allocate name\n");
+ free(d);
+ rc = -ENOMEM;
+ goto err;
+ }
+
+ rc = spdk_bdev_part_construct(d, cfg->split_base, name, offset_blocks, split_size_blocks,
+ "Split Disk");
+ free(name);
+ if (rc) {
+ SPDK_ERRLOG("could not construct bdev part\n");
+ /* spdk_bdev_part_construct will free name if it fails */
+ free(d);
+ rc = -ENOMEM;
+ goto err;
+ }
+
+ offset_blocks += split_size_blocks;
+ }
+
+ return 0;
+err:
+ split_base_tailq = spdk_bdev_part_base_get_tailq(cfg->split_base);
+ spdk_bdev_part_base_hotremove(cfg->split_base, split_base_tailq);
+ return rc;
+}
+
+static void
+vbdev_split_del_config(struct spdk_vbdev_split_config *cfg)
+{
+ TAILQ_REMOVE(&g_split_config, cfg, tailq);
+ free(cfg->base_bdev);
+ free(cfg);
+}
+
+static void
+vbdev_split_destruct_config(struct spdk_vbdev_split_config *cfg)
+{
+ struct bdev_part_tailq *split_base_tailq;
+
+ if (cfg->split_base != NULL) {
+ split_base_tailq = spdk_bdev_part_base_get_tailq(cfg->split_base);
+ spdk_bdev_part_base_hotremove(cfg->split_base, split_base_tailq);
+ } else {
+ vbdev_split_del_config(cfg);
+ }
+}
+
+static void
+vbdev_split_clear_config(void)
+{
+ struct spdk_vbdev_split_config *cfg, *tmp_cfg;
+
+ TAILQ_FOREACH_SAFE(cfg, &g_split_config, tailq, tmp_cfg) {
+ vbdev_split_destruct_config(cfg);
+ }
+}
+
+static struct spdk_vbdev_split_config *
+vbdev_split_config_find_by_base_name(const char *base_bdev_name)
+{
+ struct spdk_vbdev_split_config *cfg;
+
+ TAILQ_FOREACH(cfg, &g_split_config, tailq) {
+ if (strcmp(cfg->base_bdev, base_bdev_name) == 0) {
+ return cfg;
+ }
+ }
+
+ return NULL;
+}
+
+static int
+vbdev_split_add_config(const char *base_bdev_name, unsigned split_count, uint64_t split_size,
+ struct spdk_vbdev_split_config **config)
+{
+ struct spdk_vbdev_split_config *cfg;
+ assert(base_bdev_name);
+
+ if (base_bdev_name == NULL) {
+ SPDK_ERRLOG("Split bdev config: no base bdev provided.");
+ return -EINVAL;
+ }
+
+ if (split_count == 0) {
+ SPDK_ERRLOG("Split bdev config: split_count can't be 0.");
+ return -EINVAL;
+ }
+
+ /* Check if we already have 'base_bdev_name' registered in config */
+ cfg = vbdev_split_config_find_by_base_name(base_bdev_name);
+ if (cfg) {
+ SPDK_ERRLOG("Split bdev config for base bdev '%s' already exist.", base_bdev_name);
+ return -EEXIST;
+ }
+
+ cfg = calloc(1, sizeof(*cfg));
+ if (!cfg) {
+ SPDK_ERRLOG("calloc(): Out of memory");
+ return -ENOMEM;
+ }
+
+ cfg->base_bdev = strdup(base_bdev_name);
+ if (!cfg->base_bdev) {
+ SPDK_ERRLOG("strdup(): Out of memory");
+ free(cfg);
+ return -ENOMEM;
+ }
+
+ cfg->split_count = split_count;
+ cfg->split_size_mb = split_size;
+ TAILQ_INSERT_TAIL(&g_split_config, cfg, tailq);
+ if (config) {
+ *config = cfg;
+ }
+
+ return 0;
+}
+
+static int
+vbdev_split_init(void)
+{
+
+ struct spdk_conf_section *sp;
+ const char *base_bdev_name;
+ const char *split_count_str;
+ const char *split_size_str;
+ int rc, i, split_count, split_size;
+
+ sp = spdk_conf_find_section(NULL, "Split");
+ if (sp == NULL) {
+ return 0;
+ }
+
+ for (i = 0; ; i++) {
+ if (!spdk_conf_section_get_nval(sp, "Split", i)) {
+ break;
+ }
+
+ base_bdev_name = spdk_conf_section_get_nmval(sp, "Split", i, 0);
+ if (!base_bdev_name) {
+ SPDK_ERRLOG("Split configuration missing bdev name\n");
+ rc = -EINVAL;
+ goto err;
+ }
+
+ split_count_str = spdk_conf_section_get_nmval(sp, "Split", i, 1);
+ if (!split_count_str) {
+ SPDK_ERRLOG("Split configuration missing split count\n");
+ rc = -EINVAL;
+ goto err;
+ }
+
+ split_count = spdk_strtol(split_count_str, 10);
+ if (split_count < 1) {
+ SPDK_ERRLOG("Invalid Split count %d\n", split_count);
+ rc = -EINVAL;
+ goto err;
+ }
+
+ /* Optional split size in MB */
+ split_size = 0;
+ split_size_str = spdk_conf_section_get_nmval(sp, "Split", i, 2);
+ if (split_size_str) {
+ split_size = spdk_strtol(split_size_str, 10);
+ if (split_size <= 0) {
+ SPDK_ERRLOG("Invalid Split size %d\n", split_size);
+ rc = -EINVAL;
+ goto err;
+ }
+ }
+
+ rc = vbdev_split_add_config(base_bdev_name, split_count, split_size, NULL);
+ if (rc != 0) {
+ goto err;
+ }
+ }
+
+ return 0;
+err:
+ vbdev_split_clear_config();
+ return rc;
+}
+
+static void
+vbdev_split_fini(void)
+{
+ vbdev_split_clear_config();
+}
+
+static void
+vbdev_split_examine(struct spdk_bdev *bdev)
+{
+ struct spdk_vbdev_split_config *cfg = vbdev_split_config_find_by_base_name(bdev->name);
+
+ if (cfg != NULL) {
+ assert(cfg->split_base == NULL);
+
+ if (vbdev_split_create(cfg)) {
+ SPDK_ERRLOG("could not split bdev %s\n", bdev->name);
+ }
+ }
+ spdk_bdev_module_examine_done(&split_if);
+}
+
+static int
+vbdev_split_config_json(struct spdk_json_write_ctx *w)
+{
+ struct spdk_vbdev_split_config *cfg;
+
+ TAILQ_FOREACH(cfg, &g_split_config, tailq) {
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_string(w, "method", "bdev_split_create");
+
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_string(w, "base_bdev", cfg->base_bdev);
+ spdk_json_write_named_uint32(w, "split_count", cfg->split_count);
+ spdk_json_write_named_uint64(w, "split_size_mb", cfg->split_size_mb);
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+ }
+
+ return 0;
+}
+
+int
+create_vbdev_split(const char *base_bdev_name, unsigned split_count, uint64_t split_size_mb)
+{
+ int rc;
+ struct spdk_vbdev_split_config *cfg;
+
+ rc = vbdev_split_add_config(base_bdev_name, split_count, split_size_mb, &cfg);
+ if (rc) {
+ return rc;
+ }
+
+ rc = vbdev_split_create(cfg);
+ if (rc == -ENODEV) {
+ /* It is ok if base bdev does not exist yet. */
+ rc = 0;
+ }
+
+ return rc;
+}
+
+int
+vbdev_split_destruct(const char *base_bdev_name)
+{
+ struct spdk_vbdev_split_config *cfg = vbdev_split_config_find_by_base_name(base_bdev_name);
+
+ if (!cfg) {
+ SPDK_ERRLOG("Split configuration for '%s' not found\n", base_bdev_name);
+ return -ENOENT;
+ }
+
+ vbdev_split_destruct_config(cfg);
+ return 0;
+}
+
+struct spdk_bdev_part_base *
+vbdev_split_get_part_base(struct spdk_bdev *bdev)
+{
+ struct spdk_vbdev_split_config *cfg;
+
+ cfg = vbdev_split_config_find_by_base_name(spdk_bdev_get_name(bdev));
+
+ if (cfg == NULL) {
+ return NULL;
+ }
+
+ return cfg->split_base;
+}
+
+/*
+ * During init we'll be asked how much memory we'd like passed to us
+ * in bev_io structures as context. Here's where we specify how
+ * much context we want per IO.
+ */
+static int
+vbdev_split_get_ctx_size(void)
+{
+ return sizeof(struct vbdev_split_bdev_io);
+}
+
+SPDK_LOG_REGISTER_COMPONENT("vbdev_split", SPDK_LOG_VBDEV_SPLIT)
diff --git a/src/spdk/module/bdev/split/vbdev_split.h b/src/spdk/module/bdev/split/vbdev_split.h
new file mode 100644
index 000000000..f468f2414
--- /dev/null
+++ b/src/spdk/module/bdev/split/vbdev_split.h
@@ -0,0 +1,68 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_VBDEV_SPLIT_H
+#define SPDK_VBDEV_SPLIT_H
+
+#include "spdk/bdev_module.h"
+
+/**
+ * Add given disk name to split config. If bdev with \c base_bdev_name name
+ * exist the split bdevs will be created right away, if not the split bdevs will
+ * be created when base bdev became be available (during examination process).
+ *
+ * \param base_bdev_name Base bdev name
+ * \param split_count number of splits to be created.
+ * \param split_size_mb size of each bdev. If 0 use base bdev size / split_count
+ * \return value >= 0 - number of splits create. Negative errno code on error.
+ */
+int create_vbdev_split(const char *base_bdev_name, unsigned split_count, uint64_t split_size_mb);
+
+/**
+ * Remove all created split bdevs and split config.
+ *
+ * \param base_bdev_name base bdev name
+ * \return 0 on success or negative errno value.
+ */
+int vbdev_split_destruct(const char *base_bdev_name);
+
+/**
+ * Get the spdk_bdev_part_base associated with the given split base_bdev.
+ *
+ * \param base_bdev Bdev to get the part_base from
+ * \return pointer to the associated spdk_bdev_part_base
+ * \return NULL if the base_bdev is not being split by the split module
+ */
+struct spdk_bdev_part_base *vbdev_split_get_part_base(struct spdk_bdev *base_bdev);
+
+#endif /* SPDK_VBDEV_SPLIT_H */
diff --git a/src/spdk/module/bdev/split/vbdev_split_rpc.c b/src/spdk/module/bdev/split/vbdev_split_rpc.c
new file mode 100644
index 000000000..a8c6f3be0
--- /dev/null
+++ b/src/spdk/module/bdev/split/vbdev_split_rpc.c
@@ -0,0 +1,145 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/rpc.h"
+#include "spdk/util.h"
+#include "spdk/string.h"
+
+#include "vbdev_split.h"
+#include "spdk_internal/log.h"
+
+struct rpc_construct_split {
+ char *base_bdev;
+ uint32_t split_count;
+ uint64_t split_size_mb;
+};
+
+static const struct spdk_json_object_decoder rpc_construct_split_decoders[] = {
+ {"base_bdev", offsetof(struct rpc_construct_split, base_bdev), spdk_json_decode_string},
+ {"split_count", offsetof(struct rpc_construct_split, split_count), spdk_json_decode_uint32},
+ {"split_size_mb", offsetof(struct rpc_construct_split, split_size_mb), spdk_json_decode_uint64, true},
+};
+
+static void
+rpc_bdev_split_create(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_construct_split req = {};
+ struct spdk_json_write_ctx *w;
+ struct spdk_bdev *base_bdev;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_construct_split_decoders,
+ SPDK_COUNTOF(rpc_construct_split_decoders),
+ &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+ goto out;
+ }
+
+ rc = create_vbdev_split(req.base_bdev, req.split_count, req.split_size_mb);
+ if (rc < 0) {
+ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Failed to create %"PRIu32" split bdevs from '%s': %s",
+ req.split_count, req.base_bdev, spdk_strerror(-rc));
+ goto out;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_array_begin(w);
+
+ base_bdev = spdk_bdev_get_by_name(req.base_bdev);
+ if (base_bdev != NULL) {
+ struct spdk_bdev_part_base *split_base;
+ struct bdev_part_tailq *split_base_tailq;
+ struct spdk_bdev_part *split_part;
+ struct spdk_bdev *split_bdev;
+
+ split_base = vbdev_split_get_part_base(base_bdev);
+
+ assert(split_base != NULL);
+
+ split_base_tailq = spdk_bdev_part_base_get_tailq(split_base);
+ TAILQ_FOREACH(split_part, split_base_tailq, tailq) {
+ split_bdev = spdk_bdev_part_get_bdev(split_part);
+ spdk_json_write_string(w, spdk_bdev_get_name(split_bdev));
+ }
+ }
+
+ spdk_json_write_array_end(w);
+ spdk_jsonrpc_end_result(request, w);
+
+out:
+ free(req.base_bdev);
+}
+SPDK_RPC_REGISTER("bdev_split_create", rpc_bdev_split_create, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_split_create, construct_split_vbdev)
+
+struct rpc_delete_split {
+ char *base_bdev;
+};
+
+static const struct spdk_json_object_decoder rpc_delete_split_decoders[] = {
+ {"base_bdev", offsetof(struct rpc_delete_split, base_bdev), spdk_json_decode_string},
+};
+
+static void
+rpc_bdev_split_delete(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_delete_split req = {};
+ struct spdk_json_write_ctx *w;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_delete_split_decoders,
+ SPDK_COUNTOF(rpc_delete_split_decoders),
+ &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+ goto out;
+ }
+
+ rc = vbdev_split_destruct(req.base_bdev);
+ if (rc < 0) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, spdk_strerror(-rc));
+ goto out;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+out:
+ free(req.base_bdev);
+}
+SPDK_RPC_REGISTER("bdev_split_delete", rpc_bdev_split_delete, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_split_delete, destruct_split_vbdev)
diff --git a/src/spdk/module/bdev/uring/Makefile b/src/spdk/module/bdev/uring/Makefile
new file mode 100644
index 000000000..2a97f1564
--- /dev/null
+++ b/src/spdk/module/bdev/uring/Makefile
@@ -0,0 +1,51 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = bdev_uring.c bdev_uring_rpc.c
+LIBNAME = bdev_uring
+LOCAL_SYS_LIBS = -luring
+
+ifneq ($(strip $(CONFIG_URING_PATH)),)
+CFLAGS += -I$(CONFIG_URING_PATH)
+LDFLAGS += -L$(CONFIG_URING_PATH)
+endif
+
+SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/module/bdev/uring/bdev_uring.c b/src/spdk/module/bdev/uring/bdev_uring.c
new file mode 100644
index 000000000..494cc4794
--- /dev/null
+++ b/src/spdk/module/bdev/uring/bdev_uring.c
@@ -0,0 +1,676 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "bdev_uring.h"
+
+#include "spdk/stdinc.h"
+
+#include "spdk/barrier.h"
+#include "spdk/bdev.h"
+#include "spdk/conf.h"
+#include "spdk/env.h"
+#include "spdk/fd.h"
+#include "spdk/likely.h"
+#include "spdk/thread.h"
+#include "spdk/json.h"
+#include "spdk/util.h"
+#include "spdk/string.h"
+
+#include "spdk_internal/log.h"
+#include "spdk_internal/uring.h"
+
+struct bdev_uring_io_channel {
+ struct bdev_uring_group_channel *group_ch;
+};
+
+struct bdev_uring_group_channel {
+ uint64_t io_inflight;
+ uint64_t io_pending;
+ struct spdk_poller *poller;
+ struct io_uring uring;
+};
+
+struct bdev_uring_task {
+ uint64_t len;
+ struct bdev_uring_io_channel *ch;
+ TAILQ_ENTRY(bdev_uring_task) link;
+};
+
+struct bdev_uring {
+ struct spdk_bdev bdev;
+ char *filename;
+ int fd;
+ TAILQ_ENTRY(bdev_uring) link;
+};
+
+static int bdev_uring_init(void);
+static void bdev_uring_fini(void);
+static void uring_free_bdev(struct bdev_uring *uring);
+static void bdev_uring_get_spdk_running_config(FILE *fp);
+static TAILQ_HEAD(, bdev_uring) g_uring_bdev_head;
+
+#define SPDK_URING_QUEUE_DEPTH 512
+#define MAX_EVENTS_PER_POLL 32
+
+static int
+bdev_uring_get_ctx_size(void)
+{
+ return sizeof(struct bdev_uring_task);
+}
+
+static struct spdk_bdev_module uring_if = {
+ .name = "uring",
+ .module_init = bdev_uring_init,
+ .module_fini = bdev_uring_fini,
+ .config_text = bdev_uring_get_spdk_running_config,
+ .get_ctx_size = bdev_uring_get_ctx_size,
+};
+
+SPDK_BDEV_MODULE_REGISTER(uring, &uring_if)
+
+static int
+bdev_uring_open(struct bdev_uring *bdev)
+{
+ int fd;
+
+ fd = open(bdev->filename, O_RDWR | O_DIRECT | O_NOATIME);
+ if (fd < 0) {
+ /* Try without O_DIRECT for non-disk files */
+ fd = open(bdev->filename, O_RDWR | O_NOATIME);
+ if (fd < 0) {
+ SPDK_ERRLOG("open() failed (file:%s), errno %d: %s\n",
+ bdev->filename, errno, spdk_strerror(errno));
+ bdev->fd = -1;
+ return -1;
+ }
+ }
+
+ bdev->fd = fd;
+
+ return 0;
+}
+
+static int
+bdev_uring_close(struct bdev_uring *bdev)
+{
+ int rc;
+
+ if (bdev->fd == -1) {
+ return 0;
+ }
+
+ rc = close(bdev->fd);
+ if (rc < 0) {
+ SPDK_ERRLOG("close() failed (fd=%d), errno %d: %s\n",
+ bdev->fd, errno, spdk_strerror(errno));
+ return -1;
+ }
+
+ bdev->fd = -1;
+
+ return 0;
+}
+
+static int64_t
+bdev_uring_readv(struct bdev_uring *uring, struct spdk_io_channel *ch,
+ struct bdev_uring_task *uring_task,
+ struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset)
+{
+ struct bdev_uring_io_channel *uring_ch = spdk_io_channel_get_ctx(ch);
+ struct bdev_uring_group_channel *group_ch = uring_ch->group_ch;
+ struct io_uring_sqe *sqe;
+
+ sqe = io_uring_get_sqe(&group_ch->uring);
+ io_uring_prep_readv(sqe, uring->fd, iov, iovcnt, offset);
+ io_uring_sqe_set_data(sqe, uring_task);
+ uring_task->len = nbytes;
+ uring_task->ch = uring_ch;
+
+ SPDK_DEBUGLOG(SPDK_LOG_URING, "read %d iovs size %lu to off: %#lx\n",
+ iovcnt, nbytes, offset);
+
+ group_ch->io_pending++;
+ return nbytes;
+}
+
+static int64_t
+bdev_uring_writev(struct bdev_uring *uring, struct spdk_io_channel *ch,
+ struct bdev_uring_task *uring_task,
+ struct iovec *iov, int iovcnt, size_t nbytes, uint64_t offset)
+{
+ struct bdev_uring_io_channel *uring_ch = spdk_io_channel_get_ctx(ch);
+ struct bdev_uring_group_channel *group_ch = uring_ch->group_ch;
+ struct io_uring_sqe *sqe;
+
+ sqe = io_uring_get_sqe(&group_ch->uring);
+ io_uring_prep_writev(sqe, uring->fd, iov, iovcnt, offset);
+ io_uring_sqe_set_data(sqe, uring_task);
+ uring_task->len = nbytes;
+ uring_task->ch = uring_ch;
+
+ SPDK_DEBUGLOG(SPDK_LOG_URING, "write %d iovs size %lu from off: %#lx\n",
+ iovcnt, nbytes, offset);
+
+ group_ch->io_pending++;
+ return nbytes;
+}
+
+static int
+bdev_uring_destruct(void *ctx)
+{
+ struct bdev_uring *uring = ctx;
+ int rc = 0;
+
+ TAILQ_REMOVE(&g_uring_bdev_head, uring, link);
+ rc = bdev_uring_close(uring);
+ if (rc < 0) {
+ SPDK_ERRLOG("bdev_uring_close() failed\n");
+ }
+ spdk_io_device_unregister(uring, NULL);
+ uring_free_bdev(uring);
+ return rc;
+}
+
+static int
+bdev_uring_reap(struct io_uring *ring, int max)
+{
+ int i, count, ret;
+ struct io_uring_cqe *cqe;
+ struct bdev_uring_task *uring_task;
+ enum spdk_bdev_io_status status;
+
+ count = 0;
+ for (i = 0; i < max; i++) {
+ ret = io_uring_peek_cqe(ring, &cqe);
+ if (ret != 0) {
+ return ret;
+ }
+
+ if (cqe == NULL) {
+ return count;
+ }
+
+ uring_task = (struct bdev_uring_task *)cqe->user_data;
+ if (cqe->res != (signed)uring_task->len) {
+ status = SPDK_BDEV_IO_STATUS_FAILED;
+ } else {
+ status = SPDK_BDEV_IO_STATUS_SUCCESS;
+ }
+
+ uring_task->ch->group_ch->io_inflight--;
+ io_uring_cqe_seen(ring, cqe);
+ spdk_bdev_io_complete(spdk_bdev_io_from_ctx(uring_task), status);
+ count++;
+ }
+
+ return count;
+}
+
+static int
+bdev_uring_group_poll(void *arg)
+{
+ struct bdev_uring_group_channel *group_ch = arg;
+ int to_complete, to_submit;
+ int count, ret;
+
+ to_submit = group_ch->io_pending;
+ to_complete = group_ch->io_inflight;
+
+ ret = 0;
+ if (to_submit > 0) {
+ /* If there are I/O to submit, use io_uring_submit here.
+ * It will automatically call spdk_io_uring_enter appropriately. */
+ ret = io_uring_submit(&group_ch->uring);
+ group_ch->io_pending = 0;
+ group_ch->io_inflight += to_submit;
+ } else if (to_complete > 0) {
+ /* If there are I/O in flight but none to submit, we need to
+ * call io_uring_enter ourselves. */
+ ret = spdk_io_uring_enter(group_ch->uring.ring_fd, 0, 0,
+ IORING_ENTER_GETEVENTS);
+ }
+
+ if (ret < 0) {
+ return SPDK_POLLER_BUSY;
+ }
+
+ count = 0;
+ if (to_complete > 0) {
+ count = bdev_uring_reap(&group_ch->uring, to_complete);
+ }
+
+ if (count + to_submit > 0) {
+ return SPDK_POLLER_BUSY;
+ } else {
+ return SPDK_POLLER_IDLE;
+ }
+}
+
+static void bdev_uring_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
+ bool success)
+{
+ if (!success) {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ return;
+ }
+
+ switch (bdev_io->type) {
+ case SPDK_BDEV_IO_TYPE_READ:
+ bdev_uring_readv((struct bdev_uring *)bdev_io->bdev->ctxt,
+ ch,
+ (struct bdev_uring_task *)bdev_io->driver_ctx,
+ bdev_io->u.bdev.iovs,
+ bdev_io->u.bdev.iovcnt,
+ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen,
+ bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen);
+ break;
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ bdev_uring_writev((struct bdev_uring *)bdev_io->bdev->ctxt,
+ ch,
+ (struct bdev_uring_task *)bdev_io->driver_ctx,
+ bdev_io->u.bdev.iovs,
+ bdev_io->u.bdev.iovcnt,
+ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen,
+ bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen);
+ break;
+ default:
+ SPDK_ERRLOG("Wrong io type\n");
+ break;
+ }
+}
+
+static int _bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
+{
+ switch (bdev_io->type) {
+ /* Read and write operations must be performed on buffers aligned to
+ * bdev->required_alignment. If user specified unaligned buffers,
+ * get the aligned buffer from the pool by calling spdk_bdev_io_get_buf. */
+ case SPDK_BDEV_IO_TYPE_READ:
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ spdk_bdev_io_get_buf(bdev_io, bdev_uring_get_buf_cb,
+ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
+ return 0;
+ default:
+ return -1;
+ }
+}
+
+static void bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
+{
+ if (_bdev_uring_submit_request(ch, bdev_io) < 0) {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+}
+
+static bool
+bdev_uring_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
+{
+ switch (io_type) {
+ case SPDK_BDEV_IO_TYPE_READ:
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static int
+bdev_uring_create_cb(void *io_device, void *ctx_buf)
+{
+ struct bdev_uring_io_channel *ch = ctx_buf;
+
+ ch->group_ch = spdk_io_channel_get_ctx(spdk_get_io_channel(&uring_if));
+
+ return 0;
+}
+
+static void
+bdev_uring_destroy_cb(void *io_device, void *ctx_buf)
+{
+ struct bdev_uring_io_channel *ch = ctx_buf;
+
+ spdk_put_io_channel(spdk_io_channel_from_ctx(ch->group_ch));
+}
+
+static struct spdk_io_channel *
+bdev_uring_get_io_channel(void *ctx)
+{
+ struct bdev_uring *uring = ctx;
+
+ return spdk_get_io_channel(uring);
+}
+
+static int
+bdev_uring_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
+{
+ struct bdev_uring *uring = ctx;
+
+ spdk_json_write_named_object_begin(w, "uring");
+
+ spdk_json_write_named_string(w, "filename", uring->filename);
+
+ spdk_json_write_object_end(w);
+
+ return 0;
+}
+
+static void
+bdev_uring_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
+{
+ struct bdev_uring *uring = bdev->ctxt;
+
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_string(w, "method", "bdev_uring_create");
+
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_string(w, "name", bdev->name);
+ spdk_json_write_named_uint32(w, "block_size", bdev->blocklen);
+ spdk_json_write_named_string(w, "filename", uring->filename);
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+}
+
+static const struct spdk_bdev_fn_table uring_fn_table = {
+ .destruct = bdev_uring_destruct,
+ .submit_request = bdev_uring_submit_request,
+ .io_type_supported = bdev_uring_io_type_supported,
+ .get_io_channel = bdev_uring_get_io_channel,
+ .dump_info_json = bdev_uring_dump_info_json,
+ .write_config_json = bdev_uring_write_json_config,
+};
+
+static void uring_free_bdev(struct bdev_uring *uring)
+{
+ if (uring == NULL) {
+ return;
+ }
+ free(uring->filename);
+ free(uring->bdev.name);
+ free(uring);
+}
+
+static int
+bdev_uring_group_create_cb(void *io_device, void *ctx_buf)
+{
+ struct bdev_uring_group_channel *ch = ctx_buf;
+
+ if (io_uring_queue_init(SPDK_URING_QUEUE_DEPTH, &ch->uring, IORING_SETUP_IOPOLL) < 0) {
+ SPDK_ERRLOG("uring I/O context setup failure\n");
+ return -1;
+ }
+
+ ch->poller = SPDK_POLLER_REGISTER(bdev_uring_group_poll, ch, 0);
+ return 0;
+}
+
+static void
+bdev_uring_group_destroy_cb(void *io_device, void *ctx_buf)
+{
+ struct bdev_uring_group_channel *ch = ctx_buf;
+
+ io_uring_queue_exit(&ch->uring);
+
+ spdk_poller_unregister(&ch->poller);
+}
+
+struct spdk_bdev *
+create_uring_bdev(const char *name, const char *filename, uint32_t block_size)
+{
+ struct bdev_uring *uring;
+ uint32_t detected_block_size;
+ uint64_t bdev_size;
+ int rc;
+
+ uring = calloc(1, sizeof(*uring));
+ if (!uring) {
+ SPDK_ERRLOG("Unable to allocate enough memory for uring backend\n");
+ return NULL;
+ }
+
+ uring->filename = strdup(filename);
+ if (!uring->filename) {
+ goto error_return;
+ }
+
+ if (bdev_uring_open(uring)) {
+ SPDK_ERRLOG("Unable to open file %s. fd: %d errno: %d\n", filename, uring->fd, errno);
+ goto error_return;
+ }
+
+ bdev_size = spdk_fd_get_size(uring->fd);
+
+ uring->bdev.name = strdup(name);
+ if (!uring->bdev.name) {
+ goto error_return;
+ }
+ uring->bdev.product_name = "URING bdev";
+ uring->bdev.module = &uring_if;
+
+ uring->bdev.write_cache = 1;
+
+ detected_block_size = spdk_fd_get_blocklen(uring->fd);
+ if (block_size == 0) {
+ /* User did not specify block size - use autodetected block size. */
+ if (detected_block_size == 0) {
+ SPDK_ERRLOG("Block size could not be auto-detected\n");
+ goto error_return;
+ }
+ block_size = detected_block_size;
+ } else {
+ if (block_size < detected_block_size) {
+ SPDK_ERRLOG("Specified block size %" PRIu32 " is smaller than "
+ "auto-detected block size %" PRIu32 "\n",
+ block_size, detected_block_size);
+ goto error_return;
+ } else if (detected_block_size != 0 && block_size != detected_block_size) {
+ SPDK_WARNLOG("Specified block size %" PRIu32 " does not match "
+ "auto-detected block size %" PRIu32 "\n",
+ block_size, detected_block_size);
+ }
+ }
+
+ if (block_size < 512) {
+ SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be at least 512).\n", block_size);
+ goto error_return;
+ }
+
+ if (!spdk_u32_is_pow2(block_size)) {
+ SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be a power of 2.)\n", block_size);
+ goto error_return;
+ }
+
+ uring->bdev.blocklen = block_size;
+ uring->bdev.required_alignment = spdk_u32log2(block_size);
+
+ if (bdev_size % uring->bdev.blocklen != 0) {
+ SPDK_ERRLOG("Disk size %" PRIu64 " is not a multiple of block size %" PRIu32 "\n",
+ bdev_size, uring->bdev.blocklen);
+ goto error_return;
+ }
+
+ uring->bdev.blockcnt = bdev_size / uring->bdev.blocklen;
+ uring->bdev.ctxt = uring;
+
+ uring->bdev.fn_table = &uring_fn_table;
+
+ spdk_io_device_register(uring, bdev_uring_create_cb, bdev_uring_destroy_cb,
+ sizeof(struct bdev_uring_io_channel),
+ uring->bdev.name);
+ rc = spdk_bdev_register(&uring->bdev);
+ if (rc) {
+ spdk_io_device_unregister(uring, NULL);
+ goto error_return;
+ }
+
+ TAILQ_INSERT_TAIL(&g_uring_bdev_head, uring, link);
+ return &uring->bdev;
+
+error_return:
+ bdev_uring_close(uring);
+ uring_free_bdev(uring);
+ return NULL;
+}
+
+struct delete_uring_bdev_ctx {
+ spdk_delete_uring_complete cb_fn;
+ void *cb_arg;
+};
+
+static void
+uring_bdev_unregister_cb(void *arg, int bdeverrno)
+{
+ struct delete_uring_bdev_ctx *ctx = arg;
+
+ ctx->cb_fn(ctx->cb_arg, bdeverrno);
+ free(ctx);
+}
+
+void
+delete_uring_bdev(struct spdk_bdev *bdev, spdk_delete_uring_complete cb_fn, void *cb_arg)
+{
+ struct delete_uring_bdev_ctx *ctx;
+
+ if (!bdev || bdev->module != &uring_if) {
+ cb_fn(cb_arg, -ENODEV);
+ return;
+ }
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (ctx == NULL) {
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ ctx->cb_fn = cb_fn;
+ ctx->cb_arg = cb_arg;
+ spdk_bdev_unregister(bdev, uring_bdev_unregister_cb, ctx);
+}
+
+static int
+bdev_uring_init(void)
+{
+ size_t i;
+ struct spdk_conf_section *sp;
+ struct spdk_bdev *bdev;
+
+ TAILQ_INIT(&g_uring_bdev_head);
+ spdk_io_device_register(&uring_if, bdev_uring_group_create_cb, bdev_uring_group_destroy_cb,
+ sizeof(struct bdev_uring_group_channel),
+ "uring_module");
+
+ sp = spdk_conf_find_section(NULL, "URING");
+ if (!sp) {
+ return 0;
+ }
+
+ i = 0;
+ while (true) {
+ const char *file;
+ const char *name;
+ const char *block_size_str;
+ uint32_t block_size = 0;
+ long int tmp;
+
+ file = spdk_conf_section_get_nmval(sp, "URING", i, 0);
+ if (!file) {
+ break;
+ }
+
+ name = spdk_conf_section_get_nmval(sp, "URING", i, 1);
+ if (!name) {
+ SPDK_ERRLOG("No name provided for URING bdev with file %s\n", file);
+ i++;
+ continue;
+ }
+
+ block_size_str = spdk_conf_section_get_nmval(sp, "URING", i, 2);
+ if (block_size_str) {
+ tmp = spdk_strtol(block_size_str, 10);
+ if (tmp < 0) {
+ SPDK_ERRLOG("Invalid block size for URING bdev with file %s\n", file);
+ i++;
+ continue;
+ }
+ block_size = (uint32_t)tmp;
+ }
+
+ bdev = create_uring_bdev(name, file, block_size);
+ if (!bdev) {
+ SPDK_ERRLOG("Unable to create URING bdev from file %s\n", file);
+ i++;
+ continue;
+ }
+
+ i++;
+ }
+
+ return 0;
+}
+
+static void
+bdev_uring_fini(void)
+{
+ spdk_io_device_unregister(&uring_if, NULL);
+}
+
+static void
+bdev_uring_get_spdk_running_config(FILE *fp)
+{
+ char *file;
+ char *name;
+ uint32_t block_size;
+ struct bdev_uring *uring;
+
+ fprintf(fp,
+ "\n"
+ "# Users must change this section to match the /dev/sdX devices to be\n"
+ "# exported as iSCSI LUNs. The devices are accessed using io_uring.\n"
+ "# The format is:\n"
+ "# URING <file name> <bdev name> [<block size>]\n"
+ "# The file name is the backing device\n"
+ "# The bdev name can be referenced from elsewhere in the configuration file.\n"
+ "# Block size may be omitted to automatically detect the block size of a bdev.\n"
+ "[URING]\n");
+
+ TAILQ_FOREACH(uring, &g_uring_bdev_head, link) {
+ file = uring->filename;
+ name = uring->bdev.name;
+ block_size = uring->bdev.blocklen;
+ fprintf(fp, " URING %s %s %d\n", file, name, block_size);
+ }
+ fprintf(fp, "\n");
+}
+
+SPDK_LOG_REGISTER_COMPONENT("uring", SPDK_LOG_URING)
diff --git a/src/spdk/module/bdev/uring/bdev_uring.h b/src/spdk/module/bdev/uring/bdev_uring.h
new file mode 100644
index 000000000..a35681832
--- /dev/null
+++ b/src/spdk/module/bdev/uring/bdev_uring.h
@@ -0,0 +1,50 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_BDEV_URING_H
+#define SPDK_BDEV_URING_H
+
+#include "spdk/stdinc.h"
+
+#include "spdk/queue.h"
+#include "spdk/bdev.h"
+
+#include "spdk/bdev_module.h"
+
+typedef void (*spdk_delete_uring_complete)(void *cb_arg, int bdeverrno);
+
+struct spdk_bdev *create_uring_bdev(const char *name, const char *filename, uint32_t block_size);
+
+void delete_uring_bdev(struct spdk_bdev *bdev, spdk_delete_uring_complete cb_fn, void *cb_arg);
+
+#endif /* SPDK_BDEV_URING_H */
diff --git a/src/spdk/module/bdev/uring/bdev_uring_rpc.c b/src/spdk/module/bdev/uring/bdev_uring_rpc.c
new file mode 100644
index 000000000..e65751002
--- /dev/null
+++ b/src/spdk/module/bdev/uring/bdev_uring_rpc.c
@@ -0,0 +1,150 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "bdev_uring.h"
+#include "spdk/rpc.h"
+#include "spdk/util.h"
+#include "spdk/string.h"
+#include "spdk_internal/log.h"
+
+/* Structure to hold the parameters for this RPC method. */
+struct rpc_create_uring {
+ char *name;
+ char *filename;
+ uint32_t block_size;
+};
+
+/* Free the allocated memory resource after the RPC handling. */
+static void
+free_rpc_create_uring(struct rpc_create_uring *r)
+{
+ free(r->name);
+ free(r->filename);
+}
+
+/* Structure to decode the input parameters for this RPC method. */
+static const struct spdk_json_object_decoder rpc_create_uring_decoders[] = {
+ {"name", offsetof(struct rpc_create_uring, name), spdk_json_decode_string},
+ {"filename", offsetof(struct rpc_create_uring, filename), spdk_json_decode_string},
+ {"block_size", offsetof(struct rpc_create_uring, block_size), spdk_json_decode_uint32, true},
+};
+
+/* Decode the parameters for this RPC method and properly create the uring
+ * device. Error status returned in the failed cases.
+ */
+static void
+rpc_bdev_uring_create(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_create_uring req = {};
+ struct spdk_json_write_ctx *w;
+ struct spdk_bdev *bdev;
+
+ if (spdk_json_decode_object(params, rpc_create_uring_decoders,
+ SPDK_COUNTOF(rpc_create_uring_decoders),
+ &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ bdev = create_uring_bdev(req.name, req.filename, req.block_size);
+ if (!bdev) {
+ SPDK_ERRLOG("Unable to create URING bdev from file %s\n", req.filename);
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Unable to create URING bdev.");
+ goto cleanup;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_string(w, req.name);
+ spdk_jsonrpc_end_result(request, w);
+
+cleanup:
+ free_rpc_create_uring(&req);
+}
+SPDK_RPC_REGISTER("bdev_uring_create", rpc_bdev_uring_create, SPDK_RPC_RUNTIME)
+
+struct rpc_delete_uring {
+ char *name;
+};
+
+static void
+free_rpc_delete_uring(struct rpc_delete_uring *req)
+{
+ free(req->name);
+}
+
+static const struct spdk_json_object_decoder rpc_delete_uring_decoders[] = {
+ {"name", offsetof(struct rpc_delete_uring, name), spdk_json_decode_string},
+};
+
+static void
+_rpc_bdev_uring_delete_cb(void *cb_arg, int bdeverrno)
+{
+ struct spdk_jsonrpc_request *request = cb_arg;
+ struct spdk_json_write_ctx *w = spdk_jsonrpc_begin_result(request);
+
+ spdk_json_write_bool(w, bdeverrno == 0);
+ spdk_jsonrpc_end_result(request, w);
+
+}
+
+static void
+rpc_bdev_uring_delete(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_delete_uring req = {NULL};
+ struct spdk_bdev *bdev;
+
+ if (spdk_json_decode_object(params, rpc_delete_uring_decoders,
+ SPDK_COUNTOF(rpc_delete_uring_decoders),
+ &req)) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ bdev = spdk_bdev_get_by_name(req.name);
+ if (bdev == NULL) {
+ spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV));
+ goto cleanup;
+ }
+
+ delete_uring_bdev(bdev, _rpc_bdev_uring_delete_cb, request);
+
+cleanup:
+ free_rpc_delete_uring(&req);
+}
+SPDK_RPC_REGISTER("bdev_uring_delete", rpc_bdev_uring_delete, SPDK_RPC_RUNTIME)
diff --git a/src/spdk/module/bdev/virtio/Makefile b/src/spdk/module/bdev/virtio/Makefile
new file mode 100644
index 000000000..602927afe
--- /dev/null
+++ b/src/spdk/module/bdev/virtio/Makefile
@@ -0,0 +1,45 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = bdev_virtio_scsi.c bdev_virtio_blk.c bdev_virtio_rpc.c
+LIBNAME = bdev_virtio
+
+SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/module/bdev/virtio/bdev_virtio.h b/src/spdk/module/bdev/virtio/bdev_virtio.h
new file mode 100644
index 000000000..538fab8f6
--- /dev/null
+++ b/src/spdk/module/bdev/virtio/bdev_virtio.h
@@ -0,0 +1,164 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_BDEV_VIRTIO_H
+#define SPDK_BDEV_VIRTIO_H
+
+#include "spdk/bdev.h"
+#include "spdk/env.h"
+
+/**
+ * Callback for creating virtio bdevs.
+ *
+ * \param ctx opaque context set by the user
+ * \param errnum error code. 0 on success, negative errno on error.
+ * \param bdevs contiguous array of created bdevs
+ * \param bdev_cnt number of bdevs in the `bdevs` array
+ */
+typedef void (*bdev_virtio_create_cb)(void *ctx, int errnum,
+ struct spdk_bdev **bdevs, size_t bdev_cnt);
+
+/**
+ * Callback for removing virtio devices.
+ *
+ * \param ctx opaque context set by the user
+ * \param errnum error code. 0 on success, negative errno on error.
+ */
+typedef void (*bdev_virtio_remove_cb)(void *ctx, int errnum);
+
+/**
+ * Connect to a vhost-user Unix domain socket and create a Virtio SCSI device.
+ * If the connection is successful, the device will be automatically scanned.
+ * The scan consists of probing the targets on the device and will result in
+ * creating possibly multiple Virtio SCSI bdevs - one for each target. Currently
+ * only one LUN per target is detected - LUN0. Note that the bdev creation is
+ * run asynchronously in the background. After it's finished, the `cb_fn`
+ * callback is called.
+ *
+ * \param name name for the virtio device. It will be inherited by all created
+ * bdevs, which are named in the following format: <name>t<target_id>
+ * \param path path to the socket
+ * \param num_queues max number of request virtqueues to use. `vdev` will be
+ * started successfully even if the host device supports less queues than requested.
+ * \param queue_size depth of each queue
+ * \param cb_fn function to be called after scanning all targets on the virtio
+ * device. It's optional, can be NULL. See \c bdev_virtio_create_cb.
+ * \param cb_arg argument for the `cb_fn`
+ * \return zero on success (device scan is started) or negative error code.
+ * In case of error the \c cb_fn is not called.
+ */
+int bdev_virtio_user_scsi_dev_create(const char *name, const char *path,
+ unsigned num_queues, unsigned queue_size,
+ bdev_virtio_create_cb cb_fn, void *cb_arg);
+
+/**
+ * Attach virtio-pci device. This creates a Virtio SCSI device with the same
+ * capabilities as the vhost-user equivalent. The device will be automatically
+ * scanned for exposed SCSI targets. This will result in creating possibly multiple
+ * Virtio SCSI bdevs - one for each target. Currently only one LUN per target is
+ * detected - LUN0. Note that the bdev creation is run asynchronously in the
+ * background. After it's finished, the `cb_fn` callback is called.
+ *
+ * \param name name for the virtio device. It will be inherited by all created
+ * bdevs, which are named in the following format: <name>t<target_id>
+ * \param pci_addr PCI address of the device to attach
+ * \param cb_fn function to be called after scanning all targets on the virtio
+ * device. It's optional, can be NULL. See \c bdev_virtio_create_cb.
+ * \param cb_arg argument for the `cb_fn`
+ * \return zero on success (device scan is started) or negative error code.
+ * In case of error the \c cb_fn is not called.
+ */
+int bdev_virtio_pci_scsi_dev_create(const char *name, struct spdk_pci_addr *pci_addr,
+ bdev_virtio_create_cb cb_fn, void *cb_arg);
+
+/**
+ * Remove a Virtio device with given name. This will destroy all bdevs exposed
+ * by this device.
+ *
+ * \param name virtio device name
+ * \param cb_fn function to be called after scanning all targets on the virtio
+ * device. It's optional, can be NULL. See \c bdev_virtio_create_cb. Possible
+ * error codes are:
+ * * ENODEV - couldn't find device with given name
+ * * EBUSY - device is already being removed
+ * \param cb_arg argument for the `cb_fn`
+ * \return zero on success or -ENODEV if scsi dev does not exist
+ */
+int bdev_virtio_scsi_dev_remove(const char *name,
+ bdev_virtio_remove_cb cb_fn, void *cb_arg);
+
+/**
+ * Remove a Virtio device with given name.
+ *
+ * \param bdev virtio blk device bdev
+ * \param cb_fn function to be called after removing bdev
+ * \param cb_arg argument for the `cb_fn`
+ * \return zero on success, -ENODEV if bdev with 'name' does not exist or
+ * -EINVAL if bdev with 'name' is not a virtio blk device.
+ */
+int bdev_virtio_blk_dev_remove(const char *name,
+ bdev_virtio_remove_cb cb_fn, void *cb_arg);
+
+/**
+ * List all created Virtio-SCSI devices.
+ *
+ * \param write_ctx JSON context to write into
+ */
+void bdev_virtio_scsi_dev_list(struct spdk_json_write_ctx *write_ctx);
+
+/**
+ * Connect to a vhost-user Unix domain socket and create a Virtio BLK bdev.
+ *
+ * \param name name for the virtio bdev
+ * \param path path to the socket
+ * \param num_queues max number of request virtqueues to use. `vdev` will be
+ * started successfully even if the host device supports less queues than requested.
+ * \param queue_size depth of each queue
+ * \return virtio-blk bdev or NULL
+ */
+struct spdk_bdev *bdev_virtio_user_blk_dev_create(const char *name, const char *path,
+ unsigned num_queues, unsigned queue_size);
+
+/**
+ * Attach virtio-pci device. This creates a Virtio BLK device with the same
+ * capabilities as the vhost-user equivalent.
+ *
+ * \param name name for the virtio device. It will be inherited by all created
+ * bdevs, which are named in the following format: <name>t<target_id>
+ * \param pci_addr PCI address of the device to attach
+ * \return virtio-blk bdev or NULL
+ */
+struct spdk_bdev *bdev_virtio_pci_blk_dev_create(const char *name,
+ struct spdk_pci_addr *pci_addr);
+
+#endif /* SPDK_BDEV_VIRTIO_H */
diff --git a/src/spdk/module/bdev/virtio/bdev_virtio_blk.c b/src/spdk/module/bdev/virtio/bdev_virtio_blk.c
new file mode 100644
index 000000000..99653e238
--- /dev/null
+++ b/src/spdk/module/bdev/virtio/bdev_virtio_blk.c
@@ -0,0 +1,756 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/bdev.h"
+#include "spdk/conf.h"
+#include "spdk/endian.h"
+#include "spdk/env.h"
+#include "spdk/thread.h"
+#include "spdk/string.h"
+#include "spdk/util.h"
+#include "spdk/json.h"
+
+#include "spdk_internal/assert.h"
+#include "spdk/bdev_module.h"
+#include "spdk_internal/log.h"
+#include "spdk_internal/virtio.h"
+#include "spdk_internal/vhost_user.h"
+
+#include <linux/virtio_blk.h>
+
+#include "bdev_virtio.h"
+
+struct virtio_blk_dev {
+ struct virtio_dev vdev;
+ struct spdk_bdev bdev;
+ bool readonly;
+ bool unmap;
+};
+
+struct virtio_blk_io_ctx {
+ struct iovec iov_req;
+ struct iovec iov_resp;
+ struct iovec iov_unmap;
+ struct virtio_blk_outhdr req;
+ struct virtio_blk_discard_write_zeroes unmap;
+ uint8_t resp;
+};
+
+struct bdev_virtio_blk_io_channel {
+ struct virtio_dev *vdev;
+
+ /** Virtqueue exclusively assigned to this channel. */
+ struct virtqueue *vq;
+
+ /** Virtio response poller. */
+ struct spdk_poller *poller;
+};
+
+/* Features desired/implemented by this driver. */
+#define VIRTIO_BLK_DEV_SUPPORTED_FEATURES \
+ (1ULL << VIRTIO_BLK_F_BLK_SIZE | \
+ 1ULL << VIRTIO_BLK_F_TOPOLOGY | \
+ 1ULL << VIRTIO_BLK_F_MQ | \
+ 1ULL << VIRTIO_BLK_F_RO | \
+ 1ULL << VIRTIO_BLK_F_DISCARD | \
+ 1ULL << VIRTIO_RING_F_EVENT_IDX | \
+ 1ULL << VHOST_USER_F_PROTOCOL_FEATURES)
+
+static int bdev_virtio_initialize(void);
+static int bdev_virtio_blk_get_ctx_size(void);
+
+static struct spdk_bdev_module virtio_blk_if = {
+ .name = "virtio_blk",
+ .module_init = bdev_virtio_initialize,
+ .get_ctx_size = bdev_virtio_blk_get_ctx_size,
+};
+
+SPDK_BDEV_MODULE_REGISTER(virtio_blk, &virtio_blk_if)
+
+static int bdev_virtio_blk_ch_create_cb(void *io_device, void *ctx_buf);
+static void bdev_virtio_blk_ch_destroy_cb(void *io_device, void *ctx_buf);
+
+static struct virtio_blk_io_ctx *
+bdev_virtio_blk_init_io_vreq(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
+{
+ struct virtio_blk_outhdr *req;
+ uint8_t *resp;
+ struct virtio_blk_discard_write_zeroes *desc;
+
+ struct virtio_blk_io_ctx *io_ctx = (struct virtio_blk_io_ctx *)bdev_io->driver_ctx;
+
+ req = &io_ctx->req;
+ resp = &io_ctx->resp;
+ desc = &io_ctx->unmap;
+
+ io_ctx->iov_req.iov_base = req;
+ io_ctx->iov_req.iov_len = sizeof(*req);
+
+ io_ctx->iov_resp.iov_base = resp;
+ io_ctx->iov_resp.iov_len = sizeof(*resp);
+
+ io_ctx->iov_unmap.iov_base = desc;
+ io_ctx->iov_unmap.iov_len = sizeof(*desc);
+
+ memset(req, 0, sizeof(*req));
+ return io_ctx;
+}
+
+static void
+bdev_virtio_blk_send_io(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
+{
+ struct bdev_virtio_blk_io_channel *virtio_channel = spdk_io_channel_get_ctx(ch);
+ struct virtqueue *vq = virtio_channel->vq;
+ struct virtio_blk_io_ctx *io_ctx = (struct virtio_blk_io_ctx *)bdev_io->driver_ctx;
+ int rc;
+
+ rc = virtqueue_req_start(vq, bdev_io, bdev_io->u.bdev.iovcnt + 2);
+ if (rc == -ENOMEM) {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
+ return;
+ } else if (rc != 0) {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ return;
+ }
+
+ virtqueue_req_add_iovs(vq, &io_ctx->iov_req, 1, SPDK_VIRTIO_DESC_RO);
+ if (bdev_io->type == SPDK_BDEV_IO_TYPE_UNMAP) {
+ virtqueue_req_add_iovs(vq, &io_ctx->iov_unmap, 1, SPDK_VIRTIO_DESC_RO);
+ } else {
+ virtqueue_req_add_iovs(vq, bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
+ bdev_io->type == SPDK_BDEV_IO_TYPE_READ ?
+ SPDK_VIRTIO_DESC_WR : SPDK_VIRTIO_DESC_RO);
+ }
+ virtqueue_req_add_iovs(vq, &io_ctx->iov_resp, 1, SPDK_VIRTIO_DESC_WR);
+
+ virtqueue_req_flush(vq);
+}
+
+static void
+bdev_virtio_command(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
+{
+ struct virtio_blk_io_ctx *io_ctx = bdev_virtio_blk_init_io_vreq(ch, bdev_io);
+ struct virtio_blk_outhdr *req = &io_ctx->req;
+ struct virtio_blk_discard_write_zeroes *desc = &io_ctx->unmap;
+
+ if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
+ req->type = VIRTIO_BLK_T_IN;
+ } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
+ req->type = VIRTIO_BLK_T_OUT;
+ } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_UNMAP) {
+ req->type = VIRTIO_BLK_T_DISCARD;
+ desc->sector = bdev_io->u.bdev.offset_blocks *
+ spdk_bdev_get_block_size(bdev_io->bdev) / 512;
+ desc->num_sectors = bdev_io->u.bdev.num_blocks *
+ spdk_bdev_get_block_size(bdev_io->bdev) / 512;
+ desc->flags = 0;
+ }
+
+ req->sector = bdev_io->u.bdev.offset_blocks *
+ spdk_bdev_get_block_size(bdev_io->bdev) / 512;
+
+ bdev_virtio_blk_send_io(ch, bdev_io);
+}
+
+static void
+bdev_virtio_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
+ bool success)
+{
+ if (!success) {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ return;
+ }
+
+ bdev_virtio_command(ch, bdev_io);
+}
+
+static int
+_bdev_virtio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
+{
+ struct virtio_blk_dev *bvdev = bdev_io->bdev->ctxt;
+
+ switch (bdev_io->type) {
+ case SPDK_BDEV_IO_TYPE_READ:
+ spdk_bdev_io_get_buf(bdev_io, bdev_virtio_get_buf_cb,
+ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
+ return 0;
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ if (bvdev->readonly) {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ } else {
+ bdev_virtio_command(ch, bdev_io);
+ }
+ return 0;
+ case SPDK_BDEV_IO_TYPE_RESET:
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
+ return 0;
+ case SPDK_BDEV_IO_TYPE_UNMAP:
+ if (bvdev->unmap) {
+ bdev_virtio_command(ch, bdev_io);
+ } else {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+ return 0;
+ case SPDK_BDEV_IO_TYPE_FLUSH:
+ default:
+ return -1;
+ }
+
+ SPDK_UNREACHABLE();
+}
+
+static void
+bdev_virtio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
+{
+ if (_bdev_virtio_submit_request(ch, bdev_io) < 0) {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+}
+
+static bool
+bdev_virtio_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
+{
+ struct virtio_blk_dev *bvdev = ctx;
+
+ switch (io_type) {
+ case SPDK_BDEV_IO_TYPE_READ:
+ case SPDK_BDEV_IO_TYPE_RESET:
+ return true;
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ return !bvdev->readonly;
+ case SPDK_BDEV_IO_TYPE_UNMAP:
+ return bvdev->unmap;
+ case SPDK_BDEV_IO_TYPE_FLUSH:
+ default:
+ return false;
+ }
+}
+
+static struct spdk_io_channel *
+bdev_virtio_get_io_channel(void *ctx)
+{
+ struct virtio_blk_dev *bvdev = ctx;
+
+ return spdk_get_io_channel(bvdev);
+}
+
+static void
+virtio_blk_dev_unregister_cb(void *io_device)
+{
+ struct virtio_blk_dev *bvdev = io_device;
+ struct virtio_dev *vdev = &bvdev->vdev;
+
+ virtio_dev_stop(vdev);
+ virtio_dev_destruct(vdev);
+ spdk_bdev_destruct_done(&bvdev->bdev, 0);
+ free(bvdev);
+}
+
+static int
+bdev_virtio_disk_destruct(void *ctx)
+{
+ struct virtio_blk_dev *bvdev = ctx;
+
+ spdk_io_device_unregister(bvdev, virtio_blk_dev_unregister_cb);
+ return 1;
+}
+
+int
+bdev_virtio_blk_dev_remove(const char *name, bdev_virtio_remove_cb cb_fn, void *cb_arg)
+{
+ struct spdk_bdev *bdev;
+
+ bdev = spdk_bdev_get_by_name(name);
+ if (bdev == NULL) {
+ return -ENODEV;
+ }
+
+ if (bdev->module != &virtio_blk_if) {
+ return -ENODEV;
+ }
+
+ spdk_bdev_unregister(bdev, cb_fn, cb_arg);
+
+ return 0;
+}
+
+static int
+bdev_virtio_dump_json_config(void *ctx, struct spdk_json_write_ctx *w)
+{
+ struct virtio_blk_dev *bvdev = ctx;
+
+ virtio_dev_dump_json_info(&bvdev->vdev, w);
+ return 0;
+}
+
+static void
+bdev_virtio_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
+{
+ struct virtio_blk_dev *bvdev = bdev->ctxt;
+
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_string(w, "method", "bdev_virtio_attach_controller");
+
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_string(w, "name", bvdev->vdev.name);
+ spdk_json_write_named_string(w, "dev_type", "blk");
+
+ /* Write transport specific parameters. */
+ bvdev->vdev.backend_ops->write_json_config(&bvdev->vdev, w);
+
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+}
+
+static const struct spdk_bdev_fn_table virtio_fn_table = {
+ .destruct = bdev_virtio_disk_destruct,
+ .submit_request = bdev_virtio_submit_request,
+ .io_type_supported = bdev_virtio_io_type_supported,
+ .get_io_channel = bdev_virtio_get_io_channel,
+ .dump_info_json = bdev_virtio_dump_json_config,
+ .write_config_json = bdev_virtio_write_config_json,
+};
+
+static void
+bdev_virtio_io_cpl(struct spdk_bdev_io *bdev_io)
+{
+ struct virtio_blk_io_ctx *io_ctx = (struct virtio_blk_io_ctx *)bdev_io->driver_ctx;
+
+ spdk_bdev_io_complete(bdev_io, io_ctx->resp == VIRTIO_BLK_S_OK ?
+ SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED);
+}
+
+static int
+bdev_virtio_poll(void *arg)
+{
+ struct bdev_virtio_blk_io_channel *ch = arg;
+ void *io[32];
+ uint32_t io_len[32];
+ uint16_t i, cnt;
+
+ cnt = virtio_recv_pkts(ch->vq, io, io_len, SPDK_COUNTOF(io));
+ for (i = 0; i < cnt; ++i) {
+ bdev_virtio_io_cpl(io[i]);
+ }
+
+ return cnt;
+}
+
+static int
+bdev_virtio_blk_ch_create_cb(void *io_device, void *ctx_buf)
+{
+ struct virtio_blk_dev *bvdev = io_device;
+ struct virtio_dev *vdev = &bvdev->vdev;
+ struct bdev_virtio_blk_io_channel *ch = ctx_buf;
+ struct virtqueue *vq;
+ int32_t queue_idx;
+
+ queue_idx = virtio_dev_find_and_acquire_queue(vdev, 0);
+ if (queue_idx < 0) {
+ SPDK_ERRLOG("Couldn't get an unused queue for the io_channel.\n");
+ return -1;
+ }
+
+ vq = vdev->vqs[queue_idx];
+
+ ch->vdev = vdev;
+ ch->vq = vq;
+
+ ch->poller = SPDK_POLLER_REGISTER(bdev_virtio_poll, ch, 0);
+ return 0;
+}
+
+static void
+bdev_virtio_blk_ch_destroy_cb(void *io_device, void *ctx_buf)
+{
+ struct virtio_blk_dev *bvdev = io_device;
+ struct virtio_dev *vdev = &bvdev->vdev;
+ struct bdev_virtio_blk_io_channel *ch = ctx_buf;
+ struct virtqueue *vq = ch->vq;
+
+ spdk_poller_unregister(&ch->poller);
+ virtio_dev_release_queue(vdev, vq->vq_queue_index);
+}
+
+static int
+virtio_blk_dev_init(struct virtio_blk_dev *bvdev, uint16_t max_queues)
+{
+ struct virtio_dev *vdev = &bvdev->vdev;
+ struct spdk_bdev *bdev = &bvdev->bdev;
+ uint64_t capacity, num_blocks;
+ uint32_t block_size;
+ uint16_t host_max_queues;
+ int rc;
+
+ if (virtio_dev_has_feature(vdev, VIRTIO_BLK_F_BLK_SIZE)) {
+ rc = virtio_dev_read_dev_config(vdev, offsetof(struct virtio_blk_config, blk_size),
+ &block_size, sizeof(block_size));
+ if (rc) {
+ SPDK_ERRLOG("%s: config read failed: %s\n", vdev->name, spdk_strerror(-rc));
+ return rc;
+ }
+
+ if (block_size == 0 || block_size % 512 != 0) {
+ SPDK_ERRLOG("%s: invalid block size (%"PRIu32"). Must be "
+ "a multiple of 512.\n", vdev->name, block_size);
+ return -EIO;
+ }
+ } else {
+ block_size = 512;
+ }
+
+ rc = virtio_dev_read_dev_config(vdev, offsetof(struct virtio_blk_config, capacity),
+ &capacity, sizeof(capacity));
+ if (rc) {
+ SPDK_ERRLOG("%s: config read failed: %s\n", vdev->name, spdk_strerror(-rc));
+ return rc;
+ }
+
+ /* `capacity` is a number of 512-byte sectors. */
+ num_blocks = capacity * 512 / block_size;
+ if (num_blocks == 0) {
+ SPDK_ERRLOG("%s: size too small (size: %"PRIu64", blocksize: %"PRIu32").\n",
+ vdev->name, capacity * 512, block_size);
+ return -EIO;
+ }
+
+ if ((capacity * 512) % block_size != 0) {
+ SPDK_WARNLOG("%s: size has been rounded down to the nearest block size boundary. "
+ "(block size: %"PRIu32", previous size: %"PRIu64", new size: %"PRIu64")\n",
+ vdev->name, block_size, capacity * 512, num_blocks * block_size);
+ }
+
+ if (virtio_dev_has_feature(vdev, VIRTIO_BLK_F_MQ)) {
+ rc = virtio_dev_read_dev_config(vdev, offsetof(struct virtio_blk_config, num_queues),
+ &host_max_queues, sizeof(host_max_queues));
+ if (rc) {
+ SPDK_ERRLOG("%s: config read failed: %s\n", vdev->name, spdk_strerror(-rc));
+ return rc;
+ }
+ } else {
+ host_max_queues = 1;
+ }
+
+ if (virtio_dev_has_feature(vdev, VIRTIO_BLK_F_RO)) {
+ bvdev->readonly = true;
+ }
+
+ if (virtio_dev_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) {
+ bvdev->unmap = true;
+ }
+
+ if (max_queues == 0) {
+ SPDK_ERRLOG("%s: requested 0 request queues (%"PRIu16" available).\n",
+ vdev->name, host_max_queues);
+ return -EINVAL;
+ }
+
+ if (max_queues > host_max_queues) {
+ SPDK_WARNLOG("%s: requested %"PRIu16" request queues "
+ "but only %"PRIu16" available.\n",
+ vdev->name, max_queues, host_max_queues);
+ max_queues = host_max_queues;
+ }
+
+ /* bdev is tied with the virtio device; we can reuse the name */
+ bdev->name = vdev->name;
+ rc = virtio_dev_start(vdev, max_queues, 0);
+ if (rc != 0) {
+ return rc;
+ }
+
+ bdev->product_name = "VirtioBlk Disk";
+ bdev->write_cache = 0;
+ bdev->blocklen = block_size;
+ bdev->blockcnt = num_blocks;
+
+ bdev->ctxt = bvdev;
+ bdev->fn_table = &virtio_fn_table;
+ bdev->module = &virtio_blk_if;
+
+ spdk_io_device_register(bvdev, bdev_virtio_blk_ch_create_cb,
+ bdev_virtio_blk_ch_destroy_cb,
+ sizeof(struct bdev_virtio_blk_io_channel),
+ vdev->name);
+
+ rc = spdk_bdev_register(bdev);
+ if (rc) {
+ SPDK_ERRLOG("Failed to register bdev name=%s\n", bdev->name);
+ spdk_io_device_unregister(bvdev, NULL);
+ virtio_dev_stop(vdev);
+ return rc;
+ }
+
+ return 0;
+}
+
+static struct virtio_blk_dev *
+virtio_pci_blk_dev_create(const char *name, struct virtio_pci_ctx *pci_ctx)
+{
+ static int pci_dev_counter = 0;
+ struct virtio_blk_dev *bvdev;
+ struct virtio_dev *vdev;
+ char *default_name = NULL;
+ uint16_t num_queues;
+ int rc;
+
+ bvdev = calloc(1, sizeof(*bvdev));
+ if (bvdev == NULL) {
+ SPDK_ERRLOG("virtio device calloc failed\n");
+ return NULL;
+ }
+ vdev = &bvdev->vdev;
+
+ if (name == NULL) {
+ default_name = spdk_sprintf_alloc("VirtioBlk%"PRIu32, pci_dev_counter++);
+ if (default_name == NULL) {
+ free(vdev);
+ return NULL;
+ }
+ name = default_name;
+ }
+
+ rc = virtio_pci_dev_init(vdev, name, pci_ctx);
+ free(default_name);
+
+ if (rc != 0) {
+ free(bvdev);
+ return NULL;
+ }
+
+ rc = virtio_dev_reset(vdev, VIRTIO_BLK_DEV_SUPPORTED_FEATURES);
+ if (rc != 0) {
+ virtio_dev_destruct(vdev);
+ free(bvdev);
+ return NULL;
+ }
+
+ /* TODO: add a way to limit usable virtqueues */
+ if (virtio_dev_has_feature(vdev, VIRTIO_BLK_F_MQ)) {
+ rc = virtio_dev_read_dev_config(vdev, offsetof(struct virtio_blk_config, num_queues),
+ &num_queues, sizeof(num_queues));
+ if (rc) {
+ SPDK_ERRLOG("%s: config read failed: %s\n", vdev->name, spdk_strerror(-rc));
+ virtio_dev_destruct(vdev);
+ free(bvdev);
+ return NULL;
+ }
+ } else {
+ num_queues = 1;
+ }
+
+ rc = virtio_blk_dev_init(bvdev, num_queues);
+ if (rc != 0) {
+ virtio_dev_destruct(vdev);
+ free(bvdev);
+ return NULL;
+ }
+
+ return bvdev;
+}
+
+static struct virtio_blk_dev *
+virtio_user_blk_dev_create(const char *name, const char *path,
+ uint16_t num_queues, uint32_t queue_size)
+{
+ struct virtio_blk_dev *bvdev;
+ int rc;
+
+ bvdev = calloc(1, sizeof(*bvdev));
+ if (bvdev == NULL) {
+ SPDK_ERRLOG("calloc failed for virtio device %s: %s\n", name, path);
+ return NULL;
+ }
+
+ rc = virtio_user_dev_init(&bvdev->vdev, name, path, queue_size);
+ if (rc != 0) {
+ SPDK_ERRLOG("Failed to create virito device %s: %s\n", name, path);
+ free(bvdev);
+ return NULL;
+ }
+
+ rc = virtio_dev_reset(&bvdev->vdev, VIRTIO_BLK_DEV_SUPPORTED_FEATURES);
+ if (rc != 0) {
+ virtio_dev_destruct(&bvdev->vdev);
+ free(bvdev);
+ return NULL;
+ }
+
+ rc = virtio_blk_dev_init(bvdev, num_queues);
+ if (rc != 0) {
+ virtio_dev_destruct(&bvdev->vdev);
+ free(bvdev);
+ return NULL;
+ }
+
+ return bvdev;
+}
+
+struct bdev_virtio_pci_dev_create_ctx {
+ const char *name;
+ struct virtio_blk_dev *ret;
+};
+
+static int
+bdev_virtio_pci_blk_dev_create_cb(struct virtio_pci_ctx *pci_ctx, void *ctx)
+{
+ struct bdev_virtio_pci_dev_create_ctx *create_ctx = ctx;
+
+ create_ctx->ret = virtio_pci_blk_dev_create(create_ctx->name, pci_ctx);
+ if (create_ctx->ret == NULL) {
+ return -1;
+ }
+
+ return 0;
+}
+
+struct spdk_bdev *
+bdev_virtio_pci_blk_dev_create(const char *name, struct spdk_pci_addr *pci_addr)
+{
+ struct bdev_virtio_pci_dev_create_ctx create_ctx;
+
+ create_ctx.name = name;
+ create_ctx.ret = NULL;
+
+ virtio_pci_dev_attach(bdev_virtio_pci_blk_dev_create_cb, &create_ctx,
+ PCI_DEVICE_ID_VIRTIO_BLK_MODERN, pci_addr);
+
+ if (create_ctx.ret == NULL) {
+ return NULL;
+ }
+
+ return &create_ctx.ret->bdev;
+}
+
+static int
+virtio_pci_blk_dev_enumerate_cb(struct virtio_pci_ctx *pci_ctx, void *ctx)
+{
+ struct virtio_blk_dev *bvdev;
+
+ bvdev = virtio_pci_blk_dev_create(NULL, pci_ctx);
+ return bvdev == NULL ? -1 : 0;
+}
+
+static int
+bdev_virtio_initialize(void)
+{
+ struct spdk_conf_section *sp;
+ struct virtio_blk_dev *bvdev;
+ char *default_name = NULL;
+ char *path, *type, *name;
+ unsigned vdev_num;
+ int num_queues;
+ bool enable_pci;
+ int rc = 0;
+
+ for (sp = spdk_conf_first_section(NULL); sp != NULL; sp = spdk_conf_next_section(sp)) {
+ if (!spdk_conf_section_match_prefix(sp, "VirtioUser")) {
+ continue;
+ }
+
+ if (sscanf(spdk_conf_section_get_name(sp), "VirtioUser%u", &vdev_num) != 1) {
+ SPDK_ERRLOG("Section '%s' has non-numeric suffix.\n",
+ spdk_conf_section_get_name(sp));
+ return -1;
+ }
+
+ path = spdk_conf_section_get_val(sp, "Path");
+ if (path == NULL) {
+ SPDK_ERRLOG("VirtioUserBlk%u: missing Path\n", vdev_num);
+ return -1;
+ }
+
+ type = spdk_conf_section_get_val(sp, "Type");
+ if (type == NULL || strcmp(type, "Blk") != 0) {
+ continue;
+ }
+
+ num_queues = spdk_conf_section_get_intval(sp, "Queues");
+ if (num_queues < 1) {
+ num_queues = 1;
+ }
+
+ name = spdk_conf_section_get_val(sp, "Name");
+ if (name == NULL) {
+ default_name = spdk_sprintf_alloc("VirtioBlk%u", vdev_num);
+ name = default_name;
+ }
+
+ bvdev = virtio_user_blk_dev_create(name, path, num_queues, 512);
+ free(default_name);
+ default_name = NULL;
+
+ if (bvdev == NULL) {
+ return -1;
+ }
+ }
+
+ sp = spdk_conf_find_section(NULL, "VirtioPci");
+ if (sp == NULL) {
+ return 0;
+ }
+
+ enable_pci = spdk_conf_section_get_boolval(sp, "Enable", false);
+ if (enable_pci) {
+ rc = virtio_pci_dev_enumerate(virtio_pci_blk_dev_enumerate_cb, NULL,
+ PCI_DEVICE_ID_VIRTIO_BLK_MODERN);
+ }
+
+ return rc;
+}
+
+struct spdk_bdev *
+bdev_virtio_user_blk_dev_create(const char *name, const char *path,
+ unsigned num_queues, unsigned queue_size)
+{
+ struct virtio_blk_dev *bvdev;
+
+ bvdev = virtio_user_blk_dev_create(name, path, num_queues, queue_size);
+ if (bvdev == NULL) {
+ return NULL;
+ }
+
+ return &bvdev->bdev;
+}
+
+static int
+bdev_virtio_blk_get_ctx_size(void)
+{
+ return sizeof(struct virtio_blk_io_ctx);
+}
+
+SPDK_LOG_REGISTER_COMPONENT("virtio_blk", SPDK_LOG_VIRTIO_BLK)
diff --git a/src/spdk/module/bdev/virtio/bdev_virtio_rpc.c b/src/spdk/module/bdev/virtio/bdev_virtio_rpc.c
new file mode 100644
index 000000000..3c3c276eb
--- /dev/null
+++ b/src/spdk/module/bdev/virtio/bdev_virtio_rpc.c
@@ -0,0 +1,264 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/string.h"
+#include "spdk/rpc.h"
+#include "spdk/util.h"
+#include "spdk_internal/log.h"
+
+#include "bdev_virtio.h"
+
+#define SPDK_VIRTIO_USER_DEFAULT_VQ_COUNT 1
+#define SPDK_VIRTIO_USER_DEFAULT_QUEUE_SIZE 512
+
+struct rpc_remove_virtio_dev {
+ char *name;
+};
+
+static const struct spdk_json_object_decoder rpc_remove_virtio_dev[] = {
+ {"name", offsetof(struct rpc_remove_virtio_dev, name), spdk_json_decode_string },
+};
+
+static void
+rpc_bdev_virtio_detach_controller_cb(void *ctx, int errnum)
+{
+ struct spdk_jsonrpc_request *request = ctx;
+ struct spdk_json_write_ctx *w;
+
+ if (errnum != 0) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ spdk_strerror(-errnum));
+ return;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+}
+
+static void
+rpc_bdev_virtio_detach_controller(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_remove_virtio_dev req = {NULL};
+ int rc = 0;
+
+ if (spdk_json_decode_object(params, rpc_remove_virtio_dev,
+ SPDK_COUNTOF(rpc_remove_virtio_dev),
+ &req)) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ rc = bdev_virtio_blk_dev_remove(req.name, rpc_bdev_virtio_detach_controller_cb, request);
+ if (rc == -ENODEV) {
+ rc = bdev_virtio_scsi_dev_remove(req.name, rpc_bdev_virtio_detach_controller_cb, request);
+ }
+
+ if (rc != 0) {
+ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc));
+ }
+
+cleanup:
+ free(req.name);
+}
+SPDK_RPC_REGISTER("bdev_virtio_detach_controller",
+ rpc_bdev_virtio_detach_controller, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_virtio_detach_controller, remove_virtio_bdev)
+
+static void
+rpc_bdev_virtio_scsi_get_devices(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct spdk_json_write_ctx *w;
+
+ if (params != NULL) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "bdev_virtio_scsi_get_devices requires no parameters");
+ return;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ bdev_virtio_scsi_dev_list(w);
+ spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("bdev_virtio_scsi_get_devices",
+ rpc_bdev_virtio_scsi_get_devices, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_virtio_scsi_get_devices, get_virtio_scsi_devs)
+
+struct rpc_bdev_virtio_attach_controller_ctx {
+ char *name;
+ char *trtype;
+ char *traddr;
+ char *dev_type;
+ uint32_t vq_count;
+ uint32_t vq_size;
+ struct spdk_jsonrpc_request *request;
+};
+
+static const struct spdk_json_object_decoder rpc_bdev_virtio_attach_controller_ctx[] = {
+ {"name", offsetof(struct rpc_bdev_virtio_attach_controller_ctx, name), spdk_json_decode_string },
+ {"trtype", offsetof(struct rpc_bdev_virtio_attach_controller_ctx, trtype), spdk_json_decode_string },
+ {"traddr", offsetof(struct rpc_bdev_virtio_attach_controller_ctx, traddr), spdk_json_decode_string },
+ {"dev_type", offsetof(struct rpc_bdev_virtio_attach_controller_ctx, dev_type), spdk_json_decode_string },
+ {"vq_count", offsetof(struct rpc_bdev_virtio_attach_controller_ctx, vq_count), spdk_json_decode_uint32, true },
+ {"vq_size", offsetof(struct rpc_bdev_virtio_attach_controller_ctx, vq_size), spdk_json_decode_uint32, true },
+};
+
+static void
+free_rpc_bdev_virtio_attach_controller_ctx(struct rpc_bdev_virtio_attach_controller_ctx *req)
+{
+ free(req->name);
+ free(req->trtype);
+ free(req->traddr);
+ free(req->dev_type);
+ free(req);
+}
+
+static void
+rpc_create_virtio_dev_cb(void *ctx, int result, struct spdk_bdev **bdevs, size_t cnt)
+{
+ struct rpc_bdev_virtio_attach_controller_ctx *req = ctx;
+ struct spdk_json_write_ctx *w;
+ size_t i;
+
+ if (result) {
+ spdk_jsonrpc_send_error_response(req->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ spdk_strerror(-result));
+ free_rpc_bdev_virtio_attach_controller_ctx(req);
+ return;
+ }
+
+ w = spdk_jsonrpc_begin_result(req->request);
+ spdk_json_write_array_begin(w);
+
+ for (i = 0; i < cnt; i++) {
+ spdk_json_write_string(w, spdk_bdev_get_name(bdevs[i]));
+ }
+
+ spdk_json_write_array_end(w);
+ spdk_jsonrpc_end_result(req->request, w);
+
+ free_rpc_bdev_virtio_attach_controller_ctx(ctx);
+}
+
+static void
+rpc_bdev_virtio_attach_controller(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_bdev_virtio_attach_controller_ctx *req;
+ struct spdk_bdev *bdev;
+ struct spdk_pci_addr pci_addr;
+ bool pci;
+ int rc;
+
+ req = calloc(1, sizeof(*req));
+ if (!req) {
+ SPDK_ERRLOG("calloc() failed\n");
+ spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM));
+ return;
+ }
+
+ if (spdk_json_decode_object(params, rpc_bdev_virtio_attach_controller_ctx,
+ SPDK_COUNTOF(rpc_bdev_virtio_attach_controller_ctx),
+ req)) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto cleanup;
+ }
+
+ if (strcmp(req->trtype, "pci") == 0) {
+ if (req->vq_count != 0 || req->vq_size != 0) {
+ SPDK_ERRLOG("VQ count or size is not allowed for PCI transport type\n");
+ spdk_jsonrpc_send_error_response(request, EINVAL,
+ "vq_count or vq_size is not allowed for PCI transport type.");
+ goto cleanup;
+ }
+
+ if (spdk_pci_addr_parse(&pci_addr, req->traddr) != 0) {
+ SPDK_ERRLOG("Invalid PCI address '%s'\n", req->traddr);
+ spdk_jsonrpc_send_error_response_fmt(request, EINVAL, "Invalid PCI address '%s'", req->traddr);
+ goto cleanup;
+ }
+
+ pci = true;
+ } else if (strcmp(req->trtype, "user") == 0) {
+ req->vq_count = req->vq_count == 0 ? SPDK_VIRTIO_USER_DEFAULT_VQ_COUNT : req->vq_count;
+ req->vq_size = req->vq_size == 0 ? SPDK_VIRTIO_USER_DEFAULT_QUEUE_SIZE : req->vq_size;
+ pci = false;
+ } else {
+ SPDK_ERRLOG("Invalid trtype '%s'\n", req->trtype);
+ spdk_jsonrpc_send_error_response_fmt(request, EINVAL, "Invalid trtype '%s'", req->trtype);
+ goto cleanup;
+ }
+
+ req->request = request;
+ if (strcmp(req->dev_type, "blk") == 0) {
+ if (pci) {
+ bdev = bdev_virtio_pci_blk_dev_create(req->name, &pci_addr);
+ } else {
+ bdev = bdev_virtio_user_blk_dev_create(req->name, req->traddr, req->vq_count, req->vq_size);
+ }
+
+ /* Virtio blk doesn't use callback so call it manually to send result. */
+ rc = bdev ? 0 : -EINVAL;
+ rpc_create_virtio_dev_cb(req, rc, &bdev, bdev ? 1 : 0);
+ } else if (strcmp(req->dev_type, "scsi") == 0) {
+ if (pci) {
+ rc = bdev_virtio_pci_scsi_dev_create(req->name, &pci_addr, rpc_create_virtio_dev_cb, req);
+ } else {
+ rc = bdev_virtio_user_scsi_dev_create(req->name, req->traddr, req->vq_count, req->vq_size,
+ rpc_create_virtio_dev_cb, req);
+ }
+
+ if (rc < 0) {
+ /* In case of error callback is not called so do it manually to send result. */
+ rpc_create_virtio_dev_cb(req, rc, NULL, 0);
+ }
+ } else {
+ SPDK_ERRLOG("Invalid dev_type '%s'\n", req->dev_type);
+ spdk_jsonrpc_send_error_response_fmt(request, EINVAL, "Invalid dev_type '%s'", req->dev_type);
+ goto cleanup;
+ }
+
+ return;
+
+cleanup:
+ free_rpc_bdev_virtio_attach_controller_ctx(req);
+}
+SPDK_RPC_REGISTER("bdev_virtio_attach_controller",
+ rpc_bdev_virtio_attach_controller, SPDK_RPC_RUNTIME);
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_virtio_attach_controller, construct_virtio_dev)
diff --git a/src/spdk/module/bdev/virtio/bdev_virtio_scsi.c b/src/spdk/module/bdev/virtio/bdev_virtio_scsi.c
new file mode 100644
index 000000000..520b8a17d
--- /dev/null
+++ b/src/spdk/module/bdev/virtio/bdev_virtio_scsi.c
@@ -0,0 +1,2036 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/bdev.h"
+#include "spdk/conf.h"
+#include "spdk/endian.h"
+#include "spdk/env.h"
+#include "spdk/thread.h"
+#include "spdk/scsi_spec.h"
+#include "spdk/string.h"
+#include "spdk/util.h"
+#include "spdk/json.h"
+
+#include "spdk/bdev_module.h"
+#include "spdk_internal/log.h"
+#include "spdk_internal/virtio.h"
+#include "spdk_internal/vhost_user.h"
+
+#include <linux/virtio_scsi.h>
+
+#include "bdev_virtio.h"
+
+#define BDEV_VIRTIO_MAX_TARGET 64
+#define BDEV_VIRTIO_SCAN_PAYLOAD_SIZE 256
+#define MGMT_POLL_PERIOD_US (1000 * 5)
+#define CTRLQ_RING_SIZE 16
+#define SCAN_REQUEST_RETRIES 5
+
+/* Number of non-request queues - eventq and controlq */
+#define SPDK_VIRTIO_SCSI_QUEUE_NUM_FIXED 2
+
+#define VIRTIO_SCSI_EVENTQ_BUFFER_COUNT 16
+
+#define VIRTIO_SCSI_CONTROLQ 0
+#define VIRTIO_SCSI_EVENTQ 1
+#define VIRTIO_SCSI_REQUESTQ 2
+
+static int bdev_virtio_initialize(void);
+static void bdev_virtio_finish(void);
+
+struct virtio_scsi_dev {
+ /* Generic virtio device data. */
+ struct virtio_dev vdev;
+
+ /** Detected SCSI LUNs */
+ TAILQ_HEAD(, virtio_scsi_disk) luns;
+
+ /** Context for the SCSI target scan. */
+ struct virtio_scsi_scan_base *scan_ctx;
+
+ /** Controlq poller. */
+ struct spdk_poller *mgmt_poller;
+
+ /** Controlq messages to be sent. */
+ struct spdk_ring *ctrlq_ring;
+
+ /** Buffers for the eventq. */
+ struct virtio_scsi_eventq_io *eventq_ios;
+
+ /** Device marked for removal. */
+ bool removed;
+
+ /** Callback to be called after vdev removal. */
+ bdev_virtio_remove_cb remove_cb;
+
+ /** Context for the `remove_cb`. */
+ void *remove_ctx;
+
+ TAILQ_ENTRY(virtio_scsi_dev) tailq;
+};
+
+struct virtio_scsi_io_ctx {
+ struct iovec iov_req;
+ struct iovec iov_resp;
+ union {
+ struct virtio_scsi_cmd_req req;
+ struct virtio_scsi_ctrl_tmf_req tmf_req;
+ };
+ union {
+ struct virtio_scsi_cmd_resp resp;
+ struct virtio_scsi_ctrl_tmf_resp tmf_resp;
+ };
+};
+
+struct virtio_scsi_eventq_io {
+ struct iovec iov;
+ struct virtio_scsi_event ev;
+};
+
+struct virtio_scsi_scan_info {
+ uint64_t num_blocks;
+ uint32_t block_size;
+ uint8_t target;
+ bool unmap_supported;
+ TAILQ_ENTRY(virtio_scsi_scan_info) tailq;
+};
+
+struct virtio_scsi_scan_base {
+ struct virtio_scsi_dev *svdev;
+
+ /** I/O channel used for the scan I/O. */
+ struct bdev_virtio_io_channel *channel;
+
+ bdev_virtio_create_cb cb_fn;
+ void *cb_arg;
+
+ /** Scan all targets on the device. */
+ bool full_scan;
+
+ /** Start a full rescan after receiving next scan I/O response. */
+ bool restart;
+
+ /** Additional targets to be (re)scanned. */
+ TAILQ_HEAD(, virtio_scsi_scan_info) scan_queue;
+
+ /** Remaining attempts for sending the current request. */
+ unsigned retries;
+
+ /** If set, the last scan I/O needs to be resent */
+ bool needs_resend;
+
+ struct virtio_scsi_io_ctx io_ctx;
+ struct iovec iov;
+ uint8_t payload[BDEV_VIRTIO_SCAN_PAYLOAD_SIZE];
+
+ /** Scan results for the current target. */
+ struct virtio_scsi_scan_info info;
+};
+
+struct virtio_scsi_disk {
+ struct spdk_bdev bdev;
+ struct virtio_scsi_dev *svdev;
+ struct virtio_scsi_scan_info info;
+
+ /** Descriptor opened just to be notified of external bdev hotremove. */
+ struct spdk_bdev_desc *notify_desc;
+
+ /** Disk marked for removal. */
+ bool removed;
+ TAILQ_ENTRY(virtio_scsi_disk) link;
+};
+
+struct bdev_virtio_io_channel {
+ struct virtio_scsi_dev *svdev;
+
+ /** Virtqueue exclusively assigned to this channel. */
+ struct virtqueue *vq;
+
+ /** Virtio response poller. */
+ struct spdk_poller *poller;
+};
+
+static TAILQ_HEAD(, virtio_scsi_dev) g_virtio_scsi_devs =
+ TAILQ_HEAD_INITIALIZER(g_virtio_scsi_devs);
+
+static pthread_mutex_t g_virtio_scsi_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+/** Module finish in progress */
+static bool g_bdev_virtio_finish = false;
+
+/* Features desired/implemented by this driver. */
+#define VIRTIO_SCSI_DEV_SUPPORTED_FEATURES \
+ (1ULL << VIRTIO_SCSI_F_INOUT | \
+ 1ULL << VIRTIO_SCSI_F_HOTPLUG | \
+ 1ULL << VIRTIO_RING_F_EVENT_IDX | \
+ 1ULL << VHOST_USER_F_PROTOCOL_FEATURES)
+
+static void virtio_scsi_dev_unregister_cb(void *io_device);
+static void virtio_scsi_dev_remove(struct virtio_scsi_dev *svdev,
+ bdev_virtio_remove_cb cb_fn, void *cb_arg);
+static int bdev_virtio_scsi_ch_create_cb(void *io_device, void *ctx_buf);
+static void bdev_virtio_scsi_ch_destroy_cb(void *io_device, void *ctx_buf);
+static void process_scan_resp(struct virtio_scsi_scan_base *base);
+static int bdev_virtio_mgmt_poll(void *arg);
+
+static int
+virtio_scsi_dev_send_eventq_io(struct virtqueue *vq, struct virtio_scsi_eventq_io *io)
+{
+ int rc;
+
+ rc = virtqueue_req_start(vq, io, 1);
+ if (rc != 0) {
+ return -1;
+ }
+
+ virtqueue_req_add_iovs(vq, &io->iov, 1, SPDK_VIRTIO_DESC_WR);
+ virtqueue_req_flush(vq);
+
+ return 0;
+}
+
+static int
+virtio_scsi_dev_init(struct virtio_scsi_dev *svdev, uint16_t max_queues)
+{
+ struct virtio_dev *vdev = &svdev->vdev;
+ struct spdk_ring *ctrlq_ring;
+ struct virtio_scsi_eventq_io *eventq_io;
+ struct virtqueue *eventq;
+ uint16_t i, num_events;
+ int rc;
+
+ rc = virtio_dev_reset(vdev, VIRTIO_SCSI_DEV_SUPPORTED_FEATURES);
+ if (rc != 0) {
+ return rc;
+ }
+
+ rc = virtio_dev_start(vdev, max_queues, SPDK_VIRTIO_SCSI_QUEUE_NUM_FIXED);
+ if (rc != 0) {
+ return rc;
+ }
+
+ ctrlq_ring = spdk_ring_create(SPDK_RING_TYPE_MP_SC, CTRLQ_RING_SIZE,
+ SPDK_ENV_SOCKET_ID_ANY);
+ if (ctrlq_ring == NULL) {
+ SPDK_ERRLOG("Failed to allocate send ring for the controlq.\n");
+ return -1;
+ }
+
+ rc = virtio_dev_acquire_queue(vdev, VIRTIO_SCSI_CONTROLQ);
+ if (rc != 0) {
+ SPDK_ERRLOG("Failed to acquire the controlq.\n");
+ spdk_ring_free(ctrlq_ring);
+ return -1;
+ }
+
+ rc = virtio_dev_acquire_queue(vdev, VIRTIO_SCSI_EVENTQ);
+ if (rc != 0) {
+ SPDK_ERRLOG("Failed to acquire the eventq.\n");
+ virtio_dev_release_queue(vdev, VIRTIO_SCSI_CONTROLQ);
+ spdk_ring_free(ctrlq_ring);
+ return -1;
+ }
+
+ eventq = vdev->vqs[VIRTIO_SCSI_EVENTQ];
+ num_events = spdk_min(eventq->vq_nentries, VIRTIO_SCSI_EVENTQ_BUFFER_COUNT);
+ svdev->eventq_ios = spdk_zmalloc(sizeof(*svdev->eventq_ios) * num_events,
+ 0, NULL, SPDK_ENV_LCORE_ID_ANY,
+ SPDK_MALLOC_DMA);
+ if (svdev->eventq_ios == NULL) {
+ SPDK_ERRLOG("cannot allocate memory for %"PRIu16" eventq buffers\n",
+ num_events);
+ virtio_dev_release_queue(vdev, VIRTIO_SCSI_EVENTQ);
+ virtio_dev_release_queue(vdev, VIRTIO_SCSI_CONTROLQ);
+ spdk_ring_free(ctrlq_ring);
+ return -1;
+ }
+
+ for (i = 0; i < num_events; i++) {
+ eventq_io = &svdev->eventq_ios[i];
+ eventq_io->iov.iov_base = &eventq_io->ev;
+ eventq_io->iov.iov_len = sizeof(eventq_io->ev);
+ virtio_scsi_dev_send_eventq_io(eventq, eventq_io);
+ }
+
+ svdev->ctrlq_ring = ctrlq_ring;
+
+ svdev->mgmt_poller = SPDK_POLLER_REGISTER(bdev_virtio_mgmt_poll, svdev,
+ MGMT_POLL_PERIOD_US);
+
+ TAILQ_INIT(&svdev->luns);
+ svdev->scan_ctx = NULL;
+ svdev->removed = false;
+ svdev->remove_cb = NULL;
+ svdev->remove_ctx = NULL;
+
+ spdk_io_device_register(svdev, bdev_virtio_scsi_ch_create_cb,
+ bdev_virtio_scsi_ch_destroy_cb,
+ sizeof(struct bdev_virtio_io_channel),
+ svdev->vdev.name);
+
+ pthread_mutex_lock(&g_virtio_scsi_mutex);
+ TAILQ_INSERT_TAIL(&g_virtio_scsi_devs, svdev, tailq);
+ pthread_mutex_unlock(&g_virtio_scsi_mutex);
+ return 0;
+}
+
+static struct virtio_scsi_dev *
+virtio_pci_scsi_dev_create(const char *name, struct virtio_pci_ctx *pci_ctx)
+{
+ static int pci_dev_counter = 0;
+ struct virtio_scsi_dev *svdev;
+ struct virtio_dev *vdev;
+ char *default_name = NULL;
+ uint32_t num_queues;
+ int rc;
+
+ svdev = calloc(1, sizeof(*svdev));
+ if (svdev == NULL) {
+ SPDK_ERRLOG("virtio device calloc failed\n");
+ return NULL;
+ }
+
+ vdev = &svdev->vdev;
+ if (name == NULL) {
+ default_name = spdk_sprintf_alloc("VirtioScsi%"PRIu32, pci_dev_counter++);
+ if (default_name == NULL) {
+ free(vdev);
+ return NULL;
+ }
+ name = default_name;
+ }
+
+ rc = virtio_pci_dev_init(vdev, name, pci_ctx);
+ free(default_name);
+
+ if (rc != 0) {
+ free(svdev);
+ return NULL;
+ }
+
+ rc = virtio_dev_read_dev_config(vdev, offsetof(struct virtio_scsi_config, num_queues),
+ &num_queues, sizeof(num_queues));
+ if (rc) {
+ SPDK_ERRLOG("%s: config read failed: %s\n", vdev->name, spdk_strerror(-rc));
+ virtio_dev_destruct(vdev);
+ free(svdev);
+ return NULL;
+ }
+
+ rc = virtio_scsi_dev_init(svdev, num_queues);
+ if (rc != 0) {
+ virtio_dev_destruct(vdev);
+ free(svdev);
+ return NULL;
+ }
+
+ return svdev;
+}
+
+static struct virtio_scsi_dev *
+virtio_user_scsi_dev_create(const char *name, const char *path,
+ uint16_t num_queues, uint32_t queue_size)
+{
+ struct virtio_scsi_dev *svdev;
+ struct virtio_dev *vdev;
+ int rc;
+
+ svdev = calloc(1, sizeof(*svdev));
+ if (svdev == NULL) {
+ SPDK_ERRLOG("calloc failed for virtio device %s: %s\n", name, path);
+ return NULL;
+ }
+
+ vdev = &svdev->vdev;
+ rc = virtio_user_dev_init(vdev, name, path, queue_size);
+ if (rc != 0) {
+ SPDK_ERRLOG("Failed to create virito device %s: %s\n", name, path);
+ free(svdev);
+ return NULL;
+ }
+
+ rc = virtio_scsi_dev_init(svdev, num_queues);
+ if (rc != 0) {
+ virtio_dev_destruct(vdev);
+ free(svdev);
+ return NULL;
+ }
+
+ return svdev;
+}
+
+static struct virtio_scsi_disk *
+virtio_scsi_dev_get_disk_by_id(struct virtio_scsi_dev *svdev, uint8_t target_id)
+{
+ struct virtio_scsi_disk *disk;
+
+ TAILQ_FOREACH(disk, &svdev->luns, link) {
+ if (disk->info.target == target_id) {
+ return disk;
+ }
+ }
+
+ return NULL;
+}
+
+static int virtio_scsi_dev_scan(struct virtio_scsi_dev *svdev,
+ bdev_virtio_create_cb cb_fn, void *cb_arg);
+static int send_scan_io(struct virtio_scsi_scan_base *base);
+static void _virtio_scsi_dev_scan_tgt(struct virtio_scsi_scan_base *base, uint8_t target);
+static int _virtio_scsi_dev_scan_next(struct virtio_scsi_scan_base *base, int rc);
+static void _virtio_scsi_dev_scan_finish(struct virtio_scsi_scan_base *base, int errnum);
+static int virtio_scsi_dev_scan_tgt(struct virtio_scsi_dev *svdev, uint8_t target);
+
+static int
+bdev_virtio_get_ctx_size(void)
+{
+ return sizeof(struct virtio_scsi_io_ctx);
+}
+
+static int
+bdev_virtio_scsi_config_json(struct spdk_json_write_ctx *w)
+{
+ struct virtio_scsi_dev *svdev;
+
+ pthread_mutex_lock(&g_virtio_scsi_mutex);
+ TAILQ_FOREACH(svdev, &g_virtio_scsi_devs, tailq) {
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_string(w, "method", "bdev_virtio_attach_controller");
+
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_string(w, "name", svdev->vdev.name);
+ spdk_json_write_named_string(w, "dev_type", "scsi");
+
+ /* Write transport specific parameters. */
+ svdev->vdev.backend_ops->write_json_config(&svdev->vdev, w);
+
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+
+ }
+ pthread_mutex_unlock(&g_virtio_scsi_mutex);
+
+ return 0;
+}
+
+
+static struct spdk_bdev_module virtio_scsi_if = {
+ .name = "virtio_scsi",
+ .module_init = bdev_virtio_initialize,
+ .module_fini = bdev_virtio_finish,
+ .get_ctx_size = bdev_virtio_get_ctx_size,
+ .config_json = bdev_virtio_scsi_config_json,
+ .async_init = true,
+ .async_fini = true,
+};
+
+SPDK_BDEV_MODULE_REGISTER(virtio_scsi, &virtio_scsi_if)
+
+static struct virtio_scsi_io_ctx *
+bdev_virtio_init_io_vreq(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
+{
+ struct virtio_scsi_cmd_req *req;
+ struct virtio_scsi_cmd_resp *resp;
+ struct virtio_scsi_disk *disk = (struct virtio_scsi_disk *)bdev_io->bdev;
+ struct virtio_scsi_io_ctx *io_ctx = (struct virtio_scsi_io_ctx *)bdev_io->driver_ctx;
+
+ req = &io_ctx->req;
+ resp = &io_ctx->resp;
+
+ io_ctx->iov_req.iov_base = req;
+ io_ctx->iov_req.iov_len = sizeof(*req);
+
+ io_ctx->iov_resp.iov_base = resp;
+ io_ctx->iov_resp.iov_len = sizeof(*resp);
+
+ memset(req, 0, sizeof(*req));
+ req->lun[0] = 1;
+ req->lun[1] = disk->info.target;
+
+ return io_ctx;
+}
+
+static struct virtio_scsi_io_ctx *
+bdev_virtio_init_tmf_vreq(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
+{
+ struct virtio_scsi_ctrl_tmf_req *tmf_req;
+ struct virtio_scsi_ctrl_tmf_resp *tmf_resp;
+ struct virtio_scsi_disk *disk = SPDK_CONTAINEROF(bdev_io->bdev, struct virtio_scsi_disk, bdev);
+ struct virtio_scsi_io_ctx *io_ctx = (struct virtio_scsi_io_ctx *)bdev_io->driver_ctx;
+
+ tmf_req = &io_ctx->tmf_req;
+ tmf_resp = &io_ctx->tmf_resp;
+
+ io_ctx->iov_req.iov_base = tmf_req;
+ io_ctx->iov_req.iov_len = sizeof(*tmf_req);
+ io_ctx->iov_resp.iov_base = tmf_resp;
+ io_ctx->iov_resp.iov_len = sizeof(*tmf_resp);
+
+ memset(tmf_req, 0, sizeof(*tmf_req));
+ tmf_req->lun[0] = 1;
+ tmf_req->lun[1] = disk->info.target;
+
+ return io_ctx;
+}
+
+static void
+bdev_virtio_send_io(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
+{
+ struct bdev_virtio_io_channel *virtio_channel = spdk_io_channel_get_ctx(ch);
+ struct virtqueue *vq = virtio_channel->vq;
+ struct virtio_scsi_io_ctx *io_ctx = (struct virtio_scsi_io_ctx *)bdev_io->driver_ctx;
+ int rc;
+
+ rc = virtqueue_req_start(vq, bdev_io, bdev_io->u.bdev.iovcnt + 2);
+ if (rc == -ENOMEM) {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
+ return;
+ } else if (rc != 0) {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ return;
+ }
+
+ virtqueue_req_add_iovs(vq, &io_ctx->iov_req, 1, SPDK_VIRTIO_DESC_RO);
+ if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
+ virtqueue_req_add_iovs(vq, &io_ctx->iov_resp, 1, SPDK_VIRTIO_DESC_WR);
+ virtqueue_req_add_iovs(vq, bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
+ SPDK_VIRTIO_DESC_WR);
+ } else {
+ virtqueue_req_add_iovs(vq, bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
+ SPDK_VIRTIO_DESC_RO);
+ virtqueue_req_add_iovs(vq, &io_ctx->iov_resp, 1, SPDK_VIRTIO_DESC_WR);
+ }
+
+ virtqueue_req_flush(vq);
+}
+
+static void
+bdev_virtio_rw(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
+{
+ struct virtio_scsi_disk *disk = SPDK_CONTAINEROF(bdev_io->bdev, struct virtio_scsi_disk, bdev);
+ struct virtio_scsi_io_ctx *io_ctx = bdev_virtio_init_io_vreq(ch, bdev_io);
+ struct virtio_scsi_cmd_req *req = &io_ctx->req;
+ bool is_write = bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE;
+
+ if (disk->info.num_blocks > (1ULL << 32)) {
+ req->cdb[0] = is_write ? SPDK_SBC_WRITE_16 : SPDK_SBC_READ_16;
+ to_be64(&req->cdb[2], bdev_io->u.bdev.offset_blocks);
+ to_be32(&req->cdb[10], bdev_io->u.bdev.num_blocks);
+ } else {
+ req->cdb[0] = is_write ? SPDK_SBC_WRITE_10 : SPDK_SBC_READ_10;
+ to_be32(&req->cdb[2], bdev_io->u.bdev.offset_blocks);
+ to_be16(&req->cdb[7], bdev_io->u.bdev.num_blocks);
+ }
+
+ bdev_virtio_send_io(ch, bdev_io);
+}
+
+static void
+bdev_virtio_reset(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
+{
+ struct bdev_virtio_io_channel *virtio_ch = spdk_io_channel_get_ctx(ch);
+ struct virtio_scsi_io_ctx *io_ctx = bdev_virtio_init_tmf_vreq(ch, bdev_io);
+ struct virtio_scsi_ctrl_tmf_req *tmf_req = &io_ctx->tmf_req;
+ struct virtio_scsi_dev *svdev = virtio_ch->svdev;
+ size_t enqueued_count;
+
+ tmf_req->type = VIRTIO_SCSI_T_TMF;
+ tmf_req->subtype = VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET;
+
+ enqueued_count = spdk_ring_enqueue(svdev->ctrlq_ring, (void **)&bdev_io, 1, NULL);
+ if (spdk_likely(enqueued_count == 1)) {
+ return;
+ } else {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
+ }
+}
+
+static void
+bdev_virtio_unmap(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success)
+{
+ struct virtio_scsi_io_ctx *io_ctx = bdev_virtio_init_io_vreq(ch, bdev_io);
+ struct virtio_scsi_cmd_req *req = &io_ctx->req;
+ struct spdk_scsi_unmap_bdesc *desc, *first_desc;
+ uint8_t *buf;
+ uint64_t offset_blocks, num_blocks;
+ uint16_t cmd_len;
+
+ if (!success) {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ return;
+ }
+
+ buf = bdev_io->u.bdev.iovs[0].iov_base;
+
+ offset_blocks = bdev_io->u.bdev.offset_blocks;
+ num_blocks = bdev_io->u.bdev.num_blocks;
+
+ /* (n-1) * 16-byte descriptors */
+ first_desc = desc = (struct spdk_scsi_unmap_bdesc *)&buf[8];
+ while (num_blocks > UINT32_MAX) {
+ to_be64(&desc->lba, offset_blocks);
+ to_be32(&desc->block_count, UINT32_MAX);
+ memset(&desc->reserved, 0, sizeof(desc->reserved));
+ offset_blocks += UINT32_MAX;
+ num_blocks -= UINT32_MAX;
+ desc++;
+ }
+
+ /* The last descriptor with block_count <= UINT32_MAX */
+ to_be64(&desc->lba, offset_blocks);
+ to_be32(&desc->block_count, num_blocks);
+ memset(&desc->reserved, 0, sizeof(desc->reserved));
+
+ /* 8-byte header + n * 16-byte block descriptor */
+ cmd_len = 8 + (desc - first_desc + 1) * sizeof(struct spdk_scsi_unmap_bdesc);
+
+ req->cdb[0] = SPDK_SBC_UNMAP;
+ to_be16(&req->cdb[7], cmd_len);
+
+ /* 8-byte header */
+ to_be16(&buf[0], cmd_len - 2); /* total length (excluding the length field) */
+ to_be16(&buf[2], cmd_len - 8); /* length of block descriptors */
+ memset(&buf[4], 0, 4); /* reserved */
+
+ bdev_virtio_send_io(ch, bdev_io);
+}
+
+static void
+bdev_virtio_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
+ bool success)
+{
+ if (!success) {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ return;
+ }
+
+ bdev_virtio_rw(ch, bdev_io);
+}
+
+static int _bdev_virtio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
+{
+ struct virtio_scsi_disk *disk = SPDK_CONTAINEROF(bdev_io->bdev, struct virtio_scsi_disk, bdev);
+
+ switch (bdev_io->type) {
+ case SPDK_BDEV_IO_TYPE_READ:
+ spdk_bdev_io_get_buf(bdev_io, bdev_virtio_get_buf_cb,
+ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
+ return 0;
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ bdev_virtio_rw(ch, bdev_io);
+ return 0;
+ case SPDK_BDEV_IO_TYPE_RESET:
+ bdev_virtio_reset(ch, bdev_io);
+ return 0;
+ case SPDK_BDEV_IO_TYPE_UNMAP: {
+ uint64_t buf_len = 8 /* header size */ +
+ (bdev_io->u.bdev.num_blocks + UINT32_MAX - 1) /
+ UINT32_MAX * sizeof(struct spdk_scsi_unmap_bdesc);
+
+ if (!disk->info.unmap_supported) {
+ return -1;
+ }
+
+ if (buf_len > SPDK_BDEV_LARGE_BUF_MAX_SIZE) {
+ SPDK_ERRLOG("Trying to UNMAP too many blocks: %"PRIu64"\n",
+ bdev_io->u.bdev.num_blocks);
+ return -1;
+ }
+ spdk_bdev_io_get_buf(bdev_io, bdev_virtio_unmap, buf_len);
+ return 0;
+ }
+ case SPDK_BDEV_IO_TYPE_FLUSH:
+ default:
+ return -1;
+ }
+ return 0;
+}
+
+static void bdev_virtio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
+{
+ if (_bdev_virtio_submit_request(ch, bdev_io) < 0) {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+}
+
+static bool
+bdev_virtio_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
+{
+ struct virtio_scsi_disk *disk = ctx;
+
+ switch (io_type) {
+ case SPDK_BDEV_IO_TYPE_READ:
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ case SPDK_BDEV_IO_TYPE_FLUSH:
+ case SPDK_BDEV_IO_TYPE_RESET:
+ return true;
+
+ case SPDK_BDEV_IO_TYPE_UNMAP:
+ return disk->info.unmap_supported;
+
+ default:
+ return false;
+ }
+}
+
+static struct spdk_io_channel *
+bdev_virtio_get_io_channel(void *ctx)
+{
+ struct virtio_scsi_disk *disk = ctx;
+
+ return spdk_get_io_channel(disk->svdev);
+}
+
+static int
+bdev_virtio_disk_destruct(void *ctx)
+{
+ struct virtio_scsi_disk *disk = ctx;
+ struct virtio_scsi_dev *svdev = disk->svdev;
+
+ TAILQ_REMOVE(&svdev->luns, disk, link);
+ free(disk->bdev.name);
+ free(disk);
+
+ if (svdev->removed && TAILQ_EMPTY(&svdev->luns)) {
+ spdk_io_device_unregister(svdev, virtio_scsi_dev_unregister_cb);
+ }
+
+ return 0;
+}
+
+static int
+bdev_virtio_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
+{
+ struct virtio_scsi_disk *disk = ctx;
+
+ virtio_dev_dump_json_info(&disk->svdev->vdev, w);
+ return 0;
+}
+
+static void
+bdev_virtio_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
+{
+ /* SCSI targets and LUNS are discovered during scan process so nothing
+ * to save here.
+ */
+}
+
+static const struct spdk_bdev_fn_table virtio_fn_table = {
+ .destruct = bdev_virtio_disk_destruct,
+ .submit_request = bdev_virtio_submit_request,
+ .io_type_supported = bdev_virtio_io_type_supported,
+ .get_io_channel = bdev_virtio_get_io_channel,
+ .dump_info_json = bdev_virtio_dump_info_json,
+ .write_config_json = bdev_virtio_write_config_json,
+};
+
+static void
+get_scsi_status(struct virtio_scsi_cmd_resp *resp, int *sk, int *asc, int *ascq)
+{
+ /* see spdk_scsi_task_build_sense_data() for sense data details */
+ *sk = 0;
+ *asc = 0;
+ *ascq = 0;
+
+ if (resp->sense_len < 3) {
+ return;
+ }
+
+ *sk = resp->sense[2] & 0xf;
+
+ if (resp->sense_len < 13) {
+ return;
+ }
+
+ *asc = resp->sense[12];
+
+ if (resp->sense_len < 14) {
+ return;
+ }
+
+ *ascq = resp->sense[13];
+}
+
+static void
+bdev_virtio_io_cpl(struct spdk_bdev_io *bdev_io)
+{
+ struct virtio_scsi_io_ctx *io_ctx = (struct virtio_scsi_io_ctx *)bdev_io->driver_ctx;
+ int sk, asc, ascq;
+
+ get_scsi_status(&io_ctx->resp, &sk, &asc, &ascq);
+ spdk_bdev_io_complete_scsi_status(bdev_io, io_ctx->resp.status, sk, asc, ascq);
+}
+
+static int
+bdev_virtio_poll(void *arg)
+{
+ struct bdev_virtio_io_channel *ch = arg;
+ struct virtio_scsi_dev *svdev = ch->svdev;
+ struct virtio_scsi_scan_base *scan_ctx = svdev->scan_ctx;
+ void *io[32];
+ uint32_t io_len[32];
+ uint16_t i, cnt;
+ int rc;
+
+ cnt = virtio_recv_pkts(ch->vq, (void **)io, io_len, SPDK_COUNTOF(io));
+ for (i = 0; i < cnt; ++i) {
+ if (spdk_unlikely(scan_ctx && io[i] == &scan_ctx->io_ctx)) {
+ if (svdev->removed) {
+ _virtio_scsi_dev_scan_finish(scan_ctx, -EINTR);
+ return SPDK_POLLER_BUSY;
+ }
+
+ if (scan_ctx->restart) {
+ scan_ctx->restart = false;
+ scan_ctx->full_scan = true;
+ _virtio_scsi_dev_scan_tgt(scan_ctx, 0);
+ continue;
+ }
+
+ process_scan_resp(scan_ctx);
+ continue;
+ }
+
+ bdev_virtio_io_cpl(io[i]);
+ }
+
+ if (spdk_unlikely(scan_ctx && scan_ctx->needs_resend)) {
+ if (svdev->removed) {
+ _virtio_scsi_dev_scan_finish(scan_ctx, -EINTR);
+ return SPDK_POLLER_BUSY;
+ } else if (cnt == 0) {
+ return SPDK_POLLER_IDLE;
+ }
+
+ rc = send_scan_io(scan_ctx);
+ if (rc != 0) {
+ assert(scan_ctx->retries > 0);
+ scan_ctx->retries--;
+ if (scan_ctx->retries == 0) {
+ SPDK_ERRLOG("Target scan failed unrecoverably with rc = %d.\n", rc);
+ _virtio_scsi_dev_scan_finish(scan_ctx, rc);
+ }
+ }
+ }
+
+ return cnt;
+}
+
+static void
+bdev_virtio_tmf_cpl_cb(void *ctx)
+{
+ struct spdk_bdev_io *bdev_io = ctx;
+ struct virtio_scsi_io_ctx *io_ctx = (struct virtio_scsi_io_ctx *)bdev_io->driver_ctx;
+
+ if (io_ctx->tmf_resp.response == VIRTIO_SCSI_S_OK) {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
+ } else {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+}
+
+static void
+bdev_virtio_tmf_cpl(struct spdk_bdev_io *bdev_io)
+{
+ spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), bdev_virtio_tmf_cpl_cb, bdev_io);
+}
+
+static void
+bdev_virtio_eventq_io_cpl(struct virtio_scsi_dev *svdev, struct virtio_scsi_eventq_io *io)
+{
+ struct virtio_scsi_event *ev = &io->ev;
+ struct virtio_scsi_disk *disk;
+
+ if (ev->lun[0] != 1) {
+ SPDK_WARNLOG("Received an event with invalid data layout.\n");
+ goto out;
+ }
+
+ if (ev->event & VIRTIO_SCSI_T_EVENTS_MISSED) {
+ ev->event &= ~VIRTIO_SCSI_T_EVENTS_MISSED;
+ virtio_scsi_dev_scan(svdev, NULL, NULL);
+ }
+
+ switch (ev->event) {
+ case VIRTIO_SCSI_T_NO_EVENT:
+ break;
+ case VIRTIO_SCSI_T_TRANSPORT_RESET:
+ switch (ev->reason) {
+ case VIRTIO_SCSI_EVT_RESET_RESCAN:
+ virtio_scsi_dev_scan_tgt(svdev, ev->lun[1]);
+ break;
+ case VIRTIO_SCSI_EVT_RESET_REMOVED:
+ disk = virtio_scsi_dev_get_disk_by_id(svdev, ev->lun[1]);
+ if (disk != NULL) {
+ spdk_bdev_unregister(&disk->bdev, NULL, NULL);
+ }
+ break;
+ default:
+ break;
+ }
+ break;
+ default:
+ break;
+ }
+
+out:
+ virtio_scsi_dev_send_eventq_io(svdev->vdev.vqs[VIRTIO_SCSI_EVENTQ], io);
+}
+
+static void
+bdev_virtio_tmf_abort_nomem_cb(void *ctx)
+{
+ struct spdk_bdev_io *bdev_io = ctx;
+
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
+}
+
+static void
+bdev_virtio_tmf_abort_ioerr_cb(void *ctx)
+{
+ struct spdk_bdev_io *bdev_io = ctx;
+
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+}
+
+static void
+bdev_virtio_tmf_abort(struct spdk_bdev_io *bdev_io, int status)
+{
+ spdk_msg_fn fn;
+
+ if (status == -ENOMEM) {
+ fn = bdev_virtio_tmf_abort_nomem_cb;
+ } else {
+ fn = bdev_virtio_tmf_abort_ioerr_cb;
+ }
+
+ spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), fn, bdev_io);
+}
+
+static int
+bdev_virtio_send_tmf_io(struct virtqueue *ctrlq, struct spdk_bdev_io *bdev_io)
+{
+ struct virtio_scsi_io_ctx *io_ctx = (struct virtio_scsi_io_ctx *)bdev_io->driver_ctx;
+ int rc;
+
+ rc = virtqueue_req_start(ctrlq, bdev_io, 2);
+ if (rc != 0) {
+ return rc;
+ }
+
+ virtqueue_req_add_iovs(ctrlq, &io_ctx->iov_req, 1, SPDK_VIRTIO_DESC_RO);
+ virtqueue_req_add_iovs(ctrlq, &io_ctx->iov_resp, 1, SPDK_VIRTIO_DESC_WR);
+
+ virtqueue_req_flush(ctrlq);
+ return 0;
+}
+
+static int
+bdev_virtio_mgmt_poll(void *arg)
+{
+ struct virtio_scsi_dev *svdev = arg;
+ struct virtio_dev *vdev = &svdev->vdev;
+ struct virtqueue *eventq = vdev->vqs[VIRTIO_SCSI_EVENTQ];
+ struct virtqueue *ctrlq = vdev->vqs[VIRTIO_SCSI_CONTROLQ];
+ struct spdk_ring *send_ring = svdev->ctrlq_ring;
+ void *io[16];
+ uint32_t io_len[16];
+ uint16_t i, cnt;
+ int rc;
+ int total = 0;
+
+ cnt = spdk_ring_dequeue(send_ring, io, SPDK_COUNTOF(io));
+ total += cnt;
+ for (i = 0; i < cnt; ++i) {
+ rc = bdev_virtio_send_tmf_io(ctrlq, io[i]);
+ if (rc != 0) {
+ bdev_virtio_tmf_abort(io[i], rc);
+ }
+ }
+
+ cnt = virtio_recv_pkts(ctrlq, io, io_len, SPDK_COUNTOF(io));
+ total += cnt;
+ for (i = 0; i < cnt; ++i) {
+ bdev_virtio_tmf_cpl(io[i]);
+ }
+
+ cnt = virtio_recv_pkts(eventq, io, io_len, SPDK_COUNTOF(io));
+ total += cnt;
+ for (i = 0; i < cnt; ++i) {
+ bdev_virtio_eventq_io_cpl(svdev, io[i]);
+ }
+
+ return total;
+}
+
+static int
+bdev_virtio_scsi_ch_create_cb(void *io_device, void *ctx_buf)
+{
+ struct virtio_scsi_dev *svdev = io_device;
+ struct virtio_dev *vdev = &svdev->vdev;
+ struct bdev_virtio_io_channel *ch = ctx_buf;
+ struct virtqueue *vq;
+ int32_t queue_idx;
+
+ queue_idx = virtio_dev_find_and_acquire_queue(vdev, VIRTIO_SCSI_REQUESTQ);
+ if (queue_idx < 0) {
+ SPDK_ERRLOG("Couldn't get an unused queue for the io_channel.\n");
+ return -1;
+ }
+
+ vq = vdev->vqs[queue_idx];
+
+ ch->svdev = svdev;
+ ch->vq = vq;
+
+ ch->poller = SPDK_POLLER_REGISTER(bdev_virtio_poll, ch, 0);
+
+ return 0;
+}
+
+static void
+bdev_virtio_scsi_ch_destroy_cb(void *io_device, void *ctx_buf)
+{
+ struct bdev_virtio_io_channel *ch = ctx_buf;
+ struct virtio_scsi_dev *svdev = ch->svdev;
+ struct virtio_dev *vdev = &svdev->vdev;
+ struct virtqueue *vq = ch->vq;
+
+ spdk_poller_unregister(&ch->poller);
+ virtio_dev_release_queue(vdev, vq->vq_queue_index);
+}
+
+static void
+_virtio_scsi_dev_scan_finish(struct virtio_scsi_scan_base *base, int errnum)
+{
+ struct virtio_scsi_dev *svdev = base->svdev;
+ size_t bdevs_cnt;
+ struct spdk_bdev *bdevs[BDEV_VIRTIO_MAX_TARGET];
+ struct virtio_scsi_disk *disk;
+ struct virtio_scsi_scan_info *tgt, *next_tgt;
+
+ spdk_put_io_channel(spdk_io_channel_from_ctx(base->channel));
+ base->svdev->scan_ctx = NULL;
+
+ TAILQ_FOREACH_SAFE(tgt, &base->scan_queue, tailq, next_tgt) {
+ TAILQ_REMOVE(&base->scan_queue, tgt, tailq);
+ free(tgt);
+ }
+
+ if (base->cb_fn == NULL) {
+ spdk_free(base);
+ return;
+ }
+
+ bdevs_cnt = 0;
+ if (errnum == 0) {
+ TAILQ_FOREACH(disk, &svdev->luns, link) {
+ bdevs[bdevs_cnt] = &disk->bdev;
+ bdevs_cnt++;
+ }
+ }
+
+ base->cb_fn(base->cb_arg, errnum, bdevs, bdevs_cnt);
+ spdk_free(base);
+}
+
+static int
+send_scan_io(struct virtio_scsi_scan_base *base)
+{
+ struct virtio_scsi_io_ctx *io_ctx = &base->io_ctx;
+ struct virtio_scsi_cmd_req *req = &base->io_ctx.req;
+ struct virtqueue *vq = base->channel->vq;
+ int payload_iov_cnt = base->iov.iov_len > 0 ? 1 : 0;
+ int rc;
+
+ req->lun[0] = 1;
+ req->lun[1] = base->info.target;
+
+ rc = virtqueue_req_start(vq, io_ctx, 2 + payload_iov_cnt);
+ if (rc != 0) {
+ base->needs_resend = true;
+ return -1;
+ }
+
+ virtqueue_req_add_iovs(vq, &io_ctx->iov_req, 1, SPDK_VIRTIO_DESC_RO);
+ virtqueue_req_add_iovs(vq, &io_ctx->iov_resp, 1, SPDK_VIRTIO_DESC_WR);
+ virtqueue_req_add_iovs(vq, &base->iov, payload_iov_cnt, SPDK_VIRTIO_DESC_WR);
+
+ virtqueue_req_flush(vq);
+ return 0;
+}
+
+static int
+send_inquiry(struct virtio_scsi_scan_base *base)
+{
+ struct virtio_scsi_cmd_req *req = &base->io_ctx.req;
+ struct spdk_scsi_cdb_inquiry *cdb;
+
+ memset(req, 0, sizeof(*req));
+
+ base->iov.iov_len = BDEV_VIRTIO_SCAN_PAYLOAD_SIZE;
+ cdb = (struct spdk_scsi_cdb_inquiry *)req->cdb;
+ cdb->opcode = SPDK_SPC_INQUIRY;
+ to_be16(cdb->alloc_len, BDEV_VIRTIO_SCAN_PAYLOAD_SIZE);
+
+ return send_scan_io(base);
+}
+
+static int
+send_inquiry_vpd(struct virtio_scsi_scan_base *base, uint8_t page_code)
+{
+ struct virtio_scsi_cmd_req *req = &base->io_ctx.req;
+ struct spdk_scsi_cdb_inquiry *inquiry_cdb = (struct spdk_scsi_cdb_inquiry *)req->cdb;
+
+ memset(req, 0, sizeof(*req));
+
+ base->iov.iov_len = BDEV_VIRTIO_SCAN_PAYLOAD_SIZE;
+ inquiry_cdb->opcode = SPDK_SPC_INQUIRY;
+ inquiry_cdb->evpd = 1;
+ inquiry_cdb->page_code = page_code;
+ to_be16(inquiry_cdb->alloc_len, base->iov.iov_len);
+
+ return send_scan_io(base);
+}
+
+static int
+send_read_cap_10(struct virtio_scsi_scan_base *base)
+{
+ struct virtio_scsi_cmd_req *req = &base->io_ctx.req;
+
+ memset(req, 0, sizeof(*req));
+
+ base->iov.iov_len = 8;
+ req->cdb[0] = SPDK_SBC_READ_CAPACITY_10;
+
+ return send_scan_io(base);
+}
+
+static int
+send_read_cap_16(struct virtio_scsi_scan_base *base)
+{
+ struct virtio_scsi_cmd_req *req = &base->io_ctx.req;
+
+ memset(req, 0, sizeof(*req));
+
+ base->iov.iov_len = 32;
+ req->cdb[0] = SPDK_SPC_SERVICE_ACTION_IN_16;
+ req->cdb[1] = SPDK_SBC_SAI_READ_CAPACITY_16;
+ to_be32(&req->cdb[10], base->iov.iov_len);
+
+ return send_scan_io(base);
+}
+
+static int
+send_test_unit_ready(struct virtio_scsi_scan_base *base)
+{
+ struct virtio_scsi_cmd_req *req = &base->io_ctx.req;
+
+ memset(req, 0, sizeof(*req));
+ req->cdb[0] = SPDK_SPC_TEST_UNIT_READY;
+ base->iov.iov_len = 0;
+
+ return send_scan_io(base);
+}
+
+static int
+send_start_stop_unit(struct virtio_scsi_scan_base *base)
+{
+ struct virtio_scsi_cmd_req *req = &base->io_ctx.req;
+
+ memset(req, 0, sizeof(*req));
+ req->cdb[0] = SPDK_SBC_START_STOP_UNIT;
+ req->cdb[4] = SPDK_SBC_START_STOP_UNIT_START_BIT;
+ base->iov.iov_len = 0;
+
+ return send_scan_io(base);
+}
+
+static int
+process_scan_start_stop_unit(struct virtio_scsi_scan_base *base)
+{
+ struct virtio_scsi_cmd_resp *resp = &base->io_ctx.resp;
+
+ if (resp->status == SPDK_SCSI_STATUS_GOOD) {
+ return send_inquiry_vpd(base, SPDK_SPC_VPD_SUPPORTED_VPD_PAGES);
+ }
+
+ return -1;
+}
+
+static int
+process_scan_test_unit_ready(struct virtio_scsi_scan_base *base)
+{
+ struct virtio_scsi_cmd_resp *resp = &base->io_ctx.resp;
+ int sk, asc, ascq;
+
+ get_scsi_status(resp, &sk, &asc, &ascq);
+
+ /* check response, get VPD if spun up otherwise send SSU */
+ if (resp->status == SPDK_SCSI_STATUS_GOOD) {
+ return send_inquiry_vpd(base, SPDK_SPC_VPD_SUPPORTED_VPD_PAGES);
+ } else if (resp->response == VIRTIO_SCSI_S_OK &&
+ resp->status == SPDK_SCSI_STATUS_CHECK_CONDITION &&
+ sk == SPDK_SCSI_SENSE_UNIT_ATTENTION &&
+ asc == SPDK_SCSI_ASC_LOGICAL_UNIT_NOT_READY) {
+ return send_start_stop_unit(base);
+ } else {
+ return -1;
+ }
+}
+
+static int
+process_scan_inquiry_standard(struct virtio_scsi_scan_base *base)
+{
+ struct virtio_scsi_cmd_resp *resp = &base->io_ctx.resp;
+ struct spdk_scsi_cdb_inquiry_data *inquiry_data =
+ (struct spdk_scsi_cdb_inquiry_data *)base->payload;
+
+ if (resp->status != SPDK_SCSI_STATUS_GOOD) {
+ return -1;
+ }
+
+ /* check to make sure its a supported device */
+ if (inquiry_data->peripheral_device_type != SPDK_SPC_PERIPHERAL_DEVICE_TYPE_DISK ||
+ inquiry_data->peripheral_qualifier != SPDK_SPC_PERIPHERAL_QUALIFIER_CONNECTED) {
+ SPDK_WARNLOG("Unsupported peripheral device type 0x%02x (qualifier 0x%02x)\n",
+ inquiry_data->peripheral_device_type,
+ inquiry_data->peripheral_qualifier);
+ return -1;
+ }
+
+ return send_test_unit_ready(base);
+}
+
+static int
+process_scan_inquiry_vpd_supported_vpd_pages(struct virtio_scsi_scan_base *base)
+{
+ struct virtio_scsi_cmd_resp *resp = &base->io_ctx.resp;
+ bool block_provisioning_page_supported = false;
+
+ if (resp->status == SPDK_SCSI_STATUS_GOOD) {
+ const uint8_t *vpd_data = base->payload;
+ const uint8_t *supported_vpd_pages = vpd_data + 4;
+ uint16_t page_length;
+ uint16_t num_supported_pages;
+ uint16_t i;
+
+ page_length = from_be16(vpd_data + 2);
+ num_supported_pages = spdk_min(page_length, base->iov.iov_len - 4);
+
+ for (i = 0; i < num_supported_pages; i++) {
+ if (supported_vpd_pages[i] == SPDK_SPC_VPD_BLOCK_THIN_PROVISION) {
+ block_provisioning_page_supported = true;
+ break;
+ }
+ }
+ }
+
+ if (block_provisioning_page_supported) {
+ return send_inquiry_vpd(base, SPDK_SPC_VPD_BLOCK_THIN_PROVISION);
+ } else {
+ return send_read_cap_10(base);
+ }
+}
+
+static int
+process_scan_inquiry_vpd_block_thin_provision(struct virtio_scsi_scan_base *base)
+{
+ struct virtio_scsi_cmd_resp *resp = &base->io_ctx.resp;
+
+ base->info.unmap_supported = false;
+
+ if (resp->status == SPDK_SCSI_STATUS_GOOD) {
+ uint8_t *vpd_data = base->payload;
+
+ base->info.unmap_supported = !!(vpd_data[5] & SPDK_SCSI_UNMAP_LBPU);
+ }
+
+ SPDK_INFOLOG(SPDK_LOG_VIRTIO, "Target %u: unmap supported = %d\n",
+ base->info.target, (int)base->info.unmap_supported);
+
+ return send_read_cap_10(base);
+}
+
+static int
+process_scan_inquiry(struct virtio_scsi_scan_base *base)
+{
+ struct virtio_scsi_cmd_req *req = &base->io_ctx.req;
+ struct spdk_scsi_cdb_inquiry *inquiry_cdb = (struct spdk_scsi_cdb_inquiry *)req->cdb;
+
+ if ((inquiry_cdb->evpd & 1) == 0) {
+ return process_scan_inquiry_standard(base);
+ }
+
+ switch (inquiry_cdb->page_code) {
+ case SPDK_SPC_VPD_SUPPORTED_VPD_PAGES:
+ return process_scan_inquiry_vpd_supported_vpd_pages(base);
+ case SPDK_SPC_VPD_BLOCK_THIN_PROVISION:
+ return process_scan_inquiry_vpd_block_thin_provision(base);
+ default:
+ SPDK_DEBUGLOG(SPDK_LOG_VIRTIO, "Unexpected VPD page 0x%02x\n", inquiry_cdb->page_code);
+ return -1;
+ }
+}
+
+static void
+bdev_virtio_disc_notify_remove(void *remove_ctx)
+{
+ struct virtio_scsi_disk *disk = remove_ctx;
+
+ disk->removed = true;
+ spdk_bdev_close(disk->notify_desc);
+}
+
+/* To be called only from the thread performing target scan */
+static int
+virtio_scsi_dev_add_tgt(struct virtio_scsi_dev *svdev, struct virtio_scsi_scan_info *info)
+{
+ struct virtio_scsi_disk *disk;
+ struct spdk_bdev *bdev;
+ int rc;
+
+ TAILQ_FOREACH(disk, &svdev->luns, link) {
+ if (disk->info.target == info->target) {
+ /* Target is already attached and param change is not supported */
+ return 0;
+ }
+ }
+
+ if (info->block_size == 0 || info->num_blocks == 0) {
+ SPDK_ERRLOG("%s: invalid target %u: bs=%"PRIu32" blocks=%"PRIu64"\n",
+ svdev->vdev.name, info->target, info->block_size, info->num_blocks);
+ return -EINVAL;
+ }
+
+ disk = calloc(1, sizeof(*disk));
+ if (disk == NULL) {
+ SPDK_ERRLOG("could not allocate disk\n");
+ return -ENOMEM;
+ }
+
+ disk->svdev = svdev;
+ memcpy(&disk->info, info, sizeof(*info));
+
+ bdev = &disk->bdev;
+ bdev->name = spdk_sprintf_alloc("%st%"PRIu8, svdev->vdev.name, info->target);
+ if (bdev->name == NULL) {
+ SPDK_ERRLOG("Couldn't alloc memory for the bdev name.\n");
+ free(disk);
+ return -ENOMEM;
+ }
+
+ bdev->product_name = "Virtio SCSI Disk";
+ bdev->write_cache = 0;
+ bdev->blocklen = disk->info.block_size;
+ bdev->blockcnt = disk->info.num_blocks;
+
+ bdev->ctxt = disk;
+ bdev->fn_table = &virtio_fn_table;
+ bdev->module = &virtio_scsi_if;
+
+ rc = spdk_bdev_register(&disk->bdev);
+ if (rc) {
+ SPDK_ERRLOG("Failed to register bdev name=%s\n", disk->bdev.name);
+ free(bdev->name);
+ free(disk);
+ return rc;
+ }
+
+ rc = spdk_bdev_open(bdev, false, bdev_virtio_disc_notify_remove, disk, &disk->notify_desc);
+ if (rc) {
+ assert(false);
+ }
+
+ TAILQ_INSERT_TAIL(&svdev->luns, disk, link);
+ return 0;
+}
+
+static int
+process_read_cap_10(struct virtio_scsi_scan_base *base)
+{
+ struct virtio_scsi_cmd_req *req = &base->io_ctx.req;
+ struct virtio_scsi_cmd_resp *resp = &base->io_ctx.resp;
+ uint64_t max_block;
+ uint32_t block_size;
+ uint8_t target_id = req->lun[1];
+ int rc;
+
+ if (resp->response != VIRTIO_SCSI_S_OK || resp->status != SPDK_SCSI_STATUS_GOOD) {
+ SPDK_ERRLOG("READ CAPACITY (10) failed for target %"PRIu8".\n", target_id);
+ return -1;
+ }
+
+ block_size = from_be32(base->payload + 4);
+ max_block = from_be32(base->payload);
+
+ if (max_block == 0xffffffff) {
+ return send_read_cap_16(base);
+ }
+
+ base->info.num_blocks = (uint64_t)max_block + 1;
+ base->info.block_size = block_size;
+
+ rc = virtio_scsi_dev_add_tgt(base->svdev, &base->info);
+ if (rc != 0) {
+ return rc;
+ }
+
+ return _virtio_scsi_dev_scan_next(base, 0);
+}
+
+static int
+process_read_cap_16(struct virtio_scsi_scan_base *base)
+{
+ struct virtio_scsi_cmd_req *req = &base->io_ctx.req;
+ struct virtio_scsi_cmd_resp *resp = &base->io_ctx.resp;
+ uint8_t target_id = req->lun[1];
+ int rc;
+
+ if (resp->response != VIRTIO_SCSI_S_OK || resp->status != SPDK_SCSI_STATUS_GOOD) {
+ SPDK_ERRLOG("READ CAPACITY (16) failed for target %"PRIu8".\n", target_id);
+ return -1;
+ }
+
+ base->info.num_blocks = from_be64(base->payload) + 1;
+ base->info.block_size = from_be32(base->payload + 8);
+ rc = virtio_scsi_dev_add_tgt(base->svdev, &base->info);
+ if (rc != 0) {
+ return rc;
+ }
+
+ return _virtio_scsi_dev_scan_next(base, 0);
+}
+
+static void
+process_scan_resp(struct virtio_scsi_scan_base *base)
+{
+ struct virtio_scsi_cmd_req *req = &base->io_ctx.req;
+ struct virtio_scsi_cmd_resp *resp = &base->io_ctx.resp;
+ int rc, sk, asc, ascq;
+ uint8_t target_id;
+
+ if (base->io_ctx.iov_req.iov_len < sizeof(struct virtio_scsi_cmd_req) ||
+ base->io_ctx.iov_resp.iov_len < sizeof(struct virtio_scsi_cmd_resp)) {
+ SPDK_ERRLOG("Received target scan message with invalid length.\n");
+ _virtio_scsi_dev_scan_next(base, -EIO);
+ return;
+ }
+
+ get_scsi_status(resp, &sk, &asc, &ascq);
+ target_id = req->lun[1];
+
+ if (resp->response == VIRTIO_SCSI_S_BAD_TARGET ||
+ resp->response == VIRTIO_SCSI_S_INCORRECT_LUN) {
+ _virtio_scsi_dev_scan_next(base, -ENODEV);
+ return;
+ }
+
+ if (resp->response != VIRTIO_SCSI_S_OK ||
+ (resp->status == SPDK_SCSI_STATUS_CHECK_CONDITION &&
+ sk != SPDK_SCSI_SENSE_ILLEGAL_REQUEST)) {
+ assert(base->retries > 0);
+ base->retries--;
+ if (base->retries == 0) {
+ SPDK_NOTICELOG("Target %"PRIu8" is present, but unavailable.\n", target_id);
+ SPDK_LOGDUMP(SPDK_LOG_VIRTIO, "CDB", req->cdb, sizeof(req->cdb));
+ SPDK_LOGDUMP(SPDK_LOG_VIRTIO, "SENSE DATA", resp->sense, sizeof(resp->sense));
+ _virtio_scsi_dev_scan_next(base, -EBUSY);
+ return;
+ }
+
+ /* resend the same request */
+ rc = send_scan_io(base);
+ if (rc != 0) {
+ /* Let response poller do the resend */
+ }
+ return;
+ }
+
+ base->retries = SCAN_REQUEST_RETRIES;
+
+ switch (req->cdb[0]) {
+ case SPDK_SPC_INQUIRY:
+ rc = process_scan_inquiry(base);
+ break;
+ case SPDK_SPC_TEST_UNIT_READY:
+ rc = process_scan_test_unit_ready(base);
+ break;
+ case SPDK_SBC_START_STOP_UNIT:
+ rc = process_scan_start_stop_unit(base);
+ break;
+ case SPDK_SBC_READ_CAPACITY_10:
+ rc = process_read_cap_10(base);
+ break;
+ case SPDK_SPC_SERVICE_ACTION_IN_16:
+ rc = process_read_cap_16(base);
+ break;
+ default:
+ SPDK_ERRLOG("Received invalid target scan message: cdb[0] = %"PRIu8".\n", req->cdb[0]);
+ rc = -1;
+ break;
+ }
+
+ if (rc != 0) {
+ if (base->needs_resend) {
+ return; /* Let response poller do the resend */
+ }
+
+ _virtio_scsi_dev_scan_next(base, rc);
+ }
+}
+
+static int
+_virtio_scsi_dev_scan_next(struct virtio_scsi_scan_base *base, int rc)
+{
+ struct virtio_scsi_scan_info *next;
+ struct virtio_scsi_disk *disk;
+ uint8_t target_id;
+
+ if (base->full_scan) {
+ if (rc != 0) {
+ disk = virtio_scsi_dev_get_disk_by_id(base->svdev,
+ base->info.target);
+ if (disk != NULL) {
+ spdk_bdev_unregister(&disk->bdev, NULL, NULL);
+ }
+ }
+
+ target_id = base->info.target + 1;
+ if (target_id < BDEV_VIRTIO_MAX_TARGET) {
+ _virtio_scsi_dev_scan_tgt(base, target_id);
+ return 0;
+ }
+
+ base->full_scan = false;
+ }
+
+ next = TAILQ_FIRST(&base->scan_queue);
+ if (next == NULL) {
+ _virtio_scsi_dev_scan_finish(base, 0);
+ return 0;
+ }
+
+ TAILQ_REMOVE(&base->scan_queue, next, tailq);
+ target_id = next->target;
+ free(next);
+
+ _virtio_scsi_dev_scan_tgt(base, target_id);
+ return 0;
+}
+
+static int
+virtio_pci_scsi_dev_enumerate_cb(struct virtio_pci_ctx *pci_ctx, void *ctx)
+{
+ struct virtio_scsi_dev *svdev;
+
+ svdev = virtio_pci_scsi_dev_create(NULL, pci_ctx);
+ return svdev == NULL ? -1 : 0;
+}
+
+static int
+bdev_virtio_process_config(void)
+{
+ struct spdk_conf_section *sp;
+ struct virtio_scsi_dev *svdev;
+ char *default_name = NULL;
+ char *path, *type, *name;
+ unsigned vdev_num;
+ int num_queues;
+ bool enable_pci;
+ int rc = 0;
+
+ for (sp = spdk_conf_first_section(NULL); sp != NULL; sp = spdk_conf_next_section(sp)) {
+ if (!spdk_conf_section_match_prefix(sp, "VirtioUser")) {
+ continue;
+ }
+
+ if (sscanf(spdk_conf_section_get_name(sp), "VirtioUser%u", &vdev_num) != 1) {
+ SPDK_ERRLOG("Section '%s' has non-numeric suffix.\n",
+ spdk_conf_section_get_name(sp));
+ rc = -1;
+ goto out;
+ }
+
+ path = spdk_conf_section_get_val(sp, "Path");
+ if (path == NULL) {
+ SPDK_ERRLOG("VirtioUser%u: missing Path\n", vdev_num);
+ rc = -1;
+ goto out;
+ }
+
+ type = spdk_conf_section_get_val(sp, "Type");
+ if (type != NULL && strcmp(type, "SCSI") != 0) {
+ continue;
+ }
+
+ num_queues = spdk_conf_section_get_intval(sp, "Queues");
+ if (num_queues < 1) {
+ num_queues = 1;
+ } else if (num_queues > SPDK_VIRTIO_MAX_VIRTQUEUES) {
+ num_queues = SPDK_VIRTIO_MAX_VIRTQUEUES;
+ }
+
+ name = spdk_conf_section_get_val(sp, "Name");
+ if (name == NULL) {
+ default_name = spdk_sprintf_alloc("VirtioScsi%u", vdev_num);
+ name = default_name;
+ }
+
+ svdev = virtio_user_scsi_dev_create(name, path, num_queues, 512);
+ free(default_name);
+ default_name = NULL;
+
+ if (svdev == NULL) {
+ rc = -1;
+ goto out;
+ }
+ }
+
+ sp = spdk_conf_find_section(NULL, "VirtioPci");
+ if (sp == NULL) {
+ return 0;
+ }
+
+ enable_pci = spdk_conf_section_get_boolval(sp, "Enable", false);
+ if (enable_pci) {
+ rc = virtio_pci_dev_enumerate(virtio_pci_scsi_dev_enumerate_cb, NULL,
+ PCI_DEVICE_ID_VIRTIO_SCSI_MODERN);
+ }
+
+out:
+ return rc;
+}
+
+static int
+_virtio_scsi_dev_scan_init(struct virtio_scsi_dev *svdev)
+{
+ struct virtio_scsi_scan_base *base;
+ struct spdk_io_channel *io_ch;
+ struct virtio_scsi_io_ctx *io_ctx;
+ struct virtio_scsi_cmd_req *req;
+ struct virtio_scsi_cmd_resp *resp;
+
+ io_ch = spdk_get_io_channel(svdev);
+ if (io_ch == NULL) {
+ return -EBUSY;
+ }
+
+ base = spdk_zmalloc(sizeof(*base), 64, NULL,
+ SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+ if (base == NULL) {
+ SPDK_ERRLOG("couldn't allocate memory for scsi target scan.\n");
+ return -ENOMEM;
+ }
+
+ base->svdev = svdev;
+
+ base->channel = spdk_io_channel_get_ctx(io_ch);
+ TAILQ_INIT(&base->scan_queue);
+ svdev->scan_ctx = base;
+
+ base->iov.iov_base = base->payload;
+ io_ctx = &base->io_ctx;
+ req = &io_ctx->req;
+ resp = &io_ctx->resp;
+ io_ctx->iov_req.iov_base = req;
+ io_ctx->iov_req.iov_len = sizeof(*req);
+ io_ctx->iov_resp.iov_base = resp;
+ io_ctx->iov_resp.iov_len = sizeof(*resp);
+
+ base->retries = SCAN_REQUEST_RETRIES;
+ return 0;
+}
+
+static void
+_virtio_scsi_dev_scan_tgt(struct virtio_scsi_scan_base *base, uint8_t target)
+{
+ int rc;
+
+ memset(&base->info, 0, sizeof(base->info));
+ base->info.target = target;
+
+ rc = send_inquiry(base);
+ if (rc) {
+ /* Let response poller do the resend */
+ }
+}
+
+static int
+virtio_scsi_dev_scan(struct virtio_scsi_dev *svdev, bdev_virtio_create_cb cb_fn,
+ void *cb_arg)
+{
+ struct virtio_scsi_scan_base *base;
+ struct virtio_scsi_scan_info *tgt, *next_tgt;
+ int rc;
+
+ if (svdev->scan_ctx) {
+ if (svdev->scan_ctx->full_scan) {
+ return -EEXIST;
+ }
+
+ /* We're about to start a full rescan, so there's no need
+ * to scan particular targets afterwards.
+ */
+ TAILQ_FOREACH_SAFE(tgt, &svdev->scan_ctx->scan_queue, tailq, next_tgt) {
+ TAILQ_REMOVE(&svdev->scan_ctx->scan_queue, tgt, tailq);
+ free(tgt);
+ }
+
+ svdev->scan_ctx->cb_fn = cb_fn;
+ svdev->scan_ctx->cb_arg = cb_arg;
+ svdev->scan_ctx->restart = true;
+ return 0;
+ }
+
+ rc = _virtio_scsi_dev_scan_init(svdev);
+ if (rc != 0) {
+ return rc;
+ }
+
+ base = svdev->scan_ctx;
+ base->cb_fn = cb_fn;
+ base->cb_arg = cb_arg;
+ base->full_scan = true;
+
+ _virtio_scsi_dev_scan_tgt(base, 0);
+ return 0;
+}
+
+static int
+virtio_scsi_dev_scan_tgt(struct virtio_scsi_dev *svdev, uint8_t target)
+{
+ struct virtio_scsi_scan_base *base;
+ struct virtio_scsi_scan_info *info;
+ int rc;
+
+ base = svdev->scan_ctx;
+ if (base) {
+ info = calloc(1, sizeof(*info));
+ if (info == NULL) {
+ SPDK_ERRLOG("calloc failed\n");
+ return -ENOMEM;
+ }
+
+ info->target = target;
+ TAILQ_INSERT_TAIL(&base->scan_queue, info, tailq);
+ return 0;
+ }
+
+ rc = _virtio_scsi_dev_scan_init(svdev);
+ if (rc != 0) {
+ return rc;
+ }
+
+ base = svdev->scan_ctx;
+ base->full_scan = true;
+ _virtio_scsi_dev_scan_tgt(base, target);
+ return 0;
+}
+
+static void
+bdev_virtio_initial_scan_complete(void *ctx, int result,
+ struct spdk_bdev **bdevs, size_t bdevs_cnt)
+{
+ struct virtio_scsi_dev *svdev;
+
+ pthread_mutex_lock(&g_virtio_scsi_mutex);
+ TAILQ_FOREACH(svdev, &g_virtio_scsi_devs, tailq) {
+ if (svdev->scan_ctx) {
+ /* another device is still being scanned */
+ pthread_mutex_unlock(&g_virtio_scsi_mutex);
+ return;
+ }
+ }
+
+ pthread_mutex_unlock(&g_virtio_scsi_mutex);
+ spdk_bdev_module_init_done(&virtio_scsi_if);
+}
+
+static int
+bdev_virtio_initialize(void)
+{
+ struct virtio_scsi_dev *svdev, *next_svdev;
+ int rc;
+
+ rc = bdev_virtio_process_config();
+ pthread_mutex_lock(&g_virtio_scsi_mutex);
+
+ if (rc != 0) {
+ goto err_unlock;
+ }
+
+ if (TAILQ_EMPTY(&g_virtio_scsi_devs)) {
+ goto out_unlock;
+ }
+
+ /* Initialize all created devices and scan available targets */
+ TAILQ_FOREACH(svdev, &g_virtio_scsi_devs, tailq) {
+ rc = virtio_scsi_dev_scan(svdev, bdev_virtio_initial_scan_complete, NULL);
+ if (rc != 0) {
+ goto err_unlock;
+ }
+ }
+
+ pthread_mutex_unlock(&g_virtio_scsi_mutex);
+ return 0;
+
+err_unlock:
+ /* Remove any created devices */
+ TAILQ_FOREACH_SAFE(svdev, &g_virtio_scsi_devs, tailq, next_svdev) {
+ virtio_scsi_dev_remove(svdev, NULL, NULL);
+ }
+
+out_unlock:
+ pthread_mutex_unlock(&g_virtio_scsi_mutex);
+ spdk_bdev_module_init_done(&virtio_scsi_if);
+ return rc;
+}
+
+static void
+_virtio_scsi_dev_unregister_cb(void *io_device)
+{
+ struct virtio_scsi_dev *svdev = io_device;
+ struct virtio_dev *vdev = &svdev->vdev;
+ bool finish_module;
+ bdev_virtio_remove_cb remove_cb;
+ void *remove_ctx;
+
+ assert(spdk_ring_count(svdev->ctrlq_ring) == 0);
+ spdk_ring_free(svdev->ctrlq_ring);
+ spdk_poller_unregister(&svdev->mgmt_poller);
+
+ virtio_dev_release_queue(vdev, VIRTIO_SCSI_EVENTQ);
+ virtio_dev_release_queue(vdev, VIRTIO_SCSI_CONTROLQ);
+
+ virtio_dev_stop(vdev);
+ virtio_dev_destruct(vdev);
+
+ pthread_mutex_lock(&g_virtio_scsi_mutex);
+ TAILQ_REMOVE(&g_virtio_scsi_devs, svdev, tailq);
+ pthread_mutex_unlock(&g_virtio_scsi_mutex);
+
+ remove_cb = svdev->remove_cb;
+ remove_ctx = svdev->remove_ctx;
+ spdk_free(svdev->eventq_ios);
+ free(svdev);
+
+ if (remove_cb) {
+ remove_cb(remove_ctx, 0);
+ }
+
+ finish_module = TAILQ_EMPTY(&g_virtio_scsi_devs);
+
+ if (g_bdev_virtio_finish && finish_module) {
+ spdk_bdev_module_finish_done();
+ }
+}
+
+static void
+virtio_scsi_dev_unregister_cb(void *io_device)
+{
+ struct virtio_scsi_dev *svdev = io_device;
+ struct spdk_thread *thread;
+
+ thread = virtio_dev_queue_get_thread(&svdev->vdev, VIRTIO_SCSI_CONTROLQ);
+ spdk_thread_send_msg(thread, _virtio_scsi_dev_unregister_cb, io_device);
+}
+
+static void
+virtio_scsi_dev_remove(struct virtio_scsi_dev *svdev,
+ bdev_virtio_remove_cb cb_fn, void *cb_arg)
+{
+ struct virtio_scsi_disk *disk, *disk_tmp;
+ bool do_remove = true;
+
+ if (svdev->removed) {
+ if (cb_fn) {
+ cb_fn(cb_arg, -EBUSY);
+ }
+ return;
+ }
+
+ svdev->remove_cb = cb_fn;
+ svdev->remove_ctx = cb_arg;
+ svdev->removed = true;
+
+ if (svdev->scan_ctx) {
+ /* The removal will continue after we receive a pending scan I/O. */
+ return;
+ }
+
+ TAILQ_FOREACH_SAFE(disk, &svdev->luns, link, disk_tmp) {
+ if (!disk->removed) {
+ spdk_bdev_unregister(&disk->bdev, NULL, NULL);
+ }
+ do_remove = false;
+ }
+
+ if (do_remove) {
+ spdk_io_device_unregister(svdev, virtio_scsi_dev_unregister_cb);
+ }
+}
+
+static void
+bdev_virtio_finish(void)
+{
+ struct virtio_scsi_dev *svdev, *next;
+
+ g_bdev_virtio_finish = true;
+
+ pthread_mutex_lock(&g_virtio_scsi_mutex);
+ if (TAILQ_EMPTY(&g_virtio_scsi_devs)) {
+ pthread_mutex_unlock(&g_virtio_scsi_mutex);
+ spdk_bdev_module_finish_done();
+ return;
+ }
+
+ /* Defer module finish until all controllers are removed. */
+ TAILQ_FOREACH_SAFE(svdev, &g_virtio_scsi_devs, tailq, next) {
+ virtio_scsi_dev_remove(svdev, NULL, NULL);
+ }
+ pthread_mutex_unlock(&g_virtio_scsi_mutex);
+}
+
+int
+bdev_virtio_user_scsi_dev_create(const char *base_name, const char *path,
+ unsigned num_queues, unsigned queue_size,
+ bdev_virtio_create_cb cb_fn, void *cb_arg)
+{
+ struct virtio_scsi_dev *svdev;
+ int rc;
+
+ svdev = virtio_user_scsi_dev_create(base_name, path, num_queues, queue_size);
+ if (svdev == NULL) {
+ return -1;
+ }
+
+ rc = virtio_scsi_dev_scan(svdev, cb_fn, cb_arg);
+ if (rc) {
+ virtio_scsi_dev_remove(svdev, NULL, NULL);
+ }
+
+ return rc;
+}
+
+struct bdev_virtio_pci_dev_create_ctx {
+ const char *name;
+ bdev_virtio_create_cb cb_fn;
+ void *cb_arg;
+};
+
+static int
+bdev_virtio_pci_scsi_dev_create_cb(struct virtio_pci_ctx *pci_ctx, void *ctx)
+{
+ struct virtio_scsi_dev *svdev;
+ struct bdev_virtio_pci_dev_create_ctx *create_ctx = ctx;
+ int rc;
+
+ svdev = virtio_pci_scsi_dev_create(create_ctx->name, pci_ctx);
+ if (svdev == NULL) {
+ return -1;
+ }
+
+ rc = virtio_scsi_dev_scan(svdev, create_ctx->cb_fn, create_ctx->cb_arg);
+ if (rc) {
+ virtio_scsi_dev_remove(svdev, NULL, NULL);
+ }
+
+ return rc;
+}
+
+int
+bdev_virtio_pci_scsi_dev_create(const char *name, struct spdk_pci_addr *pci_addr,
+ bdev_virtio_create_cb cb_fn, void *cb_arg)
+{
+ struct bdev_virtio_pci_dev_create_ctx create_ctx;
+
+ create_ctx.name = name;
+ create_ctx.cb_fn = cb_fn;
+ create_ctx.cb_arg = cb_arg;
+
+ return virtio_pci_dev_attach(bdev_virtio_pci_scsi_dev_create_cb, &create_ctx,
+ PCI_DEVICE_ID_VIRTIO_SCSI_MODERN, pci_addr);
+}
+
+int
+bdev_virtio_scsi_dev_remove(const char *name, bdev_virtio_remove_cb cb_fn, void *cb_arg)
+{
+ struct virtio_scsi_dev *svdev;
+
+ pthread_mutex_lock(&g_virtio_scsi_mutex);
+ TAILQ_FOREACH(svdev, &g_virtio_scsi_devs, tailq) {
+ if (strcmp(svdev->vdev.name, name) == 0) {
+ break;
+ }
+ }
+
+ if (svdev == NULL) {
+ pthread_mutex_unlock(&g_virtio_scsi_mutex);
+ SPDK_ERRLOG("Cannot find Virtio-SCSI device named '%s'\n", name);
+ return -ENODEV;
+ }
+
+ virtio_scsi_dev_remove(svdev, cb_fn, cb_arg);
+ pthread_mutex_unlock(&g_virtio_scsi_mutex);
+
+ return 0;
+}
+
+void
+bdev_virtio_scsi_dev_list(struct spdk_json_write_ctx *w)
+{
+ struct virtio_scsi_dev *svdev;
+
+ spdk_json_write_array_begin(w);
+
+ pthread_mutex_lock(&g_virtio_scsi_mutex);
+ TAILQ_FOREACH(svdev, &g_virtio_scsi_devs, tailq) {
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_string(w, "name", svdev->vdev.name);
+
+ virtio_dev_dump_json_info(&svdev->vdev, w);
+
+ spdk_json_write_object_end(w);
+ }
+ pthread_mutex_unlock(&g_virtio_scsi_mutex);
+
+ spdk_json_write_array_end(w);
+}
+
+SPDK_LOG_REGISTER_COMPONENT("virtio", SPDK_LOG_VIRTIO)
diff --git a/src/spdk/module/bdev/zone_block/Makefile b/src/spdk/module/bdev/zone_block/Makefile
new file mode 100644
index 000000000..3dec8a37d
--- /dev/null
+++ b/src/spdk/module/bdev/zone_block/Makefile
@@ -0,0 +1,45 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = vbdev_zone_block.c vbdev_zone_block_rpc.c
+LIBNAME = bdev_zone_block
+
+SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/module/bdev/zone_block/vbdev_zone_block.c b/src/spdk/module/bdev/zone_block/vbdev_zone_block.c
new file mode 100644
index 000000000..fb8b92fd2
--- /dev/null
+++ b/src/spdk/module/bdev/zone_block/vbdev_zone_block.c
@@ -0,0 +1,916 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "vbdev_zone_block.h"
+
+#include "spdk/config.h"
+#include "spdk/nvme.h"
+#include "spdk/bdev_zone.h"
+
+#include "spdk_internal/log.h"
+
+static int zone_block_init(void);
+static int zone_block_get_ctx_size(void);
+static void zone_block_finish(void);
+static int zone_block_config_json(struct spdk_json_write_ctx *w);
+static void zone_block_examine(struct spdk_bdev *bdev);
+
+static struct spdk_bdev_module bdev_zoned_if = {
+ .name = "bdev_zoned_block",
+ .module_init = zone_block_init,
+ .module_fini = zone_block_finish,
+ .config_text = NULL,
+ .config_json = zone_block_config_json,
+ .examine_config = zone_block_examine,
+ .get_ctx_size = zone_block_get_ctx_size,
+};
+
+SPDK_BDEV_MODULE_REGISTER(bdev_zoned_block, &bdev_zoned_if)
+
+/* List of block vbdev names and their base bdevs via configuration file.
+ * Used so we can parse the conf once at init and use this list in examine().
+ */
+struct bdev_zone_block_config {
+ char *vbdev_name;
+ char *bdev_name;
+ uint64_t zone_capacity;
+ uint64_t optimal_open_zones;
+ TAILQ_ENTRY(bdev_zone_block_config) link;
+};
+static TAILQ_HEAD(, bdev_zone_block_config) g_bdev_configs = TAILQ_HEAD_INITIALIZER(g_bdev_configs);
+
+struct block_zone {
+ struct spdk_bdev_zone_info zone_info;
+ pthread_spinlock_t lock;
+};
+
+/* List of block vbdevs and associated info for each. */
+struct bdev_zone_block {
+ struct spdk_bdev bdev; /* the block zoned bdev */
+ struct spdk_bdev_desc *base_desc; /* its descriptor we get from open */
+ struct block_zone *zones; /* array of zones */
+ uint64_t num_zones; /* number of zones */
+ uint64_t zone_capacity; /* zone capacity */
+ uint64_t zone_shift; /* log2 of zone_size */
+ TAILQ_ENTRY(bdev_zone_block) link;
+ struct spdk_thread *thread; /* thread where base device is opened */
+};
+static TAILQ_HEAD(, bdev_zone_block) g_bdev_nodes = TAILQ_HEAD_INITIALIZER(g_bdev_nodes);
+
+struct zone_block_io_channel {
+ struct spdk_io_channel *base_ch; /* IO channel of base device */
+};
+
+struct zone_block_io {
+ /* vbdev to which IO was issued */
+ struct bdev_zone_block *bdev_zone_block;
+};
+
+static int
+zone_block_init(void)
+{
+ return 0;
+}
+
+static void
+zone_block_remove_config(struct bdev_zone_block_config *name)
+{
+ TAILQ_REMOVE(&g_bdev_configs, name, link);
+ free(name->bdev_name);
+ free(name->vbdev_name);
+ free(name);
+}
+
+static void
+zone_block_finish(void)
+{
+ struct bdev_zone_block_config *name;
+
+ while ((name = TAILQ_FIRST(&g_bdev_configs))) {
+ zone_block_remove_config(name);
+ }
+}
+
+static int
+zone_block_get_ctx_size(void)
+{
+ return sizeof(struct zone_block_io);
+}
+
+static int
+zone_block_config_json(struct spdk_json_write_ctx *w)
+{
+ struct bdev_zone_block *bdev_node;
+ struct spdk_bdev *base_bdev = NULL;
+
+ TAILQ_FOREACH(bdev_node, &g_bdev_nodes, link) {
+ base_bdev = spdk_bdev_desc_get_bdev(bdev_node->base_desc);
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "method", "bdev_zone_block_create");
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_string(w, "base_bdev", spdk_bdev_get_name(base_bdev));
+ spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&bdev_node->bdev));
+ spdk_json_write_named_uint64(w, "zone_capacity", bdev_node->zone_capacity);
+ spdk_json_write_named_uint64(w, "optimal_open_zones", bdev_node->bdev.optimal_open_zones);
+ spdk_json_write_object_end(w);
+ spdk_json_write_object_end(w);
+ }
+
+ return 0;
+}
+
+/* Callback for unregistering the IO device. */
+static void
+_device_unregister_cb(void *io_device)
+{
+ struct bdev_zone_block *bdev_node = io_device;
+ uint64_t i;
+
+ free(bdev_node->bdev.name);
+ for (i = 0; i < bdev_node->num_zones; i++) {
+ pthread_spin_destroy(&bdev_node->zones[i].lock);
+ }
+ free(bdev_node->zones);
+ free(bdev_node);
+}
+
+static void
+_zone_block_destruct(void *ctx)
+{
+ struct spdk_bdev_desc *desc = ctx;
+
+ spdk_bdev_close(desc);
+}
+
+static int
+zone_block_destruct(void *ctx)
+{
+ struct bdev_zone_block *bdev_node = (struct bdev_zone_block *)ctx;
+
+ TAILQ_REMOVE(&g_bdev_nodes, bdev_node, link);
+
+ /* Unclaim the underlying bdev. */
+ spdk_bdev_module_release_bdev(spdk_bdev_desc_get_bdev(bdev_node->base_desc));
+
+ /* Close the underlying bdev on its same opened thread. */
+ if (bdev_node->thread && bdev_node->thread != spdk_get_thread()) {
+ spdk_thread_send_msg(bdev_node->thread, _zone_block_destruct, bdev_node->base_desc);
+ } else {
+ spdk_bdev_close(bdev_node->base_desc);
+ }
+
+ /* Unregister the io_device. */
+ spdk_io_device_unregister(bdev_node, _device_unregister_cb);
+
+ return 0;
+}
+
+static struct block_zone *
+zone_block_get_zone_containing_lba(struct bdev_zone_block *bdev_node, uint64_t lba)
+{
+ size_t index = lba >> bdev_node->zone_shift;
+
+ if (index >= bdev_node->num_zones) {
+ return NULL;
+ }
+
+ return &bdev_node->zones[index];
+}
+
+static struct block_zone *
+zone_block_get_zone_by_slba(struct bdev_zone_block *bdev_node, uint64_t start_lba)
+{
+ struct block_zone *zone = zone_block_get_zone_containing_lba(bdev_node, start_lba);
+
+ if (zone && zone->zone_info.zone_id == start_lba) {
+ return zone;
+ } else {
+ return NULL;
+ }
+}
+
+static int
+zone_block_get_zone_info(struct bdev_zone_block *bdev_node, struct spdk_bdev_io *bdev_io)
+{
+ struct block_zone *zone;
+ struct spdk_bdev_zone_info *zone_info = bdev_io->u.zone_mgmt.buf;
+ uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id;
+ size_t i;
+
+ /* User can request info for more zones than exist, need to check both internal and user
+ * boundaries
+ */
+ for (i = 0; i < bdev_io->u.zone_mgmt.num_zones; i++, zone_id += bdev_node->bdev.zone_size) {
+ zone = zone_block_get_zone_by_slba(bdev_node, zone_id);
+ if (!zone) {
+ return -EINVAL;
+ }
+ memcpy(&zone_info[i], &zone->zone_info, sizeof(*zone_info));
+ }
+
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
+ return 0;
+}
+
+static int
+zone_block_open_zone(struct block_zone *zone, struct spdk_bdev_io *bdev_io)
+{
+ pthread_spin_lock(&zone->lock);
+
+ switch (zone->zone_info.state) {
+ case SPDK_BDEV_ZONE_STATE_EMPTY:
+ case SPDK_BDEV_ZONE_STATE_OPEN:
+ case SPDK_BDEV_ZONE_STATE_CLOSED:
+ zone->zone_info.state = SPDK_BDEV_ZONE_STATE_OPEN;
+ pthread_spin_unlock(&zone->lock);
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
+ return 0;
+ default:
+ pthread_spin_unlock(&zone->lock);
+ return -EINVAL;
+ }
+}
+
+static void
+_zone_block_complete_unmap(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct spdk_bdev_io *orig_io = cb_arg;
+ int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
+
+ /* Complete the original IO and then free the one that we created here
+ * as a result of issuing an IO via submit_reqeust.
+ */
+ spdk_bdev_io_complete(orig_io, status);
+ spdk_bdev_free_io(bdev_io);
+}
+
+static int
+zone_block_reset_zone(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch,
+ struct block_zone *zone, struct spdk_bdev_io *bdev_io)
+{
+ pthread_spin_lock(&zone->lock);
+
+ switch (zone->zone_info.state) {
+ case SPDK_BDEV_ZONE_STATE_EMPTY:
+ pthread_spin_unlock(&zone->lock);
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
+ return 0;
+ case SPDK_BDEV_ZONE_STATE_OPEN:
+ case SPDK_BDEV_ZONE_STATE_FULL:
+ case SPDK_BDEV_ZONE_STATE_CLOSED:
+ zone->zone_info.state = SPDK_BDEV_ZONE_STATE_EMPTY;
+ zone->zone_info.write_pointer = zone->zone_info.zone_id;
+ pthread_spin_unlock(&zone->lock);
+ return spdk_bdev_unmap_blocks(bdev_node->base_desc, ch->base_ch,
+ zone->zone_info.zone_id, zone->zone_info.capacity,
+ _zone_block_complete_unmap, bdev_io);
+ default:
+ pthread_spin_unlock(&zone->lock);
+ return -EINVAL;
+ }
+}
+
+static int
+zone_block_close_zone(struct block_zone *zone, struct spdk_bdev_io *bdev_io)
+{
+ pthread_spin_lock(&zone->lock);
+
+ switch (zone->zone_info.state) {
+ case SPDK_BDEV_ZONE_STATE_OPEN:
+ case SPDK_BDEV_ZONE_STATE_CLOSED:
+ zone->zone_info.state = SPDK_BDEV_ZONE_STATE_CLOSED;
+ pthread_spin_unlock(&zone->lock);
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
+ return 0;
+ default:
+ pthread_spin_unlock(&zone->lock);
+ return -EINVAL;
+ }
+}
+
+static int
+zone_block_finish_zone(struct block_zone *zone, struct spdk_bdev_io *bdev_io)
+{
+ pthread_spin_lock(&zone->lock);
+
+ zone->zone_info.write_pointer = zone->zone_info.zone_id + zone->zone_info.capacity;
+ zone->zone_info.state = SPDK_BDEV_ZONE_STATE_FULL;
+
+ pthread_spin_unlock(&zone->lock);
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
+ return 0;
+}
+
+static int
+zone_block_zone_management(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch,
+ struct spdk_bdev_io *bdev_io)
+{
+ struct block_zone *zone;
+
+ zone = zone_block_get_zone_by_slba(bdev_node, bdev_io->u.zone_mgmt.zone_id);
+ if (!zone) {
+ return -EINVAL;
+ }
+
+ switch (bdev_io->u.zone_mgmt.zone_action) {
+ case SPDK_BDEV_ZONE_RESET:
+ return zone_block_reset_zone(bdev_node, ch, zone, bdev_io);
+ case SPDK_BDEV_ZONE_OPEN:
+ return zone_block_open_zone(zone, bdev_io);
+ case SPDK_BDEV_ZONE_CLOSE:
+ return zone_block_close_zone(zone, bdev_io);
+ case SPDK_BDEV_ZONE_FINISH:
+ return zone_block_finish_zone(zone, bdev_io);
+ default:
+ return -EINVAL;
+ }
+}
+
+static void
+_zone_block_complete_write(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct spdk_bdev_io *orig_io = cb_arg;
+ int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
+
+ if (success && orig_io->type == SPDK_BDEV_IO_TYPE_ZONE_APPEND) {
+ orig_io->u.bdev.offset_blocks = bdev_io->u.bdev.offset_blocks;
+ }
+
+ /* Complete the original IO and then free the one that we created here
+ * as a result of issuing an IO via submit_reqeust.
+ */
+ spdk_bdev_io_complete(orig_io, status);
+ spdk_bdev_free_io(bdev_io);
+}
+
+static int
+zone_block_write(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch,
+ struct spdk_bdev_io *bdev_io)
+{
+ struct block_zone *zone;
+ uint64_t len = bdev_io->u.bdev.num_blocks;
+ uint64_t lba = bdev_io->u.bdev.offset_blocks;
+ uint64_t num_blocks_left, wp;
+ int rc = 0;
+ bool is_append = bdev_io->type == SPDK_BDEV_IO_TYPE_ZONE_APPEND;
+
+ if (is_append) {
+ zone = zone_block_get_zone_by_slba(bdev_node, lba);
+ } else {
+ zone = zone_block_get_zone_containing_lba(bdev_node, lba);
+ }
+ if (!zone) {
+ SPDK_ERRLOG("Trying to write to invalid zone (lba 0x%lx)\n", lba);
+ return -EINVAL;
+ }
+
+ pthread_spin_lock(&zone->lock);
+
+ switch (zone->zone_info.state) {
+ case SPDK_BDEV_ZONE_STATE_OPEN:
+ case SPDK_BDEV_ZONE_STATE_EMPTY:
+ case SPDK_BDEV_ZONE_STATE_CLOSED:
+ zone->zone_info.state = SPDK_BDEV_ZONE_STATE_OPEN;
+ break;
+ default:
+ SPDK_ERRLOG("Trying to write to zone in invalid state %u\n", zone->zone_info.state);
+ rc = -EINVAL;
+ goto write_fail;
+ }
+
+ wp = zone->zone_info.write_pointer;
+ if (is_append) {
+ lba = wp;
+ } else {
+ if (lba != wp) {
+ SPDK_ERRLOG("Trying to write to zone with invalid address (lba 0x%lx, wp 0x%lx)\n", lba, wp);
+ rc = -EINVAL;
+ goto write_fail;
+ }
+ }
+
+ num_blocks_left = zone->zone_info.zone_id + zone->zone_info.capacity - wp;
+ if (len > num_blocks_left) {
+ SPDK_ERRLOG("Write exceeds zone capacity (lba 0x%" PRIu64 ", len 0x%lx, wp 0x%lx)\n", lba, len, wp);
+ rc = -EINVAL;
+ goto write_fail;
+ }
+
+ zone->zone_info.write_pointer += bdev_io->u.bdev.num_blocks;
+ assert(zone->zone_info.write_pointer <= zone->zone_info.zone_id + zone->zone_info.capacity);
+ if (zone->zone_info.write_pointer == zone->zone_info.zone_id + zone->zone_info.capacity) {
+ zone->zone_info.state = SPDK_BDEV_ZONE_STATE_FULL;
+ }
+ pthread_spin_unlock(&zone->lock);
+
+ if (bdev_io->u.bdev.md_buf == NULL) {
+ rc = spdk_bdev_writev_blocks(bdev_node->base_desc, ch->base_ch, bdev_io->u.bdev.iovs,
+ bdev_io->u.bdev.iovcnt, lba,
+ bdev_io->u.bdev.num_blocks, _zone_block_complete_write,
+ bdev_io);
+ } else {
+ rc = spdk_bdev_writev_blocks_with_md(bdev_node->base_desc, ch->base_ch,
+ bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
+ bdev_io->u.bdev.md_buf,
+ lba, bdev_io->u.bdev.num_blocks,
+ _zone_block_complete_write, bdev_io);
+ }
+
+ return rc;
+
+write_fail:
+ pthread_spin_unlock(&zone->lock);
+ return rc;
+}
+
+static void
+_zone_block_complete_read(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct spdk_bdev_io *orig_io = cb_arg;
+ int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
+
+ /* Complete the original IO and then free the one that we created here
+ * as a result of issuing an IO via submit_reqeust.
+ */
+ spdk_bdev_io_complete(orig_io, status);
+ spdk_bdev_free_io(bdev_io);
+}
+
+static int
+zone_block_read(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch,
+ struct spdk_bdev_io *bdev_io)
+{
+ struct block_zone *zone;
+ uint64_t len = bdev_io->u.bdev.num_blocks;
+ uint64_t lba = bdev_io->u.bdev.offset_blocks;
+ int rc;
+
+ zone = zone_block_get_zone_containing_lba(bdev_node, lba);
+ if (!zone) {
+ SPDK_ERRLOG("Trying to read from invalid zone (lba 0x%lx)\n", lba);
+ return -EINVAL;
+ }
+
+ if ((lba + len) > (zone->zone_info.zone_id + zone->zone_info.capacity)) {
+ SPDK_ERRLOG("Read exceeds zone capacity (lba 0x%lx, len 0x%lx)\n", lba, len);
+ return -EINVAL;
+ }
+
+ if (bdev_io->u.bdev.md_buf == NULL) {
+ rc = spdk_bdev_readv_blocks(bdev_node->base_desc, ch->base_ch, bdev_io->u.bdev.iovs,
+ bdev_io->u.bdev.iovcnt, lba,
+ len, _zone_block_complete_read,
+ bdev_io);
+ } else {
+ rc = spdk_bdev_readv_blocks_with_md(bdev_node->base_desc, ch->base_ch,
+ bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
+ bdev_io->u.bdev.md_buf,
+ lba, len,
+ _zone_block_complete_read, bdev_io);
+ }
+
+ return rc;
+}
+
+static void
+zone_block_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
+{
+ struct bdev_zone_block *bdev_node = SPDK_CONTAINEROF(bdev_io->bdev, struct bdev_zone_block, bdev);
+ struct zone_block_io_channel *dev_ch = spdk_io_channel_get_ctx(ch);
+ int rc = 0;
+
+ switch (bdev_io->type) {
+ case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
+ rc = zone_block_get_zone_info(bdev_node, bdev_io);
+ break;
+ case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
+ rc = zone_block_zone_management(bdev_node, dev_ch, bdev_io);
+ break;
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
+ rc = zone_block_write(bdev_node, dev_ch, bdev_io);
+ break;
+ case SPDK_BDEV_IO_TYPE_READ:
+ rc = zone_block_read(bdev_node, dev_ch, bdev_io);
+ break;
+ default:
+ SPDK_ERRLOG("vbdev_block: unknown I/O type %u\n", bdev_io->type);
+ rc = -ENOTSUP;
+ break;
+ }
+
+ if (rc != 0) {
+ if (rc == -ENOMEM) {
+ SPDK_WARNLOG("ENOMEM, start to queue io for vbdev.\n");
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
+ } else {
+ SPDK_ERRLOG("ERROR on bdev_io submission!\n");
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+ }
+}
+
+static bool
+zone_block_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
+{
+ switch (io_type) {
+ case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ case SPDK_BDEV_IO_TYPE_READ:
+ case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static struct spdk_io_channel *
+zone_block_get_io_channel(void *ctx)
+{
+ struct bdev_zone_block *bdev_node = (struct bdev_zone_block *)ctx;
+
+ return spdk_get_io_channel(bdev_node);
+}
+
+static int
+zone_block_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
+{
+ struct bdev_zone_block *bdev_node = (struct bdev_zone_block *)ctx;
+ struct spdk_bdev *base_bdev = spdk_bdev_desc_get_bdev(bdev_node->base_desc);
+
+ spdk_json_write_name(w, "zoned_block");
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&bdev_node->bdev));
+ spdk_json_write_named_string(w, "base_bdev", spdk_bdev_get_name(base_bdev));
+ spdk_json_write_named_uint64(w, "zone_capacity", bdev_node->zone_capacity);
+ spdk_json_write_named_uint64(w, "optimal_open_zones", bdev_node->bdev.optimal_open_zones);
+ spdk_json_write_object_end(w);
+
+ return 0;
+}
+
+/* When we register our vbdev this is how we specify our entry points. */
+static const struct spdk_bdev_fn_table zone_block_fn_table = {
+ .destruct = zone_block_destruct,
+ .submit_request = zone_block_submit_request,
+ .io_type_supported = zone_block_io_type_supported,
+ .get_io_channel = zone_block_get_io_channel,
+ .dump_info_json = zone_block_dump_info_json,
+};
+
+static void
+zone_block_base_bdev_hotremove_cb(void *ctx)
+{
+ struct bdev_zone_block *bdev_node, *tmp;
+ struct spdk_bdev *bdev_find = ctx;
+
+ TAILQ_FOREACH_SAFE(bdev_node, &g_bdev_nodes, link, tmp) {
+ if (bdev_find == spdk_bdev_desc_get_bdev(bdev_node->base_desc)) {
+ spdk_bdev_unregister(&bdev_node->bdev, NULL, NULL);
+ }
+ }
+}
+
+static int
+_zone_block_ch_create_cb(void *io_device, void *ctx_buf)
+{
+ struct zone_block_io_channel *bdev_ch = ctx_buf;
+ struct bdev_zone_block *bdev_node = io_device;
+
+ bdev_ch->base_ch = spdk_bdev_get_io_channel(bdev_node->base_desc);
+ if (!bdev_ch->base_ch) {
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void
+_zone_block_ch_destroy_cb(void *io_device, void *ctx_buf)
+{
+ struct zone_block_io_channel *bdev_ch = ctx_buf;
+
+ spdk_put_io_channel(bdev_ch->base_ch);
+}
+
+static int
+zone_block_insert_name(const char *bdev_name, const char *vbdev_name, uint64_t zone_capacity,
+ uint64_t optimal_open_zones)
+{
+ struct bdev_zone_block_config *name;
+
+ TAILQ_FOREACH(name, &g_bdev_configs, link) {
+ if (strcmp(vbdev_name, name->vbdev_name) == 0) {
+ SPDK_ERRLOG("block zoned bdev %s already exists\n", vbdev_name);
+ return -EEXIST;
+ }
+ if (strcmp(bdev_name, name->bdev_name) == 0) {
+ SPDK_ERRLOG("base bdev %s already claimed\n", bdev_name);
+ return -EEXIST;
+ }
+ }
+
+ name = calloc(1, sizeof(*name));
+ if (!name) {
+ SPDK_ERRLOG("could not allocate bdev_names\n");
+ return -ENOMEM;
+ }
+
+ name->bdev_name = strdup(bdev_name);
+ if (!name->bdev_name) {
+ SPDK_ERRLOG("could not allocate name->bdev_name\n");
+ free(name);
+ return -ENOMEM;
+ }
+
+ name->vbdev_name = strdup(vbdev_name);
+ if (!name->vbdev_name) {
+ SPDK_ERRLOG("could not allocate name->vbdev_name\n");
+ free(name->bdev_name);
+ free(name);
+ return -ENOMEM;
+ }
+
+ name->zone_capacity = zone_capacity;
+ name->optimal_open_zones = optimal_open_zones;
+
+ TAILQ_INSERT_TAIL(&g_bdev_configs, name, link);
+
+ return 0;
+}
+
+static int
+zone_block_init_zone_info(struct bdev_zone_block *bdev_node)
+{
+ size_t i;
+ struct block_zone *zone;
+ int rc = 0;
+
+ for (i = 0; i < bdev_node->num_zones; i++) {
+ zone = &bdev_node->zones[i];
+ zone->zone_info.zone_id = bdev_node->bdev.zone_size * i;
+ zone->zone_info.capacity = bdev_node->zone_capacity;
+ zone->zone_info.write_pointer = zone->zone_info.zone_id + zone->zone_info.capacity;
+ zone->zone_info.state = SPDK_BDEV_ZONE_STATE_FULL;
+ if (pthread_spin_init(&zone->lock, PTHREAD_PROCESS_PRIVATE)) {
+ SPDK_ERRLOG("pthread_spin_init() failed\n");
+ rc = -ENOMEM;
+ break;
+ }
+ }
+
+ if (rc) {
+ for (; i > 0; i--) {
+ pthread_spin_destroy(&bdev_node->zones[i - 1].lock);
+ }
+ }
+
+ return rc;
+}
+
+static int
+zone_block_register(struct spdk_bdev *base_bdev)
+{
+ struct bdev_zone_block_config *name, *tmp;
+ struct bdev_zone_block *bdev_node;
+ uint64_t zone_size;
+ int rc = 0;
+
+ /* Check our list of names from config versus this bdev and if
+ * there's a match, create the bdev_node & bdev accordingly.
+ */
+ TAILQ_FOREACH_SAFE(name, &g_bdev_configs, link, tmp) {
+ if (strcmp(name->bdev_name, base_bdev->name) != 0) {
+ continue;
+ }
+
+ if (spdk_bdev_is_zoned(base_bdev)) {
+ SPDK_ERRLOG("Base bdev %s is already a zoned bdev\n", base_bdev->name);
+ rc = -EEXIST;
+ goto free_config;
+ }
+
+ bdev_node = calloc(1, sizeof(struct bdev_zone_block));
+ if (!bdev_node) {
+ rc = -ENOMEM;
+ SPDK_ERRLOG("could not allocate bdev_node\n");
+ goto free_config;
+ }
+
+ /* The base bdev that we're attaching to. */
+ bdev_node->bdev.name = strdup(name->vbdev_name);
+ if (!bdev_node->bdev.name) {
+ rc = -ENOMEM;
+ SPDK_ERRLOG("could not allocate bdev_node name\n");
+ goto strdup_failed;
+ }
+
+ zone_size = spdk_align64pow2(name->zone_capacity);
+ if (zone_size == 0) {
+ rc = -EINVAL;
+ SPDK_ERRLOG("invalid zone size\n");
+ goto roundup_failed;
+ }
+
+ bdev_node->zone_shift = spdk_u64log2(zone_size);
+ bdev_node->num_zones = base_bdev->blockcnt / zone_size;
+
+ /* Align num_zones to optimal_open_zones */
+ bdev_node->num_zones -= bdev_node->num_zones % name->optimal_open_zones;
+ bdev_node->zones = calloc(bdev_node->num_zones, sizeof(struct block_zone));
+ if (!bdev_node->zones) {
+ rc = -ENOMEM;
+ SPDK_ERRLOG("could not allocate zones\n");
+ goto calloc_failed;
+ }
+
+ bdev_node->bdev.product_name = "zone_block";
+
+ /* Copy some properties from the underlying base bdev. */
+ bdev_node->bdev.write_cache = base_bdev->write_cache;
+ bdev_node->bdev.required_alignment = base_bdev->required_alignment;
+ bdev_node->bdev.optimal_io_boundary = base_bdev->optimal_io_boundary;
+
+ bdev_node->bdev.blocklen = base_bdev->blocklen;
+ bdev_node->bdev.blockcnt = bdev_node->num_zones * zone_size;
+
+ if (bdev_node->num_zones * name->zone_capacity != base_bdev->blockcnt) {
+ SPDK_DEBUGLOG(SPDK_LOG_VBDEV_ZONE_BLOCK,
+ "Lost %lu blocks due to zone capacity and base bdev size misalignment\n",
+ base_bdev->blockcnt - bdev_node->num_zones * name->zone_capacity);
+ }
+
+ bdev_node->bdev.write_unit_size = base_bdev->write_unit_size;
+
+ bdev_node->bdev.md_interleave = base_bdev->md_interleave;
+ bdev_node->bdev.md_len = base_bdev->md_len;
+ bdev_node->bdev.dif_type = base_bdev->dif_type;
+ bdev_node->bdev.dif_is_head_of_md = base_bdev->dif_is_head_of_md;
+ bdev_node->bdev.dif_check_flags = base_bdev->dif_check_flags;
+
+ bdev_node->bdev.zoned = true;
+ bdev_node->bdev.ctxt = bdev_node;
+ bdev_node->bdev.fn_table = &zone_block_fn_table;
+ bdev_node->bdev.module = &bdev_zoned_if;
+
+ /* bdev specific info */
+ bdev_node->bdev.zone_size = zone_size;
+
+ bdev_node->zone_capacity = name->zone_capacity;
+ bdev_node->bdev.optimal_open_zones = name->optimal_open_zones;
+ bdev_node->bdev.max_open_zones = 0;
+ rc = zone_block_init_zone_info(bdev_node);
+ if (rc) {
+ SPDK_ERRLOG("could not init zone info\n");
+ goto zone_info_failed;
+ }
+
+ TAILQ_INSERT_TAIL(&g_bdev_nodes, bdev_node, link);
+
+ spdk_io_device_register(bdev_node, _zone_block_ch_create_cb, _zone_block_ch_destroy_cb,
+ sizeof(struct zone_block_io_channel),
+ name->vbdev_name);
+
+ rc = spdk_bdev_open(base_bdev, true, zone_block_base_bdev_hotremove_cb,
+ base_bdev, &bdev_node->base_desc);
+ if (rc) {
+ SPDK_ERRLOG("could not open bdev %s\n", spdk_bdev_get_name(base_bdev));
+ goto open_failed;
+ }
+
+ /* Save the thread where the base device is opened */
+ bdev_node->thread = spdk_get_thread();
+
+ rc = spdk_bdev_module_claim_bdev(base_bdev, bdev_node->base_desc, bdev_node->bdev.module);
+ if (rc) {
+ SPDK_ERRLOG("could not claim bdev %s\n", spdk_bdev_get_name(base_bdev));
+ goto claim_failed;
+ }
+
+ rc = spdk_bdev_register(&bdev_node->bdev);
+ if (rc) {
+ SPDK_ERRLOG("could not register zoned bdev\n");
+ goto register_failed;
+ }
+ }
+
+ return rc;
+
+register_failed:
+ spdk_bdev_module_release_bdev(&bdev_node->bdev);
+claim_failed:
+ spdk_bdev_close(bdev_node->base_desc);
+open_failed:
+ TAILQ_REMOVE(&g_bdev_nodes, bdev_node, link);
+ spdk_io_device_unregister(bdev_node, NULL);
+zone_info_failed:
+ free(bdev_node->zones);
+calloc_failed:
+roundup_failed:
+ free(bdev_node->bdev.name);
+strdup_failed:
+ free(bdev_node);
+free_config:
+ zone_block_remove_config(name);
+ return rc;
+}
+
+int
+vbdev_zone_block_create(const char *bdev_name, const char *vbdev_name, uint64_t zone_capacity,
+ uint64_t optimal_open_zones)
+{
+ struct spdk_bdev *bdev = NULL;
+ int rc = 0;
+
+ if (zone_capacity == 0) {
+ SPDK_ERRLOG("Zone capacity can't be 0\n");
+ return -EINVAL;
+ }
+
+ if (optimal_open_zones == 0) {
+ SPDK_ERRLOG("Optimal open zones can't be 0\n");
+ return -EINVAL;
+ }
+
+ /* Insert the bdev into our global name list even if it doesn't exist yet,
+ * it may show up soon...
+ */
+ rc = zone_block_insert_name(bdev_name, vbdev_name, zone_capacity, optimal_open_zones);
+ if (rc) {
+ return rc;
+ }
+
+ bdev = spdk_bdev_get_by_name(bdev_name);
+ if (!bdev) {
+ /* This is not an error, even though the bdev is not present at this time it may
+ * still show up later.
+ */
+ return 0;
+ }
+
+ return zone_block_register(bdev);
+}
+
+void
+vbdev_zone_block_delete(const char *name, spdk_bdev_unregister_cb cb_fn, void *cb_arg)
+{
+ struct bdev_zone_block_config *name_node;
+ struct spdk_bdev *bdev = NULL;
+
+ bdev = spdk_bdev_get_by_name(name);
+ if (!bdev || bdev->module != &bdev_zoned_if) {
+ cb_fn(cb_arg, -ENODEV);
+ return;
+ }
+
+ TAILQ_FOREACH(name_node, &g_bdev_configs, link) {
+ if (strcmp(name_node->vbdev_name, bdev->name) == 0) {
+ zone_block_remove_config(name_node);
+ break;
+ }
+ }
+
+ spdk_bdev_unregister(bdev, cb_fn, cb_arg);
+}
+
+static void
+zone_block_examine(struct spdk_bdev *bdev)
+{
+ zone_block_register(bdev);
+
+ spdk_bdev_module_examine_done(&bdev_zoned_if);
+}
+
+SPDK_LOG_REGISTER_COMPONENT("vbdev_zone_block", SPDK_LOG_VBDEV_ZONE_BLOCK)
diff --git a/src/spdk/module/bdev/zone_block/vbdev_zone_block.h b/src/spdk/module/bdev/zone_block/vbdev_zone_block.h
new file mode 100644
index 000000000..b4904c4f4
--- /dev/null
+++ b/src/spdk/module/bdev/zone_block/vbdev_zone_block.h
@@ -0,0 +1,47 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_VBDEV_ZONE_BLOCK_H
+#define SPDK_VBDEV_ZONE_BLOCK_H
+
+#include "spdk/stdinc.h"
+
+#include "spdk/bdev.h"
+#include "spdk/bdev_module.h"
+
+int vbdev_zone_block_create(const char *bdev_name, const char *vbdev_name,
+ uint64_t zone_capacity, uint64_t optimal_open_zones);
+
+void vbdev_zone_block_delete(const char *name, spdk_bdev_unregister_cb cb_fn, void *cb_arg);
+
+#endif /* SPDK_VBDEV_ZONE_BLOCK_H */
diff --git a/src/spdk/module/bdev/zone_block/vbdev_zone_block_rpc.c b/src/spdk/module/bdev/zone_block/vbdev_zone_block_rpc.c
new file mode 100644
index 000000000..b7f485190
--- /dev/null
+++ b/src/spdk/module/bdev/zone_block/vbdev_zone_block_rpc.c
@@ -0,0 +1,146 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "vbdev_zone_block.h"
+
+#include "spdk/util.h"
+#include "spdk/string.h"
+#include "spdk/rpc.h"
+
+#include "spdk_internal/log.h"
+
+struct rpc_construct_zone_block {
+ char *name;
+ char *base_bdev;
+ uint64_t zone_capacity;
+ uint64_t optimal_open_zones;
+};
+
+static void
+free_rpc_construct_zone_block(struct rpc_construct_zone_block *req)
+{
+ free(req->name);
+ free(req->base_bdev);
+}
+
+static const struct spdk_json_object_decoder rpc_construct_zone_block_decoders[] = {
+ {"name", offsetof(struct rpc_construct_zone_block, name), spdk_json_decode_string},
+ {"base_bdev", offsetof(struct rpc_construct_zone_block, base_bdev), spdk_json_decode_string},
+ {"zone_capacity", offsetof(struct rpc_construct_zone_block, zone_capacity), spdk_json_decode_uint64},
+ {"optimal_open_zones", offsetof(struct rpc_construct_zone_block, optimal_open_zones), spdk_json_decode_uint64},
+};
+
+static void
+rpc_zone_block_create(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_construct_zone_block req = {};
+ struct spdk_json_write_ctx *w;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_construct_zone_block_decoders,
+ SPDK_COUNTOF(rpc_construct_zone_block_decoders),
+ &req)) {
+ SPDK_ERRLOG("Failed to decode block create parameters");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid parameters");
+ goto cleanup;
+ }
+
+ rc = vbdev_zone_block_create(req.base_bdev, req.name, req.zone_capacity,
+ req.optimal_open_zones);
+ if (rc) {
+ SPDK_ERRLOG("Failed to create block zoned vbdev: %s", spdk_strerror(-rc));
+ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Failed to create block zoned vbdev: %s",
+ spdk_strerror(-rc));
+ goto cleanup;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_string(w, req.name);
+ spdk_jsonrpc_end_result(request, w);
+
+cleanup:
+ free_rpc_construct_zone_block(&req);
+}
+SPDK_RPC_REGISTER("bdev_zone_block_create", rpc_zone_block_create, SPDK_RPC_RUNTIME)
+
+struct rpc_delete_zone_block {
+ char *name;
+};
+
+static void
+free_rpc_delete_zone_block(struct rpc_delete_zone_block *req)
+{
+ free(req->name);
+}
+
+static const struct spdk_json_object_decoder rpc_delete_zone_block_decoders[] = {
+ {"name", offsetof(struct rpc_delete_zone_block, name), spdk_json_decode_string},
+};
+
+static void
+_rpc_delete_zone_block_cb(void *cb_ctx, int rc)
+{
+ struct spdk_jsonrpc_request *request = cb_ctx;
+ struct spdk_json_write_ctx *w;
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, rc == 0);
+ spdk_jsonrpc_end_result(request, w);
+}
+
+static void
+rpc_zone_block_delete(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_delete_zone_block attrs = {};
+
+ if (spdk_json_decode_object(params, rpc_delete_zone_block_decoders,
+ SPDK_COUNTOF(rpc_delete_zone_block_decoders),
+ &attrs)) {
+ SPDK_ERRLOG("Failed to decode block delete parameters");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid parameters");
+ goto cleanup;
+ }
+
+ vbdev_zone_block_delete(attrs.name, _rpc_delete_zone_block_cb, request);
+
+cleanup:
+ free_rpc_delete_zone_block(&attrs);
+}
+SPDK_RPC_REGISTER("bdev_zone_block_delete", rpc_zone_block_delete, SPDK_RPC_RUNTIME)
diff --git a/src/spdk/module/blob/Makefile b/src/spdk/module/blob/Makefile
new file mode 100644
index 000000000..744a0a4a4
--- /dev/null
+++ b/src/spdk/module/blob/Makefile
@@ -0,0 +1,44 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+DIRS-y = bdev
+
+.PHONY: all clean $(DIRS-y)
+
+all: $(DIRS-y)
+clean: $(DIRS-y)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.subdirs.mk
diff --git a/src/spdk/module/blob/bdev/Makefile b/src/spdk/module/blob/bdev/Makefile
new file mode 100644
index 000000000..d40f6afb8
--- /dev/null
+++ b/src/spdk/module/blob/bdev/Makefile
@@ -0,0 +1,45 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 3
+SO_MINOR := 0
+
+C_SRCS = blob_bdev.c
+LIBNAME = blob_bdev
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_blob_bdev.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/module/blob/bdev/blob_bdev.c b/src/spdk/module/blob/bdev/blob_bdev.c
new file mode 100644
index 000000000..67949fcfe
--- /dev/null
+++ b/src/spdk/module/blob/bdev/blob_bdev.c
@@ -0,0 +1,390 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/blob_bdev.h"
+#include "spdk/blob.h"
+#include "spdk/thread.h"
+#include "spdk/log.h"
+#include "spdk/endian.h"
+#include "spdk/bdev_module.h"
+
+struct blob_bdev {
+ struct spdk_bs_dev bs_dev;
+ struct spdk_bdev *bdev;
+ struct spdk_bdev_desc *desc;
+ bool claimed;
+};
+
+struct blob_resubmit {
+ struct spdk_bdev_io_wait_entry bdev_io_wait;
+ enum spdk_bdev_io_type io_type;
+ struct spdk_bs_dev *dev;
+ struct spdk_io_channel *channel;
+ void *payload;
+ int iovcnt;
+ uint64_t lba;
+ uint32_t lba_count;
+ struct spdk_bs_dev_cb_args *cb_args;
+};
+static void bdev_blob_resubmit(void *);
+
+static inline struct spdk_bdev_desc *
+__get_desc(struct spdk_bs_dev *dev)
+{
+ return ((struct blob_bdev *)dev)->desc;
+}
+
+static inline struct spdk_bdev *
+__get_bdev(struct spdk_bs_dev *dev)
+{
+ return ((struct blob_bdev *)dev)->bdev;
+}
+
+static void
+bdev_blob_io_complete(struct spdk_bdev_io *bdev_io, bool success, void *arg)
+{
+ struct spdk_bs_dev_cb_args *cb_args = arg;
+ int bserrno;
+
+ if (success) {
+ bserrno = 0;
+ } else {
+ bserrno = -EIO;
+ }
+ cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, bserrno);
+ spdk_bdev_free_io(bdev_io);
+}
+
+static void
+bdev_blob_queue_io(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, void *payload,
+ int iovcnt,
+ uint64_t lba, uint32_t lba_count, enum spdk_bdev_io_type io_type,
+ struct spdk_bs_dev_cb_args *cb_args)
+{
+ int rc;
+ struct spdk_bdev *bdev = __get_bdev(dev);
+ struct blob_resubmit *ctx;
+
+ ctx = calloc(1, sizeof(struct blob_resubmit));
+
+ if (ctx == NULL) {
+ SPDK_ERRLOG("Not enough memory to queue io\n");
+ cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, -ENOMEM);
+ return;
+ }
+
+ ctx->io_type = io_type;
+ ctx->dev = dev;
+ ctx->channel = channel;
+ ctx->payload = payload;
+ ctx->iovcnt = iovcnt;
+ ctx->lba = lba;
+ ctx->lba_count = lba_count;
+ ctx->cb_args = cb_args;
+ ctx->bdev_io_wait.bdev = bdev;
+ ctx->bdev_io_wait.cb_fn = bdev_blob_resubmit;
+ ctx->bdev_io_wait.cb_arg = ctx;
+
+ rc = spdk_bdev_queue_io_wait(bdev, channel, &ctx->bdev_io_wait);
+ if (rc != 0) {
+ SPDK_ERRLOG("Queue io failed, rc=%d\n", rc);
+ cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc);
+ free(ctx);
+ assert(false);
+ }
+}
+
+static void
+bdev_blob_read(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, void *payload,
+ uint64_t lba, uint32_t lba_count, struct spdk_bs_dev_cb_args *cb_args)
+{
+ int rc;
+
+ rc = spdk_bdev_read_blocks(__get_desc(dev), channel, payload, lba,
+ lba_count, bdev_blob_io_complete, cb_args);
+ if (rc == -ENOMEM) {
+ bdev_blob_queue_io(dev, channel, payload, 0, lba,
+ lba_count, SPDK_BDEV_IO_TYPE_READ, cb_args);
+ } else if (rc != 0) {
+ cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc);
+ }
+}
+
+static void
+bdev_blob_write(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, void *payload,
+ uint64_t lba, uint32_t lba_count, struct spdk_bs_dev_cb_args *cb_args)
+{
+ int rc;
+
+ rc = spdk_bdev_write_blocks(__get_desc(dev), channel, payload, lba,
+ lba_count, bdev_blob_io_complete, cb_args);
+ if (rc == -ENOMEM) {
+ bdev_blob_queue_io(dev, channel, payload, 0, lba,
+ lba_count, SPDK_BDEV_IO_TYPE_WRITE, cb_args);
+ } else if (rc != 0) {
+ cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc);
+ }
+}
+
+static void
+bdev_blob_readv(struct spdk_bs_dev *dev, struct spdk_io_channel *channel,
+ struct iovec *iov, int iovcnt,
+ uint64_t lba, uint32_t lba_count, struct spdk_bs_dev_cb_args *cb_args)
+{
+ int rc;
+
+ rc = spdk_bdev_readv_blocks(__get_desc(dev), channel, iov, iovcnt, lba,
+ lba_count, bdev_blob_io_complete, cb_args);
+ if (rc == -ENOMEM) {
+ bdev_blob_queue_io(dev, channel, iov, iovcnt, lba,
+ lba_count, SPDK_BDEV_IO_TYPE_READ, cb_args);
+ } else if (rc != 0) {
+ cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc);
+ }
+}
+
+static void
+bdev_blob_writev(struct spdk_bs_dev *dev, struct spdk_io_channel *channel,
+ struct iovec *iov, int iovcnt,
+ uint64_t lba, uint32_t lba_count, struct spdk_bs_dev_cb_args *cb_args)
+{
+ int rc;
+
+ rc = spdk_bdev_writev_blocks(__get_desc(dev), channel, iov, iovcnt, lba,
+ lba_count, bdev_blob_io_complete, cb_args);
+ if (rc == -ENOMEM) {
+ bdev_blob_queue_io(dev, channel, iov, iovcnt, lba,
+ lba_count, SPDK_BDEV_IO_TYPE_WRITE, cb_args);
+ } else if (rc != 0) {
+ cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc);
+ }
+}
+
+static void
+bdev_blob_write_zeroes(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, uint64_t lba,
+ uint32_t lba_count, struct spdk_bs_dev_cb_args *cb_args)
+{
+ int rc;
+
+ rc = spdk_bdev_write_zeroes_blocks(__get_desc(dev), channel, lba,
+ lba_count, bdev_blob_io_complete, cb_args);
+ if (rc == -ENOMEM) {
+ bdev_blob_queue_io(dev, channel, NULL, 0, lba,
+ lba_count, SPDK_BDEV_IO_TYPE_WRITE_ZEROES, cb_args);
+ } else if (rc != 0) {
+ cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc);
+ }
+}
+
+static void
+bdev_blob_unmap(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, uint64_t lba,
+ uint32_t lba_count, struct spdk_bs_dev_cb_args *cb_args)
+{
+ struct blob_bdev *blob_bdev = (struct blob_bdev *)dev;
+ int rc;
+
+ if (spdk_bdev_io_type_supported(blob_bdev->bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
+ rc = spdk_bdev_unmap_blocks(__get_desc(dev), channel, lba, lba_count,
+ bdev_blob_io_complete, cb_args);
+ if (rc == -ENOMEM) {
+ bdev_blob_queue_io(dev, channel, NULL, 0, lba,
+ lba_count, SPDK_BDEV_IO_TYPE_UNMAP, cb_args);
+ } else if (rc != 0) {
+ cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc);
+ }
+ } else {
+ /*
+ * If the device doesn't support unmap, immediately complete
+ * the request. Blobstore does not rely on unmap zeroing
+ * data.
+ */
+ cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, 0);
+ }
+}
+
+static void
+bdev_blob_resubmit(void *arg)
+{
+ struct blob_resubmit *ctx = (struct blob_resubmit *) arg;
+
+ switch (ctx->io_type) {
+ case SPDK_BDEV_IO_TYPE_READ:
+ if (ctx->iovcnt > 0) {
+ bdev_blob_readv(ctx->dev, ctx->channel, (struct iovec *)ctx->payload, ctx->iovcnt,
+ ctx->lba, ctx->lba_count, ctx->cb_args);
+ } else {
+ bdev_blob_read(ctx->dev, ctx->channel, ctx->payload,
+ ctx->lba, ctx->lba_count, ctx->cb_args);
+ }
+ break;
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ if (ctx->iovcnt > 0) {
+ bdev_blob_writev(ctx->dev, ctx->channel, (struct iovec *)ctx->payload, ctx->iovcnt,
+ ctx->lba, ctx->lba_count, ctx->cb_args);
+ } else {
+ bdev_blob_write(ctx->dev, ctx->channel, ctx->payload,
+ ctx->lba, ctx->lba_count, ctx->cb_args);
+ }
+ break;
+ case SPDK_BDEV_IO_TYPE_UNMAP:
+ bdev_blob_unmap(ctx->dev, ctx->channel,
+ ctx->lba, ctx->lba_count, ctx->cb_args);
+ break;
+ case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
+ bdev_blob_write_zeroes(ctx->dev, ctx->channel,
+ ctx->lba, ctx->lba_count, ctx->cb_args);
+ break;
+ default:
+ SPDK_ERRLOG("Unsupported io type %d\n", ctx->io_type);
+ assert(false);
+ break;
+ }
+ free(ctx);
+}
+
+int
+spdk_bs_bdev_claim(struct spdk_bs_dev *bs_dev, struct spdk_bdev_module *module)
+{
+ struct blob_bdev *blob_bdev = (struct blob_bdev *)bs_dev;
+ int rc;
+
+ rc = spdk_bdev_module_claim_bdev(blob_bdev->bdev, NULL, module);
+ if (rc != 0) {
+ SPDK_ERRLOG("could not claim bs dev\n");
+ return rc;
+ }
+
+ blob_bdev->claimed = true;
+
+ return rc;
+}
+
+static struct spdk_io_channel *
+bdev_blob_create_channel(struct spdk_bs_dev *dev)
+{
+ struct blob_bdev *blob_bdev = (struct blob_bdev *)dev;
+
+ return spdk_bdev_get_io_channel(blob_bdev->desc);
+}
+
+static void
+bdev_blob_destroy_channel(struct spdk_bs_dev *dev, struct spdk_io_channel *channel)
+{
+ spdk_put_io_channel(channel);
+}
+
+static void
+bdev_blob_destroy(struct spdk_bs_dev *bs_dev)
+{
+ struct spdk_bdev_desc *desc = __get_desc(bs_dev);
+ struct blob_bdev *blob_bdev = (struct blob_bdev *)bs_dev;
+
+ if (blob_bdev->claimed) {
+ spdk_bdev_module_release_bdev(blob_bdev->bdev);
+ }
+
+ spdk_bdev_close(desc);
+ free(bs_dev);
+}
+
+struct spdk_bs_dev *
+spdk_bdev_create_bs_dev(struct spdk_bdev *bdev, spdk_bdev_remove_cb_t remove_cb, void *remove_ctx)
+{
+ struct blob_bdev *b;
+ struct spdk_bdev_desc *desc;
+ int rc;
+
+ b = calloc(1, sizeof(*b));
+
+ if (b == NULL) {
+ SPDK_ERRLOG("could not allocate blob_bdev\n");
+ return NULL;
+ }
+
+ rc = spdk_bdev_open(bdev, true, remove_cb, remove_ctx, &desc);
+ if (rc != 0) {
+ free(b);
+ return NULL;
+ }
+
+ b->bdev = bdev;
+ b->desc = desc;
+ b->bs_dev.blockcnt = spdk_bdev_get_num_blocks(bdev);
+ b->bs_dev.blocklen = spdk_bdev_get_block_size(bdev);
+ b->bs_dev.create_channel = bdev_blob_create_channel;
+ b->bs_dev.destroy_channel = bdev_blob_destroy_channel;
+ b->bs_dev.destroy = bdev_blob_destroy;
+ b->bs_dev.read = bdev_blob_read;
+ b->bs_dev.write = bdev_blob_write;
+ b->bs_dev.readv = bdev_blob_readv;
+ b->bs_dev.writev = bdev_blob_writev;
+ b->bs_dev.write_zeroes = bdev_blob_write_zeroes;
+ b->bs_dev.unmap = bdev_blob_unmap;
+
+ return &b->bs_dev;
+}
+
+struct spdk_bs_dev *
+spdk_bdev_create_bs_dev_from_desc(struct spdk_bdev_desc *desc)
+{
+ struct blob_bdev *b;
+ struct spdk_bdev *bdev;
+
+ b = calloc(1, sizeof(*b));
+
+ if (b == NULL) {
+ SPDK_ERRLOG("could not allocate blob_bdev\n");
+ return NULL;
+ }
+
+ bdev = spdk_bdev_desc_get_bdev(desc);
+ assert(bdev != NULL);
+
+ b->bdev = bdev;
+ b->desc = desc;
+ b->bs_dev.blockcnt = spdk_bdev_get_num_blocks(bdev);
+ b->bs_dev.blocklen = spdk_bdev_get_block_size(bdev);
+ b->bs_dev.create_channel = bdev_blob_create_channel;
+ b->bs_dev.destroy_channel = bdev_blob_destroy_channel;
+ b->bs_dev.destroy = bdev_blob_destroy;
+ b->bs_dev.read = bdev_blob_read;
+ b->bs_dev.write = bdev_blob_write;
+ b->bs_dev.readv = bdev_blob_readv;
+ b->bs_dev.writev = bdev_blob_writev;
+ b->bs_dev.write_zeroes = bdev_blob_write_zeroes;
+ b->bs_dev.unmap = bdev_blob_unmap;
+
+ return &b->bs_dev;
+}
diff --git a/src/spdk/module/blob/bdev/spdk_blob_bdev.map b/src/spdk/module/blob/bdev/spdk_blob_bdev.map
new file mode 100644
index 000000000..0149b3ab6
--- /dev/null
+++ b/src/spdk/module/blob/bdev/spdk_blob_bdev.map
@@ -0,0 +1,10 @@
+{
+ global:
+
+ # public functions
+ spdk_bdev_create_bs_dev;
+ spdk_bdev_create_bs_dev_from_desc;
+ spdk_bs_bdev_claim;
+
+ local: *;
+};
diff --git a/src/spdk/module/blobfs/Makefile b/src/spdk/module/blobfs/Makefile
new file mode 100644
index 000000000..744a0a4a4
--- /dev/null
+++ b/src/spdk/module/blobfs/Makefile
@@ -0,0 +1,44 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+DIRS-y = bdev
+
+.PHONY: all clean $(DIRS-y)
+
+all: $(DIRS-y)
+clean: $(DIRS-y)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.subdirs.mk
diff --git a/src/spdk/module/blobfs/bdev/Makefile b/src/spdk/module/blobfs/bdev/Makefile
new file mode 100644
index 000000000..97d350d30
--- /dev/null
+++ b/src/spdk/module/blobfs/bdev/Makefile
@@ -0,0 +1,51 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = blobfs_bdev.c blobfs_bdev_rpc.c
+
+# libfuse3 is required by blobfs_fuse.c
+ifeq ($(CONFIG_FUSE),y)
+C_SRCS += blobfs_fuse.c
+endif
+
+LIBNAME = blobfs_bdev
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_blobfs_bdev.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/module/blobfs/bdev/blobfs_bdev.c b/src/spdk/module/blobfs/bdev/blobfs_bdev.c
new file mode 100644
index 000000000..501bbea18
--- /dev/null
+++ b/src/spdk/module/blobfs/bdev/blobfs_bdev.c
@@ -0,0 +1,361 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+#include "spdk/blobfs.h"
+#include "spdk/bdev.h"
+#include "spdk/bdev_module.h"
+#include "spdk/event.h"
+#include "spdk/blob_bdev.h"
+#include "spdk/blobfs_bdev.h"
+#include "spdk/log.h"
+#include "spdk/string.h"
+#include "spdk/rpc.h"
+#include "spdk/util.h"
+
+#include "spdk_internal/log.h"
+
+#include "blobfs_fuse.h"
+
+/* Dummy bdev module used to to claim bdevs. */
+static struct spdk_bdev_module blobfs_bdev_module = {
+ .name = "blobfs",
+};
+
+static void
+blobfs_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev,
+ void *event_ctx)
+{
+ SPDK_WARNLOG("Async event(%d) is triggered in bdev %s\n", type, spdk_bdev_get_name(bdev));
+}
+
+struct blobfs_bdev_operation_ctx {
+ const char *bdev_name;
+ struct spdk_filesystem *fs;
+
+ /* If cb_fn is already called in other function, not _blobfs_bdev_unload_cb.
+ * cb_fn should be set NULL after its being called, in order to avoid repeated
+ * calling in _blobfs_bdev_unload_cb.
+ */
+ spdk_blobfs_bdev_op_complete cb_fn;
+ void *cb_arg;
+
+ /* Variables for mount operation */
+ const char *mountpoint;
+ struct spdk_thread *fs_loading_thread;
+
+ /* Used in bdev_event_cb to do some proper operations on blobfs_fuse for
+ * asynchronous event of the backend bdev.
+ */
+ struct spdk_blobfs_fuse *bfuse;
+};
+
+static void
+_blobfs_bdev_unload_cb(void *_ctx, int fserrno)
+{
+ struct blobfs_bdev_operation_ctx *ctx = _ctx;
+
+ if (fserrno) {
+ SPDK_ERRLOG("Failed to unload blobfs on bdev %s: errno %d\n", ctx->bdev_name, fserrno);
+ }
+
+ if (ctx->cb_fn) {
+ ctx->cb_fn(ctx->cb_arg, fserrno);
+ }
+
+ free(ctx);
+}
+
+static void
+blobfs_bdev_unload(void *_ctx)
+{
+ struct blobfs_bdev_operation_ctx *ctx = _ctx;
+
+ spdk_fs_unload(ctx->fs, _blobfs_bdev_unload_cb, ctx);
+}
+
+static void
+blobfs_bdev_load_cb_to_unload(void *_ctx, struct spdk_filesystem *fs, int fserrno)
+{
+ struct blobfs_bdev_operation_ctx *ctx = _ctx;
+
+ if (fserrno) {
+ ctx->cb_fn(ctx->cb_arg, fserrno);
+ free(ctx);
+ return;
+ }
+
+ ctx->fs = fs;
+ spdk_thread_send_msg(spdk_get_thread(), blobfs_bdev_unload, ctx);
+}
+
+void
+spdk_blobfs_bdev_detect(const char *bdev_name,
+ spdk_blobfs_bdev_op_complete cb_fn, void *cb_arg)
+{
+ struct blobfs_bdev_operation_ctx *ctx;
+ struct spdk_bs_dev *bs_dev;
+ struct spdk_bdev_desc *desc;
+ int rc;
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (ctx == NULL) {
+ SPDK_ERRLOG("Failed to allocate ctx.\n");
+ cb_fn(cb_arg, -ENOMEM);
+
+ return;
+ }
+
+ ctx->bdev_name = bdev_name;
+ ctx->cb_fn = cb_fn;
+ ctx->cb_arg = cb_arg;
+
+ rc = spdk_bdev_open_ext(bdev_name, true, blobfs_bdev_event_cb, NULL, &desc);
+ if (rc != 0) {
+ SPDK_INFOLOG(SPDK_LOG_BLOBFS_BDEV, "Failed to open bdev(%s): %s\n", ctx->bdev_name,
+ spdk_strerror(rc));
+
+ goto invalid;
+ }
+
+ bs_dev = spdk_bdev_create_bs_dev_from_desc(desc);
+ if (bs_dev == NULL) {
+ SPDK_INFOLOG(SPDK_LOG_BLOBFS_BDEV, "Failed to create a blobstore block device from bdev desc");
+ rc = -ENOMEM;
+ spdk_bdev_close(desc);
+
+ goto invalid;
+ }
+
+ spdk_fs_load(bs_dev, NULL, blobfs_bdev_load_cb_to_unload, ctx);
+
+ return;
+
+invalid:
+ free(ctx);
+
+ cb_fn(cb_arg, rc);
+}
+
+void
+spdk_blobfs_bdev_create(const char *bdev_name, uint32_t cluster_sz,
+ spdk_blobfs_bdev_op_complete cb_fn, void *cb_arg)
+{
+ struct blobfs_bdev_operation_ctx *ctx;
+ struct spdk_blobfs_opts blobfs_opt;
+ struct spdk_bs_dev *bs_dev;
+ struct spdk_bdev_desc *desc;
+ int rc;
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (ctx == NULL) {
+ SPDK_ERRLOG("Failed to allocate ctx.\n");
+ cb_fn(cb_arg, -ENOMEM);
+
+ return;
+ }
+
+ ctx->bdev_name = bdev_name;
+ ctx->cb_fn = cb_fn;
+ ctx->cb_arg = cb_arg;
+
+ /* Creation requires WRITE operation */
+ rc = spdk_bdev_open_ext(bdev_name, true, blobfs_bdev_event_cb, NULL, &desc);
+ if (rc != 0) {
+ SPDK_INFOLOG(SPDK_LOG_BLOBFS_BDEV, "Failed to open bdev(%s): %s\n", ctx->bdev_name,
+ spdk_strerror(rc));
+
+ goto invalid;
+ }
+
+ bs_dev = spdk_bdev_create_bs_dev_from_desc(desc);
+ if (bs_dev == NULL) {
+ SPDK_INFOLOG(SPDK_LOG_BLOBFS_BDEV, "Failed to create a blobstore block device from bdev desc\n");
+ rc = -ENOMEM;
+ spdk_bdev_close(desc);
+
+ goto invalid;
+ }
+
+ rc = spdk_bs_bdev_claim(bs_dev, &blobfs_bdev_module);
+ if (rc) {
+ SPDK_INFOLOG(SPDK_LOG_BLOBFS_BDEV, "Blobfs base bdev already claimed by another bdev\n");
+ bs_dev->destroy(bs_dev);
+
+ goto invalid;
+ }
+
+ spdk_fs_opts_init(&blobfs_opt);
+ if (cluster_sz) {
+ blobfs_opt.cluster_sz = cluster_sz;
+ }
+
+ spdk_fs_init(bs_dev, &blobfs_opt, NULL, blobfs_bdev_load_cb_to_unload, ctx);
+
+ return;
+
+invalid:
+ free(ctx);
+
+ cb_fn(cb_arg, rc);
+}
+SPDK_LOG_REGISTER_COMPONENT("blobfs_bdev", SPDK_LOG_BLOBFS_BDEV)
+#ifdef SPDK_CONFIG_FUSE
+
+static void
+blobfs_bdev_unmount(void *arg)
+{
+ struct blobfs_bdev_operation_ctx *ctx = arg;
+
+ /* Keep blobfs unloaded in a same spdk thread with spdk_fs_load */
+ spdk_thread_send_msg(ctx->fs_loading_thread, blobfs_bdev_unload, ctx);
+}
+
+static void
+_blobfs_bdev_mount_fuse_start(void *_ctx)
+{
+ struct blobfs_bdev_operation_ctx *ctx = _ctx;
+ spdk_blobfs_bdev_op_complete cb_fn = ctx->cb_fn;
+ int rc;
+
+ /* Since function of ctx->cb_fn will be called in this function, set
+ * ctx->cb_fn to be NULL, in order to avoid repeated calling in unload_cb.
+ */
+ ctx->cb_fn = NULL;
+
+ rc = blobfs_fuse_start(ctx->bdev_name, ctx->mountpoint, ctx->fs,
+ blobfs_bdev_unmount, ctx, &ctx->bfuse);
+ if (rc != 0) {
+ SPDK_ERRLOG("Failed to mount blobfs on bdev %s to %s\n", ctx->bdev_name, ctx->mountpoint);
+
+ /* Return failure state back */
+ cb_fn(ctx->cb_arg, rc);
+
+ blobfs_bdev_unmount(ctx);
+
+ return;
+ }
+
+ cb_fn(ctx->cb_arg, 0);
+}
+
+static void
+_blobfs_bdev_mount_load_cb(void *_ctx, struct spdk_filesystem *fs, int fserrno)
+{
+ struct blobfs_bdev_operation_ctx *ctx = _ctx;
+
+ if (fserrno) {
+ SPDK_ERRLOG("Failed to load blobfs on bdev %s: errno %d\n", ctx->bdev_name, fserrno);
+
+ ctx->cb_fn(ctx->cb_arg, fserrno);
+ free(ctx);
+ return;
+ }
+
+ ctx->fs = fs;
+ ctx->fs_loading_thread = spdk_get_thread();
+
+ spdk_thread_send_msg(spdk_get_thread(), _blobfs_bdev_mount_fuse_start, ctx);
+}
+
+static void
+blobfs_bdev_fuse_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev,
+ void *event_ctx)
+{
+ struct blobfs_bdev_operation_ctx *ctx = event_ctx;
+
+ SPDK_WARNLOG("Async event(%d) is triggered in bdev %s\n", type, spdk_bdev_get_name(bdev));
+
+ if (type == SPDK_BDEV_EVENT_REMOVE) {
+ blobfs_fuse_stop(ctx->bfuse);
+ }
+}
+
+void
+spdk_blobfs_bdev_mount(const char *bdev_name, const char *mountpoint,
+ spdk_blobfs_bdev_op_complete cb_fn, void *cb_arg)
+{
+ struct blobfs_bdev_operation_ctx *ctx;
+ struct spdk_bs_dev *bs_dev;
+ struct spdk_bdev_desc *desc;
+ int rc;
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (ctx == NULL) {
+ SPDK_ERRLOG("Failed to allocate ctx.\n");
+ cb_fn(cb_arg, -ENOMEM);
+
+ return;
+ }
+
+ ctx->bdev_name = bdev_name;
+ ctx->mountpoint = mountpoint;
+ ctx->cb_fn = cb_fn;
+ ctx->cb_arg = cb_arg;
+
+ rc = spdk_bdev_open_ext(bdev_name, true, blobfs_bdev_fuse_event_cb, ctx, &desc);
+ if (rc != 0) {
+ SPDK_INFOLOG(SPDK_LOG_BLOBFS_BDEV, "Failed to open bdev(%s): %s\n", ctx->bdev_name,
+ spdk_strerror(rc));
+
+ goto invalid;
+ }
+
+ bs_dev = spdk_bdev_create_bs_dev_from_desc(desc);
+ if (bs_dev == NULL) {
+ SPDK_INFOLOG(SPDK_LOG_BLOBFS_BDEV, "Failed to create a blobstore block device from bdev desc");
+ rc = -ENOMEM;
+ spdk_bdev_close(desc);
+
+ goto invalid;
+ }
+
+ rc = spdk_bs_bdev_claim(bs_dev, &blobfs_bdev_module);
+ if (rc != 0) {
+ SPDK_INFOLOG(SPDK_LOG_BLOBFS_BDEV, "Blobfs base bdev already claimed by another bdev\n");
+ bs_dev->destroy(bs_dev);
+
+ goto invalid;
+ }
+
+ spdk_fs_load(bs_dev, blobfs_fuse_send_request, _blobfs_bdev_mount_load_cb, ctx);
+
+ return;
+
+invalid:
+ free(ctx);
+
+ cb_fn(cb_arg, rc);
+}
+
+#endif
diff --git a/src/spdk/module/blobfs/bdev/blobfs_bdev_rpc.c b/src/spdk/module/blobfs/bdev/blobfs_bdev_rpc.c
new file mode 100644
index 000000000..62d9fa98e
--- /dev/null
+++ b/src/spdk/module/blobfs/bdev/blobfs_bdev_rpc.c
@@ -0,0 +1,344 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+#include "spdk/blobfs.h"
+#include "spdk/bdev.h"
+#include "spdk/event.h"
+#include "spdk/blob_bdev.h"
+#include "spdk/blobfs_bdev.h"
+#include "spdk/log.h"
+#include "spdk/string.h"
+#include "spdk/rpc.h"
+#include "spdk/util.h"
+
+#include "spdk_internal/log.h"
+
+#ifndef PAGE_SIZE
+#define PAGE_SIZE 4096
+#endif
+
+#define MIN_CLUSTER_SZ (1024 * 1024)
+
+struct rpc_blobfs_set_cache_size {
+ uint64_t size_in_mb;
+};
+
+static const struct spdk_json_object_decoder rpc_blobfs_set_cache_size_decoders[] = {
+ {"size_in_mb", offsetof(struct rpc_blobfs_set_cache_size, size_in_mb), spdk_json_decode_uint64},
+};
+
+static void
+rpc_blobfs_set_cache_size(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_blobfs_set_cache_size req;
+ struct spdk_json_write_ctx *w;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_blobfs_set_cache_size_decoders,
+ SPDK_COUNTOF(rpc_blobfs_set_cache_size_decoders),
+ &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "spdk_json_decode_object failed");
+
+ return;
+ }
+
+ if (req.size_in_mb == 0) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "spdk_json_decode_object failed");
+
+ return;
+ }
+
+ rc = spdk_fs_set_cache_size(req.size_in_mb);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, rc == 0);
+ spdk_jsonrpc_end_result(request, w);
+}
+
+SPDK_RPC_REGISTER("blobfs_set_cache_size", rpc_blobfs_set_cache_size,
+ SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME)
+
+struct rpc_blobfs_detect {
+ char *bdev_name;
+
+ struct spdk_jsonrpc_request *request;
+};
+
+static void
+free_rpc_blobfs_detect(struct rpc_blobfs_detect *req)
+{
+ free(req->bdev_name);
+ free(req);
+}
+
+static const struct spdk_json_object_decoder rpc_blobfs_detect_decoders[] = {
+ {"bdev_name", offsetof(struct rpc_blobfs_detect, bdev_name), spdk_json_decode_string},
+};
+
+static void
+_rpc_blobfs_detect_done(void *cb_arg, int fserrno)
+{
+ struct rpc_blobfs_detect *req = cb_arg;
+ struct spdk_json_write_ctx *w;
+ bool existed = true;
+
+ if (fserrno == -EILSEQ) {
+ /* There is no blobfs existing on bdev */
+ existed = false;
+ } else if (fserrno != 0) {
+ spdk_jsonrpc_send_error_response(req->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ spdk_strerror(-fserrno));
+
+ return;
+ }
+
+ w = spdk_jsonrpc_begin_result(req->request);
+ spdk_json_write_bool(w, existed);
+ spdk_jsonrpc_end_result(req->request, w);
+
+ free_rpc_blobfs_detect(req);
+}
+
+static void
+rpc_blobfs_detect(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_blobfs_detect *req;
+
+ req = calloc(1, sizeof(*req));
+ if (req == NULL) {
+ SPDK_ERRLOG("could not allocate rpc_blobfs_detect request.\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory");
+ return;
+ }
+
+ if (spdk_json_decode_object(params, rpc_blobfs_detect_decoders,
+ SPDK_COUNTOF(rpc_blobfs_detect_decoders),
+ req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "spdk_json_decode_object failed");
+
+ free_rpc_blobfs_detect(req);
+
+ return;
+ }
+
+ req->request = request;
+ spdk_blobfs_bdev_detect(req->bdev_name, _rpc_blobfs_detect_done, req);
+}
+
+SPDK_RPC_REGISTER("blobfs_detect", rpc_blobfs_detect, SPDK_RPC_RUNTIME)
+
+struct rpc_blobfs_create {
+ char *bdev_name;
+ uint64_t cluster_sz;
+
+ struct spdk_jsonrpc_request *request;
+};
+
+static void
+free_rpc_blobfs_create(struct rpc_blobfs_create *req)
+{
+ free(req->bdev_name);
+ free(req);
+}
+
+static int
+rpc_decode_cluster_sz(const struct spdk_json_val *val, void *out)
+{
+ uint64_t *cluster_sz = out;
+ char *sz_str = NULL;
+ bool has_prefix;
+ int rc;
+
+ rc = spdk_json_decode_string(val, &sz_str);
+ if (rc) {
+ SPDK_NOTICELOG("Invalid parameter value: cluster_sz\n");
+ return -EINVAL;
+ }
+
+ rc = spdk_parse_capacity(sz_str, cluster_sz, &has_prefix);
+ free(sz_str);
+
+ if (rc || *cluster_sz % PAGE_SIZE != 0 || *cluster_sz < MIN_CLUSTER_SZ) {
+ SPDK_NOTICELOG("Invalid parameter value: cluster_sz\n");
+ return -EINVAL;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_BLOBFS_BDEV_RPC, "cluster_sz of blobfs: %ld\n", *cluster_sz);
+ return 0;
+}
+
+static const struct spdk_json_object_decoder rpc_blobfs_create_decoders[] = {
+ {"bdev_name", offsetof(struct rpc_blobfs_create, bdev_name), spdk_json_decode_string},
+ {"cluster_sz", offsetof(struct rpc_blobfs_create, cluster_sz), rpc_decode_cluster_sz, true},
+};
+
+static void
+_rpc_blobfs_create_done(void *cb_arg, int fserrno)
+{
+ struct rpc_blobfs_create *req = cb_arg;
+ struct spdk_json_write_ctx *w;
+
+ if (fserrno != 0) {
+ spdk_jsonrpc_send_error_response(req->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ spdk_strerror(-fserrno));
+
+ return;
+ }
+
+ w = spdk_jsonrpc_begin_result(req->request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(req->request, w);
+
+ free_rpc_blobfs_create(req);
+}
+
+static void
+rpc_blobfs_create(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_blobfs_create *req;
+
+ req = calloc(1, sizeof(*req));
+ if (req == NULL) {
+ SPDK_ERRLOG("could not allocate rpc_blobfs_create request.\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory");
+ return;
+ }
+
+ if (spdk_json_decode_object(params, rpc_blobfs_create_decoders,
+ SPDK_COUNTOF(rpc_blobfs_create_decoders),
+ req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "spdk_json_decode_object failed");
+
+ free_rpc_blobfs_create(req);
+
+ return;
+ }
+
+ req->request = request;
+ spdk_blobfs_bdev_create(req->bdev_name, req->cluster_sz, _rpc_blobfs_create_done, req);
+}
+
+SPDK_RPC_REGISTER("blobfs_create", rpc_blobfs_create, SPDK_RPC_RUNTIME)
+
+SPDK_LOG_REGISTER_COMPONENT("blobfs_bdev_rpc", SPDK_LOG_BLOBFS_BDEV_RPC)
+#ifdef SPDK_CONFIG_FUSE
+
+struct rpc_blobfs_mount {
+ char *bdev_name;
+ char *mountpoint;
+
+ struct spdk_jsonrpc_request *request;
+};
+
+static void
+free_rpc_blobfs_mount(struct rpc_blobfs_mount *req)
+{
+ free(req->bdev_name);
+ free(req->mountpoint);
+ free(req);
+}
+
+static const struct spdk_json_object_decoder rpc_blobfs_mount_decoders[] = {
+ {"bdev_name", offsetof(struct rpc_blobfs_mount, bdev_name), spdk_json_decode_string},
+ {"mountpoint", offsetof(struct rpc_blobfs_mount, mountpoint), spdk_json_decode_string},
+};
+
+static void
+_rpc_blobfs_mount_done(void *cb_arg, int fserrno)
+{
+ struct rpc_blobfs_mount *req = cb_arg;
+ struct spdk_json_write_ctx *w;
+
+ if (fserrno == -EILSEQ) {
+ /* There is no blobfs existing on bdev */
+ spdk_jsonrpc_send_error_response(req->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "No blobfs detected on given bdev");
+
+ return;
+ } else if (fserrno != 0) {
+ spdk_jsonrpc_send_error_response(req->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ spdk_strerror(-fserrno));
+
+ return;
+ }
+
+ w = spdk_jsonrpc_begin_result(req->request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(req->request, w);
+
+ free_rpc_blobfs_mount(req);
+}
+
+static void
+rpc_blobfs_mount(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_blobfs_mount *req;
+
+ req = calloc(1, sizeof(*req));
+ if (req == NULL) {
+ SPDK_ERRLOG("could not allocate rpc_blobfs_mount request.\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory");
+ return;
+ }
+
+ if (spdk_json_decode_object(params, rpc_blobfs_mount_decoders,
+ SPDK_COUNTOF(rpc_blobfs_mount_decoders),
+ req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "spdk_json_decode_object failed");
+
+ free_rpc_blobfs_mount(req);
+
+ return;
+ }
+
+ req->request = request;
+ spdk_blobfs_bdev_mount(req->bdev_name, req->mountpoint, _rpc_blobfs_mount_done, req);
+}
+
+SPDK_RPC_REGISTER("blobfs_mount", rpc_blobfs_mount, SPDK_RPC_RUNTIME)
+
+#endif
diff --git a/src/spdk/module/blobfs/bdev/blobfs_fuse.c b/src/spdk/module/blobfs/bdev/blobfs_fuse.c
new file mode 100644
index 000000000..df6d61e04
--- /dev/null
+++ b/src/spdk/module/blobfs/bdev/blobfs_fuse.c
@@ -0,0 +1,358 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/log.h"
+#include "spdk/env.h"
+#include "spdk/event.h"
+#include "spdk/thread.h"
+#include "spdk/string.h"
+#include "spdk/blobfs.h"
+
+#include "blobfs_fuse.h"
+
+#define FUSE_USE_VERSION 30
+#include "fuse3/fuse.h"
+#include "fuse3/fuse_lowlevel.h"
+
+struct spdk_blobfs_fuse {
+ char *bdev_name;
+ char *mountpoint;
+ struct spdk_fs_thread_ctx *channel;
+ struct spdk_filesystem *fs;
+
+ struct fuse *fuse_handle;
+ pthread_t fuse_tid;
+
+ blobfs_fuse_unmount_cb cb_fn;
+ void *cb_arg;
+};
+
+/* Each thread serves one blobfs */
+static __thread struct spdk_blobfs_fuse *thd_bfuse;
+
+static void
+blobfs_fuse_free(struct spdk_blobfs_fuse *bfuse)
+{
+ if (bfuse == NULL) {
+ return;
+ }
+
+ free(bfuse->bdev_name);
+ free(bfuse->mountpoint);
+ free(bfuse);
+}
+
+static void
+__call_fn(void *arg1, void *arg2)
+{
+ fs_request_fn fn;
+
+ fn = (fs_request_fn)arg1;
+ fn(arg2);
+}
+
+void
+blobfs_fuse_send_request(fs_request_fn fn, void *arg)
+{
+ struct spdk_event *event;
+
+ event = spdk_event_allocate(0, __call_fn, (void *)fn, arg);
+ spdk_event_call(event);
+}
+
+static int
+fuse_getattr(const char *path, struct stat *stbuf, struct fuse_file_info *fi)
+{
+ struct spdk_file_stat stat;
+ int rc;
+
+ if (!strcmp(path, "/")) {
+ stbuf->st_mode = S_IFDIR | 0755;
+ stbuf->st_nlink = 2;
+ return 0;
+ }
+
+ rc = spdk_fs_file_stat(thd_bfuse->fs, thd_bfuse->channel, path, &stat);
+ if (rc == 0) {
+ stbuf->st_mode = S_IFREG | 0644;
+ stbuf->st_nlink = 1;
+ stbuf->st_size = stat.size;
+ }
+
+ return rc;
+}
+
+static int
+fuse_readdir(const char *path, void *buf, fuse_fill_dir_t filler,
+ off_t offset, struct fuse_file_info *fi,
+ enum fuse_readdir_flags flags)
+{
+ struct spdk_file *file;
+ const char *filename;
+ spdk_fs_iter iter;
+
+ filler(buf, ".", NULL, 0, 0);
+ filler(buf, "..", NULL, 0, 0);
+
+ iter = spdk_fs_iter_first(thd_bfuse->fs);
+ while (iter != NULL) {
+ file = spdk_fs_iter_get_file(iter);
+ iter = spdk_fs_iter_next(iter);
+ filename = spdk_file_get_name(file);
+ filler(buf, &filename[1], NULL, 0, 0);
+ }
+
+ return 0;
+}
+
+static int
+fuse_mknod(const char *path, mode_t mode, dev_t rdev)
+{
+ return spdk_fs_create_file(thd_bfuse->fs, thd_bfuse->channel, path);
+}
+
+static int
+fuse_unlink(const char *path)
+{
+ return spdk_fs_delete_file(thd_bfuse->fs, thd_bfuse->channel, path);
+}
+
+static int
+fuse_truncate(const char *path, off_t size, struct fuse_file_info *fi)
+{
+ struct spdk_file *file;
+ int rc;
+
+ rc = spdk_fs_open_file(thd_bfuse->fs, thd_bfuse->channel, path, 0, &file);
+ if (rc != 0) {
+ return -rc;
+ }
+
+ rc = spdk_file_truncate(file, thd_bfuse->channel, size);
+ if (rc != 0) {
+ return -rc;
+ }
+
+ spdk_file_close(file, thd_bfuse->channel);
+
+ return 0;
+}
+
+static int
+fuse_utimens(const char *path, const struct timespec tv[2], struct fuse_file_info *fi)
+{
+ return 0;
+}
+
+static int
+fuse_open(const char *path, struct fuse_file_info *info)
+{
+ struct spdk_file *file;
+ int rc;
+
+ rc = spdk_fs_open_file(thd_bfuse->fs, thd_bfuse->channel, path, 0, &file);
+ if (rc != 0) {
+ return -rc;
+ }
+
+ info->fh = (uintptr_t)file;
+ return 0;
+}
+
+static int
+fuse_release(const char *path, struct fuse_file_info *info)
+{
+ struct spdk_file *file = (struct spdk_file *)info->fh;
+
+ return spdk_file_close(file, thd_bfuse->channel);
+}
+
+static int
+fuse_read(const char *path, char *buf, size_t len, off_t offset, struct fuse_file_info *info)
+{
+ struct spdk_file *file = (struct spdk_file *)info->fh;
+
+ return spdk_file_read(file, thd_bfuse->channel, buf, offset, len);
+}
+
+static int
+fuse_write(const char *path, const char *buf, size_t len, off_t offset,
+ struct fuse_file_info *info)
+{
+ struct spdk_file *file = (struct spdk_file *)info->fh;
+ int rc;
+
+ rc = spdk_file_write(file, thd_bfuse->channel, (void *)buf, offset, len);
+ if (rc == 0) {
+ return len;
+ } else {
+ return rc;
+ }
+}
+
+static int
+fuse_flush(const char *path, struct fuse_file_info *info)
+{
+ return 0;
+}
+
+static int
+fuse_fsync(const char *path, int datasync, struct fuse_file_info *info)
+{
+ return 0;
+}
+
+static int
+fuse_rename(const char *old_path, const char *new_path, unsigned int flags)
+{
+ return spdk_fs_rename_file(thd_bfuse->fs, thd_bfuse->channel, old_path, new_path);
+}
+
+static struct fuse_operations spdk_fuse_oper = {
+ .getattr = fuse_getattr,
+ .readdir = fuse_readdir,
+ .mknod = fuse_mknod,
+ .unlink = fuse_unlink,
+ .truncate = fuse_truncate,
+ .utimens = fuse_utimens,
+ .open = fuse_open,
+ .release = fuse_release,
+ .read = fuse_read,
+ .write = fuse_write,
+ .flush = fuse_flush,
+ .fsync = fuse_fsync,
+ .rename = fuse_rename,
+};
+
+static void *
+fuse_loop_new_thread(void *arg)
+{
+ struct spdk_blobfs_fuse *bfuse = arg;
+
+ spdk_unaffinitize_thread();
+
+ thd_bfuse = bfuse;
+ SPDK_NOTICELOG("Start to loop blobfs on bdev %s mounted at %s\n", bfuse->bdev_name,
+ bfuse->mountpoint);
+
+ bfuse->channel = spdk_fs_alloc_thread_ctx(bfuse->fs);
+
+ fuse_loop(bfuse->fuse_handle);
+ fuse_unmount(bfuse->fuse_handle);
+ fuse_destroy(bfuse->fuse_handle);
+ SPDK_NOTICELOG("Blobfs on bdev %s unmounted from %s\n", bfuse->bdev_name, bfuse->mountpoint);
+
+ spdk_fs_free_thread_ctx(bfuse->channel);
+
+ bfuse->cb_fn(bfuse->cb_arg);
+
+ blobfs_fuse_free(bfuse);
+
+ pthread_exit(NULL);
+}
+
+int
+blobfs_fuse_start(const char *bdev_name, const char *mountpoint, struct spdk_filesystem *fs,
+ blobfs_fuse_unmount_cb cb_fn, void *cb_arg, struct spdk_blobfs_fuse **_bfuse)
+{
+ /* Set argv[1] as bdev_name in order to show bdev_name as the mounting source */
+ char *argv[1] = {(char *)bdev_name};
+ struct fuse_args args = FUSE_ARGS_INIT(1, argv);
+ struct fuse_cmdline_opts opts = {};
+ struct fuse *fuse_handle;
+ struct spdk_blobfs_fuse *bfuse;
+ pthread_t tid;
+ int rc;
+
+ bfuse = (struct spdk_blobfs_fuse *)calloc(1, sizeof(*bfuse));
+ if (bfuse == NULL) {
+ return -ENOMEM;
+ }
+
+ rc = fuse_parse_cmdline(&args, &opts);
+ assert(rc == 0);
+
+ bfuse->bdev_name = strdup(bdev_name);
+ bfuse->mountpoint = strdup(mountpoint);
+ bfuse->fs = fs;
+ bfuse->cb_fn = cb_fn;
+ bfuse->cb_arg = cb_arg;
+
+ fuse_handle = fuse_new(&args, &spdk_fuse_oper, sizeof(spdk_fuse_oper), NULL);
+ fuse_opt_free_args(&args);
+ if (fuse_handle == NULL) {
+ SPDK_ERRLOG("could not create fuse handle!\n");
+ rc = -1;
+ goto err;
+ }
+ bfuse->fuse_handle = fuse_handle;
+
+ rc = fuse_mount(bfuse->fuse_handle, bfuse->mountpoint);
+ if (rc != 0) {
+ SPDK_ERRLOG("could not mount fuse handle\n");
+ rc = -1;
+ goto err;
+ }
+
+ rc = pthread_create(&tid, NULL, fuse_loop_new_thread, bfuse);
+ if (rc != 0) {
+ SPDK_ERRLOG("could not create thread: %s\n", spdk_strerror(rc));
+ rc = -rc;
+ goto err;
+ }
+ bfuse->fuse_tid = tid;
+
+ rc = pthread_detach(tid);
+ if (rc != 0) {
+ SPDK_ERRLOG("could not detach thread for fuse loop thread: %s\n", spdk_strerror(rc));
+ rc = -rc;
+ goto err;
+ }
+
+ *_bfuse = bfuse;
+ return 0;
+
+err:
+ blobfs_fuse_free(bfuse);
+
+ return rc;
+}
+
+void
+blobfs_fuse_stop(struct spdk_blobfs_fuse *bfuse)
+{
+ fuse_session_exit(fuse_get_session(bfuse->fuse_handle));
+ pthread_kill(bfuse->fuse_tid, SIGINT);
+}
diff --git a/src/spdk/module/blobfs/bdev/blobfs_fuse.h b/src/spdk/module/blobfs/bdev/blobfs_fuse.h
new file mode 100644
index 000000000..b14c261da
--- /dev/null
+++ b/src/spdk/module/blobfs/bdev/blobfs_fuse.h
@@ -0,0 +1,52 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_BLOBFS_FUSE_H
+#define SPDK_BLOBFS_FUSE_H
+
+#include "spdk/stdinc.h"
+#include "spdk/blobfs.h"
+
+struct spdk_blobfs_fuse;
+
+void blobfs_fuse_send_request(fs_request_fn fn, void *arg);
+
+typedef void (*blobfs_fuse_unmount_cb)(void *arg);
+
+int blobfs_fuse_start(const char *bdev_name, const char *mountpoint,
+ struct spdk_filesystem *fs, blobfs_fuse_unmount_cb cb_fn,
+ void *cb_arg, struct spdk_blobfs_fuse **bfuse);
+
+void blobfs_fuse_stop(struct spdk_blobfs_fuse *bfuse);
+
+#endif /* SPDK_BLOBFS_FUSE_H */
diff --git a/src/spdk/module/blobfs/bdev/spdk_blobfs_bdev.map b/src/spdk/module/blobfs/bdev/spdk_blobfs_bdev.map
new file mode 100644
index 000000000..e3d461c6f
--- /dev/null
+++ b/src/spdk/module/blobfs/bdev/spdk_blobfs_bdev.map
@@ -0,0 +1,8 @@
+{
+ global:
+ spdk_blobfs_bdev_detect;
+ spdk_blobfs_bdev_create;
+ spdk_blobfs_bdev_mount;
+
+ local: *;
+};
diff --git a/src/spdk/module/env_dpdk/Makefile b/src/spdk/module/env_dpdk/Makefile
new file mode 100644
index 000000000..6585c676a
--- /dev/null
+++ b/src/spdk/module/env_dpdk/Makefile
@@ -0,0 +1,45 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = env_dpdk_rpc.c
+LIBNAME = env_dpdk_rpc
+
+SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/module/env_dpdk/env_dpdk_rpc.c b/src/spdk/module/env_dpdk/env_dpdk_rpc.c
new file mode 100644
index 000000000..e66f13609
--- /dev/null
+++ b/src/spdk/module/env_dpdk/env_dpdk_rpc.c
@@ -0,0 +1,68 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation. All rights reserved.
+ * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+#include "spdk/rpc.h"
+#include "spdk/env_dpdk.h"
+#include "spdk_internal/log.h"
+
+static void
+rpc_env_dpdk_get_mem_stats(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ FILE *file = NULL;
+ struct spdk_json_write_ctx *w;
+ char default_filename[] = "/tmp/spdk_mem_dump.txt";
+
+ if (params != NULL) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "env_dpdk_get_mem_stats doesn't accept any parameters.\n");
+ }
+
+ file = fopen(default_filename, "w");
+ if (!file) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Unable to open file for writing.\n");
+ return;
+ }
+
+ spdk_env_dpdk_dump_mem_stats(file);
+ fclose(file);
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "filename", default_filename);
+ spdk_json_write_object_end(w);
+ spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("env_dpdk_get_mem_stats", rpc_env_dpdk_get_mem_stats, SPDK_RPC_RUNTIME)
diff --git a/src/spdk/module/event/Makefile b/src/spdk/module/event/Makefile
new file mode 100644
index 000000000..f4caea64e
--- /dev/null
+++ b/src/spdk/module/event/Makefile
@@ -0,0 +1,44 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+DIRS-y = subsystems rpc
+
+.PHONY: all clean $(DIRS-y)
+
+all: $(DIRS-y)
+clean: $(DIRS-y)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.subdirs.mk
diff --git a/src/spdk/module/event/rpc/Makefile b/src/spdk/module/event/rpc/Makefile
new file mode 100644
index 000000000..1e620aa14
--- /dev/null
+++ b/src/spdk/module/event/rpc/Makefile
@@ -0,0 +1,45 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = app_rpc.c subsystem_rpc.c
+LIBNAME = app_rpc
+
+SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/module/event/rpc/app_rpc.c b/src/spdk/module/event/rpc/app_rpc.c
new file mode 100644
index 000000000..f223c1734
--- /dev/null
+++ b/src/spdk/module/event/rpc/app_rpc.c
@@ -0,0 +1,543 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation. All rights reserved.
+ * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/event.h"
+#include "spdk/rpc.h"
+#include "spdk/string.h"
+#include "spdk/util.h"
+#include "spdk/env.h"
+#include "spdk/thread.h"
+
+#include "spdk_internal/log.h"
+#include "spdk_internal/event.h"
+#include "spdk_internal/thread.h"
+
+struct rpc_spdk_kill_instance {
+ char *sig_name;
+};
+
+static void
+free_rpc_spdk_kill_instance(struct rpc_spdk_kill_instance *req)
+{
+ free(req->sig_name);
+}
+
+static const struct spdk_json_object_decoder rpc_spdk_kill_instance_decoders[] = {
+ {"sig_name", offsetof(struct rpc_spdk_kill_instance, sig_name), spdk_json_decode_string},
+};
+
+static void
+rpc_spdk_kill_instance(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ static const struct {
+ const char *signal_string;
+ int32_t signal;
+ } signals[] = {
+ {"SIGINT", SIGINT},
+ {"SIGTERM", SIGTERM},
+ {"SIGQUIT", SIGQUIT},
+ {"SIGHUP", SIGHUP},
+ {"SIGKILL", SIGKILL},
+ {"SIGUSR1", SIGUSR1},
+ };
+ size_t i, sig_count;
+ int signal;
+ struct rpc_spdk_kill_instance req = {};
+ struct spdk_json_write_ctx *w;
+
+ if (spdk_json_decode_object(params, rpc_spdk_kill_instance_decoders,
+ SPDK_COUNTOF(rpc_spdk_kill_instance_decoders),
+ &req)) {
+ SPDK_DEBUGLOG(SPDK_LOG_APP_RPC, "spdk_json_decode_object failed\n");
+ goto invalid;
+ }
+
+ sig_count = SPDK_COUNTOF(signals);
+ signal = spdk_strtol(req.sig_name, 10);
+ for (i = 0 ; i < sig_count; i++) {
+ if (strcmp(req.sig_name, signals[i].signal_string) == 0 ||
+ signal == signals[i].signal) {
+ break;
+ }
+ }
+
+ if (i == sig_count) {
+ goto invalid;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_APP_RPC, "sending signal %d\n", signals[i].signal);
+ free_rpc_spdk_kill_instance(&req);
+ kill(getpid(), signals[i].signal);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ return;
+
+invalid:
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+ free_rpc_spdk_kill_instance(&req);
+}
+SPDK_RPC_REGISTER("spdk_kill_instance", rpc_spdk_kill_instance, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(spdk_kill_instance, kill_instance)
+
+
+struct rpc_framework_monitor_context_switch {
+ bool enabled;
+};
+
+static const struct spdk_json_object_decoder rpc_framework_monitor_context_switch_decoders[] = {
+ {"enabled", offsetof(struct rpc_framework_monitor_context_switch, enabled), spdk_json_decode_bool},
+};
+
+static void
+rpc_framework_monitor_context_switch(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_framework_monitor_context_switch req = {};
+ struct spdk_json_write_ctx *w;
+
+ if (params != NULL) {
+ if (spdk_json_decode_object(params, rpc_framework_monitor_context_switch_decoders,
+ SPDK_COUNTOF(rpc_framework_monitor_context_switch_decoders),
+ &req)) {
+ SPDK_DEBUGLOG(SPDK_LOG_APP_RPC, "spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+ return;
+ }
+
+ spdk_framework_enable_context_switch_monitor(req.enabled);
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_bool(w, "enabled", spdk_framework_context_switch_monitor_enabled());
+
+ spdk_json_write_object_end(w);
+ spdk_jsonrpc_end_result(request, w);
+}
+
+SPDK_RPC_REGISTER("framework_monitor_context_switch", rpc_framework_monitor_context_switch,
+ SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(framework_monitor_context_switch, context_switch_monitor)
+
+struct rpc_get_stats_ctx {
+ struct spdk_jsonrpc_request *request;
+ struct spdk_json_write_ctx *w;
+ uint64_t now;
+};
+
+static void
+rpc_thread_get_stats_done(void *arg)
+{
+ struct rpc_get_stats_ctx *ctx = arg;
+
+ spdk_json_write_array_end(ctx->w);
+ spdk_json_write_object_end(ctx->w);
+ spdk_jsonrpc_end_result(ctx->request, ctx->w);
+
+ free(ctx);
+}
+
+static void
+rpc_thread_get_stats_for_each(struct spdk_jsonrpc_request *request, spdk_msg_fn fn)
+{
+ struct rpc_get_stats_ctx *ctx;
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Memory allocation error");
+ return;
+ }
+ ctx->request = request;
+
+ ctx->w = spdk_jsonrpc_begin_result(ctx->request);
+ spdk_json_write_object_begin(ctx->w);
+ spdk_json_write_named_uint64(ctx->w, "tick_rate", spdk_get_ticks_hz());
+ spdk_json_write_named_array_begin(ctx->w, "threads");
+
+ spdk_for_each_thread(fn, ctx, rpc_thread_get_stats_done);
+}
+
+static void
+_rpc_thread_get_stats(void *arg)
+{
+ struct rpc_get_stats_ctx *ctx = arg;
+ struct spdk_thread *thread = spdk_get_thread();
+ struct spdk_poller *poller;
+ struct spdk_thread_stats stats;
+ uint64_t active_pollers_count = 0;
+ uint64_t timed_pollers_count = 0;
+ uint64_t paused_pollers_count = 0;
+
+ TAILQ_FOREACH(poller, &thread->active_pollers, tailq) {
+ active_pollers_count++;
+ }
+ TAILQ_FOREACH(poller, &thread->timed_pollers, tailq) {
+ timed_pollers_count++;
+ }
+ TAILQ_FOREACH(poller, &thread->paused_pollers, tailq) {
+ paused_pollers_count++;
+ }
+
+ if (0 == spdk_thread_get_stats(&stats)) {
+ spdk_json_write_object_begin(ctx->w);
+ spdk_json_write_named_string(ctx->w, "name", spdk_thread_get_name(thread));
+ spdk_json_write_named_uint64(ctx->w, "id", spdk_thread_get_id(thread));
+ spdk_json_write_named_string(ctx->w, "cpumask",
+ spdk_cpuset_fmt(spdk_thread_get_cpumask(thread)));
+ spdk_json_write_named_uint64(ctx->w, "busy", stats.busy_tsc);
+ spdk_json_write_named_uint64(ctx->w, "idle", stats.idle_tsc);
+ spdk_json_write_named_uint64(ctx->w, "active_pollers_count", active_pollers_count);
+ spdk_json_write_named_uint64(ctx->w, "timed_pollers_count", timed_pollers_count);
+ spdk_json_write_named_uint64(ctx->w, "paused_pollers_count", paused_pollers_count);
+ spdk_json_write_object_end(ctx->w);
+ }
+}
+
+static void
+rpc_thread_get_stats(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ if (params) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "'thread_get_stats' requires no arguments");
+ return;
+ }
+
+ rpc_thread_get_stats_for_each(request, _rpc_thread_get_stats);
+}
+
+SPDK_RPC_REGISTER("thread_get_stats", rpc_thread_get_stats, SPDK_RPC_RUNTIME)
+
+static void
+rpc_get_poller(struct spdk_poller *poller, struct spdk_json_write_ctx *w)
+{
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "name", poller->name);
+ spdk_json_write_named_string(w, "state", spdk_poller_state_str(poller->state));
+ spdk_json_write_named_uint64(w, "run_count", poller->run_count);
+ spdk_json_write_named_uint64(w, "busy_count", poller->busy_count);
+ if (poller->period_ticks) {
+ spdk_json_write_named_uint64(w, "period_ticks", poller->period_ticks);
+ }
+ spdk_json_write_object_end(w);
+}
+
+static void
+_rpc_thread_get_pollers(void *arg)
+{
+ struct rpc_get_stats_ctx *ctx = arg;
+ struct spdk_thread *thread = spdk_get_thread();
+ struct spdk_poller *poller;
+
+ spdk_json_write_object_begin(ctx->w);
+ spdk_json_write_named_string(ctx->w, "name", spdk_thread_get_name(thread));
+ spdk_json_write_named_uint64(ctx->w, "id", spdk_thread_get_id(thread));
+
+ spdk_json_write_named_array_begin(ctx->w, "active_pollers");
+ TAILQ_FOREACH(poller, &thread->active_pollers, tailq) {
+ rpc_get_poller(poller, ctx->w);
+ }
+ spdk_json_write_array_end(ctx->w);
+
+ spdk_json_write_named_array_begin(ctx->w, "timed_pollers");
+ TAILQ_FOREACH(poller, &thread->timed_pollers, tailq) {
+ rpc_get_poller(poller, ctx->w);
+ }
+ spdk_json_write_array_end(ctx->w);
+
+ spdk_json_write_named_array_begin(ctx->w, "paused_pollers");
+ TAILQ_FOREACH(poller, &thread->paused_pollers, tailq) {
+ rpc_get_poller(poller, ctx->w);
+ }
+ spdk_json_write_array_end(ctx->w);
+
+ spdk_json_write_object_end(ctx->w);
+}
+
+static void
+rpc_thread_get_pollers(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ if (params) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "'thread_get_pollers' requires no arguments");
+ return;
+ }
+
+ rpc_thread_get_stats_for_each(request, _rpc_thread_get_pollers);
+}
+
+SPDK_RPC_REGISTER("thread_get_pollers", rpc_thread_get_pollers, SPDK_RPC_RUNTIME)
+
+static void
+rpc_get_io_channel(struct spdk_io_channel *ch, struct spdk_json_write_ctx *w)
+{
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "name", spdk_io_device_get_name(ch->dev));
+ spdk_json_write_named_uint32(w, "ref", ch->ref);
+ spdk_json_write_object_end(w);
+}
+
+static void
+_rpc_thread_get_io_channels(void *arg)
+{
+ struct rpc_get_stats_ctx *ctx = arg;
+ struct spdk_thread *thread = spdk_get_thread();
+ struct spdk_io_channel *ch;
+
+ spdk_json_write_object_begin(ctx->w);
+ spdk_json_write_named_string(ctx->w, "name", spdk_thread_get_name(thread));
+
+ spdk_json_write_named_array_begin(ctx->w, "io_channels");
+ TAILQ_FOREACH(ch, &thread->io_channels, tailq) {
+ rpc_get_io_channel(ch, ctx->w);
+ }
+ spdk_json_write_array_end(ctx->w);
+
+ spdk_json_write_object_end(ctx->w);
+}
+
+static void
+rpc_thread_get_io_channels(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ if (params) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "'thread_get_io_channels' requires no arguments");
+ return;
+ }
+
+ rpc_thread_get_stats_for_each(request, _rpc_thread_get_io_channels);
+}
+
+SPDK_RPC_REGISTER("thread_get_io_channels", rpc_thread_get_io_channels, SPDK_RPC_RUNTIME);
+
+static void
+rpc_framework_get_reactors_done(void *arg1, void *arg2)
+{
+ struct rpc_get_stats_ctx *ctx = arg1;
+
+ spdk_json_write_array_end(ctx->w);
+ spdk_json_write_object_end(ctx->w);
+ spdk_jsonrpc_end_result(ctx->request, ctx->w);
+
+ free(ctx);
+}
+
+#define GET_DELTA(end, start) (end >= start ? end - start : 0)
+
+static void
+_rpc_framework_get_reactors(void *arg1, void *arg2)
+{
+ struct rpc_get_stats_ctx *ctx = arg1;
+ uint32_t current_core;
+ struct spdk_reactor *reactor;
+ struct spdk_lw_thread *lw_thread;
+ struct spdk_thread *thread;
+
+ current_core = spdk_env_get_current_core();
+ reactor = spdk_reactor_get(current_core);
+
+ assert(reactor != NULL);
+
+ spdk_json_write_object_begin(ctx->w);
+ spdk_json_write_named_uint32(ctx->w, "lcore", current_core);
+ spdk_json_write_named_uint64(ctx->w, "busy", reactor->busy_tsc);
+ spdk_json_write_named_uint64(ctx->w, "idle", reactor->idle_tsc);
+
+ spdk_json_write_named_array_begin(ctx->w, "lw_threads");
+ TAILQ_FOREACH(lw_thread, &reactor->threads, link) {
+ thread = spdk_thread_get_from_ctx(lw_thread);
+
+ spdk_json_write_object_begin(ctx->w);
+ spdk_json_write_named_string(ctx->w, "name", spdk_thread_get_name(thread));
+ spdk_json_write_named_uint64(ctx->w, "id", spdk_thread_get_id(thread));
+ spdk_json_write_named_string(ctx->w, "cpumask",
+ spdk_cpuset_fmt(spdk_thread_get_cpumask(thread)));
+ spdk_json_write_named_uint64(ctx->w, "elapsed",
+ GET_DELTA(ctx->now, lw_thread->tsc_start));
+ spdk_json_write_object_end(ctx->w);
+ }
+ spdk_json_write_array_end(ctx->w);
+
+ spdk_json_write_object_end(ctx->w);
+}
+
+static void
+rpc_framework_get_reactors(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_get_stats_ctx *ctx;
+
+ if (params) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "`framework_get_reactors` requires no arguments");
+ return;
+ }
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Memory allocation error");
+ return;
+ }
+
+ ctx->now = spdk_get_ticks();
+ ctx->request = request;
+ ctx->w = spdk_jsonrpc_begin_result(ctx->request);
+
+ spdk_json_write_object_begin(ctx->w);
+ spdk_json_write_named_uint64(ctx->w, "tick_rate", spdk_get_ticks_hz());
+ spdk_json_write_named_array_begin(ctx->w, "reactors");
+
+ spdk_for_each_reactor(_rpc_framework_get_reactors, ctx, NULL,
+ rpc_framework_get_reactors_done);
+}
+
+SPDK_RPC_REGISTER("framework_get_reactors", rpc_framework_get_reactors, SPDK_RPC_RUNTIME)
+
+struct rpc_thread_set_cpumask_ctx {
+ struct spdk_jsonrpc_request *request;
+ struct spdk_cpuset cpumask;
+ int status;
+ struct spdk_thread *orig_thread;
+};
+
+static void
+rpc_thread_set_cpumask_done(void *_ctx)
+{
+ struct rpc_thread_set_cpumask_ctx *ctx = _ctx;
+ struct spdk_json_write_ctx *w;
+
+ if (ctx->status == 0) {
+ w = spdk_jsonrpc_begin_result(ctx->request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(ctx->request, w);
+ } else {
+ spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ spdk_strerror(-ctx->status));
+ }
+
+ free(ctx);
+}
+
+static void
+_rpc_thread_set_cpumask(void *_ctx)
+{
+ struct rpc_thread_set_cpumask_ctx *ctx = _ctx;
+
+ ctx->status = spdk_thread_set_cpumask(&ctx->cpumask);
+
+ spdk_thread_send_msg(ctx->orig_thread, rpc_thread_set_cpumask_done, ctx);
+}
+
+struct rpc_thread_set_cpumask {
+ uint64_t id;
+ char *cpumask;
+};
+
+static const struct spdk_json_object_decoder rpc_thread_set_cpumask_decoders[] = {
+ {"id", offsetof(struct rpc_thread_set_cpumask, id), spdk_json_decode_uint64},
+ {"cpumask", offsetof(struct rpc_thread_set_cpumask, cpumask), spdk_json_decode_string},
+};
+
+static void
+rpc_thread_set_cpumask(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_thread_set_cpumask req = {};
+ struct rpc_thread_set_cpumask_ctx *ctx;
+ struct spdk_thread *thread;
+ int rc;
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (ctx == NULL) {
+ SPDK_ERRLOG("Memory allocation failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Memory allocation failed");
+ return;
+ }
+
+ if (spdk_json_decode_object(params, rpc_thread_set_cpumask_decoders,
+ SPDK_COUNTOF(rpc_thread_set_cpumask_decoders),
+ &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "spdk_json_decode_object failed");
+ goto err;
+ }
+
+ thread = spdk_thread_get_by_id(req.id);
+ if (thread == NULL) {
+ SPDK_ERRLOG("Thread %" PRIu64 " does not exist\n", req.id);
+ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Thread %" PRIu64 " does not exist", req.id);
+ goto err;
+ }
+
+ rc = spdk_app_parse_core_mask(req.cpumask, &ctx->cpumask);
+ if (rc != 0) {
+ SPDK_ERRLOG("Invalid cpumask %s\n", req.cpumask);
+ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Invalid cpumask %s", req.cpumask);
+ goto err;
+ }
+
+ if (spdk_cpuset_count(&ctx->cpumask) == 0) {
+ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "No CPU is selected from reactor mask %s\n",
+ spdk_cpuset_fmt(spdk_app_get_core_mask()));
+ goto err;
+ }
+
+ ctx->request = request;
+ ctx->orig_thread = spdk_get_thread();
+
+ spdk_thread_send_msg(thread, _rpc_thread_set_cpumask, ctx);
+
+ free(req.cpumask);
+ return;
+
+err:
+ free(req.cpumask);
+ free(ctx);
+}
+SPDK_RPC_REGISTER("thread_set_cpumask", rpc_thread_set_cpumask, SPDK_RPC_RUNTIME)
+SPDK_LOG_REGISTER_COMPONENT("APP_RPC", SPDK_LOG_APP_RPC)
diff --git a/src/spdk/module/event/rpc/subsystem_rpc.c b/src/spdk/module/event/rpc/subsystem_rpc.c
new file mode 100644
index 000000000..293493afa
--- /dev/null
+++ b/src/spdk/module/event/rpc/subsystem_rpc.c
@@ -0,0 +1,118 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk_internal/event.h"
+#include "spdk/rpc.h"
+#include "spdk/string.h"
+#include "spdk/util.h"
+#include "spdk/env.h"
+
+static void
+rpc_framework_get_subsystems(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct spdk_json_write_ctx *w;
+ struct spdk_subsystem *subsystem;
+ struct spdk_subsystem_depend *deps;
+
+ if (params) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "'framework_get_subsystems' requires no arguments");
+ return;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_array_begin(w);
+ subsystem = spdk_subsystem_get_first();
+ while (subsystem != NULL) {
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_string(w, "subsystem", subsystem->name);
+ spdk_json_write_named_array_begin(w, "depends_on");
+ deps = spdk_subsystem_get_first_depend();
+ while (deps != NULL) {
+ if (strcmp(subsystem->name, deps->name) == 0) {
+ spdk_json_write_string(w, deps->depends_on);
+ }
+ deps = spdk_subsystem_get_next_depend(deps);
+ }
+ spdk_json_write_array_end(w);
+ spdk_json_write_object_end(w);
+ subsystem = spdk_subsystem_get_next(subsystem);
+ }
+ spdk_json_write_array_end(w);
+ spdk_jsonrpc_end_result(request, w);
+}
+
+SPDK_RPC_REGISTER("framework_get_subsystems", rpc_framework_get_subsystems, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(framework_get_subsystems, get_subsystems)
+
+struct rpc_framework_get_config_ctx {
+ char *name;
+};
+
+static const struct spdk_json_object_decoder rpc_framework_get_config_ctx[] = {
+ {"name", offsetof(struct rpc_framework_get_config_ctx, name), spdk_json_decode_string},
+};
+
+static void
+rpc_framework_get_config(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_framework_get_config_ctx req = {};
+ struct spdk_json_write_ctx *w;
+ struct spdk_subsystem *subsystem;
+
+ if (spdk_json_decode_object(params, rpc_framework_get_config_ctx,
+ SPDK_COUNTOF(rpc_framework_get_config_ctx), &req)) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid arguments");
+ return;
+ }
+
+ subsystem = spdk_subsystem_find(req.name);
+ if (!subsystem) {
+ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Subsystem '%s' not found", req.name);
+ free(req.name);
+ return;
+ }
+
+ free(req.name);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_subsystem_config_json(w, subsystem);
+ spdk_jsonrpc_end_result(request, w);
+}
+
+SPDK_RPC_REGISTER("framework_get_config", rpc_framework_get_config, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(framework_get_config, get_subsystem_config)
diff --git a/src/spdk/module/event/subsystems/Makefile b/src/spdk/module/event/subsystems/Makefile
new file mode 100644
index 000000000..a78985ec3
--- /dev/null
+++ b/src/spdk/module/event/subsystems/Makefile
@@ -0,0 +1,61 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+DIRS-y += bdev accel iscsi net nvmf scsi vmd sock
+
+ifeq ($(OS),Linux)
+DIRS-y += nbd
+endif
+
+DIRS-$(CONFIG_VHOST) += vhost
+
+# These dependencies are not based specifically on symbols, but rather
+# the subsystem dependency tree defined within the event subsystem C files
+# themselves. Should that tree change, these dependencies should change
+# accordingly.
+DEPDIRS-bdev := accel vmd sock
+DEPDIRS-iscsi := scsi
+DEPDIRS-nbd := bdev
+DEPDIRS-nvmf := bdev
+DEPDIRS-scsi := bdev
+DEPDIRS-vhost := scsi
+
+.PHONY: all clean $(DIRS-y)
+
+all: $(DIRS-y)
+clean: $(DIRS-y)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.subdirs.mk
diff --git a/src/spdk/module/event/subsystems/accel/Makefile b/src/spdk/module/event/subsystems/accel/Makefile
new file mode 100644
index 000000000..6c8045984
--- /dev/null
+++ b/src/spdk/module/event/subsystems/accel/Makefile
@@ -0,0 +1,45 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = accel.c
+LIBNAME = event_accel
+
+SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/module/event/subsystems/accel/accel.c b/src/spdk/module/event/subsystems/accel/accel.c
new file mode 100644
index 000000000..957a49686
--- /dev/null
+++ b/src/spdk/module/event/subsystems/accel/accel.c
@@ -0,0 +1,71 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/accel_engine.h"
+
+#include "spdk_internal/event.h"
+#include "spdk/env.h"
+
+static void
+accel_engine_subsystem_initialize(void)
+{
+ int rc;
+
+ rc = spdk_accel_engine_initialize();
+
+ spdk_subsystem_init_next(rc);
+}
+
+static void
+accel_engine_subsystem_finish_done(void *cb_arg)
+{
+ spdk_subsystem_fini_next();
+}
+
+static void
+accel_engine_subsystem_finish(void)
+{
+ spdk_accel_engine_finish(accel_engine_subsystem_finish_done, NULL);
+}
+
+static struct spdk_subsystem g_spdk_subsystem_accel = {
+ .name = "accel",
+ .init = accel_engine_subsystem_initialize,
+ .fini = accel_engine_subsystem_finish,
+ .config = spdk_accel_engine_config_text,
+ .write_config_json = spdk_accel_write_config_json,
+};
+
+SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_accel);
diff --git a/src/spdk/module/event/subsystems/bdev/Makefile b/src/spdk/module/event/subsystems/bdev/Makefile
new file mode 100644
index 000000000..4503e327a
--- /dev/null
+++ b/src/spdk/module/event/subsystems/bdev/Makefile
@@ -0,0 +1,45 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = bdev.c
+LIBNAME = event_bdev
+
+SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/module/event/subsystems/bdev/bdev.c b/src/spdk/module/event/subsystems/bdev/bdev.c
new file mode 100644
index 000000000..5776cf273
--- /dev/null
+++ b/src/spdk/module/event/subsystems/bdev/bdev.c
@@ -0,0 +1,84 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/bdev.h"
+#include "spdk/env.h"
+#include "spdk/thread.h"
+
+#include "spdk_internal/event.h"
+#include "spdk/env.h"
+
+static void
+bdev_initialize_complete(void *cb_arg, int rc)
+{
+ spdk_subsystem_init_next(rc);
+}
+
+static void
+bdev_subsystem_initialize(void)
+{
+ spdk_bdev_initialize(bdev_initialize_complete, NULL);
+}
+
+static void
+bdev_subsystem_finish_done(void *cb_arg)
+{
+ spdk_subsystem_fini_next();
+}
+
+static void
+bdev_subsystem_finish(void)
+{
+ spdk_bdev_finish(bdev_subsystem_finish_done, NULL);
+}
+
+static void
+bdev_subsystem_config_json(struct spdk_json_write_ctx *w)
+{
+ spdk_bdev_subsystem_config_json(w);
+}
+
+static struct spdk_subsystem g_spdk_subsystem_bdev = {
+ .name = "bdev",
+ .init = bdev_subsystem_initialize,
+ .fini = bdev_subsystem_finish,
+ .config = spdk_bdev_config_text,
+ .write_config_json = bdev_subsystem_config_json,
+};
+
+SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_bdev);
+SPDK_SUBSYSTEM_DEPEND(bdev, accel)
+SPDK_SUBSYSTEM_DEPEND(bdev, vmd)
+SPDK_SUBSYSTEM_DEPEND(bdev, sock)
diff --git a/src/spdk/module/event/subsystems/iscsi/Makefile b/src/spdk/module/event/subsystems/iscsi/Makefile
new file mode 100644
index 000000000..f418f8bd9
--- /dev/null
+++ b/src/spdk/module/event/subsystems/iscsi/Makefile
@@ -0,0 +1,46 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+CFLAGS += -I$(SPDK_ROOT_DIR)/lib
+C_SRCS = iscsi.c
+LIBNAME = event_iscsi
+
+SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/module/event/subsystems/iscsi/iscsi.c b/src/spdk/module/event/subsystems/iscsi/iscsi.c
new file mode 100644
index 000000000..cecefd0a5
--- /dev/null
+++ b/src/spdk/module/event/subsystems/iscsi/iscsi.c
@@ -0,0 +1,80 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "iscsi/iscsi.h"
+
+#include "spdk_internal/event.h"
+
+static void
+iscsi_subsystem_init_complete(void *cb_arg, int rc)
+{
+ spdk_subsystem_init_next(rc);
+}
+
+static void
+iscsi_subsystem_init(void)
+{
+ spdk_iscsi_init(iscsi_subsystem_init_complete, NULL);
+}
+
+static void
+iscsi_subsystem_fini_done(void *arg)
+{
+ spdk_subsystem_fini_next();
+}
+
+static void
+iscsi_subsystem_fini(void)
+{
+ spdk_iscsi_fini(iscsi_subsystem_fini_done, NULL);
+}
+
+static void
+iscsi_subsystem_config_json(struct spdk_json_write_ctx *w)
+{
+ spdk_iscsi_config_json(w);
+}
+
+static struct spdk_subsystem g_spdk_subsystem_iscsi = {
+ .name = "iscsi",
+ .init = iscsi_subsystem_init,
+ .fini = iscsi_subsystem_fini,
+ .config = spdk_iscsi_config_text,
+ .write_config_json = iscsi_subsystem_config_json,
+};
+
+SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_iscsi);
+SPDK_SUBSYSTEM_DEPEND(iscsi, scsi)
+SPDK_SUBSYSTEM_DEPEND(iscsi, sock)
diff --git a/src/spdk/module/event/subsystems/nbd/Makefile b/src/spdk/module/event/subsystems/nbd/Makefile
new file mode 100644
index 000000000..6991ce76d
--- /dev/null
+++ b/src/spdk/module/event/subsystems/nbd/Makefile
@@ -0,0 +1,45 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = nbd.c
+LIBNAME = event_nbd
+
+SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/module/event/subsystems/nbd/nbd.c b/src/spdk/module/event/subsystems/nbd/nbd.c
new file mode 100644
index 000000000..dc1c3cfa3
--- /dev/null
+++ b/src/spdk/module/event/subsystems/nbd/nbd.c
@@ -0,0 +1,72 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/nbd.h"
+
+#include "spdk_internal/event.h"
+
+static void
+nbd_subsystem_init(void)
+{
+ int rc;
+
+ rc = spdk_nbd_init();
+
+ spdk_subsystem_init_next(rc);
+}
+
+static void
+nbd_subsystem_fini(void)
+{
+ spdk_nbd_fini();
+ spdk_subsystem_fini_next();
+}
+
+static void
+nbd_subsystem_write_config_json(struct spdk_json_write_ctx *w)
+{
+ spdk_nbd_write_config_json(w);
+}
+
+static struct spdk_subsystem g_spdk_subsystem_nbd = {
+ .name = "nbd",
+ .init = nbd_subsystem_init,
+ .fini = nbd_subsystem_fini,
+ .config = NULL,
+ .write_config_json = nbd_subsystem_write_config_json,
+};
+
+SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_nbd);
+SPDK_SUBSYSTEM_DEPEND(nbd, bdev)
diff --git a/src/spdk/module/event/subsystems/net/Makefile b/src/spdk/module/event/subsystems/net/Makefile
new file mode 100644
index 000000000..b90cffae4
--- /dev/null
+++ b/src/spdk/module/event/subsystems/net/Makefile
@@ -0,0 +1,45 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = net.c
+LIBNAME = event_net
+
+SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/module/event/subsystems/net/net.c b/src/spdk/module/event/subsystems/net/net.c
new file mode 100644
index 000000000..76694238a
--- /dev/null
+++ b/src/spdk/module/event/subsystems/net/net.c
@@ -0,0 +1,98 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/net.h"
+
+#include "spdk_internal/event.h"
+
+static void
+interface_subsystem_init(void)
+{
+ int rc;
+
+ rc = spdk_interface_init();
+
+ spdk_subsystem_init_next(rc);
+}
+
+static void
+interface_subsystem_destroy(void)
+{
+ spdk_interface_destroy();
+ spdk_subsystem_fini_next();
+}
+
+static struct spdk_subsystem g_spdk_subsystem_interface = {
+ .name = "interface",
+ .init = interface_subsystem_init,
+ .fini = interface_subsystem_destroy,
+ .config = NULL,
+};
+
+SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_interface);
+
+static void
+net_start_complete(void *cb_arg, int rc)
+{
+ spdk_subsystem_init_next(rc);
+}
+
+static void
+net_subsystem_start(void)
+{
+ spdk_net_framework_start(net_start_complete, NULL);
+}
+
+static void
+net_fini_done(void *cb_arg)
+{
+ spdk_subsystem_fini_next();
+}
+
+static void
+net_subsystem_fini(void)
+{
+ spdk_net_framework_fini(net_fini_done, NULL);
+}
+
+static struct spdk_subsystem g_spdk_subsystem_net_framework = {
+ .name = "net_framework",
+ .init = net_subsystem_start,
+ .fini = net_subsystem_fini,
+ .config = NULL,
+};
+
+SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_net_framework);
+SPDK_SUBSYSTEM_DEPEND(net_framework, interface)
diff --git a/src/spdk/module/event/subsystems/nvmf/Makefile b/src/spdk/module/event/subsystems/nvmf/Makefile
new file mode 100644
index 000000000..b51962d2f
--- /dev/null
+++ b/src/spdk/module/event/subsystems/nvmf/Makefile
@@ -0,0 +1,45 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = conf.c nvmf_rpc.c nvmf_tgt.c
+LIBNAME = event_nvmf
+
+SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/module/event/subsystems/nvmf/conf.c b/src/spdk/module/event/subsystems/nvmf/conf.c
new file mode 100644
index 000000000..b92a92acc
--- /dev/null
+++ b/src/spdk/module/event/subsystems/nvmf/conf.c
@@ -0,0 +1,709 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation. All rights reserved.
+ * Copyright (c) 2018 Mellanox Technologies LTD. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "event_nvmf.h"
+
+#include "spdk/conf.h"
+#include "spdk/log.h"
+#include "spdk/bdev.h"
+#include "spdk/nvme.h"
+#include "spdk/nvmf.h"
+#include "spdk/string.h"
+#include "spdk/util.h"
+
+#define SPDK_NVMF_MAX_NAMESPACES (1 << 14)
+
+struct spdk_nvmf_tgt_conf *g_spdk_nvmf_tgt_conf = NULL;
+uint32_t g_spdk_nvmf_tgt_max_subsystems = 0;
+
+static int
+nvmf_add_discovery_subsystem(void)
+{
+ struct spdk_nvmf_subsystem *subsystem;
+
+ subsystem = spdk_nvmf_subsystem_create(g_spdk_nvmf_tgt, SPDK_NVMF_DISCOVERY_NQN,
+ SPDK_NVMF_SUBTYPE_DISCOVERY, 0);
+ if (subsystem == NULL) {
+ SPDK_ERRLOG("Failed creating discovery nvmf library subsystem\n");
+ return -1;
+ }
+
+ spdk_nvmf_subsystem_set_allow_any_host(subsystem, true);
+
+ return 0;
+}
+
+static void
+nvmf_read_config_file_tgt_max_subsystems(struct spdk_conf_section *sp,
+ int *deprecated_values)
+{
+ int tgt_max_subsystems;
+ int deprecated;
+
+ tgt_max_subsystems = spdk_conf_section_get_intval(sp, "MaxSubsystems");
+ if (tgt_max_subsystems >= 0) {
+ g_spdk_nvmf_tgt_max_subsystems = tgt_max_subsystems;
+ }
+
+ deprecated = spdk_conf_section_get_intval(sp, "MaxQueueDepth");
+ if (deprecated >= 0) {
+ *deprecated_values = -1;
+ }
+
+ deprecated = spdk_conf_section_get_intval(sp, "MaxQueuesPerSession");
+ if (deprecated >= 0) {
+ *deprecated_values = -1;
+ }
+
+ deprecated = spdk_conf_section_get_intval(sp, "InCapsuleDataSize");
+ if (deprecated >= 0) {
+ *deprecated_values = -1;
+ }
+
+ deprecated = spdk_conf_section_get_intval(sp, "MaxIOSize");
+ if (deprecated >= 0) {
+ *deprecated_values = -1;
+ }
+
+ deprecated = spdk_conf_section_get_intval(sp, "IOUnitSize");
+ if (deprecated >= 0) {
+ *deprecated_values = -1;
+ }
+}
+
+static int
+nvmf_read_config_file_tgt_conf(struct spdk_conf_section *sp,
+ struct spdk_nvmf_tgt_conf *conf)
+{
+ int acceptor_poll_rate;
+ const char *conn_scheduler;
+ int rc = 0;
+
+ acceptor_poll_rate = spdk_conf_section_get_intval(sp, "AcceptorPollRate");
+ if (acceptor_poll_rate >= 0) {
+ conf->acceptor_poll_rate = acceptor_poll_rate;
+ }
+
+ conn_scheduler = spdk_conf_section_get_val(sp, "ConnectionScheduler");
+
+ if (conn_scheduler) {
+ SPDK_NOTICELOG("The ConnectionScheduler option is no longer valid. Ignoring it.\n");
+ }
+
+ conf->admin_passthru.identify_ctrlr = spdk_conf_section_get_boolval(sp,
+ "AdminCmdPassthruIdentifyCtrlr", false);
+
+ return rc;
+}
+
+static int
+nvmf_parse_tgt_max_subsystems(void)
+{
+ struct spdk_conf_section *sp;
+ int deprecated_values = 0;
+
+ sp = spdk_conf_find_section(NULL, "Nvmf");
+ if (sp != NULL) {
+ nvmf_read_config_file_tgt_max_subsystems(sp, &deprecated_values);
+ }
+
+ return deprecated_values;
+}
+
+static struct spdk_nvmf_tgt_conf *
+nvmf_parse_tgt_conf(void)
+{
+ struct spdk_nvmf_tgt_conf *conf;
+ struct spdk_conf_section *sp;
+ int rc;
+
+ conf = calloc(1, sizeof(*conf));
+ if (!conf) {
+ SPDK_ERRLOG("calloc() failed for target conf\n");
+ return NULL;
+ }
+
+ conf->acceptor_poll_rate = ACCEPT_TIMEOUT_US;
+ conf->admin_passthru.identify_ctrlr = false;
+
+ sp = spdk_conf_find_section(NULL, "Nvmf");
+ if (sp != NULL) {
+ rc = nvmf_read_config_file_tgt_conf(sp, conf);
+ if (rc) {
+ free(conf);
+ return NULL;
+ }
+ }
+
+ return conf;
+}
+
+static int
+nvmf_parse_nvmf_tgt(void)
+{
+ int rc;
+ int using_deprecated_options;
+ struct spdk_nvmf_target_opts opts = {
+ .name = "nvmf_tgt",
+ .max_subsystems = 0
+ };
+
+ if (!g_spdk_nvmf_tgt_max_subsystems) {
+ using_deprecated_options = nvmf_parse_tgt_max_subsystems();
+ if (using_deprecated_options < 0) {
+ SPDK_ERRLOG("Deprecated options detected for the NVMe-oF target.\n"
+ "The following options are no longer controlled by the target\n"
+ "and should be set in the transport on a per-transport basis:\n"
+ "MaxQueueDepth, MaxQueuesPerSession, InCapsuleDataSize, MaxIOSize, IOUnitSize\n"
+ "This can be accomplished by setting the options through the create_nvmf_transport RPC.\n"
+ "You may also continue to configure these options in the conf file under each transport.");
+ }
+ }
+
+ if (!g_spdk_nvmf_tgt_conf) {
+ g_spdk_nvmf_tgt_conf = nvmf_parse_tgt_conf();
+ if (!g_spdk_nvmf_tgt_conf) {
+ SPDK_ERRLOG("nvmf_parse_tgt_conf() failed\n");
+ return -1;
+ }
+ }
+
+ opts.max_subsystems = g_spdk_nvmf_tgt_max_subsystems;
+ g_spdk_nvmf_tgt = spdk_nvmf_tgt_create(&opts);
+
+ g_spdk_nvmf_tgt_max_subsystems = 0;
+
+ if (!g_spdk_nvmf_tgt) {
+ SPDK_ERRLOG("spdk_nvmf_tgt_create() failed\n");
+ return -1;
+ }
+
+ rc = nvmf_add_discovery_subsystem();
+ if (rc != 0) {
+ SPDK_ERRLOG("nvmf_add_discovery_subsystem failed\n");
+ return rc;
+ }
+
+ return 0;
+}
+
+static int
+nvmf_tgt_parse_listen_ip_addr(char *address,
+ struct spdk_nvme_transport_id *trid)
+{
+ char *host;
+ char *port;
+
+ if (spdk_parse_ip_addr(address, &host, &port) < 0) {
+ SPDK_ERRLOG("Unable to parse listen address '%s'\n", address);
+ return -1;
+ }
+
+ if (strchr(host, ':')) {
+ trid->adrfam = SPDK_NVMF_ADRFAM_IPV6;
+ } else {
+ trid->adrfam = SPDK_NVMF_ADRFAM_IPV4;
+ }
+
+ snprintf(trid->traddr, sizeof(trid->traddr), "%s", host);
+ if (port) {
+ snprintf(trid->trsvcid, sizeof(trid->trsvcid), "%s", port);
+ }
+
+ return 0;
+}
+
+static int
+nvmf_tgt_parse_listen_fc_addr(const char *address,
+ struct spdk_nvme_transport_id *trid)
+{
+ /* transport address format and requirements,
+ * "nn-0xWWNN:pn-0xWWPN" - size equals 43 bytes and is required to
+ * contain 'nn' and 'pn'.
+ */
+ if (strlen(address) != 43 || strncmp(address, "nn-0x", 5) ||
+ strncmp(&address[21], ":pn-0x", 6)) {
+ SPDK_ERRLOG("Unable to parse fc address '%s'\n", address);
+ return -1;
+ }
+
+ trid->adrfam = SPDK_NVMF_ADRFAM_FC;
+ snprintf(trid->trsvcid, sizeof(trid->trsvcid), "none");
+ snprintf(trid->traddr, sizeof(trid->traddr), "%s", address);
+
+ return 0;
+}
+
+static void
+nvmf_tgt_listen_done(void *cb_arg, int status)
+{
+ /* TODO: Config parsing should wait for this operation to finish. */
+
+ if (status) {
+ SPDK_ERRLOG("Failed to listen on transport address\n");
+ }
+}
+
+static int
+nvmf_parse_subsystem(struct spdk_conf_section *sp)
+{
+ const char *nqn, *mode;
+ size_t i;
+ int ret = -1;
+ int lcore;
+ bool allow_any_host;
+ bool allow_any_listener = true;
+ const char *sn;
+ const char *mn;
+ struct spdk_nvmf_subsystem *subsystem;
+ int num_ns;
+
+ nqn = spdk_conf_section_get_val(sp, "NQN");
+ if (nqn == NULL) {
+ SPDK_ERRLOG("Subsystem missing NQN\n");
+ return -1;
+ }
+
+ mode = spdk_conf_section_get_val(sp, "Mode");
+ lcore = spdk_conf_section_get_intval(sp, "Core");
+ num_ns = spdk_conf_section_get_intval(sp, "MaxNamespaces");
+
+ if (num_ns < 1) {
+ num_ns = 0;
+ } else if (num_ns > SPDK_NVMF_MAX_NAMESPACES) {
+ num_ns = SPDK_NVMF_MAX_NAMESPACES;
+ }
+
+ /* Mode is no longer a valid parameter, but print out a nice
+ * message if it exists to inform users.
+ */
+ if (mode) {
+ SPDK_NOTICELOG("Mode present in the [Subsystem] section of the config file.\n"
+ "Mode was removed as a valid parameter.\n");
+ if (strcasecmp(mode, "Virtual") == 0) {
+ SPDK_NOTICELOG("Your mode value is 'Virtual' which is now the only possible mode.\n"
+ "Your configuration file will work as expected.\n");
+ } else {
+ SPDK_NOTICELOG("Please remove Mode from your configuration file.\n");
+ return -1;
+ }
+ }
+
+ /* Core is no longer a valid parameter, but print out a nice
+ * message if it exists to inform users.
+ */
+ if (lcore >= 0) {
+ SPDK_NOTICELOG("Core present in the [Subsystem] section of the config file.\n"
+ "Core was removed as an option. Subsystems can now run on all available cores.\n");
+ SPDK_NOTICELOG("Please remove Core from your configuration file. Ignoring it and continuing.\n");
+ }
+
+ sn = spdk_conf_section_get_val(sp, "SN");
+ if (sn == NULL) {
+ SPDK_ERRLOG("Subsystem %s: missing serial number\n", nqn);
+ return -1;
+ }
+
+ subsystem = spdk_nvmf_subsystem_create(g_spdk_nvmf_tgt, nqn, SPDK_NVMF_SUBTYPE_NVME, num_ns);
+ if (subsystem == NULL) {
+ goto done;
+ }
+
+ if (spdk_nvmf_subsystem_set_sn(subsystem, sn)) {
+ SPDK_ERRLOG("Subsystem %s: invalid serial number '%s'\n", nqn, sn);
+ spdk_nvmf_subsystem_destroy(subsystem);
+ subsystem = NULL;
+ goto done;
+ }
+
+ mn = spdk_conf_section_get_val(sp, "MN");
+ if (mn == NULL) {
+ SPDK_NOTICELOG(
+ "Subsystem %s: missing model number, will use default\n",
+ nqn);
+ }
+
+ if (mn != NULL) {
+ if (spdk_nvmf_subsystem_set_mn(subsystem, mn)) {
+ SPDK_ERRLOG("Subsystem %s: invalid model number '%s'\n", nqn, mn);
+ spdk_nvmf_subsystem_destroy(subsystem);
+ subsystem = NULL;
+ goto done;
+ }
+ }
+
+ for (i = 0; ; i++) {
+ struct spdk_nvmf_ns_opts ns_opts;
+ struct spdk_bdev *bdev;
+ const char *bdev_name;
+ const char *uuid_str;
+ char *nsid_str;
+
+ bdev_name = spdk_conf_section_get_nmval(sp, "Namespace", i, 0);
+ if (!bdev_name) {
+ break;
+ }
+
+ bdev = spdk_bdev_get_by_name(bdev_name);
+ if (bdev == NULL) {
+ SPDK_ERRLOG("Could not find namespace bdev '%s'\n", bdev_name);
+ spdk_nvmf_subsystem_destroy(subsystem);
+ subsystem = NULL;
+ goto done;
+ }
+
+ spdk_nvmf_ns_opts_get_defaults(&ns_opts, sizeof(ns_opts));
+
+ nsid_str = spdk_conf_section_get_nmval(sp, "Namespace", i, 1);
+ if (nsid_str) {
+ char *end;
+ unsigned long nsid_ul = strtoul(nsid_str, &end, 0);
+
+ if (*end != '\0' || nsid_ul == 0 || nsid_ul >= UINT32_MAX) {
+ SPDK_ERRLOG("Invalid NSID %s\n", nsid_str);
+ spdk_nvmf_subsystem_destroy(subsystem);
+ subsystem = NULL;
+ goto done;
+ }
+
+ ns_opts.nsid = (uint32_t)nsid_ul;
+ }
+
+ uuid_str = spdk_conf_section_get_nmval(sp, "Namespace", i, 2);
+ if (uuid_str) {
+ if (spdk_uuid_parse(&ns_opts.uuid, uuid_str)) {
+ SPDK_ERRLOG("Invalid UUID %s\n", uuid_str);
+ spdk_nvmf_subsystem_destroy(subsystem);
+ subsystem = NULL;
+ goto done;
+ }
+ }
+
+ if (spdk_nvmf_subsystem_add_ns(subsystem, bdev, &ns_opts, sizeof(ns_opts), NULL) == 0) {
+ SPDK_ERRLOG("Unable to add namespace\n");
+ spdk_nvmf_subsystem_destroy(subsystem);
+ subsystem = NULL;
+ goto done;
+ }
+ }
+
+ /* Parse Listen sections */
+ for (i = 0; ; i++) {
+ struct spdk_nvme_transport_id trid = {{0}};
+ const char *transport;
+ const char *address;
+ char *address_dup;
+
+ transport = spdk_conf_section_get_nmval(sp, "Listen", i, 0);
+ if (!transport) {
+ break;
+ }
+
+ if (spdk_nvme_transport_id_populate_trstring(&trid, transport)) {
+ SPDK_ERRLOG("Invalid listen address transport type '%s'\n", transport);
+ continue;
+ }
+
+ if (spdk_nvme_transport_id_parse_trtype(&trid.trtype, transport)) {
+ SPDK_ERRLOG("Invalid listen address transport type '%s'\n", transport);
+ continue;
+ }
+
+ address = spdk_conf_section_get_nmval(sp, "Listen", i, 1);
+ if (!address) {
+ break;
+ }
+
+ address_dup = strdup(address);
+ if (!address_dup) {
+ break;
+ }
+
+ if (trid.trtype == SPDK_NVME_TRANSPORT_RDMA ||
+ trid.trtype == SPDK_NVME_TRANSPORT_TCP) {
+ ret = nvmf_tgt_parse_listen_ip_addr(address_dup, &trid);
+ } else if (trid.trtype == SPDK_NVME_TRANSPORT_FC) {
+ ret = nvmf_tgt_parse_listen_fc_addr(address_dup, &trid);
+ }
+
+ free(address_dup);
+
+ if (ret) {
+ continue;
+ }
+
+ if (spdk_nvmf_tgt_listen(g_spdk_nvmf_tgt, &trid)) {
+ SPDK_ERRLOG("Failed to listen on transport address\n");
+ }
+
+ spdk_nvmf_subsystem_add_listener(subsystem, &trid, nvmf_tgt_listen_done, NULL);
+ allow_any_listener = false;
+ }
+
+ spdk_nvmf_subsystem_allow_any_listener(subsystem, allow_any_listener);
+
+ /* Parse Host sections */
+ for (i = 0; ; i++) {
+ const char *host = spdk_conf_section_get_nval(sp, "Host", i);
+
+ if (!host) {
+ break;
+ }
+
+ spdk_nvmf_subsystem_add_host(subsystem, host);
+ }
+
+ allow_any_host = spdk_conf_section_get_boolval(sp, "AllowAnyHost", false);
+ spdk_nvmf_subsystem_set_allow_any_host(subsystem, allow_any_host);
+
+done:
+ return (subsystem != NULL) ? 0 : -1;
+}
+
+static int
+nvmf_parse_subsystems(void)
+{
+ int rc = 0;
+ struct spdk_conf_section *sp;
+
+ sp = spdk_conf_first_section(NULL);
+ while (sp != NULL) {
+ if (spdk_conf_section_match_prefix(sp, "Subsystem")) {
+ rc = nvmf_parse_subsystem(sp);
+ if (rc < 0) {
+ return -1;
+ }
+ }
+ sp = spdk_conf_next_section(sp);
+ }
+ return 0;
+}
+
+struct nvmf_parse_transport_ctx {
+ struct spdk_conf_section *sp;
+ nvmf_parse_conf_done_fn cb_fn;
+};
+
+static void nvmf_parse_transport(struct nvmf_parse_transport_ctx *ctx);
+
+static void
+nvmf_tgt_add_transport_done(void *cb_arg, int status)
+{
+ struct nvmf_parse_transport_ctx *ctx = cb_arg;
+ int rc;
+
+ if (status < 0) {
+ SPDK_ERRLOG("Add transport to target failed (%d).\n", status);
+ ctx->cb_fn(status);
+ free(ctx);
+ return;
+ }
+
+ /* find next transport */
+ ctx->sp = spdk_conf_next_section(ctx->sp);
+ while (ctx->sp) {
+ if (spdk_conf_section_match_prefix(ctx->sp, "Transport")) {
+ nvmf_parse_transport(ctx);
+ return;
+ }
+ ctx->sp = spdk_conf_next_section(ctx->sp);
+ }
+
+ /* done with transports, parse Subsystem sections */
+ rc = nvmf_parse_subsystems();
+
+ ctx->cb_fn(rc);
+ free(ctx);
+}
+
+static void
+nvmf_parse_transport(struct nvmf_parse_transport_ctx *ctx)
+{
+ const char *type;
+ struct spdk_nvmf_transport_opts opts = { 0 };
+ enum spdk_nvme_transport_type trtype;
+ struct spdk_nvmf_transport *transport;
+ bool bval;
+ int val;
+
+ type = spdk_conf_section_get_val(ctx->sp, "Type");
+ if (type == NULL) {
+ SPDK_ERRLOG("Transport missing Type\n");
+ ctx->cb_fn(-1);
+ free(ctx);
+ return;
+ }
+
+ spdk_nvme_transport_id_parse_trtype(&trtype, type);
+
+ if (spdk_nvmf_tgt_get_transport(g_spdk_nvmf_tgt, type)) {
+ SPDK_ERRLOG("Duplicate transport type '%s'\n", type);
+ ctx->cb_fn(-1);
+ free(ctx);
+ return;
+ }
+
+ if (!spdk_nvmf_transport_opts_init(type, &opts)) {
+ ctx->cb_fn(-1);
+ free(ctx);
+ return;
+ }
+
+ val = spdk_conf_section_get_intval(ctx->sp, "MaxQueueDepth");
+ if (val >= 0) {
+ opts.max_queue_depth = val;
+ }
+ val = spdk_conf_section_get_intval(ctx->sp, "MaxQueuesPerSession");
+ if (val >= 0) {
+ opts.max_qpairs_per_ctrlr = val;
+ }
+ val = spdk_conf_section_get_intval(ctx->sp, "InCapsuleDataSize");
+ if (val >= 0) {
+ opts.in_capsule_data_size = val;
+ }
+ val = spdk_conf_section_get_intval(ctx->sp, "MaxIOSize");
+ if (val >= 0) {
+ opts.max_io_size = val;
+ }
+ val = spdk_conf_section_get_intval(ctx->sp, "IOUnitSize");
+ if (val >= 0) {
+ opts.io_unit_size = val;
+ }
+ val = spdk_conf_section_get_intval(ctx->sp, "MaxAQDepth");
+ if (val >= 0) {
+ opts.max_aq_depth = val;
+ }
+ val = spdk_conf_section_get_intval(ctx->sp, "NumSharedBuffers");
+ if (val >= 0) {
+ opts.num_shared_buffers = val;
+ }
+ val = spdk_conf_section_get_intval(ctx->sp, "BufCacheSize");
+ if (val >= 0) {
+ opts.buf_cache_size = val;
+ }
+
+ if (trtype == SPDK_NVME_TRANSPORT_RDMA) {
+ val = spdk_conf_section_get_intval(ctx->sp, "MaxSRQDepth");
+ if (val >= 0) {
+ opts.max_srq_depth = val;
+ }
+ bval = spdk_conf_section_get_boolval(ctx->sp, "NoSRQ", false);
+ opts.no_srq = bval;
+ }
+
+ if (trtype == SPDK_NVME_TRANSPORT_TCP) {
+ bval = spdk_conf_section_get_boolval(ctx->sp, "C2HSuccess", true);
+ opts.c2h_success = bval;
+
+ val = spdk_conf_section_get_intval(ctx->sp, "SockPriority");
+ if (val >= 0) {
+ opts.sock_priority = val;
+ }
+ }
+
+ bval = spdk_conf_section_get_boolval(ctx->sp, "DifInsertOrStrip", false);
+ opts.dif_insert_or_strip = bval;
+
+ transport = spdk_nvmf_transport_create(type, &opts);
+ if (transport) {
+ spdk_nvmf_tgt_add_transport(g_spdk_nvmf_tgt, transport, nvmf_tgt_add_transport_done, ctx);
+ } else {
+ goto error_out;
+ }
+
+ return;
+
+error_out:
+ ctx->cb_fn(-1);
+ free(ctx);
+ return;
+}
+
+static int
+nvmf_parse_transports(nvmf_parse_conf_done_fn cb_fn)
+{
+ struct nvmf_parse_transport_ctx *ctx;
+
+ ctx = calloc(1, sizeof(struct nvmf_parse_transport_ctx));
+ if (!ctx) {
+ SPDK_ERRLOG("Failed alloc of context memory for parse transports\n");
+ return -ENOMEM;
+ }
+
+ ctx->cb_fn = cb_fn;
+ ctx->sp = spdk_conf_first_section(NULL);
+ if (ctx->sp == NULL) {
+ free(ctx);
+ cb_fn(0);
+
+ return 0;
+ }
+
+ while (ctx->sp != NULL) {
+ if (spdk_conf_section_match_prefix(ctx->sp, "Transport")) {
+ nvmf_parse_transport(ctx);
+ return 0;
+ }
+ ctx->sp = spdk_conf_next_section(ctx->sp);
+ }
+
+ /* if we get here, there are no transports defined in conf file */
+ free(ctx);
+ cb_fn(0);
+ return 0;
+}
+
+int
+nvmf_parse_conf(nvmf_parse_conf_done_fn cb_fn)
+{
+ int rc;
+
+ if (cb_fn == NULL) {
+ SPDK_ERRLOG("Callback function is NULL\n");
+ return -1;
+ }
+
+ /* NVMf section */
+ rc = nvmf_parse_nvmf_tgt();
+ if (rc < 0) {
+ return rc;
+ }
+
+ /* Transport sections */
+ rc = nvmf_parse_transports(cb_fn);
+ if (rc < 0) {
+ return rc;
+ }
+
+ return 0;
+}
diff --git a/src/spdk/module/event/subsystems/nvmf/event_nvmf.h b/src/spdk/module/event/subsystems/nvmf/event_nvmf.h
new file mode 100644
index 000000000..58d3f713b
--- /dev/null
+++ b/src/spdk/module/event/subsystems/nvmf/event_nvmf.h
@@ -0,0 +1,67 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef NVMF_TGT_H
+#define NVMF_TGT_H
+
+#include "spdk/stdinc.h"
+
+#include "spdk/nvmf.h"
+#include "spdk/queue.h"
+
+#include "spdk_internal/event.h"
+#include "spdk_internal/log.h"
+
+#define ACCEPT_TIMEOUT_US 10000 /* 10ms */
+
+struct spdk_nvmf_admin_passthru_conf {
+ bool identify_ctrlr;
+};
+
+struct spdk_nvmf_tgt_conf {
+ uint32_t acceptor_poll_rate;
+ uint32_t conn_sched; /* Deprecated. */
+ struct spdk_nvmf_admin_passthru_conf admin_passthru;
+};
+
+extern struct spdk_nvmf_tgt_conf *g_spdk_nvmf_tgt_conf;
+
+extern uint32_t g_spdk_nvmf_tgt_max_subsystems;
+
+extern struct spdk_nvmf_tgt *g_spdk_nvmf_tgt;
+
+typedef void (*nvmf_parse_conf_done_fn)(int status);
+
+int nvmf_parse_conf(nvmf_parse_conf_done_fn cb_fn);
+
+#endif
diff --git a/src/spdk/module/event/subsystems/nvmf/nvmf_rpc.c b/src/spdk/module/event/subsystems/nvmf/nvmf_rpc.c
new file mode 100644
index 000000000..b16ec6686
--- /dev/null
+++ b/src/spdk/module/event/subsystems/nvmf/nvmf_rpc.c
@@ -0,0 +1,153 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation. All rights reserved.
+ * Copyright (c) 2018-2019 Mellanox Technologies LTD. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "event_nvmf.h"
+
+#include "spdk/rpc.h"
+#include "spdk/util.h"
+
+static const struct spdk_json_object_decoder nvmf_rpc_subsystem_tgt_opts_decoder[] = {
+ {"max_subsystems", 0, spdk_json_decode_uint32, true}
+};
+
+static void
+rpc_nvmf_set_max_subsystems(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct spdk_json_write_ctx *w;
+ uint32_t max_subsystems = 0;
+
+ if (g_spdk_nvmf_tgt_max_subsystems != 0) {
+ SPDK_ERRLOG("this RPC must not be called more than once.\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Must not call more than once");
+ return;
+ }
+
+ if (params != NULL) {
+ if (spdk_json_decode_object(params, nvmf_rpc_subsystem_tgt_opts_decoder,
+ SPDK_COUNTOF(nvmf_rpc_subsystem_tgt_opts_decoder), &max_subsystems)) {
+ SPDK_ERRLOG("spdk_json_decode_object() failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid parameters");
+ return;
+ }
+ }
+
+ g_spdk_nvmf_tgt_max_subsystems = max_subsystems;
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("nvmf_set_max_subsystems", rpc_nvmf_set_max_subsystems,
+ SPDK_RPC_STARTUP)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(nvmf_set_max_subsystems, set_nvmf_target_max_subsystems)
+
+static int decode_conn_sched(const struct spdk_json_val *val, void *out)
+{
+ *(uint32_t *)out = 0;
+
+ SPDK_NOTICELOG("conn_sched is no longer a supported parameter. Ignoring.");
+
+ return 0;
+}
+
+static const struct spdk_json_object_decoder admin_passthru_decoder[] = {
+ {"identify_ctrlr", offsetof(struct spdk_nvmf_admin_passthru_conf, identify_ctrlr), spdk_json_decode_bool}
+};
+
+static int decode_admin_passthru(const struct spdk_json_val *val, void *out)
+{
+ struct spdk_nvmf_admin_passthru_conf *req = (struct spdk_nvmf_admin_passthru_conf *)out;
+
+ if (spdk_json_decode_object(val, admin_passthru_decoder,
+ SPDK_COUNTOF(admin_passthru_decoder),
+ req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+static const struct spdk_json_object_decoder nvmf_rpc_subsystem_tgt_conf_decoder[] = {
+ {"acceptor_poll_rate", offsetof(struct spdk_nvmf_tgt_conf, acceptor_poll_rate), spdk_json_decode_uint32, true},
+ {"conn_sched", offsetof(struct spdk_nvmf_tgt_conf, conn_sched), decode_conn_sched, true},
+ {"admin_cmd_passthru", offsetof(struct spdk_nvmf_tgt_conf, admin_passthru), decode_admin_passthru, true}
+};
+
+static void
+rpc_nvmf_set_config(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct spdk_nvmf_tgt_conf *conf;
+ struct spdk_json_write_ctx *w;
+
+ if (g_spdk_nvmf_tgt_conf != NULL) {
+ SPDK_ERRLOG("this RPC must not be called more than once.\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Must not call more than once");
+ return;
+ }
+
+ conf = calloc(1, sizeof(*conf));
+ if (conf == NULL) {
+ SPDK_ERRLOG("calloc() failed for target config\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Out of memory");
+ return;
+ }
+
+ conf->acceptor_poll_rate = ACCEPT_TIMEOUT_US;
+ conf->admin_passthru.identify_ctrlr = false;
+
+ if (params != NULL) {
+ if (spdk_json_decode_object(params, nvmf_rpc_subsystem_tgt_conf_decoder,
+ SPDK_COUNTOF(nvmf_rpc_subsystem_tgt_conf_decoder), conf)) {
+ free(conf);
+ SPDK_ERRLOG("spdk_json_decode_object() failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid parameters");
+ return;
+ }
+ }
+
+ g_spdk_nvmf_tgt_conf = conf;
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("nvmf_set_config", rpc_nvmf_set_config, SPDK_RPC_STARTUP)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(nvmf_set_config, set_nvmf_target_config)
diff --git a/src/spdk/module/event/subsystems/nvmf/nvmf_tgt.c b/src/spdk/module/event/subsystems/nvmf/nvmf_tgt.c
new file mode 100644
index 000000000..0ffac50c6
--- /dev/null
+++ b/src/spdk/module/event/subsystems/nvmf/nvmf_tgt.c
@@ -0,0 +1,476 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "event_nvmf.h"
+
+#include "spdk/bdev.h"
+#include "spdk/event.h"
+#include "spdk/thread.h"
+#include "spdk/log.h"
+#include "spdk/nvme.h"
+#include "spdk/nvmf_cmd.h"
+#include "spdk/util.h"
+
+enum nvmf_tgt_state {
+ NVMF_TGT_INIT_NONE = 0,
+ NVMF_TGT_INIT_PARSE_CONFIG,
+ NVMF_TGT_INIT_CREATE_POLL_GROUPS,
+ NVMF_TGT_INIT_START_SUBSYSTEMS,
+ NVMF_TGT_INIT_START_ACCEPTOR,
+ NVMF_TGT_RUNNING,
+ NVMF_TGT_FINI_STOP_SUBSYSTEMS,
+ NVMF_TGT_FINI_DESTROY_POLL_GROUPS,
+ NVMF_TGT_FINI_STOP_ACCEPTOR,
+ NVMF_TGT_FINI_FREE_RESOURCES,
+ NVMF_TGT_STOPPED,
+ NVMF_TGT_ERROR,
+};
+
+struct nvmf_tgt_poll_group {
+ struct spdk_nvmf_poll_group *group;
+ struct spdk_thread *thread;
+ TAILQ_ENTRY(nvmf_tgt_poll_group) link;
+};
+
+struct spdk_nvmf_tgt *g_spdk_nvmf_tgt = NULL;
+
+static enum nvmf_tgt_state g_tgt_state;
+
+static struct spdk_thread *g_tgt_init_thread = NULL;
+static struct spdk_thread *g_tgt_fini_thread = NULL;
+
+static TAILQ_HEAD(, nvmf_tgt_poll_group) g_poll_groups = TAILQ_HEAD_INITIALIZER(g_poll_groups);
+static size_t g_num_poll_groups = 0;
+
+static struct spdk_poller *g_acceptor_poller = NULL;
+
+static void nvmf_tgt_advance_state(void);
+
+static void
+nvmf_shutdown_cb(void *arg1)
+{
+ /* Still in initialization state, defer shutdown operation */
+ if (g_tgt_state < NVMF_TGT_RUNNING) {
+ spdk_thread_send_msg(spdk_get_thread(), nvmf_shutdown_cb, NULL);
+ return;
+ } else if (g_tgt_state != NVMF_TGT_RUNNING && g_tgt_state != NVMF_TGT_ERROR) {
+ /* Already in Shutdown status, ignore the signal */
+ return;
+ }
+
+ if (g_tgt_state == NVMF_TGT_ERROR) {
+ /* Parse configuration error */
+ g_tgt_state = NVMF_TGT_FINI_FREE_RESOURCES;
+ } else {
+ g_tgt_state = NVMF_TGT_FINI_STOP_SUBSYSTEMS;
+ }
+ nvmf_tgt_advance_state();
+}
+
+static void
+nvmf_subsystem_fini(void)
+{
+ nvmf_shutdown_cb(NULL);
+}
+
+static int
+acceptor_poll(void *arg)
+{
+ struct spdk_nvmf_tgt *tgt = arg;
+ uint32_t count;
+
+ count = spdk_nvmf_tgt_accept(tgt);
+
+ if (count > 0) {
+ return SPDK_POLLER_BUSY;
+ } else {
+ return SPDK_POLLER_IDLE;
+ }
+}
+
+static void
+_nvmf_tgt_destroy_poll_group_done(void *ctx)
+{
+ assert(g_num_poll_groups > 0);
+
+ if (--g_num_poll_groups == 0) {
+ g_tgt_state = NVMF_TGT_FINI_STOP_ACCEPTOR;
+ nvmf_tgt_advance_state();
+ }
+}
+
+static void
+nvmf_tgt_destroy_poll_group_done(void *cb_arg, int status)
+{
+ struct nvmf_tgt_poll_group *pg = cb_arg;
+
+ free(pg);
+
+ spdk_thread_send_msg(g_tgt_fini_thread, _nvmf_tgt_destroy_poll_group_done, NULL);
+
+ spdk_thread_exit(spdk_get_thread());
+}
+
+static void
+nvmf_tgt_destroy_poll_group(void *ctx)
+{
+ struct nvmf_tgt_poll_group *pg = ctx;
+
+ spdk_nvmf_poll_group_destroy(pg->group, nvmf_tgt_destroy_poll_group_done, pg);
+}
+
+static void
+nvmf_tgt_destroy_poll_groups(void)
+{
+ struct nvmf_tgt_poll_group *pg, *tpg;
+
+ g_tgt_fini_thread = spdk_get_thread();
+ assert(g_tgt_fini_thread != NULL);
+
+ TAILQ_FOREACH_SAFE(pg, &g_poll_groups, link, tpg) {
+ TAILQ_REMOVE(&g_poll_groups, pg, link);
+ spdk_thread_send_msg(pg->thread, nvmf_tgt_destroy_poll_group, pg);
+ }
+}
+
+static void
+nvmf_tgt_create_poll_group_done(void *ctx)
+{
+ struct nvmf_tgt_poll_group *pg = ctx;
+
+ TAILQ_INSERT_TAIL(&g_poll_groups, pg, link);
+
+ assert(g_num_poll_groups < spdk_env_get_core_count());
+
+ if (++g_num_poll_groups == spdk_env_get_core_count()) {
+ g_tgt_state = NVMF_TGT_INIT_START_SUBSYSTEMS;
+ nvmf_tgt_advance_state();
+ }
+}
+
+static void
+nvmf_tgt_create_poll_group(void *ctx)
+{
+ struct nvmf_tgt_poll_group *pg;
+
+ pg = calloc(1, sizeof(*pg));
+ if (!pg) {
+ SPDK_ERRLOG("Not enough memory to allocate poll groups\n");
+ spdk_app_stop(-ENOMEM);
+ return;
+ }
+
+ pg->thread = spdk_get_thread();
+ pg->group = spdk_nvmf_poll_group_create(g_spdk_nvmf_tgt);
+
+ spdk_thread_send_msg(g_tgt_init_thread, nvmf_tgt_create_poll_group_done, pg);
+}
+
+static void
+nvmf_tgt_create_poll_groups(void)
+{
+ struct spdk_cpuset tmp_cpumask = {};
+ uint32_t i;
+ char thread_name[32];
+ struct spdk_thread *thread;
+
+ g_tgt_init_thread = spdk_get_thread();
+ assert(g_tgt_init_thread != NULL);
+
+ SPDK_ENV_FOREACH_CORE(i) {
+ spdk_cpuset_zero(&tmp_cpumask);
+ spdk_cpuset_set_cpu(&tmp_cpumask, i, true);
+ snprintf(thread_name, sizeof(thread_name), "nvmf_tgt_poll_group_%u", i);
+
+ thread = spdk_thread_create(thread_name, &tmp_cpumask);
+ assert(thread != NULL);
+
+ spdk_thread_send_msg(thread, nvmf_tgt_create_poll_group, NULL);
+ }
+}
+
+static void
+nvmf_tgt_subsystem_started(struct spdk_nvmf_subsystem *subsystem,
+ void *cb_arg, int status)
+{
+ subsystem = spdk_nvmf_subsystem_get_next(subsystem);
+
+ if (subsystem) {
+ spdk_nvmf_subsystem_start(subsystem, nvmf_tgt_subsystem_started, NULL);
+ return;
+ }
+
+ g_tgt_state = NVMF_TGT_INIT_START_ACCEPTOR;
+ nvmf_tgt_advance_state();
+}
+
+static void
+nvmf_tgt_subsystem_stopped(struct spdk_nvmf_subsystem *subsystem,
+ void *cb_arg, int status)
+{
+ subsystem = spdk_nvmf_subsystem_get_next(subsystem);
+
+ if (subsystem) {
+ spdk_nvmf_subsystem_stop(subsystem, nvmf_tgt_subsystem_stopped, NULL);
+ return;
+ }
+
+ g_tgt_state = NVMF_TGT_FINI_DESTROY_POLL_GROUPS;
+ nvmf_tgt_advance_state();
+}
+
+static void
+nvmf_tgt_destroy_done(void *ctx, int status)
+{
+ g_tgt_state = NVMF_TGT_STOPPED;
+
+ free(g_spdk_nvmf_tgt_conf);
+ g_spdk_nvmf_tgt_conf = NULL;
+ nvmf_tgt_advance_state();
+}
+
+static void
+nvmf_tgt_parse_conf_done(int status)
+{
+ g_tgt_state = (status == 0) ? NVMF_TGT_INIT_CREATE_POLL_GROUPS : NVMF_TGT_ERROR;
+ nvmf_tgt_advance_state();
+}
+
+static void
+nvmf_tgt_parse_conf_start(void *ctx)
+{
+ if (nvmf_parse_conf(nvmf_tgt_parse_conf_done)) {
+ SPDK_ERRLOG("nvmf_parse_conf() failed\n");
+ g_tgt_state = NVMF_TGT_ERROR;
+ nvmf_tgt_advance_state();
+ }
+}
+
+static void
+fixup_identify_ctrlr(struct spdk_nvmf_request *req)
+{
+ uint32_t length;
+ int rc;
+ struct spdk_nvme_ctrlr_data *nvme_cdata;
+ struct spdk_nvme_ctrlr_data nvmf_cdata = {};
+ struct spdk_nvmf_ctrlr *ctrlr = spdk_nvmf_request_get_ctrlr(req);
+ struct spdk_nvme_cpl *rsp = spdk_nvmf_request_get_response(req);
+
+ /* This is the identify data from the NVMe drive */
+ spdk_nvmf_request_get_data(req, (void **)&nvme_cdata, &length);
+
+ /* Get the NVMF identify data */
+ rc = spdk_nvmf_ctrlr_identify_ctrlr(ctrlr, &nvmf_cdata);
+ if (rc != SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE) {
+ rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+ rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+ return;
+ }
+
+ /* Fixup NVMF identify data with NVMe identify data */
+
+ /* Serial Number (SN) */
+ memcpy(&nvmf_cdata.sn[0], &nvme_cdata->sn[0], sizeof(nvmf_cdata.sn));
+ /* Model Number (MN) */
+ memcpy(&nvmf_cdata.mn[0], &nvme_cdata->mn[0], sizeof(nvmf_cdata.mn));
+ /* Firmware Revision (FR) */
+ memcpy(&nvmf_cdata.fr[0], &nvme_cdata->fr[0], sizeof(nvmf_cdata.fr));
+ /* IEEE OUI Identifier (IEEE) */
+ memcpy(&nvmf_cdata.ieee[0], &nvme_cdata->ieee[0], sizeof(nvmf_cdata.ieee));
+ /* FRU Globally Unique Identifier (FGUID) */
+
+ /* Copy the fixed up data back to the response */
+ memcpy(nvme_cdata, &nvmf_cdata, length);
+}
+
+static int
+nvmf_custom_identify_hdlr(struct spdk_nvmf_request *req)
+{
+ struct spdk_nvme_cmd *cmd = spdk_nvmf_request_get_cmd(req);
+ struct spdk_bdev *bdev;
+ struct spdk_bdev_desc *desc;
+ struct spdk_io_channel *ch;
+ struct spdk_nvmf_subsystem *subsys;
+ int rc;
+
+ if (cmd->cdw10_bits.identify.cns != SPDK_NVME_IDENTIFY_CTRLR) {
+ return -1; /* continue */
+ }
+
+ subsys = spdk_nvmf_request_get_subsystem(req);
+ if (subsys == NULL) {
+ return -1;
+ }
+
+ /* Only procss this request if it has exactly one namespace */
+ if (spdk_nvmf_subsystem_get_max_nsid(subsys) != 1) {
+ return -1;
+ }
+
+ /* Forward to first namespace if it supports NVME admin commands */
+ rc = spdk_nvmf_request_get_bdev(1, req, &bdev, &desc, &ch);
+ if (rc) {
+ /* No bdev found for this namespace. Continue. */
+ return -1;
+ }
+
+ if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN)) {
+ return -1;
+ }
+
+ return spdk_nvmf_bdev_ctrlr_nvme_passthru_admin(bdev, desc, ch, req, fixup_identify_ctrlr);
+}
+
+static void
+nvmf_tgt_advance_state(void)
+{
+ enum nvmf_tgt_state prev_state;
+ int rc = -1;
+
+ do {
+ prev_state = g_tgt_state;
+
+ switch (g_tgt_state) {
+ case NVMF_TGT_INIT_NONE: {
+ g_tgt_state = NVMF_TGT_INIT_PARSE_CONFIG;
+ break;
+ }
+ case NVMF_TGT_INIT_PARSE_CONFIG:
+ /* Send message to self to call parse conf func.
+ * Prevents it from possibly performing cb before getting
+ * out of this function, which causes problems. */
+ spdk_thread_send_msg(spdk_get_thread(), nvmf_tgt_parse_conf_start, NULL);
+ break;
+ case NVMF_TGT_INIT_CREATE_POLL_GROUPS:
+ /* Config parsed */
+ if (g_spdk_nvmf_tgt_conf->admin_passthru.identify_ctrlr) {
+ SPDK_NOTICELOG("Custom identify ctrlr handler enabled\n");
+ spdk_nvmf_set_custom_admin_cmd_hdlr(SPDK_NVME_OPC_IDENTIFY, nvmf_custom_identify_hdlr);
+ }
+ /* Create poll group threads, and send a message to each thread
+ * and create a poll group.
+ */
+ nvmf_tgt_create_poll_groups();
+ break;
+ case NVMF_TGT_INIT_START_SUBSYSTEMS: {
+ struct spdk_nvmf_subsystem *subsystem;
+
+ subsystem = spdk_nvmf_subsystem_get_first(g_spdk_nvmf_tgt);
+
+ if (subsystem) {
+ spdk_nvmf_subsystem_start(subsystem, nvmf_tgt_subsystem_started, NULL);
+ } else {
+ g_tgt_state = NVMF_TGT_INIT_START_ACCEPTOR;
+ }
+ break;
+ }
+ case NVMF_TGT_INIT_START_ACCEPTOR:
+ g_acceptor_poller = SPDK_POLLER_REGISTER(acceptor_poll, g_spdk_nvmf_tgt,
+ g_spdk_nvmf_tgt_conf->acceptor_poll_rate);
+ g_tgt_state = NVMF_TGT_RUNNING;
+ break;
+ case NVMF_TGT_RUNNING:
+ spdk_subsystem_init_next(0);
+ break;
+ case NVMF_TGT_FINI_STOP_SUBSYSTEMS: {
+ struct spdk_nvmf_subsystem *subsystem;
+
+ subsystem = spdk_nvmf_subsystem_get_first(g_spdk_nvmf_tgt);
+
+ if (subsystem) {
+ spdk_nvmf_subsystem_stop(subsystem, nvmf_tgt_subsystem_stopped, NULL);
+ } else {
+ g_tgt_state = NVMF_TGT_FINI_DESTROY_POLL_GROUPS;
+ }
+ break;
+ }
+ case NVMF_TGT_FINI_DESTROY_POLL_GROUPS:
+ /* Send a message to each poll group thread, and terminate the thread */
+ nvmf_tgt_destroy_poll_groups();
+ break;
+ case NVMF_TGT_FINI_STOP_ACCEPTOR:
+ spdk_poller_unregister(&g_acceptor_poller);
+ g_tgt_state = NVMF_TGT_FINI_FREE_RESOURCES;
+ break;
+ case NVMF_TGT_FINI_FREE_RESOURCES:
+ spdk_nvmf_tgt_destroy(g_spdk_nvmf_tgt, nvmf_tgt_destroy_done, NULL);
+ break;
+ case NVMF_TGT_STOPPED:
+ spdk_subsystem_fini_next();
+ return;
+ case NVMF_TGT_ERROR:
+ spdk_subsystem_init_next(rc);
+ return;
+ }
+
+ } while (g_tgt_state != prev_state);
+}
+
+static void
+nvmf_subsystem_init(void)
+{
+ g_tgt_state = NVMF_TGT_INIT_NONE;
+ nvmf_tgt_advance_state();
+}
+
+static void
+nvmf_subsystem_write_config_json(struct spdk_json_write_ctx *w)
+{
+ spdk_json_write_array_begin(w);
+
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "method", "nvmf_set_config");
+
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_uint32(w, "acceptor_poll_rate", g_spdk_nvmf_tgt_conf->acceptor_poll_rate);
+ spdk_json_write_named_object_begin(w, "admin_cmd_passthru");
+ spdk_json_write_named_bool(w, "identify_ctrlr",
+ g_spdk_nvmf_tgt_conf->admin_passthru.identify_ctrlr);
+ spdk_json_write_object_end(w);
+ spdk_json_write_object_end(w);
+ spdk_json_write_object_end(w);
+
+ spdk_nvmf_tgt_write_config_json(w, g_spdk_nvmf_tgt);
+ spdk_json_write_array_end(w);
+}
+
+static struct spdk_subsystem g_spdk_subsystem_nvmf = {
+ .name = "nvmf",
+ .init = nvmf_subsystem_init,
+ .fini = nvmf_subsystem_fini,
+ .write_config_json = nvmf_subsystem_write_config_json,
+};
+
+SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_nvmf)
+SPDK_SUBSYSTEM_DEPEND(nvmf, bdev)
+SPDK_SUBSYSTEM_DEPEND(nvmf, sock)
diff --git a/src/spdk/module/event/subsystems/scsi/Makefile b/src/spdk/module/event/subsystems/scsi/Makefile
new file mode 100644
index 000000000..5d7fc3038
--- /dev/null
+++ b/src/spdk/module/event/subsystems/scsi/Makefile
@@ -0,0 +1,45 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = scsi.c
+LIBNAME = event_scsi
+
+SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/module/event/subsystems/scsi/scsi.c b/src/spdk/module/event/subsystems/scsi/scsi.c
new file mode 100644
index 000000000..f63dc783b
--- /dev/null
+++ b/src/spdk/module/event/subsystems/scsi/scsi.c
@@ -0,0 +1,65 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/scsi.h"
+
+#include "spdk_internal/event.h"
+
+static void
+scsi_subsystem_init(void)
+{
+ int rc;
+
+ rc = spdk_scsi_init();
+
+ spdk_subsystem_init_next(rc);
+}
+
+static void
+scsi_subsystem_fini(void)
+{
+ spdk_scsi_fini();
+ spdk_subsystem_fini_next();
+}
+
+static struct spdk_subsystem g_spdk_subsystem_scsi = {
+ .name = "scsi",
+ .init = scsi_subsystem_init,
+ .fini = scsi_subsystem_fini,
+ .config = NULL,
+};
+
+SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_scsi);
+SPDK_SUBSYSTEM_DEPEND(scsi, bdev)
diff --git a/src/spdk/module/event/subsystems/sock/Makefile b/src/spdk/module/event/subsystems/sock/Makefile
new file mode 100644
index 000000000..5a137d88d
--- /dev/null
+++ b/src/spdk/module/event/subsystems/sock/Makefile
@@ -0,0 +1,44 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 1
+SO_MINOR := 0
+
+C_SRCS = sock.c
+LIBNAME = event_sock
+
+SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/module/event/subsystems/sock/sock.c b/src/spdk/module/event/subsystems/sock/sock.c
new file mode 100644
index 000000000..fdcb2160a
--- /dev/null
+++ b/src/spdk/module/event/subsystems/sock/sock.c
@@ -0,0 +1,62 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+#include "spdk/sock.h"
+#include "spdk_internal/event.h"
+
+static void
+sock_subsystem_init(void)
+{
+ spdk_subsystem_init_next(0);
+}
+
+static void
+sock_subsystem_fini(void)
+{
+ spdk_subsystem_fini_next();
+}
+
+static void
+sock_subsystem_write_config_json(struct spdk_json_write_ctx *w)
+{
+ spdk_sock_write_config_json(w);
+}
+
+static struct spdk_subsystem g_spdk_subsystem_sock = {
+ .name = "sock",
+ .init = sock_subsystem_init,
+ .fini = sock_subsystem_fini,
+ .write_config_json = sock_subsystem_write_config_json,
+};
+
+SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_sock);
diff --git a/src/spdk/module/event/subsystems/vhost/Makefile b/src/spdk/module/event/subsystems/vhost/Makefile
new file mode 100644
index 000000000..a31bba91f
--- /dev/null
+++ b/src/spdk/module/event/subsystems/vhost/Makefile
@@ -0,0 +1,45 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = vhost.c
+LIBNAME = event_vhost
+
+SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/module/event/subsystems/vhost/vhost.c b/src/spdk/module/event/subsystems/vhost/vhost.c
new file mode 100644
index 000000000..a0c386d98
--- /dev/null
+++ b/src/spdk/module/event/subsystems/vhost/vhost.c
@@ -0,0 +1,73 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/vhost.h"
+
+#include "spdk_internal/event.h"
+
+static void
+vhost_subsystem_init_done(int rc)
+{
+ spdk_subsystem_init_next(rc);
+}
+
+static void
+vhost_subsystem_init(void)
+{
+ spdk_vhost_init(vhost_subsystem_init_done);
+}
+
+static void
+vhost_subsystem_fini_done(void)
+{
+ spdk_subsystem_fini_next();
+}
+
+static void
+vhost_subsystem_fini(void)
+{
+ spdk_vhost_fini(vhost_subsystem_fini_done);
+}
+
+static struct spdk_subsystem g_spdk_subsystem_vhost = {
+ .name = "vhost",
+ .init = vhost_subsystem_init,
+ .fini = vhost_subsystem_fini,
+ .config = NULL,
+ .write_config_json = spdk_vhost_config_json,
+};
+
+SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_vhost);
+SPDK_SUBSYSTEM_DEPEND(vhost, scsi)
diff --git a/src/spdk/module/event/subsystems/vmd/Makefile b/src/spdk/module/event/subsystems/vmd/Makefile
new file mode 100644
index 000000000..2089a2b9d
--- /dev/null
+++ b/src/spdk/module/event/subsystems/vmd/Makefile
@@ -0,0 +1,45 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = vmd.c vmd_rpc.c
+LIBNAME = event_vmd
+
+SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/module/event/subsystems/vmd/event_vmd.h b/src/spdk/module/event/subsystems/vmd/event_vmd.h
new file mode 100644
index 000000000..2f4b93f95
--- /dev/null
+++ b/src/spdk/module/event/subsystems/vmd/event_vmd.h
@@ -0,0 +1,39 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef EVENT_VMD_H
+#define EVENT_VMD_H
+
+int vmd_subsystem_init(void);
+
+#endif
diff --git a/src/spdk/module/event/subsystems/vmd/vmd.c b/src/spdk/module/event/subsystems/vmd/vmd.c
new file mode 100644
index 000000000..55eb47019
--- /dev/null
+++ b/src/spdk/module/event/subsystems/vmd/vmd.c
@@ -0,0 +1,132 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+#include "spdk/conf.h"
+#include "spdk/thread.h"
+#include "spdk/likely.h"
+
+#include "spdk/vmd.h"
+
+#include "spdk_internal/event.h"
+#include "event_vmd.h"
+
+static struct spdk_poller *g_hotplug_poller;
+static bool g_enabled;
+
+static int
+vmd_hotplug_monitor(void *ctx)
+{
+ return spdk_vmd_hotplug_monitor();
+}
+
+int
+vmd_subsystem_init(void)
+{
+ int rc;
+
+ if (g_enabled) {
+ SPDK_ERRLOG("The initialization has already been performed\n");
+ return -EBUSY;
+ }
+
+ rc = spdk_vmd_init();
+ if (spdk_likely(rc != 0)) {
+ SPDK_ERRLOG("Failed to initialize the VMD library\n");
+ return rc;
+ }
+
+ assert(g_hotplug_poller == NULL);
+
+ g_hotplug_poller = SPDK_POLLER_REGISTER(vmd_hotplug_monitor, NULL, 1000000ULL);
+ if (g_hotplug_poller == NULL) {
+ SPDK_ERRLOG("Failed to register hotplug monitor poller\n");
+ return -ENOMEM;
+ }
+
+ g_enabled = true;
+
+ return 0;
+}
+
+static void
+_vmd_subsystem_init(void)
+{
+ struct spdk_conf_section *sp;
+ int rc = 0;
+
+ sp = spdk_conf_find_section(NULL, "Vmd");
+ if (sp != NULL) {
+ if (spdk_conf_section_get_boolval(sp, "Enable", false)) {
+ rc = vmd_subsystem_init();
+ }
+ }
+
+ spdk_subsystem_init_next(rc);
+}
+
+static void
+vmd_subsystem_fini(void)
+{
+ spdk_poller_unregister(&g_hotplug_poller);
+
+ spdk_vmd_fini();
+
+ spdk_subsystem_fini_next();
+}
+
+static void
+vmd_write_config_json(struct spdk_json_write_ctx *w)
+{
+ spdk_json_write_array_begin(w);
+
+ if (g_enabled) {
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "method", "enable_vmd");
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_object_end(w);
+ spdk_json_write_object_end(w);
+ }
+
+ spdk_json_write_array_end(w);
+}
+
+static struct spdk_subsystem g_spdk_subsystem_vmd = {
+ .name = "vmd",
+ .init = _vmd_subsystem_init,
+ .fini = vmd_subsystem_fini,
+ .config = NULL,
+ .write_config_json = vmd_write_config_json,
+};
+
+SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_vmd);
diff --git a/src/spdk/module/event/subsystems/vmd/vmd_rpc.c b/src/spdk/module/event/subsystems/vmd/vmd_rpc.c
new file mode 100644
index 000000000..35843e688
--- /dev/null
+++ b/src/spdk/module/event/subsystems/vmd/vmd_rpc.c
@@ -0,0 +1,55 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/vmd.h"
+
+#include "spdk/rpc.h"
+#include "spdk/util.h"
+
+#include "spdk_internal/log.h"
+#include "event_vmd.h"
+
+static void
+rpc_vmd_enable(struct spdk_jsonrpc_request *request, const struct spdk_json_val *params)
+{
+ struct spdk_json_write_ctx *w;
+ int rc;
+
+ rc = vmd_subsystem_init();
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, rc == 0);
+ spdk_jsonrpc_end_result(request, w);
+}
+
+SPDK_RPC_REGISTER("enable_vmd", rpc_vmd_enable, SPDK_RPC_STARTUP)
diff --git a/src/spdk/module/sock/Makefile b/src/spdk/module/sock/Makefile
new file mode 100644
index 000000000..865743d06
--- /dev/null
+++ b/src/spdk/module/sock/Makefile
@@ -0,0 +1,48 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+DIRS-y = posix
+ifeq ($(OS), Linux)
+DIRS-$(CONFIG_URING) += uring
+endif
+DIRS-$(CONFIG_VPP) += vpp
+
+.PHONY: all clean $(DIRS-y)
+
+all: $(DIRS-y)
+clean: $(DIRS-y)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.subdirs.mk
diff --git a/src/spdk/module/sock/posix/Makefile b/src/spdk/module/sock/posix/Makefile
new file mode 100644
index 000000000..9783e024d
--- /dev/null
+++ b/src/spdk/module/sock/posix/Makefile
@@ -0,0 +1,45 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+LIBNAME = sock_posix
+C_SRCS = posix.c
+
+SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/module/sock/posix/posix.c b/src/spdk/module/sock/posix/posix.c
new file mode 100644
index 000000000..4eb1bf106
--- /dev/null
+++ b/src/spdk/module/sock/posix/posix.c
@@ -0,0 +1,1405 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation. All rights reserved.
+ * Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#if defined(__linux__)
+#include <sys/epoll.h>
+#include <linux/errqueue.h>
+#elif defined(__FreeBSD__)
+#include <sys/event.h>
+#endif
+
+#include "spdk/log.h"
+#include "spdk/pipe.h"
+#include "spdk/sock.h"
+#include "spdk/util.h"
+#include "spdk/likely.h"
+#include "spdk_internal/sock.h"
+
+#define MAX_TMPBUF 1024
+#define PORTNUMLEN 32
+#define MIN_SO_RCVBUF_SIZE (2 * 1024 * 1024)
+#define MIN_SO_SNDBUF_SIZE (2 * 1024 * 1024)
+#define IOV_BATCH_SIZE 64
+
+#if defined(SO_ZEROCOPY) && defined(MSG_ZEROCOPY)
+#define SPDK_ZEROCOPY
+#endif
+
+struct spdk_posix_sock {
+ struct spdk_sock base;
+ int fd;
+
+ uint32_t sendmsg_idx;
+ bool zcopy;
+
+ struct spdk_pipe *recv_pipe;
+ void *recv_buf;
+ int recv_buf_sz;
+ bool pending_recv;
+ int so_priority;
+
+ TAILQ_ENTRY(spdk_posix_sock) link;
+};
+
+struct spdk_posix_sock_group_impl {
+ struct spdk_sock_group_impl base;
+ int fd;
+ TAILQ_HEAD(, spdk_posix_sock) pending_recv;
+};
+
+static struct spdk_sock_impl_opts g_spdk_posix_sock_impl_opts = {
+ .recv_buf_size = MIN_SO_RCVBUF_SIZE,
+ .send_buf_size = MIN_SO_SNDBUF_SIZE,
+ .enable_recv_pipe = true,
+ .enable_zerocopy_send = true
+};
+
+static int
+get_addr_str(struct sockaddr *sa, char *host, size_t hlen)
+{
+ const char *result = NULL;
+
+ if (sa == NULL || host == NULL) {
+ return -1;
+ }
+
+ switch (sa->sa_family) {
+ case AF_INET:
+ result = inet_ntop(AF_INET, &(((struct sockaddr_in *)sa)->sin_addr),
+ host, hlen);
+ break;
+ case AF_INET6:
+ result = inet_ntop(AF_INET6, &(((struct sockaddr_in6 *)sa)->sin6_addr),
+ host, hlen);
+ break;
+ default:
+ break;
+ }
+
+ if (result != NULL) {
+ return 0;
+ } else {
+ return -1;
+ }
+}
+
+#define __posix_sock(sock) (struct spdk_posix_sock *)sock
+#define __posix_group_impl(group) (struct spdk_posix_sock_group_impl *)group
+
+static int
+posix_sock_getaddr(struct spdk_sock *_sock, char *saddr, int slen, uint16_t *sport,
+ char *caddr, int clen, uint16_t *cport)
+{
+ struct spdk_posix_sock *sock = __posix_sock(_sock);
+ struct sockaddr_storage sa;
+ socklen_t salen;
+ int rc;
+
+ assert(sock != NULL);
+
+ memset(&sa, 0, sizeof sa);
+ salen = sizeof sa;
+ rc = getsockname(sock->fd, (struct sockaddr *) &sa, &salen);
+ if (rc != 0) {
+ SPDK_ERRLOG("getsockname() failed (errno=%d)\n", errno);
+ return -1;
+ }
+
+ switch (sa.ss_family) {
+ case AF_UNIX:
+ /* Acceptable connection types that don't have IPs */
+ return 0;
+ case AF_INET:
+ case AF_INET6:
+ /* Code below will get IP addresses */
+ break;
+ default:
+ /* Unsupported socket family */
+ return -1;
+ }
+
+ rc = get_addr_str((struct sockaddr *)&sa, saddr, slen);
+ if (rc != 0) {
+ SPDK_ERRLOG("getnameinfo() failed (errno=%d)\n", errno);
+ return -1;
+ }
+
+ if (sport) {
+ if (sa.ss_family == AF_INET) {
+ *sport = ntohs(((struct sockaddr_in *) &sa)->sin_port);
+ } else if (sa.ss_family == AF_INET6) {
+ *sport = ntohs(((struct sockaddr_in6 *) &sa)->sin6_port);
+ }
+ }
+
+ memset(&sa, 0, sizeof sa);
+ salen = sizeof sa;
+ rc = getpeername(sock->fd, (struct sockaddr *) &sa, &salen);
+ if (rc != 0) {
+ SPDK_ERRLOG("getpeername() failed (errno=%d)\n", errno);
+ return -1;
+ }
+
+ rc = get_addr_str((struct sockaddr *)&sa, caddr, clen);
+ if (rc != 0) {
+ SPDK_ERRLOG("getnameinfo() failed (errno=%d)\n", errno);
+ return -1;
+ }
+
+ if (cport) {
+ if (sa.ss_family == AF_INET) {
+ *cport = ntohs(((struct sockaddr_in *) &sa)->sin_port);
+ } else if (sa.ss_family == AF_INET6) {
+ *cport = ntohs(((struct sockaddr_in6 *) &sa)->sin6_port);
+ }
+ }
+
+ return 0;
+}
+
+enum posix_sock_create_type {
+ SPDK_SOCK_CREATE_LISTEN,
+ SPDK_SOCK_CREATE_CONNECT,
+};
+
+static int
+posix_sock_alloc_pipe(struct spdk_posix_sock *sock, int sz)
+{
+ uint8_t *new_buf;
+ struct spdk_pipe *new_pipe;
+ struct iovec siov[2];
+ struct iovec diov[2];
+ int sbytes;
+ ssize_t bytes;
+
+ if (sock->recv_buf_sz == sz) {
+ return 0;
+ }
+
+ /* If the new size is 0, just free the pipe */
+ if (sz == 0) {
+ spdk_pipe_destroy(sock->recv_pipe);
+ free(sock->recv_buf);
+ sock->recv_pipe = NULL;
+ sock->recv_buf = NULL;
+ return 0;
+ } else if (sz < MIN_SOCK_PIPE_SIZE) {
+ SPDK_ERRLOG("The size of the pipe must be larger than %d\n", MIN_SOCK_PIPE_SIZE);
+ return -1;
+ }
+
+ /* Round up to next 64 byte multiple */
+ new_buf = calloc(SPDK_ALIGN_CEIL(sz + 1, 64), sizeof(uint8_t));
+ if (!new_buf) {
+ SPDK_ERRLOG("socket recv buf allocation failed\n");
+ return -ENOMEM;
+ }
+
+ new_pipe = spdk_pipe_create(new_buf, sz + 1);
+ if (new_pipe == NULL) {
+ SPDK_ERRLOG("socket pipe allocation failed\n");
+ free(new_buf);
+ return -ENOMEM;
+ }
+
+ if (sock->recv_pipe != NULL) {
+ /* Pull all of the data out of the old pipe */
+ sbytes = spdk_pipe_reader_get_buffer(sock->recv_pipe, sock->recv_buf_sz, siov);
+ if (sbytes > sz) {
+ /* Too much data to fit into the new pipe size */
+ spdk_pipe_destroy(new_pipe);
+ free(new_buf);
+ return -EINVAL;
+ }
+
+ sbytes = spdk_pipe_writer_get_buffer(new_pipe, sz, diov);
+ assert(sbytes == sz);
+
+ bytes = spdk_iovcpy(siov, 2, diov, 2);
+ spdk_pipe_writer_advance(new_pipe, bytes);
+
+ spdk_pipe_destroy(sock->recv_pipe);
+ free(sock->recv_buf);
+ }
+
+ sock->recv_buf_sz = sz;
+ sock->recv_buf = new_buf;
+ sock->recv_pipe = new_pipe;
+
+ return 0;
+}
+
+static int
+posix_sock_set_recvbuf(struct spdk_sock *_sock, int sz)
+{
+ struct spdk_posix_sock *sock = __posix_sock(_sock);
+ int rc;
+
+ assert(sock != NULL);
+
+ if (g_spdk_posix_sock_impl_opts.enable_recv_pipe) {
+ rc = posix_sock_alloc_pipe(sock, sz);
+ if (rc) {
+ return rc;
+ }
+ }
+
+ /* Set kernel buffer size to be at least MIN_SO_RCVBUF_SIZE */
+ if (sz < MIN_SO_RCVBUF_SIZE) {
+ sz = MIN_SO_RCVBUF_SIZE;
+ }
+
+ rc = setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, &sz, sizeof(sz));
+ if (rc < 0) {
+ return rc;
+ }
+
+ return 0;
+}
+
+static int
+posix_sock_set_sendbuf(struct spdk_sock *_sock, int sz)
+{
+ struct spdk_posix_sock *sock = __posix_sock(_sock);
+ int rc;
+
+ assert(sock != NULL);
+
+ if (sz < MIN_SO_SNDBUF_SIZE) {
+ sz = MIN_SO_SNDBUF_SIZE;
+ }
+
+ rc = setsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF, &sz, sizeof(sz));
+ if (rc < 0) {
+ return rc;
+ }
+
+ return 0;
+}
+
+static struct spdk_posix_sock *
+posix_sock_alloc(int fd, bool enable_zero_copy)
+{
+ struct spdk_posix_sock *sock;
+#ifdef SPDK_ZEROCOPY
+ int rc;
+ int flag;
+#endif
+
+ sock = calloc(1, sizeof(*sock));
+ if (sock == NULL) {
+ SPDK_ERRLOG("sock allocation failed\n");
+ return NULL;
+ }
+
+ sock->fd = fd;
+
+#ifdef SPDK_ZEROCOPY
+ if (!enable_zero_copy || !g_spdk_posix_sock_impl_opts.enable_zerocopy_send) {
+ return sock;
+ }
+
+ /* Try to turn on zero copy sends */
+ flag = 1;
+ rc = setsockopt(sock->fd, SOL_SOCKET, SO_ZEROCOPY, &flag, sizeof(flag));
+ if (rc == 0) {
+ sock->zcopy = true;
+ }
+#endif
+
+ return sock;
+}
+
+static bool
+sock_is_loopback(int fd)
+{
+ struct ifaddrs *addrs, *tmp;
+ struct sockaddr_storage sa = {};
+ socklen_t salen;
+ struct ifreq ifr = {};
+ char ip_addr[256], ip_addr_tmp[256];
+ int rc;
+ bool is_loopback = false;
+
+ salen = sizeof(sa);
+ rc = getsockname(fd, (struct sockaddr *)&sa, &salen);
+ if (rc != 0) {
+ return is_loopback;
+ }
+
+ memset(ip_addr, 0, sizeof(ip_addr));
+ rc = get_addr_str((struct sockaddr *)&sa, ip_addr, sizeof(ip_addr));
+ if (rc != 0) {
+ return is_loopback;
+ }
+
+ getifaddrs(&addrs);
+ for (tmp = addrs; tmp != NULL; tmp = tmp->ifa_next) {
+ if (tmp->ifa_addr && (tmp->ifa_flags & IFF_UP) &&
+ (tmp->ifa_addr->sa_family == sa.ss_family)) {
+ memset(ip_addr_tmp, 0, sizeof(ip_addr_tmp));
+ rc = get_addr_str(tmp->ifa_addr, ip_addr_tmp, sizeof(ip_addr_tmp));
+ if (rc != 0) {
+ continue;
+ }
+
+ if (strncmp(ip_addr, ip_addr_tmp, sizeof(ip_addr)) == 0) {
+ memcpy(ifr.ifr_name, tmp->ifa_name, sizeof(ifr.ifr_name));
+ ioctl(fd, SIOCGIFFLAGS, &ifr);
+ if (ifr.ifr_flags & IFF_LOOPBACK) {
+ is_loopback = true;
+ }
+ goto end;
+ }
+ }
+ }
+
+end:
+ freeifaddrs(addrs);
+ return is_loopback;
+}
+
+static struct spdk_sock *
+posix_sock_create(const char *ip, int port,
+ enum posix_sock_create_type type,
+ struct spdk_sock_opts *opts)
+{
+ struct spdk_posix_sock *sock;
+ char buf[MAX_TMPBUF];
+ char portnum[PORTNUMLEN];
+ char *p;
+ struct addrinfo hints, *res, *res0;
+ int fd, flag;
+ int val = 1;
+ int rc, sz;
+ bool enable_zero_copy = true;
+
+ if (ip == NULL) {
+ return NULL;
+ }
+ if (ip[0] == '[') {
+ snprintf(buf, sizeof(buf), "%s", ip + 1);
+ p = strchr(buf, ']');
+ if (p != NULL) {
+ *p = '\0';
+ }
+ ip = (const char *) &buf[0];
+ }
+
+ snprintf(portnum, sizeof portnum, "%d", port);
+ memset(&hints, 0, sizeof hints);
+ hints.ai_family = PF_UNSPEC;
+ hints.ai_socktype = SOCK_STREAM;
+ hints.ai_flags = AI_NUMERICSERV;
+ hints.ai_flags |= AI_PASSIVE;
+ hints.ai_flags |= AI_NUMERICHOST;
+ rc = getaddrinfo(ip, portnum, &hints, &res0);
+ if (rc != 0) {
+ SPDK_ERRLOG("getaddrinfo() failed (errno=%d)\n", errno);
+ return NULL;
+ }
+
+ /* try listen */
+ fd = -1;
+ for (res = res0; res != NULL; res = res->ai_next) {
+retry:
+ fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol);
+ if (fd < 0) {
+ /* error */
+ continue;
+ }
+
+ sz = g_spdk_posix_sock_impl_opts.recv_buf_size;
+ rc = setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &sz, sizeof(sz));
+ if (rc) {
+ /* Not fatal */
+ }
+
+ sz = g_spdk_posix_sock_impl_opts.send_buf_size;
+ rc = setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &sz, sizeof(sz));
+ if (rc) {
+ /* Not fatal */
+ }
+
+ rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &val, sizeof val);
+ if (rc != 0) {
+ close(fd);
+ /* error */
+ continue;
+ }
+ rc = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &val, sizeof val);
+ if (rc != 0) {
+ close(fd);
+ /* error */
+ continue;
+ }
+
+#if defined(SO_PRIORITY)
+ if (opts != NULL && opts->priority) {
+ rc = setsockopt(fd, SOL_SOCKET, SO_PRIORITY, &opts->priority, sizeof val);
+ if (rc != 0) {
+ close(fd);
+ /* error */
+ continue;
+ }
+ }
+#endif
+
+ if (res->ai_family == AF_INET6) {
+ rc = setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &val, sizeof val);
+ if (rc != 0) {
+ close(fd);
+ /* error */
+ continue;
+ }
+ }
+
+ if (type == SPDK_SOCK_CREATE_LISTEN) {
+ rc = bind(fd, res->ai_addr, res->ai_addrlen);
+ if (rc != 0) {
+ SPDK_ERRLOG("bind() failed at port %d, errno = %d\n", port, errno);
+ switch (errno) {
+ case EINTR:
+ /* interrupted? */
+ close(fd);
+ goto retry;
+ case EADDRNOTAVAIL:
+ SPDK_ERRLOG("IP address %s not available. "
+ "Verify IP address in config file "
+ "and make sure setup script is "
+ "run before starting spdk app.\n", ip);
+ /* FALLTHROUGH */
+ default:
+ /* try next family */
+ close(fd);
+ fd = -1;
+ continue;
+ }
+ }
+ /* bind OK */
+ rc = listen(fd, 512);
+ if (rc != 0) {
+ SPDK_ERRLOG("listen() failed, errno = %d\n", errno);
+ close(fd);
+ fd = -1;
+ break;
+ }
+ } else if (type == SPDK_SOCK_CREATE_CONNECT) {
+ rc = connect(fd, res->ai_addr, res->ai_addrlen);
+ if (rc != 0) {
+ SPDK_ERRLOG("connect() failed, errno = %d\n", errno);
+ /* try next family */
+ close(fd);
+ fd = -1;
+ continue;
+ }
+ }
+
+ flag = fcntl(fd, F_GETFL);
+ if (fcntl(fd, F_SETFL, flag | O_NONBLOCK) < 0) {
+ SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%d)\n", fd, errno);
+ close(fd);
+ fd = -1;
+ break;
+ }
+ break;
+ }
+ freeaddrinfo(res0);
+
+ if (fd < 0) {
+ return NULL;
+ }
+
+ if (type == SPDK_SOCK_CREATE_LISTEN) {
+ /* Only enable zero copy for non-loopback sockets. */
+ enable_zero_copy = !sock_is_loopback(fd);
+ } else if (type == SPDK_SOCK_CREATE_CONNECT) {
+ /* Disable zero copy for client sockets until support is added */
+ enable_zero_copy = false;
+ }
+
+ sock = posix_sock_alloc(fd, enable_zero_copy);
+ if (sock == NULL) {
+ SPDK_ERRLOG("sock allocation failed\n");
+ close(fd);
+ return NULL;
+ }
+
+ if (opts != NULL) {
+ sock->so_priority = opts->priority;
+ }
+ return &sock->base;
+}
+
+static struct spdk_sock *
+posix_sock_listen(const char *ip, int port, struct spdk_sock_opts *opts)
+{
+ return posix_sock_create(ip, port, SPDK_SOCK_CREATE_LISTEN, opts);
+}
+
+static struct spdk_sock *
+posix_sock_connect(const char *ip, int port, struct spdk_sock_opts *opts)
+{
+ return posix_sock_create(ip, port, SPDK_SOCK_CREATE_CONNECT, opts);
+}
+
+static struct spdk_sock *
+posix_sock_accept(struct spdk_sock *_sock)
+{
+ struct spdk_posix_sock *sock = __posix_sock(_sock);
+ struct sockaddr_storage sa;
+ socklen_t salen;
+ int rc, fd;
+ struct spdk_posix_sock *new_sock;
+ int flag;
+
+ memset(&sa, 0, sizeof(sa));
+ salen = sizeof(sa);
+
+ assert(sock != NULL);
+
+ rc = accept(sock->fd, (struct sockaddr *)&sa, &salen);
+
+ if (rc == -1) {
+ return NULL;
+ }
+
+ fd = rc;
+
+ flag = fcntl(fd, F_GETFL);
+ if ((!(flag & O_NONBLOCK)) && (fcntl(fd, F_SETFL, flag | O_NONBLOCK) < 0)) {
+ SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%d)\n", fd, errno);
+ close(fd);
+ return NULL;
+ }
+
+#if defined(SO_PRIORITY)
+ /* The priority is not inherited, so call this function again */
+ if (sock->base.opts.priority) {
+ rc = setsockopt(fd, SOL_SOCKET, SO_PRIORITY, &sock->base.opts.priority, sizeof(int));
+ if (rc != 0) {
+ close(fd);
+ return NULL;
+ }
+ }
+#endif
+
+ /* Inherit the zero copy feature from the listen socket */
+ new_sock = posix_sock_alloc(fd, sock->zcopy);
+ if (new_sock == NULL) {
+ close(fd);
+ return NULL;
+ }
+ new_sock->so_priority = sock->base.opts.priority;
+
+ return &new_sock->base;
+}
+
+static int
+posix_sock_close(struct spdk_sock *_sock)
+{
+ struct spdk_posix_sock *sock = __posix_sock(_sock);
+
+ assert(TAILQ_EMPTY(&_sock->pending_reqs));
+
+ /* If the socket fails to close, the best choice is to
+ * leak the fd but continue to free the rest of the sock
+ * memory. */
+ close(sock->fd);
+
+ spdk_pipe_destroy(sock->recv_pipe);
+ free(sock->recv_buf);
+ free(sock);
+
+ return 0;
+}
+
+#ifdef SPDK_ZEROCOPY
+static int
+_sock_check_zcopy(struct spdk_sock *sock)
+{
+ struct spdk_posix_sock *psock = __posix_sock(sock);
+ struct msghdr msgh = {};
+ uint8_t buf[sizeof(struct cmsghdr) + sizeof(struct sock_extended_err)];
+ ssize_t rc;
+ struct sock_extended_err *serr;
+ struct cmsghdr *cm;
+ uint32_t idx;
+ struct spdk_sock_request *req, *treq;
+ bool found;
+
+ msgh.msg_control = buf;
+ msgh.msg_controllen = sizeof(buf);
+
+ while (true) {
+ rc = recvmsg(psock->fd, &msgh, MSG_ERRQUEUE);
+
+ if (rc < 0) {
+ if (errno == EWOULDBLOCK || errno == EAGAIN) {
+ return 0;
+ }
+
+ if (!TAILQ_EMPTY(&sock->pending_reqs)) {
+ SPDK_ERRLOG("Attempting to receive from ERRQUEUE yielded error, but pending list still has orphaned entries\n");
+ } else {
+ SPDK_WARNLOG("Recvmsg yielded an error!\n");
+ }
+ return 0;
+ }
+
+ cm = CMSG_FIRSTHDR(&msgh);
+ if (!cm || cm->cmsg_level != SOL_IP || cm->cmsg_type != IP_RECVERR) {
+ SPDK_WARNLOG("Unexpected cmsg level or type!\n");
+ return 0;
+ }
+
+ serr = (struct sock_extended_err *)CMSG_DATA(cm);
+ if (serr->ee_errno != 0 || serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY) {
+ SPDK_WARNLOG("Unexpected extended error origin\n");
+ return 0;
+ }
+
+ /* Most of the time, the pending_reqs array is in the exact
+ * order we need such that all of the requests to complete are
+ * in order, in the front. It is guaranteed that all requests
+ * belonging to the same sendmsg call are sequential, so once
+ * we encounter one match we can stop looping as soon as a
+ * non-match is found.
+ */
+ for (idx = serr->ee_info; idx <= serr->ee_data; idx++) {
+ found = false;
+ TAILQ_FOREACH_SAFE(req, &sock->pending_reqs, internal.link, treq) {
+ if (req->internal.offset == idx) {
+ found = true;
+
+ rc = spdk_sock_request_put(sock, req, 0);
+ if (rc < 0) {
+ return rc;
+ }
+
+ } else if (found) {
+ break;
+ }
+ }
+
+ }
+ }
+
+ return 0;
+}
+#endif
+
+static int
+_sock_flush(struct spdk_sock *sock)
+{
+ struct spdk_posix_sock *psock = __posix_sock(sock);
+ struct msghdr msg = {};
+ int flags;
+ struct iovec iovs[IOV_BATCH_SIZE];
+ int iovcnt;
+ int retval;
+ struct spdk_sock_request *req;
+ int i;
+ ssize_t rc;
+ unsigned int offset;
+ size_t len;
+
+ /* Can't flush from within a callback or we end up with recursive calls */
+ if (sock->cb_cnt > 0) {
+ return 0;
+ }
+
+ /* Gather an iov */
+ iovcnt = 0;
+ req = TAILQ_FIRST(&sock->queued_reqs);
+ while (req) {
+ offset = req->internal.offset;
+
+ for (i = 0; i < req->iovcnt; i++) {
+ /* Consume any offset first */
+ if (offset >= SPDK_SOCK_REQUEST_IOV(req, i)->iov_len) {
+ offset -= SPDK_SOCK_REQUEST_IOV(req, i)->iov_len;
+ continue;
+ }
+
+ iovs[iovcnt].iov_base = SPDK_SOCK_REQUEST_IOV(req, i)->iov_base + offset;
+ iovs[iovcnt].iov_len = SPDK_SOCK_REQUEST_IOV(req, i)->iov_len - offset;
+ iovcnt++;
+
+ offset = 0;
+
+ if (iovcnt >= IOV_BATCH_SIZE) {
+ break;
+ }
+ }
+
+ if (iovcnt >= IOV_BATCH_SIZE) {
+ break;
+ }
+
+ req = TAILQ_NEXT(req, internal.link);
+ }
+
+ if (iovcnt == 0) {
+ return 0;
+ }
+
+ /* Perform the vectored write */
+ msg.msg_iov = iovs;
+ msg.msg_iovlen = iovcnt;
+#ifdef SPDK_ZEROCOPY
+ if (psock->zcopy) {
+ flags = MSG_ZEROCOPY;
+ } else
+#endif
+ {
+ flags = 0;
+ }
+ rc = sendmsg(psock->fd, &msg, flags);
+ if (rc <= 0) {
+ if (errno == EAGAIN || errno == EWOULDBLOCK) {
+ return 0;
+ }
+ return rc;
+ }
+
+ psock->sendmsg_idx++;
+
+ /* Consume the requests that were actually written */
+ req = TAILQ_FIRST(&sock->queued_reqs);
+ while (req) {
+ offset = req->internal.offset;
+
+ for (i = 0; i < req->iovcnt; i++) {
+ /* Advance by the offset first */
+ if (offset >= SPDK_SOCK_REQUEST_IOV(req, i)->iov_len) {
+ offset -= SPDK_SOCK_REQUEST_IOV(req, i)->iov_len;
+ continue;
+ }
+
+ /* Calculate the remaining length of this element */
+ len = SPDK_SOCK_REQUEST_IOV(req, i)->iov_len - offset;
+
+ if (len > (size_t)rc) {
+ /* This element was partially sent. */
+ req->internal.offset += rc;
+ return 0;
+ }
+
+ offset = 0;
+ req->internal.offset += len;
+ rc -= len;
+ }
+
+ /* Handled a full request. */
+ spdk_sock_request_pend(sock, req);
+
+ if (!psock->zcopy) {
+ /* The sendmsg syscall above isn't currently asynchronous,
+ * so it's already done. */
+ retval = spdk_sock_request_put(sock, req, 0);
+ if (retval) {
+ break;
+ }
+ } else {
+ /* Re-use the offset field to hold the sendmsg call index. The
+ * index is 0 based, so subtract one here because we've already
+ * incremented above. */
+ req->internal.offset = psock->sendmsg_idx - 1;
+ }
+
+ if (rc == 0) {
+ break;
+ }
+
+ req = TAILQ_FIRST(&sock->queued_reqs);
+ }
+
+ return 0;
+}
+
+static int
+posix_sock_flush(struct spdk_sock *_sock)
+{
+ return _sock_flush(_sock);
+}
+
+static ssize_t
+posix_sock_recv_from_pipe(struct spdk_posix_sock *sock, struct iovec *diov, int diovcnt)
+{
+ struct iovec siov[2];
+ int sbytes;
+ ssize_t bytes;
+ struct spdk_posix_sock_group_impl *group;
+
+ sbytes = spdk_pipe_reader_get_buffer(sock->recv_pipe, sock->recv_buf_sz, siov);
+ if (sbytes < 0) {
+ errno = EINVAL;
+ return -1;
+ } else if (sbytes == 0) {
+ errno = EAGAIN;
+ return -1;
+ }
+
+ bytes = spdk_iovcpy(siov, 2, diov, diovcnt);
+
+ if (bytes == 0) {
+ /* The only way this happens is if diov is 0 length */
+ errno = EINVAL;
+ return -1;
+ }
+
+ spdk_pipe_reader_advance(sock->recv_pipe, bytes);
+
+ /* If we drained the pipe, take it off the level-triggered list */
+ if (sock->base.group_impl && spdk_pipe_reader_bytes_available(sock->recv_pipe) == 0) {
+ group = __posix_group_impl(sock->base.group_impl);
+ TAILQ_REMOVE(&group->pending_recv, sock, link);
+ sock->pending_recv = false;
+ }
+
+ return bytes;
+}
+
+static inline ssize_t
+posix_sock_read(struct spdk_posix_sock *sock)
+{
+ struct iovec iov[2];
+ int bytes;
+ struct spdk_posix_sock_group_impl *group;
+
+ bytes = spdk_pipe_writer_get_buffer(sock->recv_pipe, sock->recv_buf_sz, iov);
+
+ if (bytes > 0) {
+ bytes = readv(sock->fd, iov, 2);
+ if (bytes > 0) {
+ spdk_pipe_writer_advance(sock->recv_pipe, bytes);
+ if (sock->base.group_impl) {
+ group = __posix_group_impl(sock->base.group_impl);
+ TAILQ_INSERT_TAIL(&group->pending_recv, sock, link);
+ sock->pending_recv = true;
+ }
+ }
+ }
+
+ return bytes;
+}
+
+static ssize_t
+posix_sock_readv(struct spdk_sock *_sock, struct iovec *iov, int iovcnt)
+{
+ struct spdk_posix_sock *sock = __posix_sock(_sock);
+ int rc, i;
+ size_t len;
+
+ if (sock->recv_pipe == NULL) {
+ return readv(sock->fd, iov, iovcnt);
+ }
+
+ len = 0;
+ for (i = 0; i < iovcnt; i++) {
+ len += iov[i].iov_len;
+ }
+
+ if (spdk_pipe_reader_bytes_available(sock->recv_pipe) == 0) {
+ /* If the user is receiving a sufficiently large amount of data,
+ * receive directly to their buffers. */
+ if (len >= MIN_SOCK_PIPE_SIZE) {
+ return readv(sock->fd, iov, iovcnt);
+ }
+
+ /* Otherwise, do a big read into our pipe */
+ rc = posix_sock_read(sock);
+ if (rc <= 0) {
+ return rc;
+ }
+ }
+
+ return posix_sock_recv_from_pipe(sock, iov, iovcnt);
+}
+
+static ssize_t
+posix_sock_recv(struct spdk_sock *sock, void *buf, size_t len)
+{
+ struct iovec iov[1];
+
+ iov[0].iov_base = buf;
+ iov[0].iov_len = len;
+
+ return posix_sock_readv(sock, iov, 1);
+}
+
+static ssize_t
+posix_sock_writev(struct spdk_sock *_sock, struct iovec *iov, int iovcnt)
+{
+ struct spdk_posix_sock *sock = __posix_sock(_sock);
+ int rc;
+
+ /* In order to process a writev, we need to flush any asynchronous writes
+ * first. */
+ rc = _sock_flush(_sock);
+ if (rc < 0) {
+ return rc;
+ }
+
+ if (!TAILQ_EMPTY(&_sock->queued_reqs)) {
+ /* We weren't able to flush all requests */
+ errno = EAGAIN;
+ return -1;
+ }
+
+ return writev(sock->fd, iov, iovcnt);
+}
+
+static void
+posix_sock_writev_async(struct spdk_sock *sock, struct spdk_sock_request *req)
+{
+ int rc;
+
+ spdk_sock_request_queue(sock, req);
+
+ /* If there are a sufficient number queued, just flush them out immediately. */
+ if (sock->queued_iovcnt >= IOV_BATCH_SIZE) {
+ rc = _sock_flush(sock);
+ if (rc) {
+ spdk_sock_abort_requests(sock);
+ }
+ }
+}
+
+static int
+posix_sock_set_recvlowat(struct spdk_sock *_sock, int nbytes)
+{
+ struct spdk_posix_sock *sock = __posix_sock(_sock);
+ int val;
+ int rc;
+
+ assert(sock != NULL);
+
+ val = nbytes;
+ rc = setsockopt(sock->fd, SOL_SOCKET, SO_RCVLOWAT, &val, sizeof val);
+ if (rc != 0) {
+ return -1;
+ }
+ return 0;
+}
+
+static bool
+posix_sock_is_ipv6(struct spdk_sock *_sock)
+{
+ struct spdk_posix_sock *sock = __posix_sock(_sock);
+ struct sockaddr_storage sa;
+ socklen_t salen;
+ int rc;
+
+ assert(sock != NULL);
+
+ memset(&sa, 0, sizeof sa);
+ salen = sizeof sa;
+ rc = getsockname(sock->fd, (struct sockaddr *) &sa, &salen);
+ if (rc != 0) {
+ SPDK_ERRLOG("getsockname() failed (errno=%d)\n", errno);
+ return false;
+ }
+
+ return (sa.ss_family == AF_INET6);
+}
+
+static bool
+posix_sock_is_ipv4(struct spdk_sock *_sock)
+{
+ struct spdk_posix_sock *sock = __posix_sock(_sock);
+ struct sockaddr_storage sa;
+ socklen_t salen;
+ int rc;
+
+ assert(sock != NULL);
+
+ memset(&sa, 0, sizeof sa);
+ salen = sizeof sa;
+ rc = getsockname(sock->fd, (struct sockaddr *) &sa, &salen);
+ if (rc != 0) {
+ SPDK_ERRLOG("getsockname() failed (errno=%d)\n", errno);
+ return false;
+ }
+
+ return (sa.ss_family == AF_INET);
+}
+
+static bool
+posix_sock_is_connected(struct spdk_sock *_sock)
+{
+ struct spdk_posix_sock *sock = __posix_sock(_sock);
+ uint8_t byte;
+ int rc;
+
+ rc = recv(sock->fd, &byte, 1, MSG_PEEK);
+ if (rc == 0) {
+ return false;
+ }
+
+ if (rc < 0) {
+ if (errno == EAGAIN || errno == EWOULDBLOCK) {
+ return true;
+ }
+
+ return false;
+ }
+
+ return true;
+}
+
+static int
+posix_sock_get_placement_id(struct spdk_sock *_sock, int *placement_id)
+{
+ int rc = -1;
+
+#if defined(SO_INCOMING_NAPI_ID)
+ struct spdk_posix_sock *sock = __posix_sock(_sock);
+ socklen_t salen = sizeof(int);
+
+ rc = getsockopt(sock->fd, SOL_SOCKET, SO_INCOMING_NAPI_ID, placement_id, &salen);
+ if (rc != 0) {
+ SPDK_ERRLOG("getsockopt() failed (errno=%d)\n", errno);
+ }
+
+#endif
+ return rc;
+}
+
+static struct spdk_sock_group_impl *
+posix_sock_group_impl_create(void)
+{
+ struct spdk_posix_sock_group_impl *group_impl;
+ int fd;
+
+#if defined(__linux__)
+ fd = epoll_create1(0);
+#elif defined(__FreeBSD__)
+ fd = kqueue();
+#endif
+ if (fd == -1) {
+ return NULL;
+ }
+
+ group_impl = calloc(1, sizeof(*group_impl));
+ if (group_impl == NULL) {
+ SPDK_ERRLOG("group_impl allocation failed\n");
+ close(fd);
+ return NULL;
+ }
+
+ group_impl->fd = fd;
+ TAILQ_INIT(&group_impl->pending_recv);
+
+ return &group_impl->base;
+}
+
+static int
+posix_sock_group_impl_add_sock(struct spdk_sock_group_impl *_group, struct spdk_sock *_sock)
+{
+ struct spdk_posix_sock_group_impl *group = __posix_group_impl(_group);
+ struct spdk_posix_sock *sock = __posix_sock(_sock);
+ int rc;
+
+#if defined(__linux__)
+ struct epoll_event event;
+
+ memset(&event, 0, sizeof(event));
+ /* EPOLLERR is always on even if we don't set it, but be explicit for clarity */
+ event.events = EPOLLIN | EPOLLERR;
+ event.data.ptr = sock;
+
+ rc = epoll_ctl(group->fd, EPOLL_CTL_ADD, sock->fd, &event);
+#elif defined(__FreeBSD__)
+ struct kevent event;
+ struct timespec ts = {0};
+
+ EV_SET(&event, sock->fd, EVFILT_READ, EV_ADD, 0, 0, sock);
+
+ rc = kevent(group->fd, &event, 1, NULL, 0, &ts);
+#endif
+
+ /* switched from another polling group due to scheduling */
+ if (spdk_unlikely(sock->recv_pipe != NULL &&
+ (spdk_pipe_reader_bytes_available(sock->recv_pipe) > 0))) {
+ assert(sock->pending_recv == false);
+ sock->pending_recv = true;
+ TAILQ_INSERT_TAIL(&group->pending_recv, sock, link);
+ }
+
+ return rc;
+}
+
+static int
+posix_sock_group_impl_remove_sock(struct spdk_sock_group_impl *_group, struct spdk_sock *_sock)
+{
+ struct spdk_posix_sock_group_impl *group = __posix_group_impl(_group);
+ struct spdk_posix_sock *sock = __posix_sock(_sock);
+ int rc;
+
+ if (sock->recv_pipe != NULL) {
+ if (spdk_pipe_reader_bytes_available(sock->recv_pipe) > 0) {
+ TAILQ_REMOVE(&group->pending_recv, sock, link);
+ sock->pending_recv = false;
+ }
+ assert(sock->pending_recv == false);
+ }
+
+#if defined(__linux__)
+ struct epoll_event event;
+
+ /* Event parameter is ignored but some old kernel version still require it. */
+ rc = epoll_ctl(group->fd, EPOLL_CTL_DEL, sock->fd, &event);
+#elif defined(__FreeBSD__)
+ struct kevent event;
+ struct timespec ts = {0};
+
+ EV_SET(&event, sock->fd, EVFILT_READ, EV_DELETE, 0, 0, NULL);
+
+ rc = kevent(group->fd, &event, 1, NULL, 0, &ts);
+ if (rc == 0 && event.flags & EV_ERROR) {
+ rc = -1;
+ errno = event.data;
+ }
+#endif
+
+ spdk_sock_abort_requests(_sock);
+
+ return rc;
+}
+
+static int
+posix_sock_group_impl_poll(struct spdk_sock_group_impl *_group, int max_events,
+ struct spdk_sock **socks)
+{
+ struct spdk_posix_sock_group_impl *group = __posix_group_impl(_group);
+ struct spdk_sock *sock, *tmp;
+ int num_events, i, rc;
+ struct spdk_posix_sock *psock, *ptmp;
+#if defined(__linux__)
+ struct epoll_event events[MAX_EVENTS_PER_POLL];
+#elif defined(__FreeBSD__)
+ struct kevent events[MAX_EVENTS_PER_POLL];
+ struct timespec ts = {0};
+#endif
+
+ /* This must be a TAILQ_FOREACH_SAFE because while flushing,
+ * a completion callback could remove the sock from the
+ * group. */
+ TAILQ_FOREACH_SAFE(sock, &_group->socks, link, tmp) {
+ rc = _sock_flush(sock);
+ if (rc) {
+ spdk_sock_abort_requests(sock);
+ }
+ }
+
+#if defined(__linux__)
+ num_events = epoll_wait(group->fd, events, max_events, 0);
+#elif defined(__FreeBSD__)
+ num_events = kevent(group->fd, NULL, 0, events, max_events, &ts);
+#endif
+
+ if (num_events == -1) {
+ return -1;
+ } else if (num_events == 0 && !TAILQ_EMPTY(&_group->socks)) {
+ uint8_t byte;
+
+ sock = TAILQ_FIRST(&_group->socks);
+ psock = __posix_sock(sock);
+ /* a recv is done here to busy poll the queue associated with
+ * first socket in list and potentially reap incoming data.
+ */
+ if (psock->so_priority) {
+ recv(psock->fd, &byte, 1, MSG_PEEK);
+ }
+ }
+
+ for (i = 0; i < num_events; i++) {
+#if defined(__linux__)
+ sock = events[i].data.ptr;
+ psock = __posix_sock(sock);
+
+#ifdef SPDK_ZEROCOPY
+ if (events[i].events & EPOLLERR) {
+ rc = _sock_check_zcopy(sock);
+ /* If the socket was closed or removed from
+ * the group in response to a send ack, don't
+ * add it to the array here. */
+ if (rc || sock->cb_fn == NULL) {
+ continue;
+ }
+ }
+#endif
+ if ((events[i].events & EPOLLIN) == 0) {
+ continue;
+ }
+
+#elif defined(__FreeBSD__)
+ sock = events[i].udata;
+ psock = __posix_sock(sock);
+#endif
+
+ /* If the socket does not already have recv pending, add it now */
+ if (!psock->pending_recv) {
+ psock->pending_recv = true;
+ TAILQ_INSERT_TAIL(&group->pending_recv, psock, link);
+ }
+ }
+
+ num_events = 0;
+
+ TAILQ_FOREACH_SAFE(psock, &group->pending_recv, link, ptmp) {
+ if (num_events == max_events) {
+ break;
+ }
+
+ socks[num_events++] = &psock->base;
+ }
+
+ /* Cycle the pending_recv list so that each time we poll things aren't
+ * in the same order. */
+ for (i = 0; i < num_events; i++) {
+ psock = __posix_sock(socks[i]);
+
+ TAILQ_REMOVE(&group->pending_recv, psock, link);
+
+ if (psock->recv_pipe == NULL || spdk_pipe_reader_bytes_available(psock->recv_pipe) == 0) {
+ psock->pending_recv = false;
+ } else {
+ TAILQ_INSERT_TAIL(&group->pending_recv, psock, link);
+ }
+
+ }
+
+ return num_events;
+}
+
+static int
+posix_sock_group_impl_close(struct spdk_sock_group_impl *_group)
+{
+ struct spdk_posix_sock_group_impl *group = __posix_group_impl(_group);
+ int rc;
+
+ rc = close(group->fd);
+ free(group);
+ return rc;
+}
+
+static int
+posix_sock_impl_get_opts(struct spdk_sock_impl_opts *opts, size_t *len)
+{
+ if (!opts || !len) {
+ errno = EINVAL;
+ return -1;
+ }
+
+#define FIELD_OK(field) \
+ offsetof(struct spdk_sock_impl_opts, field) + sizeof(opts->field) <= *len
+
+#define GET_FIELD(field) \
+ if (FIELD_OK(field)) { \
+ opts->field = g_spdk_posix_sock_impl_opts.field; \
+ }
+
+ GET_FIELD(recv_buf_size);
+ GET_FIELD(send_buf_size);
+ GET_FIELD(enable_recv_pipe);
+ GET_FIELD(enable_zerocopy_send);
+
+#undef GET_FIELD
+#undef FIELD_OK
+
+ *len = spdk_min(*len, sizeof(g_spdk_posix_sock_impl_opts));
+ return 0;
+}
+
+static int
+posix_sock_impl_set_opts(const struct spdk_sock_impl_opts *opts, size_t len)
+{
+ if (!opts) {
+ errno = EINVAL;
+ return -1;
+ }
+
+#define FIELD_OK(field) \
+ offsetof(struct spdk_sock_impl_opts, field) + sizeof(opts->field) <= len
+
+#define SET_FIELD(field) \
+ if (FIELD_OK(field)) { \
+ g_spdk_posix_sock_impl_opts.field = opts->field; \
+ }
+
+ SET_FIELD(recv_buf_size);
+ SET_FIELD(send_buf_size);
+ SET_FIELD(enable_recv_pipe);
+ SET_FIELD(enable_zerocopy_send);
+
+#undef SET_FIELD
+#undef FIELD_OK
+
+ return 0;
+}
+
+
+static struct spdk_net_impl g_posix_net_impl = {
+ .name = "posix",
+ .getaddr = posix_sock_getaddr,
+ .connect = posix_sock_connect,
+ .listen = posix_sock_listen,
+ .accept = posix_sock_accept,
+ .close = posix_sock_close,
+ .recv = posix_sock_recv,
+ .readv = posix_sock_readv,
+ .writev = posix_sock_writev,
+ .writev_async = posix_sock_writev_async,
+ .flush = posix_sock_flush,
+ .set_recvlowat = posix_sock_set_recvlowat,
+ .set_recvbuf = posix_sock_set_recvbuf,
+ .set_sendbuf = posix_sock_set_sendbuf,
+ .is_ipv6 = posix_sock_is_ipv6,
+ .is_ipv4 = posix_sock_is_ipv4,
+ .is_connected = posix_sock_is_connected,
+ .get_placement_id = posix_sock_get_placement_id,
+ .group_impl_create = posix_sock_group_impl_create,
+ .group_impl_add_sock = posix_sock_group_impl_add_sock,
+ .group_impl_remove_sock = posix_sock_group_impl_remove_sock,
+ .group_impl_poll = posix_sock_group_impl_poll,
+ .group_impl_close = posix_sock_group_impl_close,
+ .get_opts = posix_sock_impl_get_opts,
+ .set_opts = posix_sock_impl_set_opts,
+};
+
+SPDK_NET_IMPL_REGISTER(posix, &g_posix_net_impl, DEFAULT_SOCK_PRIORITY);
diff --git a/src/spdk/module/sock/uring/Makefile b/src/spdk/module/sock/uring/Makefile
new file mode 100644
index 000000000..2d0e7c4e2
--- /dev/null
+++ b/src/spdk/module/sock/uring/Makefile
@@ -0,0 +1,45 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 1
+SO_MINOR := 0
+
+LIBNAME = sock_uring
+C_SRCS = uring.c
+
+SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/module/sock/uring/uring.c b/src/spdk/module/sock/uring/uring.c
new file mode 100644
index 000000000..3066f2d16
--- /dev/null
+++ b/src/spdk/module/sock/uring/uring.c
@@ -0,0 +1,1328 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+#include "spdk/config.h"
+
+#include <sys/epoll.h>
+#include <liburing.h>
+
+#include "spdk/barrier.h"
+#include "spdk/likely.h"
+#include "spdk/log.h"
+#include "spdk/pipe.h"
+#include "spdk/sock.h"
+#include "spdk/string.h"
+#include "spdk/util.h"
+
+#include "spdk_internal/sock.h"
+#include "spdk_internal/assert.h"
+
+#define MAX_TMPBUF 1024
+#define PORTNUMLEN 32
+#define SO_RCVBUF_SIZE (2 * 1024 * 1024)
+#define SO_SNDBUF_SIZE (2 * 1024 * 1024)
+#define SPDK_SOCK_GROUP_QUEUE_DEPTH 4096
+#define IOV_BATCH_SIZE 64
+
+enum spdk_sock_task_type {
+ SPDK_SOCK_TASK_POLLIN = 0,
+ SPDK_SOCK_TASK_WRITE,
+ SPDK_SOCK_TASK_CANCEL,
+};
+
+enum spdk_uring_sock_task_status {
+ SPDK_URING_SOCK_TASK_NOT_IN_USE = 0,
+ SPDK_URING_SOCK_TASK_IN_PROCESS,
+};
+
+struct spdk_uring_task {
+ enum spdk_uring_sock_task_status status;
+ enum spdk_sock_task_type type;
+ struct spdk_uring_sock *sock;
+ struct msghdr msg;
+ struct iovec iovs[IOV_BATCH_SIZE];
+ int iov_cnt;
+ struct spdk_sock_request *last_req;
+ STAILQ_ENTRY(spdk_uring_task) link;
+};
+
+struct spdk_uring_sock {
+ struct spdk_sock base;
+ int fd;
+ struct spdk_uring_sock_group_impl *group;
+ struct spdk_uring_task write_task;
+ struct spdk_uring_task pollin_task;
+ struct spdk_uring_task cancel_task;
+ struct spdk_pipe *recv_pipe;
+ void *recv_buf;
+ int recv_buf_sz;
+ bool pending_recv;
+ int connection_status;
+ TAILQ_ENTRY(spdk_uring_sock) link;
+};
+
+struct spdk_uring_sock_group_impl {
+ struct spdk_sock_group_impl base;
+ struct io_uring uring;
+ uint32_t io_inflight;
+ uint32_t io_queued;
+ uint32_t io_avail;
+ TAILQ_HEAD(, spdk_uring_sock) pending_recv;
+};
+
+#define SPDK_URING_SOCK_REQUEST_IOV(req) ((struct iovec *)((uint8_t *)req + sizeof(struct spdk_sock_request)))
+
+static int
+get_addr_str(struct sockaddr *sa, char *host, size_t hlen)
+{
+ const char *result = NULL;
+
+ if (sa == NULL || host == NULL) {
+ return -1;
+ }
+
+ switch (sa->sa_family) {
+ case AF_INET:
+ result = inet_ntop(AF_INET, &(((struct sockaddr_in *)sa)->sin_addr),
+ host, hlen);
+ break;
+ case AF_INET6:
+ result = inet_ntop(AF_INET6, &(((struct sockaddr_in6 *)sa)->sin6_addr),
+ host, hlen);
+ break;
+ default:
+ break;
+ }
+
+ if (result != NULL) {
+ return 0;
+ } else {
+ return -1;
+ }
+}
+
+#define __uring_sock(sock) (struct spdk_uring_sock *)sock
+#define __uring_group_impl(group) (struct spdk_uring_sock_group_impl *)group
+
+static int
+uring_sock_getaddr(struct spdk_sock *_sock, char *saddr, int slen, uint16_t *sport,
+ char *caddr, int clen, uint16_t *cport)
+{
+ struct spdk_uring_sock *sock = __uring_sock(_sock);
+ struct sockaddr_storage sa;
+ socklen_t salen;
+ int rc;
+
+ assert(sock != NULL);
+
+ memset(&sa, 0, sizeof sa);
+ salen = sizeof sa;
+ rc = getsockname(sock->fd, (struct sockaddr *) &sa, &salen);
+ if (rc != 0) {
+ SPDK_ERRLOG("getsockname() failed (errno=%d)\n", errno);
+ return -1;
+ }
+
+ switch (sa.ss_family) {
+ case AF_UNIX:
+ /* Acceptable connection types that don't have IPs */
+ return 0;
+ case AF_INET:
+ case AF_INET6:
+ /* Code below will get IP addresses */
+ break;
+ default:
+ /* Unsupported socket family */
+ return -1;
+ }
+
+ rc = get_addr_str((struct sockaddr *)&sa, saddr, slen);
+ if (rc != 0) {
+ SPDK_ERRLOG("getnameinfo() failed (errno=%d)\n", errno);
+ return -1;
+ }
+
+ if (sport) {
+ if (sa.ss_family == AF_INET) {
+ *sport = ntohs(((struct sockaddr_in *) &sa)->sin_port);
+ } else if (sa.ss_family == AF_INET6) {
+ *sport = ntohs(((struct sockaddr_in6 *) &sa)->sin6_port);
+ }
+ }
+
+ memset(&sa, 0, sizeof sa);
+ salen = sizeof sa;
+ rc = getpeername(sock->fd, (struct sockaddr *) &sa, &salen);
+ if (rc != 0) {
+ SPDK_ERRLOG("getpeername() failed (errno=%d)\n", errno);
+ return -1;
+ }
+
+ rc = get_addr_str((struct sockaddr *)&sa, caddr, clen);
+ if (rc != 0) {
+ SPDK_ERRLOG("getnameinfo() failed (errno=%d)\n", errno);
+ return -1;
+ }
+
+ if (cport) {
+ if (sa.ss_family == AF_INET) {
+ *cport = ntohs(((struct sockaddr_in *) &sa)->sin_port);
+ } else if (sa.ss_family == AF_INET6) {
+ *cport = ntohs(((struct sockaddr_in6 *) &sa)->sin6_port);
+ }
+ }
+
+ return 0;
+}
+
+enum uring_sock_create_type {
+ SPDK_SOCK_CREATE_LISTEN,
+ SPDK_SOCK_CREATE_CONNECT,
+};
+
+static int
+uring_sock_alloc_pipe(struct spdk_uring_sock *sock, int sz)
+{
+ uint8_t *new_buf;
+ struct spdk_pipe *new_pipe;
+ struct iovec siov[2];
+ struct iovec diov[2];
+ int sbytes;
+ ssize_t bytes;
+
+ if (sock->recv_buf_sz == sz) {
+ return 0;
+ }
+
+ /* If the new size is 0, just free the pipe */
+ if (sz == 0) {
+ spdk_pipe_destroy(sock->recv_pipe);
+ free(sock->recv_buf);
+ sock->recv_pipe = NULL;
+ sock->recv_buf = NULL;
+ return 0;
+ } else if (sz < MIN_SOCK_PIPE_SIZE) {
+ SPDK_ERRLOG("The size of the pipe must be larger than %d\n", MIN_SOCK_PIPE_SIZE);
+ return -1;
+ }
+
+ /* Round up to next 64 byte multiple */
+ new_buf = calloc(SPDK_ALIGN_CEIL(sz + 1, 64), sizeof(uint8_t));
+ if (!new_buf) {
+ SPDK_ERRLOG("socket recv buf allocation failed\n");
+ return -ENOMEM;
+ }
+
+ new_pipe = spdk_pipe_create(new_buf, sz + 1);
+ if (new_pipe == NULL) {
+ SPDK_ERRLOG("socket pipe allocation failed\n");
+ free(new_buf);
+ return -ENOMEM;
+ }
+
+ if (sock->recv_pipe != NULL) {
+ /* Pull all of the data out of the old pipe */
+ sbytes = spdk_pipe_reader_get_buffer(sock->recv_pipe, sock->recv_buf_sz, siov);
+ if (sbytes > sz) {
+ /* Too much data to fit into the new pipe size */
+ spdk_pipe_destroy(new_pipe);
+ free(new_buf);
+ return -EINVAL;
+ }
+
+ sbytes = spdk_pipe_writer_get_buffer(new_pipe, sz, diov);
+ assert(sbytes == sz);
+
+ bytes = spdk_iovcpy(siov, 2, diov, 2);
+ spdk_pipe_writer_advance(new_pipe, bytes);
+
+ spdk_pipe_destroy(sock->recv_pipe);
+ free(sock->recv_buf);
+ }
+
+ sock->recv_buf_sz = sz;
+ sock->recv_buf = new_buf;
+ sock->recv_pipe = new_pipe;
+
+ return 0;
+}
+
+static int
+uring_sock_set_recvbuf(struct spdk_sock *_sock, int sz)
+{
+ struct spdk_uring_sock *sock = __uring_sock(_sock);
+ int rc;
+
+ assert(sock != NULL);
+
+#ifndef __aarch64__
+ /* On ARM systems, this buffering does not help. Skip it. */
+ /* The size of the pipe is purely derived from benchmarks. It seems to work well. */
+ rc = uring_sock_alloc_pipe(sock, sz);
+ if (rc) {
+ SPDK_ERRLOG("unable to allocate sufficient recvbuf with sz=%d on sock=%p\n", sz, _sock);
+ return rc;
+ }
+#endif
+
+ if (sz < SO_RCVBUF_SIZE) {
+ sz = SO_RCVBUF_SIZE;
+ }
+
+ rc = setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, &sz, sizeof(sz));
+ if (rc < 0) {
+ return rc;
+ }
+
+ return 0;
+}
+
+static int
+uring_sock_set_sendbuf(struct spdk_sock *_sock, int sz)
+{
+ struct spdk_uring_sock *sock = __uring_sock(_sock);
+ int rc;
+
+ assert(sock != NULL);
+
+ if (sz < SO_SNDBUF_SIZE) {
+ sz = SO_SNDBUF_SIZE;
+ }
+
+ rc = setsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF, &sz, sizeof(sz));
+ if (rc < 0) {
+ return rc;
+ }
+
+ return 0;
+}
+
+static struct spdk_uring_sock *
+uring_sock_alloc(int fd)
+{
+ struct spdk_uring_sock *sock;
+
+ sock = calloc(1, sizeof(*sock));
+ if (sock == NULL) {
+ SPDK_ERRLOG("sock allocation failed\n");
+ return NULL;
+ }
+
+ sock->fd = fd;
+ return sock;
+}
+
+static struct spdk_sock *
+uring_sock_create(const char *ip, int port,
+ enum uring_sock_create_type type,
+ struct spdk_sock_opts *opts)
+{
+ struct spdk_uring_sock *sock;
+ char buf[MAX_TMPBUF];
+ char portnum[PORTNUMLEN];
+ char *p;
+ struct addrinfo hints, *res, *res0;
+ int fd, flag;
+ int val = 1;
+ int rc;
+
+ if (ip == NULL) {
+ return NULL;
+ }
+ if (ip[0] == '[') {
+ snprintf(buf, sizeof(buf), "%s", ip + 1);
+ p = strchr(buf, ']');
+ if (p != NULL) {
+ *p = '\0';
+ }
+ ip = (const char *) &buf[0];
+ }
+
+ snprintf(portnum, sizeof portnum, "%d", port);
+ memset(&hints, 0, sizeof hints);
+ hints.ai_family = PF_UNSPEC;
+ hints.ai_socktype = SOCK_STREAM;
+ hints.ai_flags = AI_NUMERICSERV;
+ hints.ai_flags |= AI_PASSIVE;
+ hints.ai_flags |= AI_NUMERICHOST;
+ rc = getaddrinfo(ip, portnum, &hints, &res0);
+ if (rc != 0) {
+ SPDK_ERRLOG("getaddrinfo() failed (errno=%d)\n", errno);
+ return NULL;
+ }
+
+ /* try listen */
+ fd = -1;
+ for (res = res0; res != NULL; res = res->ai_next) {
+retry:
+ fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol);
+ if (fd < 0) {
+ /* error */
+ continue;
+ }
+
+ val = SO_RCVBUF_SIZE;
+ rc = setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &val, sizeof val);
+ if (rc) {
+ /* Not fatal */
+ }
+
+ val = SO_SNDBUF_SIZE;
+ rc = setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &val, sizeof val);
+ if (rc) {
+ /* Not fatal */
+ }
+
+ rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &val, sizeof val);
+ if (rc != 0) {
+ close(fd);
+ /* error */
+ continue;
+ }
+ rc = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &val, sizeof val);
+ if (rc != 0) {
+ close(fd);
+ /* error */
+ continue;
+ }
+
+#if defined(SO_PRIORITY)
+ if (opts != NULL && opts->priority) {
+ rc = setsockopt(fd, SOL_SOCKET, SO_PRIORITY, &opts->priority, sizeof val);
+ if (rc != 0) {
+ close(fd);
+ /* error */
+ continue;
+ }
+ }
+#endif
+ if (res->ai_family == AF_INET6) {
+ rc = setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &val, sizeof val);
+ if (rc != 0) {
+ close(fd);
+ /* error */
+ continue;
+ }
+ }
+
+ if (type == SPDK_SOCK_CREATE_LISTEN) {
+ rc = bind(fd, res->ai_addr, res->ai_addrlen);
+ if (rc != 0) {
+ SPDK_ERRLOG("bind() failed at port %d, errno = %d\n", port, errno);
+ switch (errno) {
+ case EINTR:
+ /* interrupted? */
+ close(fd);
+ goto retry;
+ case EADDRNOTAVAIL:
+ SPDK_ERRLOG("IP address %s not available. "
+ "Verify IP address in config file "
+ "and make sure setup script is "
+ "run before starting spdk app.\n", ip);
+ /* FALLTHROUGH */
+ default:
+ /* try next family */
+ close(fd);
+ fd = -1;
+ continue;
+ }
+ }
+ /* bind OK */
+ rc = listen(fd, 512);
+ if (rc != 0) {
+ SPDK_ERRLOG("listen() failed, errno = %d\n", errno);
+ close(fd);
+ fd = -1;
+ break;
+ }
+ } else if (type == SPDK_SOCK_CREATE_CONNECT) {
+ rc = connect(fd, res->ai_addr, res->ai_addrlen);
+ if (rc != 0) {
+ SPDK_ERRLOG("connect() failed, errno = %d\n", errno);
+ /* try next family */
+ close(fd);
+ fd = -1;
+ continue;
+ }
+ }
+
+ flag = fcntl(fd, F_GETFL);
+ if (fcntl(fd, F_SETFL, flag | O_NONBLOCK) < 0) {
+ SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%d)\n", fd, errno);
+ close(fd);
+ fd = -1;
+ break;
+ }
+ break;
+ }
+ freeaddrinfo(res0);
+
+ if (fd < 0) {
+ return NULL;
+ }
+
+ sock = uring_sock_alloc(fd);
+ if (sock == NULL) {
+ SPDK_ERRLOG("sock allocation failed\n");
+ close(fd);
+ return NULL;
+ }
+
+ return &sock->base;
+}
+
+static struct spdk_sock *
+uring_sock_listen(const char *ip, int port, struct spdk_sock_opts *opts)
+{
+ return uring_sock_create(ip, port, SPDK_SOCK_CREATE_LISTEN, opts);
+}
+
+static struct spdk_sock *
+uring_sock_connect(const char *ip, int port, struct spdk_sock_opts *opts)
+{
+ return uring_sock_create(ip, port, SPDK_SOCK_CREATE_CONNECT, opts);
+}
+
+static struct spdk_sock *
+uring_sock_accept(struct spdk_sock *_sock)
+{
+ struct spdk_uring_sock *sock = __uring_sock(_sock);
+ struct sockaddr_storage sa;
+ socklen_t salen;
+ int rc, fd;
+ struct spdk_uring_sock *new_sock;
+ int flag;
+
+ memset(&sa, 0, sizeof(sa));
+ salen = sizeof(sa);
+
+ assert(sock != NULL);
+
+ rc = accept(sock->fd, (struct sockaddr *)&sa, &salen);
+
+ if (rc == -1) {
+ return NULL;
+ }
+
+ fd = rc;
+
+ flag = fcntl(fd, F_GETFL);
+ if ((!(flag & O_NONBLOCK)) && (fcntl(fd, F_SETFL, flag | O_NONBLOCK) < 0)) {
+ SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%d)\n", fd, errno);
+ close(fd);
+ return NULL;
+ }
+
+#if defined(SO_PRIORITY)
+ /* The priority is not inherited, so call this function again */
+ if (sock->base.opts.priority) {
+ rc = setsockopt(fd, SOL_SOCKET, SO_PRIORITY, &sock->base.opts.priority, sizeof(int));
+ if (rc != 0) {
+ close(fd);
+ return NULL;
+ }
+ }
+#endif
+
+ new_sock = uring_sock_alloc(fd);
+ if (new_sock == NULL) {
+ close(fd);
+ return NULL;
+ }
+
+ return &new_sock->base;
+}
+
+static int
+uring_sock_close(struct spdk_sock *_sock)
+{
+ struct spdk_uring_sock *sock = __uring_sock(_sock);
+ int rc;
+
+ assert(TAILQ_EMPTY(&_sock->pending_reqs));
+ assert(sock->group == NULL);
+
+ spdk_pipe_destroy(sock->recv_pipe);
+ free(sock->recv_buf);
+ rc = close(sock->fd);
+ if (rc == 0) {
+ free(sock);
+ }
+
+ return rc;
+}
+
+static ssize_t
+uring_sock_recv_from_pipe(struct spdk_uring_sock *sock, struct iovec *diov, int diovcnt)
+{
+ struct iovec siov[2];
+ int sbytes;
+ ssize_t bytes;
+ struct spdk_uring_sock_group_impl *group;
+
+ sbytes = spdk_pipe_reader_get_buffer(sock->recv_pipe, sock->recv_buf_sz, siov);
+ if (sbytes < 0) {
+ errno = EINVAL;
+ return -1;
+ } else if (sbytes == 0) {
+ errno = EAGAIN;
+ return -1;
+ }
+
+ bytes = spdk_iovcpy(siov, 2, diov, diovcnt);
+
+ if (bytes == 0) {
+ /* The only way this happens is if diov is 0 length */
+ errno = EINVAL;
+ return -1;
+ }
+
+ spdk_pipe_reader_advance(sock->recv_pipe, bytes);
+
+ /* If we drained the pipe, take it off the level-triggered list */
+ if (sock->base.group_impl && spdk_pipe_reader_bytes_available(sock->recv_pipe) == 0) {
+ group = __uring_group_impl(sock->base.group_impl);
+ TAILQ_REMOVE(&group->pending_recv, sock, link);
+ sock->pending_recv = false;
+ }
+
+ return bytes;
+}
+
+static inline ssize_t
+uring_sock_read(struct spdk_uring_sock *sock)
+{
+ struct iovec iov[2];
+ int bytes;
+ struct spdk_uring_sock_group_impl *group;
+
+ bytes = spdk_pipe_writer_get_buffer(sock->recv_pipe, sock->recv_buf_sz, iov);
+
+ if (bytes > 0) {
+ bytes = readv(sock->fd, iov, 2);
+ if (bytes > 0) {
+ spdk_pipe_writer_advance(sock->recv_pipe, bytes);
+ if (sock->base.group_impl) {
+ group = __uring_group_impl(sock->base.group_impl);
+ TAILQ_INSERT_TAIL(&group->pending_recv, sock, link);
+ sock->pending_recv = true;
+ }
+ }
+ }
+
+ return bytes;
+}
+
+static ssize_t
+uring_sock_readv(struct spdk_sock *_sock, struct iovec *iov, int iovcnt)
+{
+ struct spdk_uring_sock *sock = __uring_sock(_sock);
+ int rc, i;
+ size_t len;
+
+ if (sock->recv_pipe == NULL) {
+ return readv(sock->fd, iov, iovcnt);
+ }
+
+ len = 0;
+ for (i = 0; i < iovcnt; i++) {
+ len += iov[i].iov_len;
+ }
+
+ if (spdk_pipe_reader_bytes_available(sock->recv_pipe) == 0) {
+ /* If the user is receiving a sufficiently large amount of data,
+ * receive directly to their buffers. */
+ if (len >= MIN_SOCK_PIPE_SIZE) {
+ return readv(sock->fd, iov, iovcnt);
+ }
+
+ /* Otherwise, do a big read into our pipe */
+ rc = uring_sock_read(sock);
+ if (rc <= 0) {
+ return rc;
+ }
+ }
+
+ return uring_sock_recv_from_pipe(sock, iov, iovcnt);
+}
+
+static ssize_t
+uring_sock_recv(struct spdk_sock *sock, void *buf, size_t len)
+{
+ struct iovec iov[1];
+
+ iov[0].iov_base = buf;
+ iov[0].iov_len = len;
+
+ return uring_sock_readv(sock, iov, 1);
+}
+
+static ssize_t
+uring_sock_writev(struct spdk_sock *_sock, struct iovec *iov, int iovcnt)
+{
+ struct spdk_uring_sock *sock = __uring_sock(_sock);
+
+ if (sock->write_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE) {
+ errno = EAGAIN;
+ return -1;
+ }
+
+ return writev(sock->fd, iov, iovcnt);
+}
+
+static int
+sock_prep_reqs(struct spdk_sock *_sock, struct iovec *iovs, int index,
+ struct spdk_sock_request **last_req)
+{
+ int iovcnt, i;
+ struct spdk_sock_request *req;
+ unsigned int offset;
+
+ /* Gather an iov */
+ iovcnt = index;
+ if (spdk_unlikely(iovcnt >= IOV_BATCH_SIZE)) {
+ goto end;
+ }
+
+ if (last_req != NULL && *last_req != NULL) {
+ req = TAILQ_NEXT(*last_req, internal.link);
+ } else {
+ req = TAILQ_FIRST(&_sock->queued_reqs);
+ }
+
+ while (req) {
+ offset = req->internal.offset;
+
+ for (i = 0; i < req->iovcnt; i++) {
+ /* Consume any offset first */
+ if (offset >= SPDK_SOCK_REQUEST_IOV(req, i)->iov_len) {
+ offset -= SPDK_SOCK_REQUEST_IOV(req, i)->iov_len;
+ continue;
+ }
+
+ iovs[iovcnt].iov_base = SPDK_SOCK_REQUEST_IOV(req, i)->iov_base + offset;
+ iovs[iovcnt].iov_len = SPDK_SOCK_REQUEST_IOV(req, i)->iov_len - offset;
+ iovcnt++;
+
+ offset = 0;
+
+ if (iovcnt >= IOV_BATCH_SIZE) {
+ break;
+ }
+ }
+ if (iovcnt >= IOV_BATCH_SIZE) {
+ break;
+ }
+
+ if (last_req != NULL) {
+ *last_req = req;
+ }
+ req = TAILQ_NEXT(req, internal.link);
+ }
+
+end:
+ return iovcnt;
+}
+
+static int
+sock_complete_reqs(struct spdk_sock *_sock, ssize_t rc)
+{
+ struct spdk_sock_request *req;
+ int i, retval;
+ unsigned int offset;
+ size_t len;
+
+ /* Consume the requests that were actually written */
+ req = TAILQ_FIRST(&_sock->queued_reqs);
+ while (req) {
+ offset = req->internal.offset;
+
+ for (i = 0; i < req->iovcnt; i++) {
+ /* Advance by the offset first */
+ if (offset >= SPDK_SOCK_REQUEST_IOV(req, i)->iov_len) {
+ offset -= SPDK_SOCK_REQUEST_IOV(req, i)->iov_len;
+ continue;
+ }
+
+ /* Calculate the remaining length of this element */
+ len = SPDK_SOCK_REQUEST_IOV(req, i)->iov_len - offset;
+
+ if (len > (size_t)rc) {
+ /* This element was partially sent. */
+ req->internal.offset += rc;
+ return 0;
+ }
+
+ offset = 0;
+ req->internal.offset += len;
+ rc -= len;
+ }
+
+ /* Handled a full request. */
+ spdk_sock_request_pend(_sock, req);
+
+ retval = spdk_sock_request_put(_sock, req, 0);
+ if (retval) {
+ return retval;
+ }
+
+ if (rc == 0) {
+ break;
+ }
+
+ req = TAILQ_FIRST(&_sock->queued_reqs);
+ }
+
+ return 0;
+}
+
+static void
+_sock_flush(struct spdk_sock *_sock)
+{
+ struct spdk_uring_sock *sock = __uring_sock(_sock);
+ struct spdk_uring_task *task = &sock->write_task;
+ uint32_t iovcnt;
+ struct io_uring_sqe *sqe;
+
+ if (task->status == SPDK_URING_SOCK_TASK_IN_PROCESS) {
+ return;
+ }
+
+ iovcnt = sock_prep_reqs(&sock->base, task->iovs, task->iov_cnt, &task->last_req);
+ if (!iovcnt) {
+ return;
+ }
+
+ task->iov_cnt = iovcnt;
+ assert(sock->group != NULL);
+ task->msg.msg_iov = task->iovs;
+ task->msg.msg_iovlen = task->iov_cnt;
+
+ sock->group->io_queued++;
+
+ sqe = io_uring_get_sqe(&sock->group->uring);
+ io_uring_prep_sendmsg(sqe, sock->fd, &sock->write_task.msg, 0);
+ io_uring_sqe_set_data(sqe, task);
+ task->status = SPDK_URING_SOCK_TASK_IN_PROCESS;
+}
+
+static void
+_sock_prep_pollin(struct spdk_sock *_sock)
+{
+ struct spdk_uring_sock *sock = __uring_sock(_sock);
+ struct spdk_uring_task *task = &sock->pollin_task;
+ struct io_uring_sqe *sqe;
+
+ /* Do not prepare pollin event */
+ if (task->status == SPDK_URING_SOCK_TASK_IN_PROCESS || sock->pending_recv) {
+ return;
+ }
+
+ assert(sock->group != NULL);
+ sock->group->io_queued++;
+
+ sqe = io_uring_get_sqe(&sock->group->uring);
+ io_uring_prep_poll_add(sqe, sock->fd, POLLIN);
+ io_uring_sqe_set_data(sqe, task);
+ task->status = SPDK_URING_SOCK_TASK_IN_PROCESS;
+}
+
+static void
+_sock_prep_cancel_task(struct spdk_sock *_sock, void *user_data)
+{
+ struct spdk_uring_sock *sock = __uring_sock(_sock);
+ struct spdk_uring_task *task = &sock->cancel_task;
+ struct io_uring_sqe *sqe;
+
+ if (task->status == SPDK_URING_SOCK_TASK_IN_PROCESS) {
+ return;
+ }
+
+ assert(sock->group != NULL);
+ sock->group->io_queued++;
+
+ sqe = io_uring_get_sqe(&sock->group->uring);
+ io_uring_prep_cancel(sqe, user_data, 0);
+ io_uring_sqe_set_data(sqe, task);
+ task->status = SPDK_URING_SOCK_TASK_IN_PROCESS;
+}
+
+static int
+sock_uring_group_reap(struct spdk_uring_sock_group_impl *group, int max, int max_read_events,
+ struct spdk_sock **socks)
+{
+ int i, count, ret;
+ struct io_uring_cqe *cqe;
+ struct spdk_uring_sock *sock, *tmp;
+ struct spdk_uring_task *task;
+ int status;
+
+ for (i = 0; i < max; i++) {
+ ret = io_uring_peek_cqe(&group->uring, &cqe);
+ if (ret != 0) {
+ break;
+ }
+
+ if (cqe == NULL) {
+ break;
+ }
+
+ task = (struct spdk_uring_task *)cqe->user_data;
+ assert(task != NULL);
+ sock = task->sock;
+ assert(sock != NULL);
+ assert(sock->group != NULL);
+ assert(sock->group == group);
+ sock->group->io_inflight--;
+ sock->group->io_avail++;
+ status = cqe->res;
+ io_uring_cqe_seen(&group->uring, cqe);
+
+ task->status = SPDK_URING_SOCK_TASK_NOT_IN_USE;
+
+ if (spdk_unlikely(status <= 0)) {
+ if (status == -EAGAIN || status == -EWOULDBLOCK) {
+ continue;
+ }
+ }
+
+ switch (task->type) {
+ case SPDK_SOCK_TASK_POLLIN:
+ if ((status & POLLIN) == POLLIN) {
+ if (sock->base.cb_fn != NULL) {
+ assert(sock->pending_recv == false);
+ sock->pending_recv = true;
+ TAILQ_INSERT_TAIL(&group->pending_recv, sock, link);
+ }
+ }
+ break;
+ case SPDK_SOCK_TASK_WRITE:
+ assert(TAILQ_EMPTY(&sock->base.pending_reqs));
+ task->last_req = NULL;
+ task->iov_cnt = 0;
+ if (spdk_unlikely(status) < 0) {
+ sock->connection_status = status;
+ spdk_sock_abort_requests(&sock->base);
+ } else {
+ sock_complete_reqs(&sock->base, status);
+ }
+
+ break;
+ case SPDK_SOCK_TASK_CANCEL:
+ /* Do nothing */
+ break;
+ default:
+ SPDK_UNREACHABLE();
+ }
+ }
+
+ if (!socks) {
+ return 0;
+ }
+ count = 0;
+ TAILQ_FOREACH_SAFE(sock, &group->pending_recv, link, tmp) {
+ if (count == max_read_events) {
+ break;
+ }
+
+ socks[count++] = &sock->base;
+ }
+
+ /* Cycle the pending_recv list so that each time we poll things aren't
+ * in the same order. */
+ for (i = 0; i < count; i++) {
+ sock = __uring_sock(socks[i]);
+
+ TAILQ_REMOVE(&group->pending_recv, sock, link);
+
+ if (sock->recv_pipe == NULL || spdk_pipe_reader_bytes_available(sock->recv_pipe) == 0) {
+ sock->pending_recv = false;
+ } else {
+ TAILQ_INSERT_TAIL(&group->pending_recv, sock, link);
+ }
+ }
+
+ return count;
+}
+
+static int
+_sock_flush_client(struct spdk_sock *_sock)
+{
+ struct spdk_uring_sock *sock = __uring_sock(_sock);
+ struct msghdr msg = {};
+ struct iovec iovs[IOV_BATCH_SIZE];
+ int iovcnt;
+ ssize_t rc;
+
+ /* Can't flush from within a callback or we end up with recursive calls */
+ if (_sock->cb_cnt > 0) {
+ return 0;
+ }
+
+ /* Gather an iov */
+ iovcnt = sock_prep_reqs(_sock, iovs, 0, NULL);
+ if (iovcnt == 0) {
+ return 0;
+ }
+
+ /* Perform the vectored write */
+ msg.msg_iov = iovs;
+ msg.msg_iovlen = iovcnt;
+ rc = sendmsg(sock->fd, &msg, 0);
+ if (rc <= 0) {
+ if (errno == EAGAIN || errno == EWOULDBLOCK) {
+ return 0;
+ }
+ return rc;
+ }
+
+ sock_complete_reqs(_sock, rc);
+
+ return 0;
+}
+
+static void
+uring_sock_writev_async(struct spdk_sock *_sock, struct spdk_sock_request *req)
+{
+ struct spdk_uring_sock *sock = __uring_sock(_sock);
+ int rc;
+
+ if (spdk_unlikely(sock->connection_status)) {
+ req->cb_fn(req->cb_arg, sock->connection_status);
+ return;
+ }
+
+ spdk_sock_request_queue(_sock, req);
+
+ if (!sock->group) {
+ if (_sock->queued_iovcnt >= IOV_BATCH_SIZE) {
+ rc = _sock_flush_client(_sock);
+ if (rc) {
+ spdk_sock_abort_requests(_sock);
+ }
+ }
+ }
+}
+
+static int
+uring_sock_set_recvlowat(struct spdk_sock *_sock, int nbytes)
+{
+ struct spdk_uring_sock *sock = __uring_sock(_sock);
+ int val;
+ int rc;
+
+ assert(sock != NULL);
+
+ val = nbytes;
+ rc = setsockopt(sock->fd, SOL_SOCKET, SO_RCVLOWAT, &val, sizeof val);
+ if (rc != 0) {
+ return -1;
+ }
+ return 0;
+}
+
+static bool
+uring_sock_is_ipv6(struct spdk_sock *_sock)
+{
+ struct spdk_uring_sock *sock = __uring_sock(_sock);
+ struct sockaddr_storage sa;
+ socklen_t salen;
+ int rc;
+
+ assert(sock != NULL);
+
+ memset(&sa, 0, sizeof sa);
+ salen = sizeof sa;
+ rc = getsockname(sock->fd, (struct sockaddr *) &sa, &salen);
+ if (rc != 0) {
+ SPDK_ERRLOG("getsockname() failed (errno=%d)\n", errno);
+ return false;
+ }
+
+ return (sa.ss_family == AF_INET6);
+}
+
+static bool
+uring_sock_is_ipv4(struct spdk_sock *_sock)
+{
+ struct spdk_uring_sock *sock = __uring_sock(_sock);
+ struct sockaddr_storage sa;
+ socklen_t salen;
+ int rc;
+
+ assert(sock != NULL);
+
+ memset(&sa, 0, sizeof sa);
+ salen = sizeof sa;
+ rc = getsockname(sock->fd, (struct sockaddr *) &sa, &salen);
+ if (rc != 0) {
+ SPDK_ERRLOG("getsockname() failed (errno=%d)\n", errno);
+ return false;
+ }
+
+ return (sa.ss_family == AF_INET);
+}
+
+static bool
+uring_sock_is_connected(struct spdk_sock *_sock)
+{
+ struct spdk_uring_sock *sock = __uring_sock(_sock);
+ uint8_t byte;
+ int rc;
+
+ rc = recv(sock->fd, &byte, 1, MSG_PEEK);
+ if (rc == 0) {
+ return false;
+ }
+
+ if (rc < 0) {
+ if (errno == EAGAIN || errno == EWOULDBLOCK) {
+ return true;
+ }
+
+ return false;
+ }
+
+ return true;
+}
+
+static int
+uring_sock_get_placement_id(struct spdk_sock *_sock, int *placement_id)
+{
+ int rc = -1;
+
+#if defined(SO_INCOMING_NAPI_ID)
+ struct spdk_uring_sock *sock = __uring_sock(_sock);
+ socklen_t salen = sizeof(int);
+
+ rc = getsockopt(sock->fd, SOL_SOCKET, SO_INCOMING_NAPI_ID, placement_id, &salen);
+ if (rc != 0) {
+ SPDK_ERRLOG("getsockopt() failed (errno=%d)\n", errno);
+ }
+
+#endif
+ return rc;
+}
+
+static struct spdk_sock_group_impl *
+uring_sock_group_impl_create(void)
+{
+ struct spdk_uring_sock_group_impl *group_impl;
+
+ group_impl = calloc(1, sizeof(*group_impl));
+ if (group_impl == NULL) {
+ SPDK_ERRLOG("group_impl allocation failed\n");
+ return NULL;
+ }
+
+ group_impl->io_avail = SPDK_SOCK_GROUP_QUEUE_DEPTH;
+
+ if (io_uring_queue_init(SPDK_SOCK_GROUP_QUEUE_DEPTH, &group_impl->uring, 0) < 0) {
+ SPDK_ERRLOG("uring I/O context setup failure\n");
+ free(group_impl);
+ return NULL;
+ }
+
+ TAILQ_INIT(&group_impl->pending_recv);
+
+ return &group_impl->base;
+}
+
+static int
+uring_sock_group_impl_add_sock(struct spdk_sock_group_impl *_group,
+ struct spdk_sock *_sock)
+{
+ struct spdk_uring_sock *sock = __uring_sock(_sock);
+ struct spdk_uring_sock_group_impl *group = __uring_group_impl(_group);
+
+ sock->group = group;
+ sock->write_task.sock = sock;
+ sock->write_task.type = SPDK_SOCK_TASK_WRITE;
+
+ sock->pollin_task.sock = sock;
+ sock->pollin_task.type = SPDK_SOCK_TASK_POLLIN;
+
+ sock->cancel_task.sock = sock;
+ sock->cancel_task.type = SPDK_SOCK_TASK_CANCEL;
+
+ /* switched from another polling group due to scheduling */
+ if (spdk_unlikely(sock->recv_pipe != NULL &&
+ (spdk_pipe_reader_bytes_available(sock->recv_pipe) > 0))) {
+ assert(sock->pending_recv == false);
+ sock->pending_recv = true;
+ TAILQ_INSERT_TAIL(&group->pending_recv, sock, link);
+ }
+
+ return 0;
+}
+
+static int
+uring_sock_group_impl_poll(struct spdk_sock_group_impl *_group, int max_events,
+ struct spdk_sock **socks)
+{
+ struct spdk_uring_sock_group_impl *group = __uring_group_impl(_group);
+ int count, ret;
+ int to_complete, to_submit;
+ struct spdk_sock *_sock, *tmp;
+ struct spdk_uring_sock *sock;
+
+ if (spdk_likely(socks)) {
+ TAILQ_FOREACH_SAFE(_sock, &group->base.socks, link, tmp) {
+ sock = __uring_sock(_sock);
+ if (spdk_unlikely(sock->connection_status)) {
+ continue;
+ }
+ _sock_flush(_sock);
+ _sock_prep_pollin(_sock);
+ }
+ }
+
+ to_submit = group->io_queued;
+
+ /* For network I/O, it cannot be set with O_DIRECT, so we do not need to call spdk_io_uring_enter */
+ if (to_submit > 0) {
+ /* If there are I/O to submit, use io_uring_submit here.
+ * It will automatically call io_uring_enter appropriately. */
+ ret = io_uring_submit(&group->uring);
+ if (ret < 0) {
+ return 1;
+ }
+ group->io_queued = 0;
+ group->io_inflight += to_submit;
+ group->io_avail -= to_submit;
+ }
+
+ count = 0;
+ to_complete = group->io_inflight;
+ if (to_complete > 0) {
+ count = sock_uring_group_reap(group, to_complete, max_events, socks);
+ }
+
+ return count;
+}
+
+static int
+uring_sock_group_impl_remove_sock(struct spdk_sock_group_impl *_group,
+ struct spdk_sock *_sock)
+{
+ struct spdk_uring_sock *sock = __uring_sock(_sock);
+ struct spdk_uring_sock_group_impl *group = __uring_group_impl(_group);
+
+ if (sock->write_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE) {
+ _sock_prep_cancel_task(_sock, &sock->write_task);
+ /* Since spdk_sock_group_remove_sock is not asynchronous interface, so
+ * currently can use a while loop here. */
+ while ((sock->write_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE) ||
+ (sock->cancel_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE)) {
+ uring_sock_group_impl_poll(_group, 32, NULL);
+ }
+ }
+
+ if (sock->pollin_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE) {
+ _sock_prep_cancel_task(_sock, &sock->pollin_task);
+ /* Since spdk_sock_group_remove_sock is not asynchronous interface, so
+ * currently can use a while loop here. */
+ while ((sock->pollin_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE) ||
+ (sock->cancel_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE)) {
+ uring_sock_group_impl_poll(_group, 32, NULL);
+ }
+ }
+
+ if (sock->pending_recv) {
+ TAILQ_REMOVE(&group->pending_recv, sock, link);
+ sock->pending_recv = false;
+ }
+ assert(sock->pending_recv == false);
+
+ sock->group = NULL;
+ return 0;
+}
+
+static int
+uring_sock_group_impl_close(struct spdk_sock_group_impl *_group)
+{
+ struct spdk_uring_sock_group_impl *group = __uring_group_impl(_group);
+
+ /* try to reap all the active I/O */
+ while (group->io_inflight) {
+ uring_sock_group_impl_poll(_group, 32, NULL);
+ }
+ assert(group->io_inflight == 0);
+ assert(group->io_avail == SPDK_SOCK_GROUP_QUEUE_DEPTH);
+
+ io_uring_queue_exit(&group->uring);
+
+ free(group);
+ return 0;
+}
+
+static int
+uring_sock_flush(struct spdk_sock *_sock)
+{
+ struct spdk_uring_sock *sock = __uring_sock(_sock);
+
+ if (!sock->group) {
+ return _sock_flush_client(_sock);
+ }
+
+ return 0;
+}
+
+static struct spdk_net_impl g_uring_net_impl = {
+ .name = "uring",
+ .getaddr = uring_sock_getaddr,
+ .connect = uring_sock_connect,
+ .listen = uring_sock_listen,
+ .accept = uring_sock_accept,
+ .close = uring_sock_close,
+ .recv = uring_sock_recv,
+ .readv = uring_sock_readv,
+ .writev = uring_sock_writev,
+ .writev_async = uring_sock_writev_async,
+ .flush = uring_sock_flush,
+ .set_recvlowat = uring_sock_set_recvlowat,
+ .set_recvbuf = uring_sock_set_recvbuf,
+ .set_sendbuf = uring_sock_set_sendbuf,
+ .is_ipv6 = uring_sock_is_ipv6,
+ .is_ipv4 = uring_sock_is_ipv4,
+ .is_connected = uring_sock_is_connected,
+ .get_placement_id = uring_sock_get_placement_id,
+ .group_impl_create = uring_sock_group_impl_create,
+ .group_impl_add_sock = uring_sock_group_impl_add_sock,
+ .group_impl_remove_sock = uring_sock_group_impl_remove_sock,
+ .group_impl_poll = uring_sock_group_impl_poll,
+ .group_impl_close = uring_sock_group_impl_close,
+};
+
+SPDK_NET_IMPL_REGISTER(uring, &g_uring_net_impl, DEFAULT_SOCK_PRIORITY + 1);
diff --git a/src/spdk/module/sock/vpp/Makefile b/src/spdk/module/sock/vpp/Makefile
new file mode 100644
index 000000000..016018c77
--- /dev/null
+++ b/src/spdk/module/sock/vpp/Makefile
@@ -0,0 +1,55 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS += vpp.c
+CFLAGS += -Wno-sign-compare -Wno-error=old-style-definition
+CFLAGS += -Wno-error=strict-prototypes -Wno-error=ignored-qualifiers
+
+GCC_VERSION=$(shell $(CC) -dumpversion | cut -d. -f1)
+
+# disable packed member unalign warnings
+ifeq ($(shell test $(GCC_VERSION) -ge 9 && echo 1), 1)
+CFLAGS += -Wno-error=address-of-packed-member
+endif
+
+LIBNAME = sock_vpp
+
+SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/module/sock/vpp/vpp.c b/src/spdk/module/sock/vpp/vpp.c
new file mode 100644
index 000000000..89a92e9d1
--- /dev/null
+++ b/src/spdk/module/sock/vpp/vpp.c
@@ -0,0 +1,1633 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Omit from static analysis. */
+#ifndef __clang_analyzer__
+
+#include "spdk/stdinc.h"
+
+#include "spdk/log.h"
+#include "spdk/sock.h"
+#include "spdk/net.h"
+#include "spdk/string.h"
+#include "spdk_internal/sock.h"
+#include "spdk/queue.h"
+#include "spdk/event.h"
+#include "spdk/thread.h"
+#include "spdk_internal/log.h"
+
+/* _GNU_SOURCE is redefined in the vpp headers with no protection (dlmalloc.h) */
+#undef _GNU_SOURCE
+
+#include <svm/svm_fifo_segment.h>
+#include <vlibmemory/api.h>
+#include <vpp/api/vpe_msg_enum.h>
+#include <vnet/session/application_interface.h>
+
+#define vl_typedefs /* define message structures */
+#include <vpp/api/vpe_all_api_h.h>
+#undef vl_typedefs
+
+/* declare message handlers for each api */
+
+#define vl_endianfun /* define message structures */
+#include <vpp/api/vpe_all_api_h.h>
+#undef vl_endianfun
+
+/* instantiate all the print functions we know about */
+#define vl_print(handle, ...)
+#define vl_printfun
+#include <vpp/api/vpe_all_api_h.h>
+#undef vl_printfun
+
+#define SPDK_VPP_CLIB_MEM_SIZE 256 << 20
+#define SPDK_VPP_SESSIONS_MAX 2048
+#define SPDK_VPP_LISTEN_QUEUE_SIZE SPDK_VPP_SESSIONS_MAX
+#define SPDK_VPP_SEGMENT_BASEVA 0x200000000ULL
+#define SPDK_VPP_SEGMENT_TIMEOUT 20
+#define IOV_BATCH_SIZE 64
+
+/* VPP connection state */
+enum spdk_vpp_state {
+ VPP_STATE_START,
+ VPP_STATE_ENABLED,
+ VPP_STATE_ATTACHED,
+ VPP_STATE_READY,
+ VPP_STATE_DISCONNECTING,
+ VPP_STATE_FAILED
+};
+
+/* VPP session state */
+enum spdk_vpp_session_state {
+ VPP_SESSION_STATE_UNUSED = 0,
+ VPP_SESSION_STATE_INIT, /* Initial state */
+ VPP_SESSION_STATE_READY, /* Ready for processing */
+ VPP_SESSION_STATE_DISCONNECT,
+ VPP_SESSION_STATE_CLOSE,
+ VPP_SESSION_STATE_FAILED
+};
+
+struct spdk_vpp_session {
+ struct spdk_sock base;
+
+ /* VPP app session */
+ app_session_t app_session;
+
+ uint32_t id;
+
+ bool is_server; /* Server side session */
+ bool is_listen; /* Session is listener */
+
+ uint64_t handle;
+ uint32_t context;
+
+ /* Listener fields */
+ pthread_mutex_t accept_session_lock;
+ uint32_t *accept_session_index_fifo;
+};
+
+static struct spdk_vpp_main {
+ int my_client_index;
+ enum spdk_vpp_state vpp_state;
+ bool vpp_initialized;
+ struct spdk_thread *init_thread;
+
+ svm_fifo_segment_main_t segment_main;
+ svm_queue_t *vl_input_queue;
+ svm_queue_t *vl_output_queue;
+ svm_msg_q_t *app_event_queue;
+
+ struct spdk_vpp_session sessions[SPDK_VPP_SESSIONS_MAX];
+ pthread_mutex_t session_get_lock;
+
+ struct spdk_poller *vpp_queue_poller;
+ struct spdk_poller *app_queue_poller;
+ struct spdk_poller *timeout_poller;
+} g_svm;
+
+struct spdk_vpp_sock_group_impl {
+ struct spdk_sock_group_impl base;
+ struct spdk_sock *last_sock;
+};
+
+#define __vpp_session(sock) ((struct spdk_vpp_session *)sock)
+#define __vpp_group_impl(group) ((struct spdk_vpp_sock_group_impl *)group)
+
+/******************************************************************************
+ * Session management
+ */
+static struct spdk_vpp_session *
+vpp_session_create(void)
+{
+ struct spdk_vpp_session *session;
+ int i;
+
+ pthread_mutex_lock(&g_svm.session_get_lock);
+ for (i = 0; i < SPDK_VPP_SESSIONS_MAX &&
+ g_svm.sessions[i].app_session.session_state != VPP_SESSION_STATE_UNUSED; i++) {
+ /* Empty loop body */
+ }
+ if (i == SPDK_VPP_SESSIONS_MAX ||
+ g_svm.sessions[i].app_session.session_state != VPP_SESSION_STATE_UNUSED) {
+ SPDK_ERRLOG("Cannot allocate space for new session\n");
+ pthread_mutex_unlock(&g_svm.session_get_lock);
+ return NULL;
+ }
+ session = &g_svm.sessions[i];
+ memset(session, 0, sizeof(struct spdk_vpp_session));
+ pthread_mutex_init(&session->accept_session_lock, NULL);
+
+ session->id = i;
+ session->app_session.session_state = VPP_SESSION_STATE_INIT;
+
+ pthread_mutex_unlock(&g_svm.session_get_lock);
+
+ SPDK_DEBUGLOG(SPDK_SOCK_VPP, "Creating new session %p (%d)\n",
+ session, session->id);
+
+ return session;
+}
+
+static struct spdk_vpp_session *
+vpp_session_get(uint32_t id)
+{
+ struct spdk_vpp_session *session = NULL;
+
+ if (id >= SPDK_VPP_SESSIONS_MAX) {
+ return NULL;
+ }
+
+ pthread_mutex_lock(&g_svm.session_get_lock);
+ if (g_svm.sessions[id].app_session.session_state != VPP_SESSION_STATE_UNUSED) {
+ session = &g_svm.sessions[id];
+ }
+ pthread_mutex_unlock(&g_svm.session_get_lock);
+
+ return session;
+}
+
+static struct spdk_vpp_session *
+vpp_session_get_by_handle(uint64_t handle, bool is_listen)
+{
+ struct spdk_vpp_session *session = NULL;
+ int i;
+
+ for (i = 0; i < SPDK_VPP_SESSIONS_MAX; i++) {
+ if (g_svm.sessions[i].app_session.session_state != VPP_SESSION_STATE_UNUSED &&
+ g_svm.sessions[i].app_session.session_state != VPP_SESSION_STATE_DISCONNECT &&
+ g_svm.sessions[i].handle == handle &&
+ g_svm.sessions[i].is_listen == is_listen) {
+ session = &g_svm.sessions[i];
+ break;
+ }
+ }
+
+ return session;
+}
+
+static int
+vpp_session_free(struct spdk_vpp_session *session)
+{
+ /* Remove session */
+ if (session == NULL) {
+ SPDK_ERRLOG("Wrong session\n");
+ return -EINVAL;
+ }
+
+ SPDK_DEBUGLOG(SPDK_SOCK_VPP, "Free session %p (%d)\n", session, session->id);
+
+ pthread_mutex_lock(&g_svm.session_get_lock);
+ session->app_session.session_state = VPP_SESSION_STATE_UNUSED;
+ pthread_mutex_destroy(&session->accept_session_lock);
+ pthread_mutex_unlock(&g_svm.session_get_lock);
+
+ return 0;
+}
+
+static int
+vpp_sock_getaddr(struct spdk_sock *_sock, char *saddr, int slen, uint16_t *sport,
+ char *caddr, int clen, uint16_t *cport)
+{
+ struct spdk_vpp_session *session = __vpp_session(_sock);
+ const char *result = NULL;
+
+ assert(session != NULL);
+ assert(g_svm.vpp_initialized);
+
+ if (session->app_session.transport.is_ip4) {
+ result = inet_ntop(AF_INET, &session->app_session.transport.lcl_ip.ip4.as_u8,
+ saddr, slen);
+ } else {
+ result = inet_ntop(AF_INET6, &session->app_session.transport.lcl_ip.ip6.as_u8,
+ saddr, slen);
+ }
+ if (result == NULL) {
+ return -1;
+ }
+
+ if (sport) {
+ *sport = ntohs(session->app_session.transport.lcl_port);
+ }
+
+ if (session->app_session.transport.is_ip4) {
+ result = inet_ntop(AF_INET, &session->app_session.transport.rmt_ip.ip4.as_u8,
+ caddr, clen);
+ } else {
+ result = inet_ntop(AF_INET6, &session->app_session.transport.rmt_ip.ip6.as_u8,
+ caddr, clen);
+ }
+ if (result == NULL) {
+ return -1;
+ }
+
+ if (cport) {
+ *cport = ntohs(session->app_session.transport.rmt_port);
+ }
+
+ return 0;
+}
+
+enum spdk_vpp_create_type {
+ SPDK_SOCK_CREATE_LISTEN,
+ SPDK_SOCK_CREATE_CONNECT,
+};
+
+/******************************************************************************
+ * VPP message handlers
+ */
+static void
+session_accepted_handler(session_accepted_msg_t *mp)
+{
+ svm_fifo_t *rx_fifo, *tx_fifo;
+ struct spdk_vpp_session *client_session, *listen_session;
+
+ pthread_mutex_lock(&g_svm.session_get_lock);
+ listen_session = vpp_session_get_by_handle(mp->listener_handle, true);
+ pthread_mutex_unlock(&g_svm.session_get_lock);
+ if (!listen_session) {
+ SPDK_ERRLOG("Listener not found\n");
+ return;
+ }
+
+ SPDK_DEBUGLOG(SPDK_SOCK_VPP, "Listeners handle is %" PRIu64 "\n", mp->listener_handle);
+
+ /* Allocate local session for a client and set it up */
+ client_session = vpp_session_create();
+ if (client_session == NULL) {
+ SPDK_ERRLOG("Cannot create new session\n");
+ return;
+ }
+
+ SPDK_DEBUGLOG(SPDK_SOCK_VPP, "Accept session %p (%d) on %p (%d/%" PRIu64 ")\n",
+ client_session, client_session->id, listen_session, listen_session->id,
+ listen_session->handle);
+
+ rx_fifo = uword_to_pointer(mp->server_rx_fifo, svm_fifo_t *);
+ rx_fifo->client_session_index = client_session->id;
+ tx_fifo = uword_to_pointer(mp->server_tx_fifo, svm_fifo_t *);
+ tx_fifo->client_session_index = client_session->id;
+
+ client_session->handle = mp->handle;
+ client_session->context = mp->context;
+ client_session->app_session.rx_fifo = rx_fifo;
+ client_session->app_session.tx_fifo = tx_fifo;
+ client_session->app_session.vpp_evt_q = uword_to_pointer(mp->vpp_event_queue_address,
+ svm_msg_q_t *);
+
+ client_session->is_server = true;
+ client_session->app_session.transport.rmt_port = mp->port;
+ client_session->app_session.transport.is_ip4 = mp->is_ip4;
+ memcpy(&client_session->app_session.transport.rmt_ip, mp->ip, sizeof(mp->ip));
+
+ client_session->app_session.transport.lcl_port = listen_session->app_session.transport.lcl_port;
+ memcpy(&client_session->app_session.transport.lcl_ip, &listen_session->app_session.transport.lcl_ip,
+ sizeof(listen_session->app_session.transport.lcl_ip));
+ client_session->app_session.transport.is_ip4 = listen_session->app_session.transport.is_ip4;
+
+ client_session->app_session.session_state = VPP_SESSION_STATE_READY;
+
+ pthread_mutex_lock(&listen_session->accept_session_lock);
+
+ clib_fifo_add1(listen_session->accept_session_index_fifo,
+ client_session->id);
+
+ pthread_mutex_unlock(&listen_session->accept_session_lock);
+}
+
+static void
+session_connected_handler(session_connected_msg_t *mp)
+{
+ struct spdk_vpp_session *session;
+ svm_fifo_t *rx_fifo, *tx_fifo;
+
+ session = vpp_session_get(mp->context);
+ if (session == NULL) {
+ return;
+ }
+
+ if (mp->retval) {
+ SPDK_ERRLOG("Connection failed (%d).\n", ntohl(mp->retval));
+ session->app_session.session_state = VPP_SESSION_STATE_FAILED;
+ return;
+ }
+
+ session->app_session.vpp_evt_q = uword_to_pointer(mp->vpp_event_queue_address,
+ svm_msg_q_t *);
+
+ rx_fifo = uword_to_pointer(mp->server_rx_fifo, svm_fifo_t *);
+ rx_fifo->client_session_index = session->id;
+ tx_fifo = uword_to_pointer(mp->server_tx_fifo, svm_fifo_t *);
+ tx_fifo->client_session_index = session->id;
+
+ session->app_session.rx_fifo = rx_fifo;
+ session->app_session.tx_fifo = tx_fifo;
+ session->handle = mp->handle;
+
+ /* Set lcl addr */
+ session->app_session.transport.is_ip4 = mp->is_ip4;
+ memcpy(&session->app_session.transport.lcl_ip, mp->lcl_ip, sizeof(mp->lcl_ip));
+ session->app_session.transport.lcl_port = mp->lcl_port;
+
+ session->app_session.session_state = VPP_SESSION_STATE_READY;
+}
+
+static void
+session_disconnected_handler(session_disconnected_msg_t *mp)
+{
+ struct spdk_vpp_session *session = 0;
+
+ pthread_mutex_lock(&g_svm.session_get_lock);
+ session = vpp_session_get_by_handle(mp->handle, false);
+ if (session == NULL) {
+ SPDK_ERRLOG("Session with handle=%" PRIu64 " not found.\n",
+ mp->handle);
+ pthread_mutex_unlock(&g_svm.session_get_lock);
+ return;
+ }
+ SPDK_DEBUGLOG(SPDK_SOCK_VPP, "Disconnect session %p (%d) handler\n", session, session->id);
+
+ /* We need to postpone session deletion to inform upper layer */
+ session->app_session.session_state = VPP_SESSION_STATE_DISCONNECT;
+ pthread_mutex_unlock(&g_svm.session_get_lock);
+}
+
+static void
+session_reset_handler(session_reset_msg_t *mp)
+{
+ int rv = 0;
+ struct spdk_vpp_session *session = NULL;
+ app_session_evt_t app_evt;
+ session_reset_reply_msg_t *rmp;
+
+ pthread_mutex_lock(&g_svm.session_get_lock);
+ session = vpp_session_get_by_handle(mp->handle, false);
+ if (session == NULL) {
+ SPDK_ERRLOG("Session with handle=%" PRIu64 " not found.\n",
+ mp->handle);
+ pthread_mutex_unlock(&g_svm.session_get_lock);
+ return;
+ }
+ SPDK_DEBUGLOG(SPDK_SOCK_VPP, "Reset session %p (%d) handler\n", session, session->id);
+
+ session->app_session.session_state = VPP_SESSION_STATE_DISCONNECT;
+ pthread_mutex_unlock(&g_svm.session_get_lock);
+
+ app_alloc_ctrl_evt_to_vpp(session->app_session.vpp_evt_q, &app_evt,
+ SESSION_CTRL_EVT_RESET_REPLY);
+ rmp = (session_reset_reply_msg_t *) app_evt.evt->data;
+ rmp->retval = rv;
+ rmp->handle = mp->handle;
+ app_send_ctrl_evt_to_vpp(session->app_session.vpp_evt_q, &app_evt);
+}
+
+static void
+session_bound_handler(session_bound_msg_t *mp)
+{
+ struct spdk_vpp_session *session;
+
+ /* Context should be set to the session index */
+ session = vpp_session_get(mp->context);
+
+ if (mp->retval) {
+ SPDK_ERRLOG("Bind failed (%d).\n", ntohl(mp->retval));
+ session->app_session.session_state = VPP_SESSION_STATE_FAILED;
+ return;
+ }
+
+ /* Set local address */
+ session->app_session.transport.is_ip4 = mp->lcl_is_ip4;
+ memcpy(&session->app_session.transport.lcl_ip, mp->lcl_ip, sizeof(mp->lcl_ip));
+ session->app_session.transport.lcl_port = mp->lcl_port;
+
+ /* Register listener */
+ session->handle = mp->handle;
+
+ SPDK_DEBUGLOG(SPDK_SOCK_VPP, "Bind session %p (%d/%" PRIu64 ")\n",
+ session, session->id, session->handle);
+
+ /* Session binded, set listen state */
+ session->is_listen = true;
+ session->app_session.session_state = VPP_SESSION_STATE_READY;
+}
+
+static void
+session_unlisten_reply_handler(session_unlisten_reply_msg_t *mp)
+{
+ struct spdk_vpp_session *session;
+
+ if (mp->retval != 0) {
+ SPDK_ERRLOG("Cannot unbind socket\n");
+ return;
+ }
+
+ session = vpp_session_get(mp->context);
+ if (session == NULL) {
+ SPDK_ERRLOG("Cannot find a session by context\n");
+ return;
+ }
+ SPDK_DEBUGLOG(SPDK_SOCK_VPP, "Unbind session %p (%d)\n", session, session->id);
+
+ session->app_session.session_state = VPP_SESSION_STATE_CLOSE;
+}
+
+static void
+handle_mq_event(session_event_t *e)
+{
+ switch (e->event_type) {
+ case SESSION_CTRL_EVT_BOUND:
+ session_bound_handler((session_bound_msg_t *) e->data);
+ break;
+ case SESSION_CTRL_EVT_ACCEPTED:
+ session_accepted_handler((session_accepted_msg_t *) e->data);
+ break;
+ case SESSION_CTRL_EVT_CONNECTED:
+ session_connected_handler((session_connected_msg_t *) e->data);
+ break;
+ case SESSION_CTRL_EVT_DISCONNECTED:
+ session_disconnected_handler((session_disconnected_msg_t *) e->data);
+ break;
+ case SESSION_CTRL_EVT_RESET:
+ session_reset_handler((session_reset_msg_t *) e->data);
+ break;
+ case SESSION_CTRL_EVT_UNLISTEN_REPLY:
+ session_unlisten_reply_handler((session_unlisten_reply_msg_t *) e->data);
+ break;
+ default:
+ SPDK_DEBUGLOG(SPDK_SOCK_VPP, "Unhandled event %u\n", e->event_type);
+ }
+}
+
+static int
+vpp_queue_poller(void *ctx)
+{
+ uword msg;
+
+ if (g_svm.vl_output_queue->cursize > 0 &&
+ !svm_queue_sub_raw(g_svm.vl_output_queue, (u8 *)&msg)) {
+ vl_msg_api_handler((void *)msg);
+ }
+
+ return SPDK_POLLER_BUSY;
+}
+
+static int
+app_queue_poller(void *ctx)
+{
+ session_event_t *e;
+ svm_msg_q_msg_t msg;
+
+ if (!svm_msg_q_is_empty(g_svm.app_event_queue)) {
+ svm_msg_q_sub(g_svm.app_event_queue, &msg, SVM_Q_WAIT, 0);
+ e = svm_msg_q_msg_data(g_svm.app_event_queue, &msg);
+ handle_mq_event(e);
+ svm_msg_q_free_msg(g_svm.app_event_queue, &msg);
+ }
+ return SPDK_POLLER_BUSY;
+}
+
+/* This is required until sock.c API changes to asynchronous */
+static int
+_wait_for_session_state_change(struct spdk_vpp_session *session, enum spdk_vpp_session_state state)
+{
+ time_t start = time(NULL);
+ while (time(NULL) - start < 10) {
+ if (session->app_session.session_state == VPP_SESSION_STATE_FAILED) {
+ errno = EADDRNOTAVAIL;
+ return -1;
+ }
+ if (session->app_session.session_state == state) {
+ errno = 0;
+ return 0;
+ }
+ if (spdk_get_thread() == g_svm.init_thread) {
+ usleep(100000);
+ app_queue_poller(NULL);
+ vpp_queue_poller(NULL);
+ }
+ }
+ /* timeout */
+ errno = ETIMEDOUT;
+ return -1;
+}
+
+static int
+vpp_session_connect(struct spdk_vpp_session *session)
+{
+ vl_api_connect_sock_t *cmp;
+
+ cmp = vl_msg_api_alloc(sizeof(*cmp));
+ if (cmp == NULL) {
+ return -ENOMEM;
+ }
+ memset(cmp, 0, sizeof(*cmp));
+
+ cmp->_vl_msg_id = ntohs(VL_API_CONNECT_SOCK);
+ cmp->client_index = g_svm.my_client_index;
+ cmp->context = session->id;
+
+ cmp->vrf = 0 /* VPPCOM_VRF_DEFAULT */;
+ cmp->is_ip4 = (session->app_session.transport.is_ip4);
+ memcpy(cmp->ip, &session->app_session.transport.rmt_ip, sizeof(cmp->ip));
+ cmp->port = session->app_session.transport.rmt_port;
+ cmp->proto = TRANSPORT_PROTO_TCP;
+ vl_msg_api_send_shmem(g_svm.vl_input_queue, (u8 *)&cmp);
+
+ return _wait_for_session_state_change(session, VPP_SESSION_STATE_READY);
+}
+
+static void
+vl_api_disconnect_session_reply_t_handler(vl_api_disconnect_session_reply_t *mp)
+{
+ struct spdk_vpp_session *session;
+
+ if (mp->retval) {
+ SPDK_ERRLOG("Disconnecting session failed (%d).\n", ntohl(mp->retval));
+ return;
+ }
+
+ pthread_mutex_lock(&g_svm.session_get_lock);
+ session = vpp_session_get_by_handle(mp->handle, false);
+ if (session == NULL) {
+ SPDK_ERRLOG("Invalid session handler (%" PRIu64 ").\n", mp->handle);
+ pthread_mutex_unlock(&g_svm.session_get_lock);
+ return;
+ }
+ SPDK_DEBUGLOG(SPDK_SOCK_VPP, "Session disconnected %p (%d)\n", session, session->id);
+ session->app_session.session_state = VPP_SESSION_STATE_CLOSE;
+ pthread_mutex_unlock(&g_svm.session_get_lock);
+}
+
+static int
+vpp_session_disconnect(struct spdk_vpp_session *session)
+{
+ int rv = 0;
+ vl_api_disconnect_session_t *dmp;
+ session_disconnected_reply_msg_t *rmp;
+ app_session_evt_t app_evt;
+
+ if (session->app_session.session_state == VPP_SESSION_STATE_DISCONNECT) {
+ SPDK_DEBUGLOG(SPDK_SOCK_VPP, "Session is already in disconnecting state %p (%d)\n",
+ session, session->id);
+
+ app_alloc_ctrl_evt_to_vpp(session->app_session.vpp_evt_q, &app_evt,
+ SESSION_CTRL_EVT_DISCONNECTED_REPLY);
+ rmp = (session_disconnected_reply_msg_t *) app_evt.evt->data;
+ rmp->retval = rv;
+ rmp->handle = session->handle;
+ rmp->context = session->context;
+ app_send_ctrl_evt_to_vpp(session->app_session.vpp_evt_q, &app_evt);
+
+ return 0;
+ }
+ SPDK_DEBUGLOG(SPDK_SOCK_VPP, "Disconnect session %p (%d)\n", session, session->id);
+
+ dmp = vl_msg_api_alloc(sizeof(*dmp));
+ if (dmp == NULL) {
+ return -ENOMEM;
+ }
+ memset(dmp, 0, sizeof(*dmp));
+ dmp->_vl_msg_id = ntohs(VL_API_DISCONNECT_SESSION);
+ dmp->client_index = g_svm.my_client_index;
+ dmp->handle = session->handle;
+ vl_msg_api_send_shmem(g_svm.vl_input_queue, (u8 *)&dmp);
+
+ return _wait_for_session_state_change(session, VPP_SESSION_STATE_CLOSE);
+}
+
+static int
+send_unbind_sock(struct spdk_vpp_session *session)
+{
+ vl_api_unbind_sock_t *ump;
+
+ SPDK_DEBUGLOG(SPDK_SOCK_VPP, "Unbind session %p (%d) request\n", session, session->id);
+
+ ump = vl_msg_api_alloc(sizeof(*ump));
+ if (ump == NULL) {
+ return -ENOMEM;
+ }
+ memset(ump, 0, sizeof(*ump));
+
+ ump->_vl_msg_id = ntohs(VL_API_UNBIND_SOCK);
+ ump->client_index = g_svm.my_client_index;
+ ump->handle = session->handle;
+ ump->context = session->id;
+ vl_msg_api_send_shmem(g_svm.vl_input_queue, (u8 *)&ump);
+
+ return _wait_for_session_state_change(session, VPP_SESSION_STATE_CLOSE);
+}
+
+static int
+vpp_session_listen(struct spdk_vpp_session *session)
+{
+ vl_api_bind_sock_t *bmp;
+
+ if (session->is_listen) {
+ /* Already in the listen state */
+ return 0;
+ }
+
+ clib_fifo_resize(session->accept_session_index_fifo, SPDK_VPP_LISTEN_QUEUE_SIZE);
+
+ session->is_server = 1;
+ bmp = vl_msg_api_alloc(sizeof(*bmp));
+ if (bmp == NULL) {
+ return -ENOMEM;
+ }
+ memset(bmp, 0, sizeof(*bmp));
+
+ bmp->_vl_msg_id = ntohs(VL_API_BIND_SOCK);
+ bmp->client_index = g_svm.my_client_index;
+ bmp->context = session->id;
+ bmp->vrf = 0;
+ bmp->is_ip4 = session->app_session.transport.is_ip4;
+ memcpy(bmp->ip, &session->app_session.transport.lcl_ip, sizeof(bmp->ip));
+ bmp->port = session->app_session.transport.lcl_port;
+ bmp->proto = TRANSPORT_PROTO_TCP;
+
+ vl_msg_api_send_shmem(g_svm.vl_input_queue, (u8 *)&bmp);
+
+ return _wait_for_session_state_change(session, VPP_SESSION_STATE_READY);
+}
+
+static struct spdk_sock *
+vpp_sock_create(const char *ip, int port, enum spdk_vpp_create_type type,
+ struct spdk_sock_opts *opts)
+{
+ struct spdk_vpp_session *session;
+ int rc;
+ uint8_t is_ip4 = 0;
+ ip46_address_t addr_buf;
+
+ if (!g_svm.vpp_initialized || ip == NULL) {
+ return NULL;
+ }
+
+ session = vpp_session_create();
+ if (session == NULL) {
+ SPDK_ERRLOG("vpp_session_create() failed\n");
+ errno = ENOMEM;
+ return NULL;
+ }
+
+ /* Check address family */
+ if (inet_pton(AF_INET, ip, &addr_buf.ip4.as_u8)) {
+ is_ip4 = 1;
+ } else if (inet_pton(AF_INET6, ip, &addr_buf.ip6.as_u8)) {
+ is_ip4 = 0;
+ } else {
+ SPDK_ERRLOG("IP address with invalid format\n");
+ errno = EAFNOSUPPORT;
+ goto err;
+ }
+
+ if (type == SPDK_SOCK_CREATE_LISTEN) {
+ session->app_session.transport.is_ip4 = is_ip4;
+ memcpy(&session->app_session.transport.lcl_ip, &addr_buf, sizeof(addr_buf));
+ session->app_session.transport.lcl_port = htons(port);
+
+ rc = vpp_session_listen(session);
+ if (rc != 0) {
+ errno = -rc;
+ SPDK_ERRLOG("session_listen() failed\n");
+ goto err;
+ }
+ } else if (type == SPDK_SOCK_CREATE_CONNECT) {
+ session->app_session.transport.is_ip4 = is_ip4;
+ memcpy(&session->app_session.transport.rmt_ip, &addr_buf, sizeof(addr_buf));
+ session->app_session.transport.rmt_port = htons(port);
+
+ rc = vpp_session_connect(session);
+ if (rc != 0) {
+ SPDK_ERRLOG("session_connect() failed\n");
+ goto err;
+ }
+ }
+
+ return &session->base;
+
+err:
+ vpp_session_free(session);
+ return NULL;
+}
+
+static struct spdk_sock *
+vpp_sock_listen(const char *ip, int port, struct spdk_sock_opts *opts)
+{
+ return vpp_sock_create(ip, port, SPDK_SOCK_CREATE_LISTEN, opts);
+}
+
+static struct spdk_sock *
+vpp_sock_connect(const char *ip, int port, struct spdk_sock_opts *opts)
+{
+ return vpp_sock_create(ip, port, SPDK_SOCK_CREATE_CONNECT, opts);
+}
+
+static struct spdk_sock *
+vpp_sock_accept(struct spdk_sock *_sock)
+{
+ struct spdk_vpp_session *listen_session = __vpp_session(_sock);
+ struct spdk_vpp_session *client_session = NULL;
+ u32 client_session_index = ~0;
+ uword elts = 0;
+ app_session_evt_t app_evt;
+ session_accepted_reply_msg_t *rmp;
+
+ assert(listen_session != NULL);
+ assert(g_svm.vpp_initialized);
+
+ if (listen_session->app_session.session_state != VPP_SESSION_STATE_READY) {
+ /* Listen session should be in the listen state */
+ errno = EWOULDBLOCK;
+ return NULL;
+ }
+
+ pthread_mutex_lock(&listen_session->accept_session_lock);
+
+ if (listen_session->accept_session_index_fifo != NULL) {
+ elts = clib_fifo_elts(listen_session->accept_session_index_fifo);
+ }
+
+ if (elts == 0) {
+ /* No client sessions */
+ errno = EAGAIN;
+ pthread_mutex_unlock(&listen_session->accept_session_lock);
+ return NULL;
+ }
+
+ clib_fifo_sub1(listen_session->accept_session_index_fifo,
+ client_session_index);
+
+ pthread_mutex_unlock(&listen_session->accept_session_lock);
+
+ client_session = vpp_session_get(client_session_index);
+ if (client_session == NULL) {
+ SPDK_ERRLOG("client session closed or aborted\n");
+ errno = ECONNABORTED;
+ return NULL;
+ }
+
+ SPDK_DEBUGLOG(SPDK_SOCK_VPP, "Client %p(%" PRIu32 ") accepted.\n",
+ client_session, client_session_index);
+
+ /*
+ * Send accept session reply
+ */
+ app_alloc_ctrl_evt_to_vpp(client_session->app_session.vpp_evt_q, &app_evt,
+ SESSION_CTRL_EVT_ACCEPTED_REPLY);
+ rmp = (session_accepted_reply_msg_t *) app_evt.evt->data;
+ rmp->handle = client_session->handle;
+ rmp->context = client_session->context;
+ app_send_ctrl_evt_to_vpp(client_session->app_session.vpp_evt_q, &app_evt);
+
+ return &client_session->base;
+}
+
+static int
+vpp_sock_close(struct spdk_sock *_sock)
+{
+ struct spdk_vpp_session *session = __vpp_session(_sock);
+
+ assert(session != NULL);
+ assert(g_svm.vpp_initialized);
+
+ if (session->is_listen) {
+ send_unbind_sock(session);
+ } else {
+ vpp_session_disconnect(session);
+ }
+ vpp_session_free(session);
+
+ return 0;
+}
+
+static ssize_t
+vpp_sock_recv(struct spdk_sock *_sock, void *buf, size_t len)
+{
+ struct spdk_vpp_session *session = __vpp_session(_sock);
+ int rc;
+ svm_fifo_t *rx_fifo;
+ uint32_t bytes;
+
+ assert(session != NULL);
+ assert(g_svm.vpp_initialized);
+
+ rx_fifo = session->app_session.rx_fifo;
+
+ bytes = svm_fifo_max_dequeue(session->app_session.rx_fifo);
+ if (bytes > (ssize_t)len) {
+ bytes = len;
+ }
+
+ if (bytes == 0) {
+ if (session->app_session.session_state == VPP_SESSION_STATE_DISCONNECT) {
+ /* Socket is disconnected */
+ SPDK_DEBUGLOG(SPDK_SOCK_VPP, "Client %p(%" PRIu32 ") is disconnected.\n",
+ session, session->id);
+ errno = 0;
+ return 0;
+ }
+ errno = EAGAIN;
+ return -1;
+ }
+
+ rc = app_recv_stream_raw(rx_fifo, buf, bytes, 0, 0);
+ if (rc < 0) {
+ errno = -rc;
+ return rc;
+ }
+
+ return rc;
+}
+
+static ssize_t
+vpp_sock_readv(struct spdk_sock *_sock, struct iovec *iov, int iovcnt)
+{
+ ssize_t total = 0;
+ int i, rc;
+
+ assert(_sock != NULL);
+ assert(g_svm.vpp_initialized);
+
+ for (i = 0; i < iovcnt; ++i) {
+ rc = vpp_sock_recv(_sock, iov[i].iov_base, iov[i].iov_len);
+ if (rc < 0) {
+ if (total > 0) {
+ break;
+ } else {
+ errno = -rc;
+ return -1;
+ }
+ } else {
+ total += rc;
+ if (rc < (ssize_t)iov[i].iov_len) {
+ /* Read less than buffer provided, no point to continue. */
+ break;
+ }
+ }
+ }
+ return total;
+}
+
+static ssize_t
+_vpp_sock_writev(struct spdk_sock *_sock, struct iovec *iov, int iovcnt)
+{
+ struct spdk_vpp_session *session = __vpp_session(_sock);
+ ssize_t total = 0;
+ int i, rc;
+ svm_fifo_t *tx_fifo;
+ session_evt_type_t et;
+
+ assert(session != NULL);
+ assert(g_svm.vpp_initialized);
+
+ tx_fifo = session->app_session.tx_fifo;
+ et = SESSION_IO_EVT_TX;
+
+ for (i = 0; i < iovcnt; ++i) {
+ if (svm_fifo_is_full(tx_fifo)) {
+ errno = EWOULDBLOCK;
+ return -1;
+ }
+
+ /* We use only stream connection for now */
+ rc = app_send_stream_raw(tx_fifo, session->app_session.vpp_evt_q,
+ iov[i].iov_base, iov[i].iov_len, et,
+ 1, SVM_Q_WAIT);
+
+ if (rc < 0) {
+ if (total > 0) {
+ break;
+ } else {
+ SPDK_DEBUGLOG(SPDK_SOCK_VPP, "Buffer overflow\n");
+ errno = EWOULDBLOCK;
+ return -1;
+ }
+ } else {
+ total += rc;
+ if (rc < (ssize_t)iov[i].iov_len) {
+ /* Write less than buffer provided, no point to continue. */
+ break;
+ }
+ }
+ }
+
+ return total;
+}
+
+static int
+_sock_flush(struct spdk_sock *sock)
+{
+ struct iovec iovs[IOV_BATCH_SIZE];
+ int iovcnt;
+ int retval;
+ struct spdk_sock_request *req;
+ int i;
+ ssize_t rc;
+ unsigned int offset;
+ size_t len;
+
+ /* Can't flush from within a callback or we end up with recursive calls */
+ if (sock->cb_cnt > 0) {
+ return 0;
+ }
+
+ /* Gather an iov */
+ iovcnt = 0;
+ req = TAILQ_FIRST(&sock->queued_reqs);
+ while (req) {
+ offset = req->internal.offset;
+
+ for (i = 0; i < req->iovcnt; i++) {
+ /* Consume any offset first */
+ if (offset >= SPDK_SOCK_REQUEST_IOV(req, i)->iov_len) {
+ offset -= SPDK_SOCK_REQUEST_IOV(req, i)->iov_len;
+ continue;
+ }
+
+ iovs[iovcnt].iov_base = SPDK_SOCK_REQUEST_IOV(req, i)->iov_base + offset;
+ iovs[iovcnt].iov_len = SPDK_SOCK_REQUEST_IOV(req, i)->iov_len - offset;
+ iovcnt++;
+
+ offset = 0;
+
+ if (iovcnt >= IOV_BATCH_SIZE) {
+ break;
+ }
+ }
+
+ if (iovcnt >= IOV_BATCH_SIZE) {
+ break;
+ }
+
+ req = TAILQ_NEXT(req, internal.link);
+ }
+
+ if (iovcnt == 0) {
+ return 0;
+ }
+
+ /* Perform the vectored write */
+ rc = _vpp_sock_writev(sock, iovs, iovcnt);
+ if (rc <= 0) {
+ if (errno == EAGAIN || errno == EWOULDBLOCK) {
+ return 0;
+ }
+ return rc;
+ }
+
+ /* Consume the requests that were actually written */
+ req = TAILQ_FIRST(&sock->queued_reqs);
+ while (req) {
+ offset = req->internal.offset;
+
+ for (i = 0; i < req->iovcnt; i++) {
+ /* Advance by the offset first */
+ if (offset >= SPDK_SOCK_REQUEST_IOV(req, i)->iov_len) {
+ offset -= SPDK_SOCK_REQUEST_IOV(req, i)->iov_len;
+ continue;
+ }
+
+ /* Calculate the remaining length of this element */
+ len = SPDK_SOCK_REQUEST_IOV(req, i)->iov_len - offset;
+
+ if (len > (size_t)rc) {
+ /* This element was partially sent. */
+ req->internal.offset += rc;
+ return 0;
+ }
+
+ offset = 0;
+ req->internal.offset += len;
+ rc -= len;
+ }
+
+ /* Handled a full request. */
+ req->internal.offset = 0;
+ spdk_sock_request_pend(sock, req);
+
+ /* The _vpp_sock_writev above isn't currently asynchronous,
+ * so it's already done. */
+ retval = spdk_sock_request_put(sock, req, 0);
+
+ if (rc == 0 || retval) {
+ break;
+ }
+
+ req = TAILQ_FIRST(&sock->queued_reqs);
+ }
+
+ return 0;
+}
+
+static ssize_t
+vpp_sock_writev(struct spdk_sock *_sock, struct iovec *iov, int iovcnt)
+{
+ int rc;
+
+ /* In order to process a writev, we need to flush any asynchronous writes
+ * first. */
+ rc = _sock_flush(_sock);
+ if (rc < 0) {
+ return rc;
+ }
+
+ if (!TAILQ_EMPTY(&_sock->queued_reqs)) {
+ /* We weren't able to flush all requests */
+ errno = EAGAIN;
+ return -1;
+ }
+
+ return _vpp_sock_writev(_sock, iov, iovcnt);
+}
+
+static void
+vpp_sock_writev_async(struct spdk_sock *sock, struct spdk_sock_request *req)
+{
+ int rc;
+
+ spdk_sock_request_queue(sock, req);
+
+ if (sock->group_impl == NULL) {
+ spdk_sock_request_put(sock, req, -ENOTSUP);
+ return;
+ }
+
+ /* If there are a sufficient number queued, just flush them out immediately. */
+ if (sock->queued_iovcnt >= IOV_BATCH_SIZE) {
+ rc = _sock_flush(sock);
+ if (rc) {
+ spdk_sock_abort_requests(sock);
+ }
+ }
+}
+
+static int
+vpp_sock_set_recvlowat(struct spdk_sock *_sock, int nbytes)
+{
+ assert(g_svm.vpp_initialized);
+
+ return 0;
+}
+
+static int
+vpp_sock_set_recvbuf(struct spdk_sock *_sock, int sz)
+{
+ assert(g_svm.vpp_initialized);
+
+ return 0;
+}
+
+static int
+vpp_sock_set_sendbuf(struct spdk_sock *_sock, int sz)
+{
+ assert(g_svm.vpp_initialized);
+
+ return 0;
+}
+
+static bool
+vpp_sock_is_ipv6(struct spdk_sock *_sock)
+{
+ return !__vpp_session(_sock)->app_session.transport.is_ip4;
+}
+
+static bool
+vpp_sock_is_ipv4(struct spdk_sock *_sock)
+{
+ return __vpp_session(_sock)->app_session.transport.is_ip4;
+}
+
+static bool
+vpp_sock_is_connected(struct spdk_sock *_sock)
+{
+ assert(g_svm.vpp_initialized);
+
+ return (__vpp_session(_sock)->app_session.session_state == VPP_SESSION_STATE_READY);
+}
+
+static int
+vpp_sock_get_placement_id(struct spdk_sock *_sock, int *placement_id)
+{
+ return -1;
+}
+
+static struct spdk_sock_group_impl *
+vpp_sock_group_impl_create(void)
+{
+ struct spdk_vpp_sock_group_impl *group_impl;
+
+ if (!g_svm.vpp_initialized) {
+ return NULL;
+ }
+
+ group_impl = calloc(1, sizeof(*group_impl));
+ if (group_impl == NULL) {
+ SPDK_ERRLOG("sock_group allocation failed\n");
+ errno = ENOMEM;
+ return NULL;
+ }
+
+ return &group_impl->base;
+}
+
+static int
+vpp_sock_group_impl_add_sock(struct spdk_sock_group_impl *_group,
+ struct spdk_sock *_sock)
+{
+ /* We expect that higher level do it for us */
+ return 0;
+}
+
+static int
+vpp_sock_group_impl_remove_sock(struct spdk_sock_group_impl *_group,
+ struct spdk_sock *_sock)
+{
+ /* We expect that higher level do it for us */
+ return 0;
+}
+
+static bool
+vpp_session_read_ready(struct spdk_vpp_session *session)
+{
+ svm_fifo_t *rx_fifo = NULL;
+ uint32_t ready = 0;
+
+ if (session->app_session.session_state == VPP_SESSION_STATE_DISCONNECT) {
+ /* If session not found force reading to close it.
+ * NOTE: We're expecting here that upper layer will close
+ * connection when next read fails.
+ */
+ return true;
+ }
+
+ if (session->app_session.session_state == VPP_SESSION_STATE_READY) {
+ rx_fifo = session->app_session.rx_fifo;
+ ready = svm_fifo_max_dequeue(rx_fifo);
+ }
+
+ return ready > 0;
+}
+
+static int
+vpp_sock_group_impl_poll(struct spdk_sock_group_impl *_group, int max_events,
+ struct spdk_sock **socks)
+{
+ int num_events, rc;
+ struct spdk_sock *sock, *tmp;
+ struct spdk_vpp_session *session;
+ struct spdk_vpp_sock_group_impl *group;
+
+ assert(_group != NULL);
+ assert(socks != NULL);
+ assert(g_svm.vpp_initialized);
+
+ group = __vpp_group_impl(_group);
+ num_events = 0;
+
+ /* This must be a TAILQ_FOREACH_SAFE because while flushing,
+ * a completion callback could remove the sock from the
+ * group. */
+ TAILQ_FOREACH_SAFE(sock, &_group->socks, link, tmp) {
+ rc = _sock_flush(sock);
+ if (rc) {
+ spdk_sock_abort_requests(sock);
+ }
+ }
+
+ sock = group->last_sock;
+ if (sock == NULL) {
+ sock = TAILQ_FIRST(&group->base.socks);
+ }
+
+ while (sock != NULL) {
+ session = __vpp_session(sock);
+ if (vpp_session_read_ready(session)) {
+ socks[num_events] = sock;
+ num_events++;
+ if (num_events >= max_events) {
+ sock = TAILQ_NEXT(sock, link);
+ break;
+ }
+ }
+ sock = TAILQ_NEXT(sock, link);
+ }
+ group->last_sock = sock;
+
+ return num_events;
+}
+
+static int
+vpp_sock_group_impl_close(struct spdk_sock_group_impl *_group)
+{
+ free(_group);
+ return 0;
+}
+
+/******************************************************************************
+ * Initialize and attach to the VPP
+ */
+static int
+vpp_app_attach(void)
+{
+ vl_api_application_attach_t *bmp;
+ u32 fifo_size = 16 << 20;
+
+ bmp = vl_msg_api_alloc(sizeof(*bmp));
+ if (bmp == NULL) {
+ return -ENOMEM;
+ }
+ memset(bmp, 0, sizeof(*bmp));
+
+ bmp->_vl_msg_id = ntohs(VL_API_APPLICATION_ATTACH);
+ bmp->client_index = g_svm.my_client_index;
+ bmp->context = ntohl(0xfeedface);
+
+ bmp->options[APP_OPTIONS_FLAGS] = APP_OPTIONS_FLAGS_ACCEPT_REDIRECT;
+ bmp->options[APP_OPTIONS_FLAGS] |= APP_OPTIONS_FLAGS_ADD_SEGMENT;
+
+ bmp->options[APP_OPTIONS_PREALLOC_FIFO_PAIRS] = 16;
+ bmp->options[APP_OPTIONS_RX_FIFO_SIZE] = fifo_size;
+ bmp->options[APP_OPTIONS_TX_FIFO_SIZE] = fifo_size;
+ bmp->options[APP_OPTIONS_ADD_SEGMENT_SIZE] = 256 << 20;
+ bmp->options[APP_OPTIONS_SEGMENT_SIZE] = 512 << 20;
+ bmp->options[APP_OPTIONS_EVT_QUEUE_SIZE] = 256;
+
+ vl_msg_api_send_shmem(g_svm.vl_input_queue, (u8 *)&bmp);
+
+ return 0;
+}
+static void
+vl_api_session_enable_disable_reply_t_handler(vl_api_session_enable_disable_reply_t *mp)
+{
+ if (mp->retval) {
+ SPDK_ERRLOG("Session enable failed (%d).\n", ntohl(mp->retval));
+ } else {
+ SPDK_NOTICELOG("Session layer enabled\n");
+ g_svm.vpp_state = VPP_STATE_ENABLED;
+ vpp_app_attach();
+ }
+}
+
+static int
+vpp_session_enable(u8 is_enable)
+{
+ vl_api_session_enable_disable_t *bmp;
+
+ bmp = vl_msg_api_alloc(sizeof(*bmp));
+ if (bmp == NULL) {
+ return -ENOMEM;
+ }
+ memset(bmp, 0, sizeof(*bmp));
+
+ bmp->_vl_msg_id = ntohs(VL_API_SESSION_ENABLE_DISABLE);
+ bmp->client_index = g_svm.my_client_index;
+ bmp->context = htonl(0xfeedface);
+ bmp->is_enable = is_enable;
+ vl_msg_api_send_shmem(g_svm.vl_input_queue, (u8 *)&bmp);
+
+ return 0;
+}
+
+static void
+vpp_application_attached(void *arg)
+{
+ SPDK_NOTICELOG("VPP net framework initialized.\n");
+ g_svm.vpp_state = VPP_STATE_ATTACHED;
+ g_svm.vpp_initialized = true;
+ g_svm.app_queue_poller = SPDK_POLLER_REGISTER(app_queue_poller, NULL, 100);
+ spdk_net_framework_init_next(0);
+}
+
+static int
+ssvm_segment_attach(char *name, ssvm_segment_type_t type, int fd)
+{
+ svm_fifo_segment_create_args_t a;
+ int rv;
+
+ SPDK_DEBUGLOG(SPDK_SOCK_VPP, "Attaching segment %s\n", name);
+
+ clib_memset(&a, 0, sizeof(a));
+ a.segment_name = (char *) name;
+ a.segment_type = type;
+
+ assert(type == SSVM_SEGMENT_MEMFD);
+ a.memfd_fd = fd;
+
+ if ((rv = svm_fifo_segment_attach(&g_svm.segment_main, &a))) {
+ SPDK_ERRLOG("Segment '%s' attach failed (%d).\n", name, rv);
+ return rv;
+ }
+
+ vec_reset_length(a.new_segment_indices);
+ return 0;
+}
+
+static void
+vl_api_application_attach_reply_t_handler(vl_api_application_attach_reply_t *mp)
+{
+ u32 n_fds = 0;
+
+ if (mp->retval) {
+ SPDK_ERRLOG("Application attach to VPP failed (%d)\n",
+ ntohl(mp->retval));
+ goto err;
+ }
+
+ if (mp->segment_name_length == 0) {
+ SPDK_ERRLOG("segment_name_length zero\n");
+ goto err;
+ }
+
+ assert(mp->app_event_queue_address);
+ g_svm.app_event_queue = uword_to_pointer(mp->app_event_queue_address, svm_msg_q_t *);
+
+ if (mp->n_fds) {
+ int fds[mp->n_fds];
+
+ vl_socket_client_recv_fd_msg(fds, mp->n_fds, 5);
+
+ if (mp->fd_flags & SESSION_FD_F_VPP_MQ_SEGMENT) {
+ if (ssvm_segment_attach(0, SSVM_SEGMENT_MEMFD, fds[n_fds++])) {
+ goto err;
+ }
+ }
+
+ if (mp->fd_flags & SESSION_FD_F_MEMFD_SEGMENT) {
+ if (ssvm_segment_attach((char *) mp->segment_name, SSVM_SEGMENT_MEMFD, fds[n_fds++])) {
+ goto err;
+ }
+ }
+
+ if (mp->fd_flags & SESSION_FD_F_MQ_EVENTFD) {
+ svm_msg_q_set_consumer_eventfd(g_svm.app_event_queue, fds[n_fds++]);
+ }
+ }
+
+ spdk_thread_send_msg(g_svm.init_thread, vpp_application_attached, NULL);
+ return;
+err:
+ g_svm.vpp_state = VPP_STATE_FAILED;
+ return;
+}
+
+/* Detach */
+static void
+vpp_application_detached(void *arg)
+{
+ if (!g_svm.vpp_initialized) {
+ return;
+ }
+
+ spdk_poller_unregister(&g_svm.vpp_queue_poller);
+ spdk_poller_unregister(&g_svm.app_queue_poller);
+ spdk_poller_unregister(&g_svm.timeout_poller);
+
+ g_svm.vpp_initialized = false;
+ g_svm.vpp_state = VPP_STATE_START;
+ pthread_mutex_destroy(&g_svm.session_get_lock);
+ vl_socket_client_disconnect();
+
+ SPDK_NOTICELOG("Application detached\n");
+
+ spdk_net_framework_fini_next();
+}
+
+static int
+vpp_application_detached_timeout(void *arg)
+{
+ if (g_svm.vpp_initialized) {
+ /* We need to finish detach on initial thread */
+ spdk_thread_send_msg(g_svm.init_thread, vpp_application_detached, NULL);
+ }
+ return SPDK_POLLER_BUSY;
+}
+
+static void
+vl_api_application_detach_reply_t_handler(vl_api_application_detach_reply_t *mp)
+{
+ if (mp->retval) {
+ SPDK_ERRLOG("Application detach from VPP failed (%d).\n", ntohl(mp->retval));
+ g_svm.vpp_state = VPP_STATE_FAILED;
+ }
+
+ /* We need to finish detach on initial thread */
+ spdk_thread_send_msg(g_svm.init_thread, vpp_application_detached, NULL);
+}
+
+static int
+vpp_app_detach(void)
+{
+ vl_api_application_detach_t *bmp;
+
+ bmp = vl_msg_api_alloc(sizeof(*bmp));
+ if (bmp == NULL) {
+ return -ENOMEM;
+ }
+ memset(bmp, 0, sizeof(*bmp));
+
+ bmp->_vl_msg_id = ntohs(VL_API_APPLICATION_DETACH);
+ bmp->client_index = g_svm.my_client_index;
+ bmp->context = ntohl(0xfeedface);
+ vl_msg_api_send_shmem(g_svm.vl_input_queue, (u8 *)&bmp);
+
+ g_svm.timeout_poller = SPDK_POLLER_REGISTER(vpp_application_detached_timeout,
+ NULL, 10000000);
+
+ return 0;
+}
+
+static void
+vl_api_map_another_segment_t_handler(vl_api_map_another_segment_t *mp)
+{
+ ssvm_segment_type_t seg_type = SSVM_SEGMENT_SHM;
+ int fd = -1;
+
+ if (mp->fd_flags) {
+ vl_socket_client_recv_fd_msg(&fd, 1, 5);
+ seg_type = SSVM_SEGMENT_MEMFD;
+ }
+
+ if (ssvm_segment_attach((char *) mp->segment_name,
+ seg_type, fd)) {
+ SPDK_ERRLOG("svm_fifo_segment_attach ('%s') failed\n",
+ mp->segment_name);
+ return;
+ }
+
+ SPDK_DEBUGLOG(SPDK_SOCK_VPP, "New segment ('%s') attached\n",
+ mp->segment_name);
+}
+
+static void
+vpp_net_framework_set_handlers(void)
+{
+ /* Set up VPP handlers */
+#define _(N,n) \
+ vl_msg_api_set_handlers(VL_API_##N, #n, \
+ vl_api_##n##_t_handler, \
+ vl_noop_handler, \
+ vl_api_##n##_t_endian, \
+ vl_api_##n##_t_print, \
+ sizeof(vl_api_##n##_t), 1);
+ _(SESSION_ENABLE_DISABLE_REPLY, session_enable_disable_reply) \
+ _(DISCONNECT_SESSION_REPLY, disconnect_session_reply) \
+ _(APPLICATION_ATTACH_REPLY, application_attach_reply) \
+ _(APPLICATION_DETACH_REPLY, application_detach_reply) \
+ _(MAP_ANOTHER_SEGMENT, map_another_segment)
+#undef _
+}
+
+static void
+vpp_net_framework_init(void)
+{
+ char *app_name;
+ api_main_t *am = &api_main;
+
+ clib_mem_init_thread_safe(0, SPDK_VPP_CLIB_MEM_SIZE);
+ svm_fifo_segment_main_init(&g_svm.segment_main, SPDK_VPP_SEGMENT_BASEVA,
+ SPDK_VPP_SEGMENT_TIMEOUT);
+
+ app_name = spdk_sprintf_alloc("SPDK_%d", getpid());
+ if (app_name == NULL) {
+ SPDK_ERRLOG("Cannot alloc memory for SPDK app name\n");
+ return;
+ }
+
+ vpp_net_framework_set_handlers();
+
+ if (vl_socket_client_connect((char *) API_SOCKET_FILE, app_name,
+ 0 /* default rx, tx buffer */)) {
+ SPDK_ERRLOG("Client \"%s\" failed to connect to the socket \"%s\".\n",
+ app_name, API_SOCKET_FILE);
+ goto err;
+ }
+
+ if (vl_socket_client_init_shm(0, 0 /* want_pthread */)) {
+ SPDK_ERRLOG("SHM API initialization failed.\n");
+ vl_socket_client_disconnect();
+ goto err;
+ }
+
+ g_svm.vl_input_queue = am->shmem_hdr->vl_input_queue;
+ g_svm.vl_output_queue = am->vl_input_queue;
+
+ g_svm.my_client_index = am->my_client_index;
+ pthread_mutex_init(&g_svm.session_get_lock, NULL);
+
+ free(app_name);
+
+ g_svm.init_thread = spdk_get_thread();
+ SPDK_NOTICELOG("Enable VPP session\n");
+
+ g_svm.vpp_queue_poller = SPDK_POLLER_REGISTER(vpp_queue_poller, NULL, 100);
+
+ vpp_session_enable(1);
+
+ return;
+
+err:
+ free(app_name);
+ spdk_net_framework_init_next(0);
+}
+
+/******************************************************************************
+ * Register components
+ */
+static struct spdk_net_impl g_vpp_net_impl = {
+ .name = "vpp",
+ .getaddr = vpp_sock_getaddr,
+ .connect = vpp_sock_connect,
+ .listen = vpp_sock_listen,
+ .accept = vpp_sock_accept,
+ .close = vpp_sock_close,
+ .recv = vpp_sock_recv,
+ .readv = vpp_sock_readv,
+ .writev = vpp_sock_writev,
+ .writev_async = vpp_sock_writev_async,
+ .set_recvlowat = vpp_sock_set_recvlowat,
+ .set_recvbuf = vpp_sock_set_recvbuf,
+ .set_sendbuf = vpp_sock_set_sendbuf,
+ .is_ipv6 = vpp_sock_is_ipv6,
+ .is_ipv4 = vpp_sock_is_ipv4,
+ .is_connected = vpp_sock_is_connected,
+ .get_placement_id = vpp_sock_get_placement_id,
+ .group_impl_create = vpp_sock_group_impl_create,
+ .group_impl_add_sock = vpp_sock_group_impl_add_sock,
+ .group_impl_remove_sock = vpp_sock_group_impl_remove_sock,
+ .group_impl_poll = vpp_sock_group_impl_poll,
+ .group_impl_close = vpp_sock_group_impl_close,
+};
+
+SPDK_NET_IMPL_REGISTER(vpp, &g_vpp_net_impl, DEFAULT_SOCK_PRIORITY + 2);
+
+static void
+vpp_net_framework_fini(void)
+{
+ if (g_svm.vpp_initialized) {
+ vpp_app_detach();
+ } else {
+ spdk_net_framework_fini_next();
+ }
+}
+
+static struct spdk_net_framework g_vpp_net_framework = {
+ .name = "vpp",
+ .init = vpp_net_framework_init,
+ .fini = vpp_net_framework_fini,
+};
+
+SPDK_NET_FRAMEWORK_REGISTER(vpp, &g_vpp_net_framework);
+
+SPDK_LOG_REGISTER_COMPONENT("sock_vpp", SPDK_SOCK_VPP)
+
+#endif /* __clang_analyzer__ */