summaryrefslogtreecommitdiffstats
path: root/src/spdk/lib
diff options
context:
space:
mode:
Diffstat (limited to 'src/spdk/lib')
-rw-r--r--src/spdk/lib/Makefile65
-rw-r--r--src/spdk/lib/accel/Makefile46
-rw-r--r--src/spdk/lib/accel/accel_engine.c1044
-rw-r--r--src/spdk/lib/accel/spdk_accel.map33
-rw-r--r--src/spdk/lib/bdev/Makefile50
-rw-r--r--src/spdk/lib/bdev/bdev.c6763
-rw-r--r--src/spdk/lib/bdev/bdev_internal.h50
-rw-r--r--src/spdk/lib/bdev/bdev_rpc.c98
-rw-r--r--src/spdk/lib/bdev/bdev_zone.c201
-rw-r--r--src/spdk/lib/bdev/part.c524
-rw-r--r--src/spdk/lib/bdev/scsi_nvme.c261
-rw-r--r--src/spdk/lib/bdev/spdk_bdev.map154
-rw-r--r--src/spdk/lib/bdev/vtune.c49
-rw-r--r--src/spdk/lib/blob/Makefile45
-rw-r--r--src/spdk/lib/blob/blob_bs_dev.c150
-rw-r--r--src/spdk/lib/blob/blobstore.c7461
-rw-r--r--src/spdk/lib/blob/blobstore.h702
-rw-r--r--src/spdk/lib/blob/request.c521
-rw-r--r--src/spdk/lib/blob/request.h217
-rw-r--r--src/spdk/lib/blob/spdk_blob.map64
-rw-r--r--src/spdk/lib/blob/zeroes.c122
-rw-r--r--src/spdk/lib/blobfs/Makefile45
-rw-r--r--src/spdk/lib/blobfs/blobfs.c2980
-rw-r--r--src/spdk/lib/blobfs/spdk_blobfs.map45
-rw-r--r--src/spdk/lib/blobfs/tree.c181
-rw-r--r--src/spdk/lib/blobfs/tree.h77
-rw-r--r--src/spdk/lib/conf/Makefile45
-rw-r--r--src/spdk/lib/conf/conf.c704
-rw-r--r--src/spdk/lib/conf/spdk_conf.map23
-rw-r--r--src/spdk/lib/env_dpdk/Makefile47
-rw-r--r--src/spdk/lib/env_dpdk/env.c451
-rw-r--r--src/spdk/lib/env_dpdk/env.mk176
-rw-r--r--src/spdk/lib/env_dpdk/env_internal.h98
-rw-r--r--src/spdk/lib/env_dpdk/init.c604
-rw-r--r--src/spdk/lib/env_dpdk/memory.c1442
-rw-r--r--src/spdk/lib/env_dpdk/pci.c1063
-rw-r--r--src/spdk/lib/env_dpdk/pci_idxd.c50
-rw-r--r--src/spdk/lib/env_dpdk/pci_ioat.c98
-rw-r--r--src/spdk/lib/env_dpdk/pci_virtio.c53
-rw-r--r--src/spdk/lib/env_dpdk/pci_vmd.c50
-rw-r--r--src/spdk/lib/env_dpdk/spdk_env_dpdk.map114
-rw-r--r--src/spdk/lib/env_dpdk/threads.c108
-rw-r--r--src/spdk/lib/env_ocf/.gitignore2
-rw-r--r--src/spdk/lib/env_ocf/Makefile108
-rw-r--r--src/spdk/lib/env_ocf/ocf_env.c176
-rw-r--r--src/spdk/lib/env_ocf/ocf_env.h834
-rw-r--r--src/spdk/lib/env_ocf/ocf_env_headers.h43
-rw-r--r--src/spdk/lib/env_ocf/ocf_env_list.h185
-rw-r--r--src/spdk/lib/event/Makefile45
-rw-r--r--src/spdk/lib/event/app.c1177
-rw-r--r--src/spdk/lib/event/json_config.c630
-rw-r--r--src/spdk/lib/event/reactor.c664
-rw-r--r--src/spdk/lib/event/rpc.c87
-rw-r--r--src/spdk/lib/event/spdk_event.map46
-rw-r--r--src/spdk/lib/event/subsystem.c288
-rw-r--r--src/spdk/lib/ftl/Makefile47
-rw-r--r--src/spdk/lib/ftl/ftl_addr.h76
-rw-r--r--src/spdk/lib/ftl/ftl_band.c1097
-rw-r--r--src/spdk/lib/ftl/ftl_band.h287
-rw-r--r--src/spdk/lib/ftl/ftl_core.c2460
-rw-r--r--src/spdk/lib/ftl/ftl_core.h552
-rw-r--r--src/spdk/lib/ftl/ftl_debug.c169
-rw-r--r--src/spdk/lib/ftl/ftl_debug.h73
-rw-r--r--src/spdk/lib/ftl/ftl_init.c1688
-rw-r--r--src/spdk/lib/ftl/ftl_io.c563
-rw-r--r--src/spdk/lib/ftl/ftl_io.h351
-rw-r--r--src/spdk/lib/ftl/ftl_reloc.c860
-rw-r--r--src/spdk/lib/ftl/ftl_reloc.h53
-rw-r--r--src/spdk/lib/ftl/ftl_restore.c1350
-rw-r--r--src/spdk/lib/ftl/ftl_trace.c361
-rw-r--r--src/spdk/lib/ftl/ftl_trace.h84
-rw-r--r--src/spdk/lib/ftl/spdk_ftl.map14
-rw-r--r--src/spdk/lib/idxd/Makefile45
-rw-r--r--src/spdk/lib/idxd/idxd.c1292
-rw-r--r--src/spdk/lib/idxd/idxd.h188
-rw-r--r--src/spdk/lib/idxd/idxd_spec.h503
-rw-r--r--src/spdk/lib/idxd/spdk_idxd.map29
-rw-r--r--src/spdk/lib/ioat/Makefile45
-rw-r--r--src/spdk/lib/ioat/ioat.c775
-rw-r--r--src/spdk/lib/ioat/ioat_internal.h100
-rw-r--r--src/spdk/lib/ioat/spdk_ioat.map17
-rw-r--r--src/spdk/lib/iscsi/Makefile50
-rw-r--r--src/spdk/lib/iscsi/conn.c1714
-rw-r--r--src/spdk/lib/iscsi/conn.h237
-rw-r--r--src/spdk/lib/iscsi/init_grp.c787
-rw-r--r--src/spdk/lib/iscsi/init_grp.h81
-rw-r--r--src/spdk/lib/iscsi/iscsi.c4797
-rw-r--r--src/spdk/lib/iscsi/iscsi.h465
-rw-r--r--src/spdk/lib/iscsi/iscsi_rpc.c1639
-rw-r--r--src/spdk/lib/iscsi/iscsi_subsystem.c1577
-rw-r--r--src/spdk/lib/iscsi/md5.c75
-rw-r--r--src/spdk/lib/iscsi/md5.h52
-rw-r--r--src/spdk/lib/iscsi/param.c1216
-rw-r--r--src/spdk/lib/iscsi/param.h94
-rw-r--r--src/spdk/lib/iscsi/portal_grp.c655
-rw-r--r--src/spdk/lib/iscsi/portal_grp.h90
-rw-r--r--src/spdk/lib/iscsi/spdk_iscsi.map11
-rw-r--r--src/spdk/lib/iscsi/task.c98
-rw-r--r--src/spdk/lib/iscsi/task.h188
-rw-r--r--src/spdk/lib/iscsi/tgt_node.c1607
-rw-r--r--src/spdk/lib/iscsi/tgt_node.h147
-rw-r--r--src/spdk/lib/json/Makefile45
-rw-r--r--src/spdk/lib/json/json_parse.c668
-rw-r--r--src/spdk/lib/json/json_util.c653
-rw-r--r--src/spdk/lib/json/json_write.c687
-rw-r--r--src/spdk/lib/json/spdk_json.map67
-rw-r--r--src/spdk/lib/jsonrpc/Makefile46
-rw-r--r--src/spdk/lib/jsonrpc/jsonrpc_client.c227
-rw-r--r--src/spdk/lib/jsonrpc/jsonrpc_client_tcp.c431
-rw-r--r--src/spdk/lib/jsonrpc/jsonrpc_internal.h166
-rw-r--r--src/spdk/lib/jsonrpc/jsonrpc_server.c361
-rw-r--r--src/spdk/lib/jsonrpc/jsonrpc_server_tcp.c441
-rw-r--r--src/spdk/lib/jsonrpc/spdk_jsonrpc.map28
-rw-r--r--src/spdk/lib/log/Makefile46
-rw-r--r--src/spdk/lib/log/log.c203
-rw-r--r--src/spdk/lib/log/log_flags.c188
-rw-r--r--src/spdk/lib/log/spdk_log.map25
-rw-r--r--src/spdk/lib/log_rpc/Makefile45
-rw-r--r--src/spdk/lib/log_rpc/log_rpc.c340
-rw-r--r--src/spdk/lib/log_rpc/spdk_log_rpc.map3
-rw-r--r--src/spdk/lib/lvol/Makefile45
-rw-r--r--src/spdk/lib/lvol/lvol.c1509
-rw-r--r--src/spdk/lib/lvol/spdk_lvol.map28
-rw-r--r--src/spdk/lib/nbd/Makefile45
-rw-r--r--src/spdk/lib/nbd/nbd.c1093
-rw-r--r--src/spdk/lib/nbd/nbd_internal.h52
-rw-r--r--src/spdk/lib/nbd/nbd_rpc.c422
-rw-r--r--src/spdk/lib/nbd/spdk_nbd.map13
-rw-r--r--src/spdk/lib/net/Makefile46
-rw-r--r--src/spdk/lib/net/interface.c551
-rw-r--r--src/spdk/lib/net/net_internal.h79
-rw-r--r--src/spdk/lib/net/net_rpc.c198
-rw-r--r--src/spdk/lib/net/spdk_net.map9
-rw-r--r--src/spdk/lib/notify/Makefile45
-rw-r--r--src/spdk/lib/notify/notify.c150
-rw-r--r--src/spdk/lib/notify/notify_rpc.c126
-rw-r--r--src/spdk/lib/notify/spdk_notify.map10
-rw-r--r--src/spdk/lib/nvme/Makefile73
-rw-r--r--src/spdk/lib/nvme/nvme.c1423
-rw-r--r--src/spdk/lib/nvme/nvme_ctrlr.c3639
-rw-r--r--src/spdk/lib/nvme/nvme_ctrlr_cmd.c966
-rw-r--r--src/spdk/lib/nvme/nvme_ctrlr_ocssd_cmd.c88
-rw-r--r--src/spdk/lib/nvme/nvme_cuse.c1115
-rw-r--r--src/spdk/lib/nvme/nvme_cuse.h42
-rw-r--r--src/spdk/lib/nvme/nvme_fabric.c475
-rw-r--r--src/spdk/lib/nvme/nvme_internal.h1233
-rw-r--r--src/spdk/lib/nvme/nvme_io_msg.c216
-rw-r--r--src/spdk/lib/nvme/nvme_io_msg.h90
-rw-r--r--src/spdk/lib/nvme/nvme_ns.c401
-rw-r--r--src/spdk/lib/nvme/nvme_ns_cmd.c1074
-rw-r--r--src/spdk/lib/nvme/nvme_ns_ocssd_cmd.c233
-rw-r--r--src/spdk/lib/nvme/nvme_opal.c2566
-rw-r--r--src/spdk/lib/nvme/nvme_opal_internal.h272
-rw-r--r--src/spdk/lib/nvme/nvme_pcie.c2604
-rw-r--r--src/spdk/lib/nvme/nvme_poll_group.c164
-rw-r--r--src/spdk/lib/nvme/nvme_qpair.c1064
-rw-r--r--src/spdk/lib/nvme/nvme_quirks.c155
-rw-r--r--src/spdk/lib/nvme/nvme_rdma.c2852
-rw-r--r--src/spdk/lib/nvme/nvme_tcp.c1973
-rw-r--r--src/spdk/lib/nvme/nvme_transport.c591
-rw-r--r--src/spdk/lib/nvme/nvme_uevent.c213
-rw-r--r--src/spdk/lib/nvme/nvme_uevent.h61
-rw-r--r--src/spdk/lib/nvme/spdk_nvme.map185
-rw-r--r--src/spdk/lib/nvmf/Makefile75
-rw-r--r--src/spdk/lib/nvmf/ctrlr.c3224
-rw-r--r--src/spdk/lib/nvmf/ctrlr_bdev.c761
-rw-r--r--src/spdk/lib/nvmf/ctrlr_discovery.c159
-rw-r--r--src/spdk/lib/nvmf/fc.c3957
-rw-r--r--src/spdk/lib/nvmf/fc_ls.c1678
-rw-r--r--src/spdk/lib/nvmf/nvmf.c1457
-rw-r--r--src/spdk/lib/nvmf/nvmf_fc.h999
-rw-r--r--src/spdk/lib/nvmf/nvmf_internal.h371
-rw-r--r--src/spdk/lib/nvmf/nvmf_rpc.c2012
-rw-r--r--src/spdk/lib/nvmf/rdma.c4313
-rw-r--r--src/spdk/lib/nvmf/spdk_nvmf.map118
-rw-r--r--src/spdk/lib/nvmf/subsystem.c2515
-rw-r--r--src/spdk/lib/nvmf/tcp.c2631
-rw-r--r--src/spdk/lib/nvmf/transport.c572
-rw-r--r--src/spdk/lib/nvmf/transport.h82
-rw-r--r--src/spdk/lib/rdma/Makefile70
-rw-r--r--src/spdk/lib/rdma/rdma_mlx5_dv.c316
-rw-r--r--src/spdk/lib/rdma/rdma_verbs.c167
-rw-r--r--src/spdk/lib/rdma/spdk_rdma.map14
-rw-r--r--src/spdk/lib/reduce/Makefile45
-rw-r--r--src/spdk/lib/reduce/reduce.c1625
-rw-r--r--src/spdk/lib/reduce/spdk_reduce.map16
-rw-r--r--src/spdk/lib/rocksdb/env_spdk.cc798
-rw-r--r--src/spdk/lib/rocksdb/spdk.rocksdb.mk70
-rw-r--r--src/spdk/lib/rpc/Makefile45
-rw-r--r--src/spdk/lib/rpc/rpc.c392
-rw-r--r--src/spdk/lib/rpc/spdk_rpc.map16
-rw-r--r--src/spdk/lib/rte_vhost/Makefile50
-rw-r--r--src/spdk/lib/rte_vhost/fd_man.c300
-rw-r--r--src/spdk/lib/rte_vhost/fd_man.h69
-rw-r--r--src/spdk/lib/rte_vhost/rte_vhost.h635
-rw-r--r--src/spdk/lib/rte_vhost/socket.c841
-rw-r--r--src/spdk/lib/rte_vhost/vhost.c565
-rw-r--r--src/spdk/lib/rte_vhost/vhost.h330
-rw-r--r--src/spdk/lib/rte_vhost/vhost_user.c1426
-rw-r--r--src/spdk/lib/rte_vhost/vhost_user.h171
-rw-r--r--src/spdk/lib/scsi/Makefile45
-rw-r--r--src/spdk/lib/scsi/dev.c436
-rw-r--r--src/spdk/lib/scsi/lun.c623
-rw-r--r--src/spdk/lib/scsi/port.c134
-rw-r--r--src/spdk/lib/scsi/scsi.c110
-rw-r--r--src/spdk/lib/scsi/scsi_bdev.c2067
-rw-r--r--src/spdk/lib/scsi/scsi_internal.h214
-rw-r--r--src/spdk/lib/scsi/scsi_pr.c1067
-rw-r--r--src/spdk/lib/scsi/scsi_rpc.c77
-rw-r--r--src/spdk/lib/scsi/spdk_scsi.map49
-rw-r--r--src/spdk/lib/scsi/task.c300
-rw-r--r--src/spdk/lib/sock/Makefile46
-rw-r--r--src/spdk/lib/sock/net_framework.c107
-rw-r--r--src/spdk/lib/sock/sock.c809
-rw-r--r--src/spdk/lib/sock/sock_rpc.c161
-rw-r--r--src/spdk/lib/sock/spdk_sock.map47
-rw-r--r--src/spdk/lib/thread/Makefile45
-rw-r--r--src/spdk/lib/thread/spdk_thread.map55
-rw-r--r--src/spdk/lib/thread/thread.c1636
-rw-r--r--src/spdk/lib/trace/Makefile45
-rw-r--r--src/spdk/lib/trace/spdk_trace.map29
-rw-r--r--src/spdk/lib/trace/trace.c201
-rw-r--r--src/spdk/lib/trace/trace_flags.c323
-rw-r--r--src/spdk/lib/trace/trace_rpc.c170
-rw-r--r--src/spdk/lib/ut_mock/Makefile45
-rw-r--r--src/spdk/lib/ut_mock/mock.c71
-rw-r--r--src/spdk/lib/util/Makefile47
-rw-r--r--src/spdk/lib/util/base64.c262
-rw-r--r--src/spdk/lib/util/base64_neon.c225
-rw-r--r--src/spdk/lib/util/bit_array.c363
-rw-r--r--src/spdk/lib/util/cpuset.c336
-rw-r--r--src/spdk/lib/util/crc16.c668
-rw-r--r--src/spdk/lib/util/crc32.c95
-rw-r--r--src/spdk/lib/util/crc32_ieee.c49
-rw-r--r--src/spdk/lib/util/crc32c.c133
-rw-r--r--src/spdk/lib/util/dif.c1999
-rw-r--r--src/spdk/lib/util/fd.c103
-rw-r--r--src/spdk/lib/util/file.c71
-rw-r--r--src/spdk/lib/util/iov.c111
-rw-r--r--src/spdk/lib/util/math.c69
-rw-r--r--src/spdk/lib/util/pipe.c246
-rw-r--r--src/spdk/lib/util/spdk_util.map128
-rw-r--r--src/spdk/lib/util/strerror_tls.c43
-rw-r--r--src/spdk/lib/util/string.c476
-rw-r--r--src/spdk/lib/util/util_internal.h77
-rw-r--r--src/spdk/lib/util/uuid.c73
-rw-r--r--src/spdk/lib/vhost/Makefile54
-rw-r--r--src/spdk/lib/vhost/rte_vhost_compat.c402
-rw-r--r--src/spdk/lib/vhost/spdk_vhost.map27
-rw-r--r--src/spdk/lib/vhost/vhost.c1634
-rw-r--r--src/spdk/lib/vhost/vhost_blk.c1354
-rw-r--r--src/spdk/lib/vhost/vhost_internal.h496
-rw-r--r--src/spdk/lib/vhost/vhost_nvme.c1500
-rw-r--r--src/spdk/lib/vhost/vhost_rpc.c652
-rw-r--r--src/spdk/lib/vhost/vhost_scsi.c1536
-rw-r--r--src/spdk/lib/virtio/Makefile46
-rw-r--r--src/spdk/lib/virtio/spdk_virtio.map33
-rw-r--r--src/spdk/lib/virtio/vhost_user.c489
-rw-r--r--src/spdk/lib/virtio/vhost_user.h69
-rw-r--r--src/spdk/lib/virtio/virtio.c717
-rw-r--r--src/spdk/lib/virtio/virtio_pci.c599
-rw-r--r--src/spdk/lib/virtio/virtio_user.c628
-rw-r--r--src/spdk/lib/vmd/Makefile45
-rw-r--r--src/spdk/lib/vmd/led.c166
-rw-r--r--src/spdk/lib/vmd/spdk_vmd.map13
-rw-r--r--src/spdk/lib/vmd/vmd.c1376
-rw-r--r--src/spdk/lib/vmd/vmd.h201
-rw-r--r--src/spdk/lib/vmd/vmd_spec.h473
268 files changed, 152758 insertions, 0 deletions
diff --git a/src/spdk/lib/Makefile b/src/spdk/lib/Makefile
new file mode 100644
index 000000000..4c0c383eb
--- /dev/null
+++ b/src/spdk/lib/Makefile
@@ -0,0 +1,65 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+include $(SPDK_ROOT_DIR)/mk/spdk.lib_deps.mk
+
+DIRS-y += bdev blob blobfs conf accel event json jsonrpc \
+ log log_rpc lvol net rpc sock thread trace util nvme vmd nvmf scsi \
+ ioat ut_mock iscsi notify
+ifeq ($(OS),Linux)
+DIRS-y += nbd ftl
+endif
+
+DIRS-$(CONFIG_OCF) += env_ocf
+DIRS-$(CONFIG_IDXD) += idxd
+DIRS-$(CONFIG_VHOST) += vhost
+DIRS-$(CONFIG_VIRTIO) += virtio
+DIRS-$(CONFIG_REDUCE) += reduce
+DIRS-$(CONFIG_VHOST_INTERNAL_LIB) += rte_vhost
+DIRS-$(CONFIG_RDMA) += rdma
+
+# If CONFIG_ENV is pointing at a directory in lib, build it.
+# Out-of-tree env implementations must be built separately by the user.
+ENV_NAME := $(notdir $(CONFIG_ENV))
+ifeq ($(abspath $(CONFIG_ENV)),$(SPDK_ROOT_DIR)/lib/$(ENV_NAME))
+DIRS-y += $(ENV_NAME)
+endif
+
+.PHONY: all clean $(DIRS-y)
+
+all: $(DIRS-y)
+clean: $(DIRS-y)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.subdirs.mk
diff --git a/src/spdk/lib/accel/Makefile b/src/spdk/lib/accel/Makefile
new file mode 100644
index 000000000..0d41104de
--- /dev/null
+++ b/src/spdk/lib/accel/Makefile
@@ -0,0 +1,46 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 3
+SO_MINOR := 0
+SO_SUFFIX := $(SO_VER).$(SO_MINOR)
+
+LIBNAME = accel
+C_SRCS = accel_engine.c
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_accel.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/accel/accel_engine.c b/src/spdk/lib/accel/accel_engine.c
new file mode 100644
index 000000000..03a405439
--- /dev/null
+++ b/src/spdk/lib/accel/accel_engine.c
@@ -0,0 +1,1044 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk_internal/accel_engine.h"
+
+#include "spdk/env.h"
+#include "spdk/log.h"
+#include "spdk/thread.h"
+#include "spdk/json.h"
+#include "spdk/crc32.h"
+
+/* Accelerator Engine Framework: The following provides a top level
+ * generic API for the accelerator functions defined here. Modules,
+ * such as the one in /module/accel/ioat, supply the implemention of
+ * with the exception of the pure software implemention contained
+ * later in this file.
+ */
+
+#define ALIGN_4K 0x1000
+#define SPDK_ACCEL_NUM_TASKS 0x4000
+
+static struct spdk_mempool *g_accel_task_pool;
+
+/* Largest context size for all accel modules */
+static size_t g_max_accel_module_size = 0;
+
+static struct spdk_accel_engine *g_hw_accel_engine = NULL;
+static struct spdk_accel_engine *g_sw_accel_engine = NULL;
+static struct spdk_accel_module_if *g_accel_engine_module = NULL;
+static spdk_accel_fini_cb g_fini_cb_fn = NULL;
+static void *g_fini_cb_arg = NULL;
+
+/* Global list of registered accelerator modules */
+static TAILQ_HEAD(, spdk_accel_module_if) spdk_accel_module_list =
+ TAILQ_HEAD_INITIALIZER(spdk_accel_module_list);
+
+struct accel_io_channel {
+ struct spdk_accel_engine *engine;
+ struct spdk_io_channel *ch;
+};
+
+/* Forward declarations of software implementations used when an
+ * engine has not implemented the capability.
+ */
+static int sw_accel_submit_dualcast(struct spdk_io_channel *ch, void *dst1, void *dst2, void *src,
+ uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg);
+static int sw_accel_submit_copy(struct spdk_io_channel *ch, void *dst, void *src,
+ uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg);
+static int sw_accel_submit_compare(struct spdk_io_channel *ch, void *src1, void *src2,
+ uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg);
+static int sw_accel_submit_fill(struct spdk_io_channel *ch, void *dst, uint8_t fill,
+ uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg);
+static int sw_accel_submit_crc32c(struct spdk_io_channel *ch, uint32_t *dst, void *src,
+ uint32_t seed, uint64_t nbytes, spdk_accel_completion_cb cb_fn,
+ void *cb_arg);
+
+/* Registration of hw modules (currently supports only 1 at a time) */
+void
+spdk_accel_hw_engine_register(struct spdk_accel_engine *accel_engine)
+{
+ if (g_hw_accel_engine == NULL) {
+ g_hw_accel_engine = accel_engine;
+ } else {
+ SPDK_NOTICELOG("Hardware offload engine already enabled\n");
+ }
+}
+
+/* Registration of sw modules (currently supports only 1) */
+static void
+accel_sw_register(struct spdk_accel_engine *accel_engine)
+{
+ assert(g_sw_accel_engine == NULL);
+ g_sw_accel_engine = accel_engine;
+}
+
+static void
+accel_sw_unregister(void)
+{
+ g_sw_accel_engine = NULL;
+}
+
+/* Common completion routine, called only by the accel framework */
+static void
+_accel_engine_done(void *ref, int status)
+{
+ struct spdk_accel_task *req = (struct spdk_accel_task *)ref;
+
+ req->cb(req->cb_arg, status);
+ spdk_mempool_put(g_accel_task_pool, req);
+}
+
+uint64_t
+spdk_accel_get_capabilities(struct spdk_io_channel *ch)
+{
+ struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch);
+
+ /* All engines are required to implement this API. */
+ return accel_ch->engine->get_capabilities();
+}
+
+/* Accel framework public API for copy function */
+int
+spdk_accel_submit_copy(struct spdk_io_channel *ch, void *dst, void *src, uint64_t nbytes,
+ spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+ struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch);
+ struct spdk_accel_task *accel_req = spdk_mempool_get(g_accel_task_pool);
+
+ if (accel_req == NULL) {
+ SPDK_ERRLOG("Unable to get an accel task.\n");
+ return -ENOMEM;
+ }
+
+ accel_req->cb = cb_fn;
+ accel_req->cb_arg = cb_arg;
+
+ /* If the engine does not support it, fallback to the sw implementation. */
+ if (accel_ch->engine->copy) {
+ return accel_ch->engine->copy(accel_ch->ch, dst, src, nbytes,
+ _accel_engine_done, accel_req->offload_ctx);
+ } else {
+ return sw_accel_submit_copy(accel_ch->ch, dst, src, nbytes,
+ _accel_engine_done, accel_req->offload_ctx);
+ }
+}
+
+/* Accel framework public API for dual cast copy function */
+int
+spdk_accel_submit_dualcast(struct spdk_io_channel *ch, void *dst1, void *dst2, void *src,
+ uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+ struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch);
+ struct spdk_accel_task *accel_req = spdk_mempool_get(g_accel_task_pool);
+
+ if (accel_req == NULL) {
+ SPDK_ERRLOG("Unable to get an accel task.\n");
+ return -ENOMEM;
+ }
+
+ if ((uintptr_t)dst1 & (ALIGN_4K - 1) || (uintptr_t)dst2 & (ALIGN_4K - 1)) {
+ SPDK_ERRLOG("Dualcast requires 4K alignment on dst addresses\n");
+ return -EINVAL;
+ }
+
+ accel_req->cb = cb_fn;
+ accel_req->cb_arg = cb_arg;
+
+ /* If the engine does not support it, fallback to the sw implementation. */
+ if (accel_ch->engine->dualcast) {
+ return accel_ch->engine->dualcast(accel_ch->ch, dst1, dst2, src, nbytes,
+ _accel_engine_done, accel_req->offload_ctx);
+ } else {
+ return sw_accel_submit_dualcast(accel_ch->ch, dst1, dst2, src, nbytes,
+ _accel_engine_done, accel_req->offload_ctx);
+ }
+}
+
+/* Accel framework public API for batch_create function. All engines are
+ * required to implement this API.
+ */
+struct spdk_accel_batch *
+spdk_accel_batch_create(struct spdk_io_channel *ch)
+{
+ struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch);
+
+ return accel_ch->engine->batch_create(accel_ch->ch);
+}
+
+/* Accel framework public API for batch_submit function. All engines are
+ * required to implement this API.
+ */
+int
+spdk_accel_batch_submit(struct spdk_io_channel *ch, struct spdk_accel_batch *batch,
+ spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+ struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch);
+ struct spdk_accel_task *accel_req = spdk_mempool_get(g_accel_task_pool);
+
+ if (accel_req == NULL) {
+ SPDK_ERRLOG("Unable to get an accel task.\n");
+ return -ENOMEM;
+ }
+
+ accel_req->cb = cb_fn;
+ accel_req->cb_arg = cb_arg;
+
+ return accel_ch->engine->batch_submit(accel_ch->ch, batch, _accel_engine_done,
+ accel_req->offload_ctx);
+}
+
+/* Accel framework public API for getting max batch. All engines are
+ * required to implement this API.
+ */
+uint32_t
+spdk_accel_batch_get_max(struct spdk_io_channel *ch)
+{
+ struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch);
+
+ return accel_ch->engine->batch_get_max();
+}
+
+/* Accel framework public API for for when an app is unable to complete a batch sequence,
+ * it cancels with this API.
+ */
+int
+spdk_accel_batch_cancel(struct spdk_io_channel *ch, struct spdk_accel_batch *batch)
+{
+ struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch);
+
+ return accel_ch->engine->batch_cancel(accel_ch->ch, batch);
+}
+
+/* Accel framework public API for batch prep_copy function. All engines are
+ * required to implement this API.
+ */
+int
+spdk_accel_batch_prep_copy(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, void *dst,
+ void *src, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+ struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch);
+ struct spdk_accel_task *accel_req = spdk_mempool_get(g_accel_task_pool);
+
+ if (accel_req == NULL) {
+ SPDK_ERRLOG("Unable to get an accel task.\n");
+ return -ENOMEM;
+ }
+
+ accel_req->cb = cb_fn;
+ accel_req->cb_arg = cb_arg;
+
+ return accel_ch->engine->batch_prep_copy(accel_ch->ch, batch, dst, src, nbytes,
+ _accel_engine_done, accel_req->offload_ctx);
+}
+
+/* Accel framework public API for batch prep_dualcast function. All engines are
+ * required to implement this API.
+ */
+int
+spdk_accel_batch_prep_dualcast(struct spdk_io_channel *ch, struct spdk_accel_batch *batch,
+ void *dst1, void *dst2, void *src, uint64_t nbytes,
+ spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+ struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch);
+ struct spdk_accel_task *accel_req = spdk_mempool_get(g_accel_task_pool);
+
+ if (accel_req == NULL) {
+ SPDK_ERRLOG("Unable to get an accel task.\n");
+ return -ENOMEM;
+ }
+
+ if ((uintptr_t)dst1 & (ALIGN_4K - 1) || (uintptr_t)dst2 & (ALIGN_4K - 1)) {
+ SPDK_ERRLOG("Dualcast requires 4K alignment on dst addresses\n");
+ return -EINVAL;
+ }
+
+ accel_req->cb = cb_fn;
+ accel_req->cb_arg = cb_arg;
+
+ return accel_ch->engine->batch_prep_dualcast(accel_ch->ch, batch, dst1, dst2, src,
+ nbytes, _accel_engine_done, accel_req->offload_ctx);
+}
+
+/* Accel framework public API for batch prep_compare function. All engines are
+ * required to implement this API.
+ */
+int
+spdk_accel_batch_prep_compare(struct spdk_io_channel *ch, struct spdk_accel_batch *batch,
+ void *src1, void *src2, uint64_t nbytes, spdk_accel_completion_cb cb_fn,
+ void *cb_arg)
+{
+ struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch);
+ struct spdk_accel_task *accel_req = spdk_mempool_get(g_accel_task_pool);
+
+ if (accel_req == NULL) {
+ SPDK_ERRLOG("Unable to get an accel task.\n");
+ return -ENOMEM;
+ }
+
+ accel_req->cb = cb_fn;
+ accel_req->cb_arg = cb_arg;
+
+ return accel_ch->engine->batch_prep_compare(accel_ch->ch, batch, src1, src2, nbytes,
+ _accel_engine_done, accel_req->offload_ctx);
+}
+
+/* Accel framework public API for batch prep_fill function. All engines are
+ * required to implement this API.
+ */
+int
+spdk_accel_batch_prep_fill(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, void *dst,
+ uint8_t fill, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+ struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch);
+ struct spdk_accel_task *accel_req = spdk_mempool_get(g_accel_task_pool);
+
+ if (accel_req == NULL) {
+ SPDK_ERRLOG("Unable to get an accel task.\n");
+ return -ENOMEM;
+ }
+
+ accel_req->cb = cb_fn;
+ accel_req->cb_arg = cb_arg;
+
+ return accel_ch->engine->batch_prep_fill(accel_ch->ch, batch, dst, fill, nbytes,
+ _accel_engine_done, accel_req->offload_ctx);
+}
+
+/* Accel framework public API for batch prep_crc32c function. All engines are
+ * required to implement this API.
+ */
+int
+spdk_accel_batch_prep_crc32c(struct spdk_io_channel *ch, struct spdk_accel_batch *batch,
+ uint32_t *dst, void *src, uint32_t seed, uint64_t nbytes,
+ spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+ struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch);
+ struct spdk_accel_task *accel_req = spdk_mempool_get(g_accel_task_pool);
+
+ if (accel_req == NULL) {
+ SPDK_ERRLOG("Unable to get an accel task.\n");
+ return -ENOMEM;
+ }
+
+ accel_req->cb = cb_fn;
+ accel_req->cb_arg = cb_arg;
+
+ return accel_ch->engine->batch_prep_crc32c(accel_ch->ch, batch, dst, src, seed, nbytes,
+ _accel_engine_done, accel_req->offload_ctx);
+}
+
+/* Accel framework public API for compare function */
+int
+spdk_accel_submit_compare(struct spdk_io_channel *ch, void *src1, void *src2, uint64_t nbytes,
+ spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+ struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch);
+ struct spdk_accel_task *accel_req = spdk_mempool_get(g_accel_task_pool);
+
+ if (accel_req == NULL) {
+ SPDK_ERRLOG("Unable to get an accel task.\n");
+ return -ENOMEM;
+ }
+
+ accel_req->cb = cb_fn;
+ accel_req->cb_arg = cb_arg;
+
+ /* If the engine does not support it, fallback to the sw implementation. */
+ if (accel_ch->engine->compare) {
+ return accel_ch->engine->compare(accel_ch->ch, src1, src2, nbytes,
+ _accel_engine_done, accel_req->offload_ctx);
+ } else {
+ return sw_accel_submit_compare(accel_ch->ch, src1, src2, nbytes,
+ _accel_engine_done, accel_req->offload_ctx);
+ }
+}
+
+/* Accel framework public API for fill function */
+int
+spdk_accel_submit_fill(struct spdk_io_channel *ch, void *dst, uint8_t fill, uint64_t nbytes,
+ spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+ struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch);
+ struct spdk_accel_task *accel_req = spdk_mempool_get(g_accel_task_pool);
+
+ if (accel_req == NULL) {
+ SPDK_ERRLOG("Unable to get an accel task.\n");
+ return -ENOMEM;
+ }
+
+ accel_req->cb = cb_fn;
+ accel_req->cb_arg = cb_arg;
+
+ /* If the engine does not support it, fallback to the sw implementation. */
+ if (accel_ch->engine->fill) {
+ return accel_ch->engine->fill(accel_ch->ch, dst, fill, nbytes,
+ _accel_engine_done, accel_req->offload_ctx);
+ } else {
+ return sw_accel_submit_fill(accel_ch->ch, dst, fill, nbytes,
+ _accel_engine_done, accel_req->offload_ctx);
+ }
+}
+
+/* Accel framework public API for CRC-32C function */
+int
+spdk_accel_submit_crc32c(struct spdk_io_channel *ch, uint32_t *dst, void *src, uint32_t seed,
+ uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+ struct accel_io_channel *accel_ch = spdk_io_channel_get_ctx(ch);
+ struct spdk_accel_task *accel_req = spdk_mempool_get(g_accel_task_pool);
+
+ if (accel_req == NULL) {
+ SPDK_ERRLOG("Unable to get an accel task.\n");
+ return -ENOMEM;
+ }
+
+ accel_req->cb = cb_fn;
+ accel_req->cb_arg = cb_arg;
+
+ /* If the engine does not support it, fallback to the sw implementation. */
+ if (accel_ch->engine->crc32c) {
+ return accel_ch->engine->crc32c(accel_ch->ch, dst, src, seed, nbytes,
+ _accel_engine_done, accel_req->offload_ctx);
+ } else {
+ return sw_accel_submit_crc32c(accel_ch->ch, dst, src, seed, nbytes,
+ _accel_engine_done, accel_req->offload_ctx);
+ }
+}
+
+/* Helper function when when accel modules register with the framework. */
+void spdk_accel_module_list_add(struct spdk_accel_module_if *accel_module)
+{
+ TAILQ_INSERT_TAIL(&spdk_accel_module_list, accel_module, tailq);
+ if (accel_module->get_ctx_size && accel_module->get_ctx_size() > g_max_accel_module_size) {
+ g_max_accel_module_size = accel_module->get_ctx_size();
+ }
+}
+
+/* Framework level channel create callback. */
+static int
+accel_engine_create_cb(void *io_device, void *ctx_buf)
+{
+ struct accel_io_channel *accel_ch = ctx_buf;
+
+ if (g_hw_accel_engine != NULL) {
+ accel_ch->ch = g_hw_accel_engine->get_io_channel();
+ if (accel_ch->ch != NULL) {
+ accel_ch->engine = g_hw_accel_engine;
+ return 0;
+ }
+ }
+
+ /* No hw engine enabled, use sw. */
+ accel_ch->ch = g_sw_accel_engine->get_io_channel();
+ assert(accel_ch->ch != NULL);
+ accel_ch->engine = g_sw_accel_engine;
+ return 0;
+}
+
+/* Framework level channel destroy callback. */
+static void
+accel_engine_destroy_cb(void *io_device, void *ctx_buf)
+{
+ struct accel_io_channel *accel_ch = ctx_buf;
+
+ spdk_put_io_channel(accel_ch->ch);
+}
+
+struct spdk_io_channel *
+spdk_accel_engine_get_io_channel(void)
+{
+ return spdk_get_io_channel(&spdk_accel_module_list);
+}
+
+static void
+accel_engine_module_initialize(void)
+{
+ struct spdk_accel_module_if *accel_engine_module;
+ char task_pool_name[30];
+
+ TAILQ_FOREACH(accel_engine_module, &spdk_accel_module_list, tailq) {
+ accel_engine_module->module_init();
+ }
+
+ snprintf(task_pool_name, sizeof(task_pool_name), "accel_task_pool");
+ g_accel_task_pool = spdk_mempool_create(task_pool_name,
+ SPDK_ACCEL_NUM_TASKS,
+ g_max_accel_module_size,
+ SPDK_MEMPOOL_DEFAULT_CACHE_SIZE,
+ SPDK_ENV_SOCKET_ID_ANY);
+ assert(g_accel_task_pool);
+
+}
+
+int
+spdk_accel_engine_initialize(void)
+{
+ SPDK_NOTICELOG("Accel engine initialized to use software engine.\n");
+ accel_engine_module_initialize();
+
+ /*
+ * We need a unique identifier for the accel engine framework, so use the
+ * spdk_accel_module_list address for this purpose.
+ */
+ spdk_io_device_register(&spdk_accel_module_list, accel_engine_create_cb, accel_engine_destroy_cb,
+ sizeof(struct accel_io_channel), "accel_module");
+
+ return 0;
+}
+
+static void
+accel_engine_module_finish_cb(void)
+{
+ spdk_accel_fini_cb cb_fn = g_fini_cb_fn;
+
+ cb_fn(g_fini_cb_arg);
+ g_fini_cb_fn = NULL;
+ g_fini_cb_arg = NULL;
+}
+
+void
+spdk_accel_write_config_json(struct spdk_json_write_ctx *w)
+{
+ struct spdk_accel_module_if *accel_engine_module;
+
+ /*
+ * The accel engine has no config, there may be some in
+ * the modules though.
+ */
+ spdk_json_write_array_begin(w);
+ TAILQ_FOREACH(accel_engine_module, &spdk_accel_module_list, tailq) {
+ if (accel_engine_module->write_config_json) {
+ accel_engine_module->write_config_json(w);
+ }
+ }
+ spdk_json_write_array_end(w);
+}
+
+void
+spdk_accel_engine_module_finish(void)
+{
+ if (!g_accel_engine_module) {
+ g_accel_engine_module = TAILQ_FIRST(&spdk_accel_module_list);
+ } else {
+ g_accel_engine_module = TAILQ_NEXT(g_accel_engine_module, tailq);
+ }
+
+ if (!g_accel_engine_module) {
+ accel_engine_module_finish_cb();
+ return;
+ }
+
+ if (g_accel_engine_module->module_fini) {
+ spdk_thread_send_msg(spdk_get_thread(), g_accel_engine_module->module_fini, NULL);
+ } else {
+ spdk_accel_engine_module_finish();
+ }
+}
+
+void
+spdk_accel_engine_finish(spdk_accel_fini_cb cb_fn, void *cb_arg)
+{
+ assert(cb_fn != NULL);
+
+ g_fini_cb_fn = cb_fn;
+ g_fini_cb_arg = cb_arg;
+
+ spdk_io_device_unregister(&spdk_accel_module_list, NULL);
+ spdk_accel_engine_module_finish();
+ spdk_mempool_free(g_accel_task_pool);
+}
+
+void
+spdk_accel_engine_config_text(FILE *fp)
+{
+ struct spdk_accel_module_if *accel_engine_module;
+
+ TAILQ_FOREACH(accel_engine_module, &spdk_accel_module_list, tailq) {
+ if (accel_engine_module->config_text) {
+ accel_engine_module->config_text(fp);
+ }
+ }
+}
+
+/*
+ * The SW Accelerator module is "built in" here (rest of file)
+ */
+
+#define SW_ACCEL_BATCH_SIZE 2048
+
+enum sw_accel_opcode {
+ SW_ACCEL_OPCODE_MEMMOVE = 0,
+ SW_ACCEL_OPCODE_MEMFILL = 1,
+ SW_ACCEL_OPCODE_COMPARE = 2,
+ SW_ACCEL_OPCODE_CRC32C = 3,
+ SW_ACCEL_OPCODE_DUALCAST = 4,
+};
+
+struct sw_accel_op {
+ struct sw_accel_io_channel *sw_ch;
+ void *cb_arg;
+ spdk_accel_completion_cb cb_fn;
+ void *src;
+ union {
+ void *dst;
+ void *src2;
+ };
+ void *dst2;
+ uint32_t seed;
+ uint64_t fill_pattern;
+ enum sw_accel_opcode op_code;
+ uint64_t nbytes;
+ TAILQ_ENTRY(sw_accel_op) link;
+};
+
+/* The sw accel engine only supports one outstanding batch at a time. */
+struct sw_accel_io_channel {
+ TAILQ_HEAD(, sw_accel_op) op_pool;
+ TAILQ_HEAD(, sw_accel_op) batch;
+};
+
+static uint64_t
+sw_accel_get_capabilities(void)
+{
+ return ACCEL_COPY | ACCEL_FILL | ACCEL_CRC32C | ACCEL_COMPARE |
+ ACCEL_DUALCAST | ACCEL_BATCH;
+}
+
+static uint32_t
+sw_accel_batch_get_max(void)
+{
+ return SW_ACCEL_BATCH_SIZE;
+}
+
+/* The sw engine plug-in does not ahve a public API, it is only callable
+ * from the accel fw and thus does not need to have its own struct definition
+ * of a batch, it just simply casts the address of the single supported batch
+ * as the struct spdk_accel_batch pointer.
+ */
+static struct spdk_accel_batch *
+sw_accel_batch_start(struct spdk_io_channel *ch)
+{
+ struct sw_accel_io_channel *sw_ch = spdk_io_channel_get_ctx(ch);
+
+ if (!TAILQ_EMPTY(&sw_ch->batch)) {
+ SPDK_ERRLOG("SW accel engine only supports one batch at a time.\n");
+ return NULL;
+ }
+
+ return (struct spdk_accel_batch *)&sw_ch->batch;
+}
+
+static struct sw_accel_op *
+_prep_op(struct sw_accel_io_channel *sw_ch, struct spdk_accel_batch *batch,
+ spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+ struct sw_accel_op *op;
+
+ if ((struct spdk_accel_batch *)&sw_ch->batch != batch) {
+ SPDK_ERRLOG("Invalid batch\n");
+ return NULL;
+ }
+
+ if (!TAILQ_EMPTY(&sw_ch->op_pool)) {
+ op = TAILQ_FIRST(&sw_ch->op_pool);
+ TAILQ_REMOVE(&sw_ch->op_pool, op, link);
+ } else {
+ SPDK_ERRLOG("Ran out of operations for batch\n");
+ return NULL;
+ }
+
+ op->cb_arg = cb_arg;
+ op->cb_fn = cb_fn;
+ op->sw_ch = sw_ch;
+
+ return op;
+}
+
+static int
+sw_accel_batch_prep_copy(struct spdk_io_channel *ch, struct spdk_accel_batch *batch,
+ void *dst, void *src, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+ struct sw_accel_op *op;
+ struct sw_accel_io_channel *sw_ch = spdk_io_channel_get_ctx(ch);
+
+ op = _prep_op(sw_ch, batch, cb_fn, cb_arg);
+ if (op == NULL) {
+ return -EINVAL;
+ }
+
+ /* Command specific. */
+ op->src = src;
+ op->dst = dst;
+ op->nbytes = nbytes;
+ op->op_code = SW_ACCEL_OPCODE_MEMMOVE;
+ TAILQ_INSERT_TAIL(&sw_ch->batch, op, link);
+
+ return 0;
+}
+
+static int
+sw_accel_batch_prep_dualcast(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, void *dst1,
+ void *dst2,
+ void *src, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+ struct sw_accel_op *op;
+ struct sw_accel_io_channel *sw_ch = spdk_io_channel_get_ctx(ch);
+
+ op = _prep_op(sw_ch, batch, cb_fn, cb_arg);
+ if (op == NULL) {
+ return -EINVAL;
+ }
+
+ /* Command specific. */
+ op->src = src;
+ op->dst = dst1;
+ op->dst2 = dst2;
+ op->nbytes = nbytes;
+ op->op_code = SW_ACCEL_OPCODE_DUALCAST;
+ TAILQ_INSERT_TAIL(&sw_ch->batch, op, link);
+
+ return 0;
+}
+
+static int
+sw_accel_batch_prep_compare(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, void *src1,
+ void *src2, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+ struct sw_accel_op *op;
+ struct sw_accel_io_channel *sw_ch = spdk_io_channel_get_ctx(ch);
+
+ op = _prep_op(sw_ch, batch, cb_fn, cb_arg);
+ if (op == NULL) {
+ return -EINVAL;
+ }
+
+ /* Command specific. */
+ op->src = src1;
+ op->src2 = src2;
+ op->nbytes = nbytes;
+ op->op_code = SW_ACCEL_OPCODE_COMPARE;
+ TAILQ_INSERT_TAIL(&sw_ch->batch, op, link);
+
+ return 0;
+}
+
+static int
+sw_accel_batch_prep_fill(struct spdk_io_channel *ch, struct spdk_accel_batch *batch, void *dst,
+ uint8_t fill,
+ uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+ struct sw_accel_op *op;
+ struct sw_accel_io_channel *sw_ch = spdk_io_channel_get_ctx(ch);
+
+ op = _prep_op(sw_ch, batch, cb_fn, cb_arg);
+ if (op == NULL) {
+ return -EINVAL;
+ }
+
+ /* Command specific. */
+ op->dst = dst;
+ op->fill_pattern = fill;
+ op->nbytes = nbytes;
+ op->op_code = SW_ACCEL_OPCODE_MEMFILL;
+ TAILQ_INSERT_TAIL(&sw_ch->batch, op, link);
+
+ return 0;
+}
+
+static int
+sw_accel_batch_prep_crc32c(struct spdk_io_channel *ch, struct spdk_accel_batch *batch,
+ uint32_t *dst,
+ void *src, uint32_t seed, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+ struct sw_accel_op *op;
+ struct sw_accel_io_channel *sw_ch = spdk_io_channel_get_ctx(ch);
+
+ op = _prep_op(sw_ch, batch, cb_fn, cb_arg);
+ if (op == NULL) {
+ return -EINVAL;
+ }
+
+ /* Command specific. */
+ op->dst = (void *)dst;
+ op->src = src;
+ op->seed = seed;
+ op->nbytes = nbytes;
+ op->op_code = SW_ACCEL_OPCODE_CRC32C;
+ TAILQ_INSERT_TAIL(&sw_ch->batch, op, link);
+
+ return 0;
+}
+
+
+static int
+sw_accel_batch_cancel(struct spdk_io_channel *ch, struct spdk_accel_batch *batch)
+{
+ struct sw_accel_op *op;
+ struct sw_accel_io_channel *sw_ch = spdk_io_channel_get_ctx(ch);
+
+ if ((struct spdk_accel_batch *)&sw_ch->batch != batch) {
+ SPDK_ERRLOG("Invalid batch\n");
+ return -EINVAL;
+ }
+
+ /* Cancel the batch items by moving them back to the op_pool. */
+ while ((op = TAILQ_FIRST(&sw_ch->batch))) {
+ TAILQ_REMOVE(&sw_ch->batch, op, link);
+ TAILQ_INSERT_TAIL(&sw_ch->op_pool, op, link);
+ }
+
+ return 0;
+}
+
+static int
+sw_accel_batch_submit(struct spdk_io_channel *ch, struct spdk_accel_batch *batch,
+ spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+ struct sw_accel_op *op;
+ struct sw_accel_io_channel *sw_ch = spdk_io_channel_get_ctx(ch);
+ struct spdk_accel_task *accel_req;
+ int batch_status = 0, cmd_status = 0;
+
+ if ((struct spdk_accel_batch *)&sw_ch->batch != batch) {
+ SPDK_ERRLOG("Invalid batch\n");
+ return -EINVAL;
+ }
+
+ /* Complete the batch items. */
+ while ((op = TAILQ_FIRST(&sw_ch->batch))) {
+ TAILQ_REMOVE(&sw_ch->batch, op, link);
+ accel_req = (struct spdk_accel_task *)((uintptr_t)op->cb_arg -
+ offsetof(struct spdk_accel_task, offload_ctx));
+
+ switch (op->op_code) {
+ case SW_ACCEL_OPCODE_MEMMOVE:
+ memcpy(op->dst, op->src, op->nbytes);
+ break;
+ case SW_ACCEL_OPCODE_DUALCAST:
+ memcpy(op->dst, op->src, op->nbytes);
+ memcpy(op->dst2, op->src, op->nbytes);
+ break;
+ case SW_ACCEL_OPCODE_COMPARE:
+ cmd_status = memcmp(op->src, op->src2, op->nbytes);
+ break;
+ case SW_ACCEL_OPCODE_MEMFILL:
+ memset(op->dst, op->fill_pattern, op->nbytes);
+ break;
+ case SW_ACCEL_OPCODE_CRC32C:
+ *(uint32_t *)op->dst = spdk_crc32c_update(op->src, op->nbytes, ~op->seed);
+ break;
+ default:
+ assert(false);
+ break;
+ }
+
+ batch_status |= cmd_status;
+ op->cb_fn(accel_req, cmd_status);
+ TAILQ_INSERT_TAIL(&sw_ch->op_pool, op, link);
+ }
+
+ /* Now complete the batch request itself. */
+ accel_req = (struct spdk_accel_task *)((uintptr_t)cb_arg -
+ offsetof(struct spdk_accel_task, offload_ctx));
+ cb_fn(accel_req, batch_status);
+
+ return 0;
+}
+
+static int
+sw_accel_submit_copy(struct spdk_io_channel *ch, void *dst, void *src,
+ uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+ struct spdk_accel_task *accel_req;
+
+ memcpy(dst, src, (size_t)nbytes);
+
+ accel_req = (struct spdk_accel_task *)((uintptr_t)cb_arg -
+ offsetof(struct spdk_accel_task, offload_ctx));
+ cb_fn(accel_req, 0);
+ return 0;
+}
+
+static int
+sw_accel_submit_dualcast(struct spdk_io_channel *ch, void *dst1, void *dst2,
+ void *src, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+ struct spdk_accel_task *accel_req;
+
+ memcpy(dst1, src, (size_t)nbytes);
+ memcpy(dst2, src, (size_t)nbytes);
+
+ accel_req = (struct spdk_accel_task *)((uintptr_t)cb_arg -
+ offsetof(struct spdk_accel_task, offload_ctx));
+ cb_fn(accel_req, 0);
+ return 0;
+}
+
+static int
+sw_accel_submit_compare(struct spdk_io_channel *ch, void *src1, void *src2,
+ uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+ struct spdk_accel_task *accel_req;
+ int result;
+
+ result = memcmp(src1, src2, (size_t)nbytes);
+
+ accel_req = (struct spdk_accel_task *)((uintptr_t)cb_arg -
+ offsetof(struct spdk_accel_task, offload_ctx));
+ cb_fn(accel_req, result);
+
+ return 0;
+}
+
+static int
+sw_accel_submit_fill(struct spdk_io_channel *ch, void *dst, uint8_t fill,
+ uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+ struct spdk_accel_task *accel_req;
+
+ memset(dst, fill, nbytes);
+ accel_req = (struct spdk_accel_task *)((uintptr_t)cb_arg -
+ offsetof(struct spdk_accel_task, offload_ctx));
+ cb_fn(accel_req, 0);
+
+ return 0;
+}
+
+static int
+sw_accel_submit_crc32c(struct spdk_io_channel *ch, uint32_t *dst, void *src,
+ uint32_t seed, uint64_t nbytes, spdk_accel_completion_cb cb_fn, void *cb_arg)
+{
+ struct spdk_accel_task *accel_req;
+
+ *dst = spdk_crc32c_update(src, nbytes, ~seed);
+ accel_req = (struct spdk_accel_task *)((uintptr_t)cb_arg -
+ offsetof(struct spdk_accel_task, offload_ctx));
+ cb_fn(accel_req, 0);
+
+ return 0;
+}
+
+static struct spdk_io_channel *sw_accel_get_io_channel(void);
+
+static struct spdk_accel_engine sw_accel_engine = {
+ .get_capabilities = sw_accel_get_capabilities,
+ .copy = sw_accel_submit_copy,
+ .dualcast = sw_accel_submit_dualcast,
+ .batch_get_max = sw_accel_batch_get_max,
+ .batch_create = sw_accel_batch_start,
+ .batch_cancel = sw_accel_batch_cancel,
+ .batch_prep_copy = sw_accel_batch_prep_copy,
+ .batch_prep_dualcast = sw_accel_batch_prep_dualcast,
+ .batch_prep_compare = sw_accel_batch_prep_compare,
+ .batch_prep_fill = sw_accel_batch_prep_fill,
+ .batch_prep_crc32c = sw_accel_batch_prep_crc32c,
+ .batch_submit = sw_accel_batch_submit,
+ .compare = sw_accel_submit_compare,
+ .fill = sw_accel_submit_fill,
+ .crc32c = sw_accel_submit_crc32c,
+ .get_io_channel = sw_accel_get_io_channel,
+};
+
+static int
+sw_accel_create_cb(void *io_device, void *ctx_buf)
+{
+ struct sw_accel_io_channel *sw_ch = ctx_buf;
+ struct sw_accel_op *op;
+ int i;
+
+ TAILQ_INIT(&sw_ch->batch);
+
+ TAILQ_INIT(&sw_ch->op_pool);
+ for (i = 0 ; i < SW_ACCEL_BATCH_SIZE ; i++) {
+ op = calloc(1, sizeof(struct sw_accel_op));
+ if (op == NULL) {
+ SPDK_ERRLOG("Failed to allocate operation for batch.\n");
+ while ((op = TAILQ_FIRST(&sw_ch->op_pool))) {
+ TAILQ_REMOVE(&sw_ch->op_pool, op, link);
+ free(op);
+ }
+ return -ENOMEM;
+ }
+ TAILQ_INSERT_TAIL(&sw_ch->op_pool, op, link);
+ }
+
+ return 0;
+}
+
+static void
+sw_accel_destroy_cb(void *io_device, void *ctx_buf)
+{
+ struct sw_accel_io_channel *sw_ch = ctx_buf;
+ struct sw_accel_op *op;
+
+ while ((op = TAILQ_FIRST(&sw_ch->op_pool))) {
+ TAILQ_REMOVE(&sw_ch->op_pool, op, link);
+ free(op);
+ }
+}
+
+static struct spdk_io_channel *sw_accel_get_io_channel(void)
+{
+ return spdk_get_io_channel(&sw_accel_engine);
+}
+
+static size_t
+sw_accel_engine_get_ctx_size(void)
+{
+ return sizeof(struct spdk_accel_task);
+}
+
+static int
+sw_accel_engine_init(void)
+{
+ accel_sw_register(&sw_accel_engine);
+ spdk_io_device_register(&sw_accel_engine, sw_accel_create_cb, sw_accel_destroy_cb,
+ sizeof(struct sw_accel_io_channel), "sw_accel_engine");
+
+ return 0;
+}
+
+static void
+sw_accel_engine_fini(void *ctxt)
+{
+ spdk_io_device_unregister(&sw_accel_engine, NULL);
+ accel_sw_unregister();
+
+ spdk_accel_engine_module_finish();
+}
+
+SPDK_ACCEL_MODULE_REGISTER(sw_accel_engine_init, sw_accel_engine_fini,
+ NULL, NULL, sw_accel_engine_get_ctx_size)
diff --git a/src/spdk/lib/accel/spdk_accel.map b/src/spdk/lib/accel/spdk_accel.map
new file mode 100644
index 000000000..bfccf0a90
--- /dev/null
+++ b/src/spdk/lib/accel/spdk_accel.map
@@ -0,0 +1,33 @@
+{
+ global:
+
+ # public functions
+ spdk_accel_engine_initialize;
+ spdk_accel_engine_finish;
+ spdk_accel_engine_config_text;
+ spdk_accel_engine_module_finish;
+ spdk_accel_engine_get_io_channel;
+ spdk_accel_get_capabilities;
+ spdk_accel_batch_get_max;
+ spdk_accel_batch_create;
+ spdk_accel_batch_prep_copy;
+ spdk_accel_batch_prep_dualcast;
+ spdk_accel_batch_prep_compare;
+ spdk_accel_batch_prep_fill;
+ spdk_accel_batch_prep_crc32c;
+ spdk_accel_batch_submit;
+ spdk_accel_batch_cancel;
+ spdk_accel_submit_copy;
+ spdk_accel_submit_dualcast;
+ spdk_accel_submit_compare;
+ spdk_accel_submit_fill;
+ spdk_accel_submit_crc32c;
+ spdk_accel_write_config_json;
+
+ # functions needed by modules
+ spdk_accel_hw_engine_register;
+ spdk_accel_module_list_add;
+
+
+ local: *;
+};
diff --git a/src/spdk/lib/bdev/Makefile b/src/spdk/lib/bdev/Makefile
new file mode 100644
index 000000000..ca0bf992a
--- /dev/null
+++ b/src/spdk/lib/bdev/Makefile
@@ -0,0 +1,50 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 3
+SO_MINOR := 0
+
+ifeq ($(CONFIG_VTUNE),y)
+CFLAGS += -I$(CONFIG_VTUNE_DIR)/include -I$(CONFIG_VTUNE_DIR)/sdk/src/ittnotify
+endif
+
+C_SRCS = bdev.c bdev_rpc.c bdev_zone.c part.c scsi_nvme.c
+C_SRCS-$(CONFIG_VTUNE) += vtune.c
+LIBNAME = bdev
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_bdev.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/bdev/bdev.c b/src/spdk/lib/bdev/bdev.c
new file mode 100644
index 000000000..af8c05aaa
--- /dev/null
+++ b/src/spdk/lib/bdev/bdev.c
@@ -0,0 +1,6763 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation. All rights reserved.
+ * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/bdev.h"
+#include "spdk/conf.h"
+
+#include "spdk/config.h"
+#include "spdk/env.h"
+#include "spdk/thread.h"
+#include "spdk/likely.h"
+#include "spdk/queue.h"
+#include "spdk/nvme_spec.h"
+#include "spdk/scsi_spec.h"
+#include "spdk/notify.h"
+#include "spdk/util.h"
+#include "spdk/trace.h"
+
+#include "spdk/bdev_module.h"
+#include "spdk_internal/log.h"
+#include "spdk/string.h"
+
+#include "bdev_internal.h"
+
+#ifdef SPDK_CONFIG_VTUNE
+#include "ittnotify.h"
+#include "ittnotify_types.h"
+int __itt_init_ittlib(const char *, __itt_group_id);
+#endif
+
+#define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1)
+#define SPDK_BDEV_IO_CACHE_SIZE 256
+#define SPDK_BDEV_AUTO_EXAMINE true
+#define BUF_SMALL_POOL_SIZE 8191
+#define BUF_LARGE_POOL_SIZE 1023
+#define NOMEM_THRESHOLD_COUNT 8
+#define ZERO_BUFFER_SIZE 0x100000
+
+#define OWNER_BDEV 0x2
+
+#define OBJECT_BDEV_IO 0x2
+
+#define TRACE_GROUP_BDEV 0x3
+#define TRACE_BDEV_IO_START SPDK_TPOINT_ID(TRACE_GROUP_BDEV, 0x0)
+#define TRACE_BDEV_IO_DONE SPDK_TPOINT_ID(TRACE_GROUP_BDEV, 0x1)
+
+#define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000
+#define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1
+#define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512
+#define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000
+#define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024)
+#define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX
+#define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000
+
+#define SPDK_BDEV_POOL_ALIGNMENT 512
+
+static const char *qos_conf_type[] = {"Limit_IOPS",
+ "Limit_BPS", "Limit_Read_BPS", "Limit_Write_BPS"
+ };
+static const char *qos_rpc_type[] = {"rw_ios_per_sec",
+ "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec"
+ };
+
+TAILQ_HEAD(spdk_bdev_list, spdk_bdev);
+
+struct spdk_bdev_mgr {
+ struct spdk_mempool *bdev_io_pool;
+
+ struct spdk_mempool *buf_small_pool;
+ struct spdk_mempool *buf_large_pool;
+
+ void *zero_buffer;
+
+ TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules;
+
+ struct spdk_bdev_list bdevs;
+
+ bool init_complete;
+ bool module_init_complete;
+
+ pthread_mutex_t mutex;
+
+#ifdef SPDK_CONFIG_VTUNE
+ __itt_domain *domain;
+#endif
+};
+
+static struct spdk_bdev_mgr g_bdev_mgr = {
+ .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules),
+ .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs),
+ .init_complete = false,
+ .module_init_complete = false,
+ .mutex = PTHREAD_MUTEX_INITIALIZER,
+};
+
+typedef void (*lock_range_cb)(void *ctx, int status);
+
+struct lba_range {
+ uint64_t offset;
+ uint64_t length;
+ void *locked_ctx;
+ struct spdk_bdev_channel *owner_ch;
+ TAILQ_ENTRY(lba_range) tailq;
+};
+
+static struct spdk_bdev_opts g_bdev_opts = {
+ .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE,
+ .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE,
+ .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE,
+};
+
+static spdk_bdev_init_cb g_init_cb_fn = NULL;
+static void *g_init_cb_arg = NULL;
+
+static spdk_bdev_fini_cb g_fini_cb_fn = NULL;
+static void *g_fini_cb_arg = NULL;
+static struct spdk_thread *g_fini_thread = NULL;
+
+struct spdk_bdev_qos_limit {
+ /** IOs or bytes allowed per second (i.e., 1s). */
+ uint64_t limit;
+
+ /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms).
+ * For remaining bytes, allowed to run negative if an I/O is submitted when
+ * some bytes are remaining, but the I/O is bigger than that amount. The
+ * excess will be deducted from the next timeslice.
+ */
+ int64_t remaining_this_timeslice;
+
+ /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */
+ uint32_t min_per_timeslice;
+
+ /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */
+ uint32_t max_per_timeslice;
+
+ /** Function to check whether to queue the IO. */
+ bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io);
+
+ /** Function to update for the submitted IO. */
+ void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io);
+};
+
+struct spdk_bdev_qos {
+ /** Types of structure of rate limits. */
+ struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES];
+
+ /** The channel that all I/O are funneled through. */
+ struct spdk_bdev_channel *ch;
+
+ /** The thread on which the poller is running. */
+ struct spdk_thread *thread;
+
+ /** Queue of I/O waiting to be issued. */
+ bdev_io_tailq_t queued;
+
+ /** Size of a timeslice in tsc ticks. */
+ uint64_t timeslice_size;
+
+ /** Timestamp of start of last timeslice. */
+ uint64_t last_timeslice;
+
+ /** Poller that processes queued I/O commands each time slice. */
+ struct spdk_poller *poller;
+};
+
+struct spdk_bdev_mgmt_channel {
+ bdev_io_stailq_t need_buf_small;
+ bdev_io_stailq_t need_buf_large;
+
+ /*
+ * Each thread keeps a cache of bdev_io - this allows
+ * bdev threads which are *not* DPDK threads to still
+ * benefit from a per-thread bdev_io cache. Without
+ * this, non-DPDK threads fetching from the mempool
+ * incur a cmpxchg on get and put.
+ */
+ bdev_io_stailq_t per_thread_cache;
+ uint32_t per_thread_cache_count;
+ uint32_t bdev_io_cache_size;
+
+ TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources;
+ TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue;
+};
+
+/*
+ * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device
+ * will queue here their IO that awaits retry. It makes it possible to retry sending
+ * IO to one bdev after IO from other bdev completes.
+ */
+struct spdk_bdev_shared_resource {
+ /* The bdev management channel */
+ struct spdk_bdev_mgmt_channel *mgmt_ch;
+
+ /*
+ * Count of I/O submitted to bdev module and waiting for completion.
+ * Incremented before submit_request() is called on an spdk_bdev_io.
+ */
+ uint64_t io_outstanding;
+
+ /*
+ * Queue of IO awaiting retry because of a previous NOMEM status returned
+ * on this channel.
+ */
+ bdev_io_tailq_t nomem_io;
+
+ /*
+ * Threshold which io_outstanding must drop to before retrying nomem_io.
+ */
+ uint64_t nomem_threshold;
+
+ /* I/O channel allocated by a bdev module */
+ struct spdk_io_channel *shared_ch;
+
+ /* Refcount of bdev channels using this resource */
+ uint32_t ref;
+
+ TAILQ_ENTRY(spdk_bdev_shared_resource) link;
+};
+
+#define BDEV_CH_RESET_IN_PROGRESS (1 << 0)
+#define BDEV_CH_QOS_ENABLED (1 << 1)
+
+struct spdk_bdev_channel {
+ struct spdk_bdev *bdev;
+
+ /* The channel for the underlying device */
+ struct spdk_io_channel *channel;
+
+ /* Per io_device per thread data */
+ struct spdk_bdev_shared_resource *shared_resource;
+
+ struct spdk_bdev_io_stat stat;
+
+ /*
+ * Count of I/O submitted to the underlying dev module through this channel
+ * and waiting for completion.
+ */
+ uint64_t io_outstanding;
+
+ /*
+ * List of all submitted I/Os including I/O that are generated via splitting.
+ */
+ bdev_io_tailq_t io_submitted;
+
+ /*
+ * List of spdk_bdev_io that are currently queued because they write to a locked
+ * LBA range.
+ */
+ bdev_io_tailq_t io_locked;
+
+ uint32_t flags;
+
+ struct spdk_histogram_data *histogram;
+
+#ifdef SPDK_CONFIG_VTUNE
+ uint64_t start_tsc;
+ uint64_t interval_tsc;
+ __itt_string_handle *handle;
+ struct spdk_bdev_io_stat prev_stat;
+#endif
+
+ bdev_io_tailq_t queued_resets;
+
+ lba_range_tailq_t locked_ranges;
+};
+
+struct media_event_entry {
+ struct spdk_bdev_media_event event;
+ TAILQ_ENTRY(media_event_entry) tailq;
+};
+
+#define MEDIA_EVENT_POOL_SIZE 64
+
+struct spdk_bdev_desc {
+ struct spdk_bdev *bdev;
+ struct spdk_thread *thread;
+ struct {
+ bool open_with_ext;
+ union {
+ spdk_bdev_remove_cb_t remove_fn;
+ spdk_bdev_event_cb_t event_fn;
+ };
+ void *ctx;
+ } callback;
+ bool closed;
+ bool write;
+ pthread_mutex_t mutex;
+ uint32_t refs;
+ TAILQ_HEAD(, media_event_entry) pending_media_events;
+ TAILQ_HEAD(, media_event_entry) free_media_events;
+ struct media_event_entry *media_events_buffer;
+ TAILQ_ENTRY(spdk_bdev_desc) link;
+
+ uint64_t timeout_in_sec;
+ spdk_bdev_io_timeout_cb cb_fn;
+ void *cb_arg;
+ struct spdk_poller *io_timeout_poller;
+};
+
+struct spdk_bdev_iostat_ctx {
+ struct spdk_bdev_io_stat *stat;
+ spdk_bdev_get_device_stat_cb cb;
+ void *cb_arg;
+};
+
+struct set_qos_limit_ctx {
+ void (*cb_fn)(void *cb_arg, int status);
+ void *cb_arg;
+ struct spdk_bdev *bdev;
+};
+
+#define __bdev_to_io_dev(bdev) (((char *)bdev) + 1)
+#define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1))
+
+static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg);
+static void bdev_write_zero_buffer_next(void *_bdev_io);
+
+static void bdev_enable_qos_msg(struct spdk_io_channel_iter *i);
+static void bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status);
+
+static int
+bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+ struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks,
+ uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg);
+static int
+bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+ struct iovec *iov, int iovcnt, void *md_buf,
+ uint64_t offset_blocks, uint64_t num_blocks,
+ spdk_bdev_io_completion_cb cb, void *cb_arg);
+
+static int
+bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch,
+ uint64_t offset, uint64_t length,
+ lock_range_cb cb_fn, void *cb_arg);
+
+static int
+bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch,
+ uint64_t offset, uint64_t length,
+ lock_range_cb cb_fn, void *cb_arg);
+
+static inline void bdev_io_complete(void *ctx);
+
+static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort);
+static bool bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_io *bio_to_abort);
+
+void
+spdk_bdev_get_opts(struct spdk_bdev_opts *opts)
+{
+ *opts = g_bdev_opts;
+}
+
+int
+spdk_bdev_set_opts(struct spdk_bdev_opts *opts)
+{
+ uint32_t min_pool_size;
+
+ /*
+ * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem
+ * initialization. A second mgmt_ch will be created on the same thread when the application starts
+ * but before the deferred put_io_channel event is executed for the first mgmt_ch.
+ */
+ min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1);
+ if (opts->bdev_io_pool_size < min_pool_size) {
+ SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32
+ " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size,
+ spdk_thread_get_count());
+ SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size);
+ return -1;
+ }
+
+ g_bdev_opts = *opts;
+ return 0;
+}
+
+struct spdk_bdev_examine_item {
+ char *name;
+ TAILQ_ENTRY(spdk_bdev_examine_item) link;
+};
+
+TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item);
+
+struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER(
+ g_bdev_examine_allowlist);
+
+static inline bool
+bdev_examine_allowlist_check(const char *name)
+{
+ struct spdk_bdev_examine_item *item;
+ TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) {
+ if (strcmp(name, item->name) == 0) {
+ return true;
+ }
+ }
+ return false;
+}
+
+static inline bool
+bdev_in_examine_allowlist(struct spdk_bdev *bdev)
+{
+ struct spdk_bdev_alias *tmp;
+ if (bdev_examine_allowlist_check(bdev->name)) {
+ return true;
+ }
+ TAILQ_FOREACH(tmp, &bdev->aliases, tailq) {
+ if (bdev_examine_allowlist_check(tmp->alias)) {
+ return true;
+ }
+ }
+ return false;
+}
+
+static inline bool
+bdev_ok_to_examine(struct spdk_bdev *bdev)
+{
+ if (g_bdev_opts.bdev_auto_examine) {
+ return true;
+ } else {
+ return bdev_in_examine_allowlist(bdev);
+ }
+}
+
+static void
+bdev_examine(struct spdk_bdev *bdev)
+{
+ struct spdk_bdev_module *module;
+ uint32_t action;
+
+ TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) {
+ if (module->examine_config && bdev_ok_to_examine(bdev)) {
+ action = module->internal.action_in_progress;
+ module->internal.action_in_progress++;
+ module->examine_config(bdev);
+ if (action != module->internal.action_in_progress) {
+ SPDK_ERRLOG("examine_config for module %s did not call spdk_bdev_module_examine_done()\n",
+ module->name);
+ }
+ }
+ }
+
+ if (bdev->internal.claim_module && bdev_ok_to_examine(bdev)) {
+ if (bdev->internal.claim_module->examine_disk) {
+ bdev->internal.claim_module->internal.action_in_progress++;
+ bdev->internal.claim_module->examine_disk(bdev);
+ }
+ return;
+ }
+
+ TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) {
+ if (module->examine_disk && bdev_ok_to_examine(bdev)) {
+ module->internal.action_in_progress++;
+ module->examine_disk(bdev);
+ }
+ }
+}
+
+struct spdk_bdev *
+spdk_bdev_first(void)
+{
+ struct spdk_bdev *bdev;
+
+ bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs);
+ if (bdev) {
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Starting bdev iteration at %s\n", bdev->name);
+ }
+
+ return bdev;
+}
+
+struct spdk_bdev *
+spdk_bdev_next(struct spdk_bdev *prev)
+{
+ struct spdk_bdev *bdev;
+
+ bdev = TAILQ_NEXT(prev, internal.link);
+ if (bdev) {
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Continuing bdev iteration at %s\n", bdev->name);
+ }
+
+ return bdev;
+}
+
+static struct spdk_bdev *
+_bdev_next_leaf(struct spdk_bdev *bdev)
+{
+ while (bdev != NULL) {
+ if (bdev->internal.claim_module == NULL) {
+ return bdev;
+ } else {
+ bdev = TAILQ_NEXT(bdev, internal.link);
+ }
+ }
+
+ return bdev;
+}
+
+struct spdk_bdev *
+spdk_bdev_first_leaf(void)
+{
+ struct spdk_bdev *bdev;
+
+ bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs));
+
+ if (bdev) {
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Starting bdev iteration at %s\n", bdev->name);
+ }
+
+ return bdev;
+}
+
+struct spdk_bdev *
+spdk_bdev_next_leaf(struct spdk_bdev *prev)
+{
+ struct spdk_bdev *bdev;
+
+ bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link));
+
+ if (bdev) {
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Continuing bdev iteration at %s\n", bdev->name);
+ }
+
+ return bdev;
+}
+
+struct spdk_bdev *
+spdk_bdev_get_by_name(const char *bdev_name)
+{
+ struct spdk_bdev_alias *tmp;
+ struct spdk_bdev *bdev = spdk_bdev_first();
+
+ while (bdev != NULL) {
+ if (strcmp(bdev_name, bdev->name) == 0) {
+ return bdev;
+ }
+
+ TAILQ_FOREACH(tmp, &bdev->aliases, tailq) {
+ if (strcmp(bdev_name, tmp->alias) == 0) {
+ return bdev;
+ }
+ }
+
+ bdev = spdk_bdev_next(bdev);
+ }
+
+ return NULL;
+}
+
+void
+spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len)
+{
+ struct iovec *iovs;
+
+ if (bdev_io->u.bdev.iovs == NULL) {
+ bdev_io->u.bdev.iovs = &bdev_io->iov;
+ bdev_io->u.bdev.iovcnt = 1;
+ }
+
+ iovs = bdev_io->u.bdev.iovs;
+
+ assert(iovs != NULL);
+ assert(bdev_io->u.bdev.iovcnt >= 1);
+
+ iovs[0].iov_base = buf;
+ iovs[0].iov_len = len;
+}
+
+void
+spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len)
+{
+ assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks);
+ bdev_io->u.bdev.md_buf = md_buf;
+}
+
+static bool
+_is_buf_allocated(const struct iovec *iovs)
+{
+ if (iovs == NULL) {
+ return false;
+ }
+
+ return iovs[0].iov_base != NULL;
+}
+
+static bool
+_are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment)
+{
+ int i;
+ uintptr_t iov_base;
+
+ if (spdk_likely(alignment == 1)) {
+ return true;
+ }
+
+ for (i = 0; i < iovcnt; i++) {
+ iov_base = (uintptr_t)iovs[i].iov_base;
+ if ((iov_base & (alignment - 1)) != 0) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static void
+_copy_iovs_to_buf(void *buf, size_t buf_len, struct iovec *iovs, int iovcnt)
+{
+ int i;
+ size_t len;
+
+ for (i = 0; i < iovcnt; i++) {
+ len = spdk_min(iovs[i].iov_len, buf_len);
+ memcpy(buf, iovs[i].iov_base, len);
+ buf += len;
+ buf_len -= len;
+ }
+}
+
+static void
+_copy_buf_to_iovs(struct iovec *iovs, int iovcnt, void *buf, size_t buf_len)
+{
+ int i;
+ size_t len;
+
+ for (i = 0; i < iovcnt; i++) {
+ len = spdk_min(iovs[i].iov_len, buf_len);
+ memcpy(iovs[i].iov_base, buf, len);
+ buf += len;
+ buf_len -= len;
+ }
+}
+
+static void
+_bdev_io_set_bounce_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len)
+{
+ /* save original iovec */
+ bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs;
+ bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt;
+ /* set bounce iov */
+ bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov;
+ bdev_io->u.bdev.iovcnt = 1;
+ /* set bounce buffer for this operation */
+ bdev_io->u.bdev.iovs[0].iov_base = buf;
+ bdev_io->u.bdev.iovs[0].iov_len = len;
+ /* if this is write path, copy data from original buffer to bounce buffer */
+ if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
+ _copy_iovs_to_buf(buf, len, bdev_io->internal.orig_iovs, bdev_io->internal.orig_iovcnt);
+ }
+}
+
+static void
+_bdev_io_set_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len)
+{
+ /* save original md_buf */
+ bdev_io->internal.orig_md_buf = bdev_io->u.bdev.md_buf;
+ /* set bounce md_buf */
+ bdev_io->u.bdev.md_buf = md_buf;
+
+ if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
+ memcpy(md_buf, bdev_io->internal.orig_md_buf, len);
+ }
+}
+
+static void
+bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, void *buf, bool status)
+{
+ struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io);
+
+ if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) {
+ bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf);
+ bdev_io->internal.get_aux_buf_cb = NULL;
+ } else {
+ assert(bdev_io->internal.get_buf_cb != NULL);
+ bdev_io->internal.buf = buf;
+ bdev_io->internal.get_buf_cb(ch, bdev_io, status);
+ bdev_io->internal.get_buf_cb = NULL;
+ }
+}
+
+static void
+_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len)
+{
+ struct spdk_bdev *bdev = bdev_io->bdev;
+ bool buf_allocated;
+ uint64_t md_len, alignment;
+ void *aligned_buf;
+
+ if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) {
+ bdev_io_get_buf_complete(bdev_io, buf, true);
+ return;
+ }
+
+ alignment = spdk_bdev_get_buf_align(bdev);
+ buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs);
+ aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1));
+
+ if (buf_allocated) {
+ _bdev_io_set_bounce_buf(bdev_io, aligned_buf, len);
+ } else {
+ spdk_bdev_io_set_buf(bdev_io, aligned_buf, len);
+ }
+
+ if (spdk_bdev_is_md_separate(bdev)) {
+ aligned_buf = (char *)aligned_buf + len;
+ md_len = bdev_io->u.bdev.num_blocks * bdev->md_len;
+
+ assert(((uintptr_t)aligned_buf & (alignment - 1)) == 0);
+
+ if (bdev_io->u.bdev.md_buf != NULL) {
+ _bdev_io_set_bounce_md_buf(bdev_io, aligned_buf, md_len);
+ } else {
+ spdk_bdev_io_set_md_buf(bdev_io, aligned_buf, md_len);
+ }
+ }
+ bdev_io_get_buf_complete(bdev_io, buf, true);
+}
+
+static void
+_bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len)
+{
+ struct spdk_bdev *bdev = bdev_io->bdev;
+ struct spdk_mempool *pool;
+ struct spdk_bdev_io *tmp;
+ bdev_io_stailq_t *stailq;
+ struct spdk_bdev_mgmt_channel *ch;
+ uint64_t md_len, alignment;
+
+ md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0;
+ alignment = spdk_bdev_get_buf_align(bdev);
+ ch = bdev_io->internal.ch->shared_resource->mgmt_ch;
+
+ if (buf_len + alignment + md_len <= SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) +
+ SPDK_BDEV_POOL_ALIGNMENT) {
+ pool = g_bdev_mgr.buf_small_pool;
+ stailq = &ch->need_buf_small;
+ } else {
+ pool = g_bdev_mgr.buf_large_pool;
+ stailq = &ch->need_buf_large;
+ }
+
+ if (STAILQ_EMPTY(stailq)) {
+ spdk_mempool_put(pool, buf);
+ } else {
+ tmp = STAILQ_FIRST(stailq);
+ STAILQ_REMOVE_HEAD(stailq, internal.buf_link);
+ _bdev_io_set_buf(tmp, buf, tmp->internal.buf_len);
+ }
+}
+
+static void
+bdev_io_put_buf(struct spdk_bdev_io *bdev_io)
+{
+ assert(bdev_io->internal.buf != NULL);
+ _bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len);
+ bdev_io->internal.buf = NULL;
+}
+
+void
+spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf)
+{
+ uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
+
+ assert(buf != NULL);
+ _bdev_io_put_buf(bdev_io, buf, len);
+}
+
+static void
+_bdev_io_unset_bounce_buf(struct spdk_bdev_io *bdev_io)
+{
+ if (spdk_likely(bdev_io->internal.orig_iovcnt == 0)) {
+ assert(bdev_io->internal.orig_md_buf == NULL);
+ return;
+ }
+
+ /* if this is read path, copy data from bounce buffer to original buffer */
+ if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ &&
+ bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
+ _copy_buf_to_iovs(bdev_io->internal.orig_iovs,
+ bdev_io->internal.orig_iovcnt,
+ bdev_io->internal.bounce_iov.iov_base,
+ bdev_io->internal.bounce_iov.iov_len);
+ }
+ /* set original buffer for this io */
+ bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt;
+ bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs;
+ /* disable bouncing buffer for this io */
+ bdev_io->internal.orig_iovcnt = 0;
+ bdev_io->internal.orig_iovs = NULL;
+
+ /* do the same for metadata buffer */
+ if (spdk_unlikely(bdev_io->internal.orig_md_buf != NULL)) {
+ assert(spdk_bdev_is_md_separate(bdev_io->bdev));
+
+ if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ &&
+ bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
+ memcpy(bdev_io->internal.orig_md_buf, bdev_io->u.bdev.md_buf,
+ bdev_io->u.bdev.num_blocks * spdk_bdev_get_md_size(bdev_io->bdev));
+ }
+
+ bdev_io->u.bdev.md_buf = bdev_io->internal.orig_md_buf;
+ bdev_io->internal.orig_md_buf = NULL;
+ }
+
+ /* We want to free the bounce buffer here since we know we're done with it (as opposed
+ * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()).
+ */
+ bdev_io_put_buf(bdev_io);
+}
+
+static void
+bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len)
+{
+ struct spdk_bdev *bdev = bdev_io->bdev;
+ struct spdk_mempool *pool;
+ bdev_io_stailq_t *stailq;
+ struct spdk_bdev_mgmt_channel *mgmt_ch;
+ uint64_t alignment, md_len;
+ void *buf;
+
+ alignment = spdk_bdev_get_buf_align(bdev);
+ md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0;
+
+ if (len + alignment + md_len > SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_LARGE_BUF_MAX_SIZE) +
+ SPDK_BDEV_POOL_ALIGNMENT) {
+ SPDK_ERRLOG("Length + alignment %" PRIu64 " is larger than allowed\n",
+ len + alignment);
+ bdev_io_get_buf_complete(bdev_io, NULL, false);
+ return;
+ }
+
+ mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch;
+
+ bdev_io->internal.buf_len = len;
+
+ if (len + alignment + md_len <= SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) +
+ SPDK_BDEV_POOL_ALIGNMENT) {
+ pool = g_bdev_mgr.buf_small_pool;
+ stailq = &mgmt_ch->need_buf_small;
+ } else {
+ pool = g_bdev_mgr.buf_large_pool;
+ stailq = &mgmt_ch->need_buf_large;
+ }
+
+ buf = spdk_mempool_get(pool);
+ if (!buf) {
+ STAILQ_INSERT_TAIL(stailq, bdev_io, internal.buf_link);
+ } else {
+ _bdev_io_set_buf(bdev_io, buf, len);
+ }
+}
+
+void
+spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len)
+{
+ struct spdk_bdev *bdev = bdev_io->bdev;
+ uint64_t alignment;
+
+ assert(cb != NULL);
+ bdev_io->internal.get_buf_cb = cb;
+
+ alignment = spdk_bdev_get_buf_align(bdev);
+
+ if (_is_buf_allocated(bdev_io->u.bdev.iovs) &&
+ _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) {
+ /* Buffer already present and aligned */
+ cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true);
+ return;
+ }
+
+ bdev_io_get_buf(bdev_io, len);
+}
+
+void
+spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb)
+{
+ uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
+
+ assert(cb != NULL);
+ assert(bdev_io->internal.get_aux_buf_cb == NULL);
+ bdev_io->internal.get_aux_buf_cb = cb;
+ bdev_io_get_buf(bdev_io, len);
+}
+
+static int
+bdev_module_get_max_ctx_size(void)
+{
+ struct spdk_bdev_module *bdev_module;
+ int max_bdev_module_size = 0;
+
+ TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) {
+ if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) {
+ max_bdev_module_size = bdev_module->get_ctx_size();
+ }
+ }
+
+ return max_bdev_module_size;
+}
+
+void
+spdk_bdev_config_text(FILE *fp)
+{
+ struct spdk_bdev_module *bdev_module;
+
+ TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) {
+ if (bdev_module->config_text) {
+ bdev_module->config_text(fp);
+ }
+ }
+}
+
+static void
+bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
+{
+ int i;
+ struct spdk_bdev_qos *qos = bdev->internal.qos;
+ uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES];
+
+ if (!qos) {
+ return;
+ }
+
+ spdk_bdev_get_qos_rate_limits(bdev, limits);
+
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "method", "bdev_set_qos_limit");
+
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_string(w, "name", bdev->name);
+ for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
+ if (limits[i] > 0) {
+ spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]);
+ }
+ }
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+}
+
+void
+spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w)
+{
+ struct spdk_bdev_module *bdev_module;
+ struct spdk_bdev *bdev;
+
+ assert(w != NULL);
+
+ spdk_json_write_array_begin(w);
+
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "method", "bdev_set_options");
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size);
+ spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size);
+ spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine);
+ spdk_json_write_object_end(w);
+ spdk_json_write_object_end(w);
+
+ TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) {
+ if (bdev_module->config_json) {
+ bdev_module->config_json(w);
+ }
+ }
+
+ pthread_mutex_lock(&g_bdev_mgr.mutex);
+
+ TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) {
+ if (bdev->fn_table->write_config_json) {
+ bdev->fn_table->write_config_json(bdev, w);
+ }
+
+ bdev_qos_config_json(bdev, w);
+ }
+
+ pthread_mutex_unlock(&g_bdev_mgr.mutex);
+
+ spdk_json_write_array_end(w);
+}
+
+static int
+bdev_mgmt_channel_create(void *io_device, void *ctx_buf)
+{
+ struct spdk_bdev_mgmt_channel *ch = ctx_buf;
+ struct spdk_bdev_io *bdev_io;
+ uint32_t i;
+
+ STAILQ_INIT(&ch->need_buf_small);
+ STAILQ_INIT(&ch->need_buf_large);
+
+ STAILQ_INIT(&ch->per_thread_cache);
+ ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size;
+
+ /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */
+ ch->per_thread_cache_count = 0;
+ for (i = 0; i < ch->bdev_io_cache_size; i++) {
+ bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool);
+ assert(bdev_io != NULL);
+ ch->per_thread_cache_count++;
+ STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link);
+ }
+
+ TAILQ_INIT(&ch->shared_resources);
+ TAILQ_INIT(&ch->io_wait_queue);
+
+ return 0;
+}
+
+static void
+bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf)
+{
+ struct spdk_bdev_mgmt_channel *ch = ctx_buf;
+ struct spdk_bdev_io *bdev_io;
+
+ if (!STAILQ_EMPTY(&ch->need_buf_small) || !STAILQ_EMPTY(&ch->need_buf_large)) {
+ SPDK_ERRLOG("Pending I/O list wasn't empty on mgmt channel free\n");
+ }
+
+ if (!TAILQ_EMPTY(&ch->shared_resources)) {
+ SPDK_ERRLOG("Module channel list wasn't empty on mgmt channel free\n");
+ }
+
+ while (!STAILQ_EMPTY(&ch->per_thread_cache)) {
+ bdev_io = STAILQ_FIRST(&ch->per_thread_cache);
+ STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link);
+ ch->per_thread_cache_count--;
+ spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io);
+ }
+
+ assert(ch->per_thread_cache_count == 0);
+}
+
+static void
+bdev_init_complete(int rc)
+{
+ spdk_bdev_init_cb cb_fn = g_init_cb_fn;
+ void *cb_arg = g_init_cb_arg;
+ struct spdk_bdev_module *m;
+
+ g_bdev_mgr.init_complete = true;
+ g_init_cb_fn = NULL;
+ g_init_cb_arg = NULL;
+
+ /*
+ * For modules that need to know when subsystem init is complete,
+ * inform them now.
+ */
+ if (rc == 0) {
+ TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) {
+ if (m->init_complete) {
+ m->init_complete();
+ }
+ }
+ }
+
+ cb_fn(cb_arg, rc);
+}
+
+static void
+bdev_module_action_complete(void)
+{
+ struct spdk_bdev_module *m;
+
+ /*
+ * Don't finish bdev subsystem initialization if
+ * module pre-initialization is still in progress, or
+ * the subsystem been already initialized.
+ */
+ if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) {
+ return;
+ }
+
+ /*
+ * Check all bdev modules for inits/examinations in progress. If any
+ * exist, return immediately since we cannot finish bdev subsystem
+ * initialization until all are completed.
+ */
+ TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) {
+ if (m->internal.action_in_progress > 0) {
+ return;
+ }
+ }
+
+ /*
+ * Modules already finished initialization - now that all
+ * the bdev modules have finished their asynchronous I/O
+ * processing, the entire bdev layer can be marked as complete.
+ */
+ bdev_init_complete(0);
+}
+
+static void
+bdev_module_action_done(struct spdk_bdev_module *module)
+{
+ assert(module->internal.action_in_progress > 0);
+ module->internal.action_in_progress--;
+ bdev_module_action_complete();
+}
+
+void
+spdk_bdev_module_init_done(struct spdk_bdev_module *module)
+{
+ bdev_module_action_done(module);
+}
+
+void
+spdk_bdev_module_examine_done(struct spdk_bdev_module *module)
+{
+ bdev_module_action_done(module);
+}
+
+/** The last initialized bdev module */
+static struct spdk_bdev_module *g_resume_bdev_module = NULL;
+
+static void
+bdev_init_failed(void *cb_arg)
+{
+ struct spdk_bdev_module *module = cb_arg;
+
+ module->internal.action_in_progress--;
+ bdev_init_complete(-1);
+}
+
+static int
+bdev_modules_init(void)
+{
+ struct spdk_bdev_module *module;
+ int rc = 0;
+
+ TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) {
+ g_resume_bdev_module = module;
+ if (module->async_init) {
+ module->internal.action_in_progress = 1;
+ }
+ rc = module->module_init();
+ if (rc != 0) {
+ /* Bump action_in_progress to prevent other modules from completion of modules_init
+ * Send message to defer application shutdown until resources are cleaned up */
+ module->internal.action_in_progress = 1;
+ spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module);
+ return rc;
+ }
+ }
+
+ g_resume_bdev_module = NULL;
+ return 0;
+}
+
+void
+spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg)
+{
+ struct spdk_conf_section *sp;
+ struct spdk_bdev_opts bdev_opts;
+ int32_t bdev_io_pool_size, bdev_io_cache_size;
+ int cache_size;
+ int rc = 0;
+ char mempool_name[32];
+
+ assert(cb_fn != NULL);
+
+ sp = spdk_conf_find_section(NULL, "Bdev");
+ if (sp != NULL) {
+ spdk_bdev_get_opts(&bdev_opts);
+
+ bdev_io_pool_size = spdk_conf_section_get_intval(sp, "BdevIoPoolSize");
+ if (bdev_io_pool_size >= 0) {
+ bdev_opts.bdev_io_pool_size = bdev_io_pool_size;
+ }
+
+ bdev_io_cache_size = spdk_conf_section_get_intval(sp, "BdevIoCacheSize");
+ if (bdev_io_cache_size >= 0) {
+ bdev_opts.bdev_io_cache_size = bdev_io_cache_size;
+ }
+
+ if (spdk_bdev_set_opts(&bdev_opts)) {
+ bdev_init_complete(-1);
+ return;
+ }
+
+ assert(memcmp(&bdev_opts, &g_bdev_opts, sizeof(bdev_opts)) == 0);
+ }
+
+ g_init_cb_fn = cb_fn;
+ g_init_cb_arg = cb_arg;
+
+ spdk_notify_type_register("bdev_register");
+ spdk_notify_type_register("bdev_unregister");
+
+ snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid());
+
+ g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name,
+ g_bdev_opts.bdev_io_pool_size,
+ sizeof(struct spdk_bdev_io) +
+ bdev_module_get_max_ctx_size(),
+ 0,
+ SPDK_ENV_SOCKET_ID_ANY);
+
+ if (g_bdev_mgr.bdev_io_pool == NULL) {
+ SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n");
+ bdev_init_complete(-1);
+ return;
+ }
+
+ /**
+ * Ensure no more than half of the total buffers end up local caches, by
+ * using spdk_env_get_core_count() to determine how many local caches we need
+ * to account for.
+ */
+ cache_size = BUF_SMALL_POOL_SIZE / (2 * spdk_env_get_core_count());
+ snprintf(mempool_name, sizeof(mempool_name), "buf_small_pool_%d", getpid());
+
+ g_bdev_mgr.buf_small_pool = spdk_mempool_create(mempool_name,
+ BUF_SMALL_POOL_SIZE,
+ SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) +
+ SPDK_BDEV_POOL_ALIGNMENT,
+ cache_size,
+ SPDK_ENV_SOCKET_ID_ANY);
+ if (!g_bdev_mgr.buf_small_pool) {
+ SPDK_ERRLOG("create rbuf small pool failed\n");
+ bdev_init_complete(-1);
+ return;
+ }
+
+ cache_size = BUF_LARGE_POOL_SIZE / (2 * spdk_env_get_core_count());
+ snprintf(mempool_name, sizeof(mempool_name), "buf_large_pool_%d", getpid());
+
+ g_bdev_mgr.buf_large_pool = spdk_mempool_create(mempool_name,
+ BUF_LARGE_POOL_SIZE,
+ SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_LARGE_BUF_MAX_SIZE) +
+ SPDK_BDEV_POOL_ALIGNMENT,
+ cache_size,
+ SPDK_ENV_SOCKET_ID_ANY);
+ if (!g_bdev_mgr.buf_large_pool) {
+ SPDK_ERRLOG("create rbuf large pool failed\n");
+ bdev_init_complete(-1);
+ return;
+ }
+
+ g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE,
+ NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+ if (!g_bdev_mgr.zero_buffer) {
+ SPDK_ERRLOG("create bdev zero buffer failed\n");
+ bdev_init_complete(-1);
+ return;
+ }
+
+#ifdef SPDK_CONFIG_VTUNE
+ g_bdev_mgr.domain = __itt_domain_create("spdk_bdev");
+#endif
+
+ spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create,
+ bdev_mgmt_channel_destroy,
+ sizeof(struct spdk_bdev_mgmt_channel),
+ "bdev_mgr");
+
+ rc = bdev_modules_init();
+ g_bdev_mgr.module_init_complete = true;
+ if (rc != 0) {
+ SPDK_ERRLOG("bdev modules init failed\n");
+ return;
+ }
+
+ bdev_module_action_complete();
+}
+
+static void
+bdev_mgr_unregister_cb(void *io_device)
+{
+ spdk_bdev_fini_cb cb_fn = g_fini_cb_fn;
+
+ if (g_bdev_mgr.bdev_io_pool) {
+ if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) {
+ SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n",
+ spdk_mempool_count(g_bdev_mgr.bdev_io_pool),
+ g_bdev_opts.bdev_io_pool_size);
+ }
+
+ spdk_mempool_free(g_bdev_mgr.bdev_io_pool);
+ }
+
+ if (g_bdev_mgr.buf_small_pool) {
+ if (spdk_mempool_count(g_bdev_mgr.buf_small_pool) != BUF_SMALL_POOL_SIZE) {
+ SPDK_ERRLOG("Small buffer pool count is %zu but should be %u\n",
+ spdk_mempool_count(g_bdev_mgr.buf_small_pool),
+ BUF_SMALL_POOL_SIZE);
+ assert(false);
+ }
+
+ spdk_mempool_free(g_bdev_mgr.buf_small_pool);
+ }
+
+ if (g_bdev_mgr.buf_large_pool) {
+ if (spdk_mempool_count(g_bdev_mgr.buf_large_pool) != BUF_LARGE_POOL_SIZE) {
+ SPDK_ERRLOG("Large buffer pool count is %zu but should be %u\n",
+ spdk_mempool_count(g_bdev_mgr.buf_large_pool),
+ BUF_LARGE_POOL_SIZE);
+ assert(false);
+ }
+
+ spdk_mempool_free(g_bdev_mgr.buf_large_pool);
+ }
+
+ spdk_free(g_bdev_mgr.zero_buffer);
+
+ cb_fn(g_fini_cb_arg);
+ g_fini_cb_fn = NULL;
+ g_fini_cb_arg = NULL;
+ g_bdev_mgr.init_complete = false;
+ g_bdev_mgr.module_init_complete = false;
+ pthread_mutex_destroy(&g_bdev_mgr.mutex);
+}
+
+static void
+bdev_module_finish_iter(void *arg)
+{
+ struct spdk_bdev_module *bdev_module;
+
+ /* FIXME: Handling initialization failures is broken now,
+ * so we won't even try cleaning up after successfully
+ * initialized modules. if module_init_complete is false,
+ * just call spdk_bdev_mgr_unregister_cb
+ */
+ if (!g_bdev_mgr.module_init_complete) {
+ bdev_mgr_unregister_cb(NULL);
+ return;
+ }
+
+ /* Start iterating from the last touched module */
+ if (!g_resume_bdev_module) {
+ bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list);
+ } else {
+ bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list,
+ internal.tailq);
+ }
+
+ while (bdev_module) {
+ if (bdev_module->async_fini) {
+ /* Save our place so we can resume later. We must
+ * save the variable here, before calling module_fini()
+ * below, because in some cases the module may immediately
+ * call spdk_bdev_module_finish_done() and re-enter
+ * this function to continue iterating. */
+ g_resume_bdev_module = bdev_module;
+ }
+
+ if (bdev_module->module_fini) {
+ bdev_module->module_fini();
+ }
+
+ if (bdev_module->async_fini) {
+ return;
+ }
+
+ bdev_module = TAILQ_PREV(bdev_module, bdev_module_list,
+ internal.tailq);
+ }
+
+ g_resume_bdev_module = NULL;
+ spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb);
+}
+
+void
+spdk_bdev_module_finish_done(void)
+{
+ if (spdk_get_thread() != g_fini_thread) {
+ spdk_thread_send_msg(g_fini_thread, bdev_module_finish_iter, NULL);
+ } else {
+ bdev_module_finish_iter(NULL);
+ }
+}
+
+static void
+bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno)
+{
+ struct spdk_bdev *bdev = cb_arg;
+
+ if (bdeverrno && bdev) {
+ SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n",
+ bdev->name);
+
+ /*
+ * Since the call to spdk_bdev_unregister() failed, we have no way to free this
+ * bdev; try to continue by manually removing this bdev from the list and continue
+ * with the next bdev in the list.
+ */
+ TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link);
+ }
+
+ if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) {
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Done unregistering bdevs\n");
+ /*
+ * Bdev module finish need to be deferred as we might be in the middle of some context
+ * (like bdev part free) that will use this bdev (or private bdev driver ctx data)
+ * after returning.
+ */
+ spdk_thread_send_msg(spdk_get_thread(), bdev_module_finish_iter, NULL);
+ return;
+ }
+
+ /*
+ * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem
+ * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity
+ * to detect clean shutdown as opposed to run-time hot removal of the underlying
+ * base bdevs.
+ *
+ * Also, walk the list in the reverse order.
+ */
+ for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list);
+ bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) {
+ if (bdev->internal.claim_module != NULL) {
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Skipping claimed bdev '%s'(<-'%s').\n",
+ bdev->name, bdev->internal.claim_module->name);
+ continue;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Unregistering bdev '%s'\n", bdev->name);
+ spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev);
+ return;
+ }
+
+ /*
+ * If any bdev fails to unclaim underlying bdev properly, we may face the
+ * case of bdev list consisting of claimed bdevs only (if claims are managed
+ * correctly, this would mean there's a loop in the claims graph which is
+ * clearly impossible). Warn and unregister last bdev on the list then.
+ */
+ for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list);
+ bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) {
+ SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name);
+ spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev);
+ return;
+ }
+}
+
+void
+spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg)
+{
+ struct spdk_bdev_module *m;
+
+ assert(cb_fn != NULL);
+
+ g_fini_thread = spdk_get_thread();
+
+ g_fini_cb_fn = cb_fn;
+ g_fini_cb_arg = cb_arg;
+
+ TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) {
+ if (m->fini_start) {
+ m->fini_start();
+ }
+ }
+
+ bdev_finish_unregister_bdevs_iter(NULL, 0);
+}
+
+struct spdk_bdev_io *
+bdev_channel_get_io(struct spdk_bdev_channel *channel)
+{
+ struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch;
+ struct spdk_bdev_io *bdev_io;
+
+ if (ch->per_thread_cache_count > 0) {
+ bdev_io = STAILQ_FIRST(&ch->per_thread_cache);
+ STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link);
+ ch->per_thread_cache_count--;
+ } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) {
+ /*
+ * Don't try to look for bdev_ios in the global pool if there are
+ * waiters on bdev_ios - we don't want this caller to jump the line.
+ */
+ bdev_io = NULL;
+ } else {
+ bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool);
+ }
+
+ return bdev_io;
+}
+
+void
+spdk_bdev_free_io(struct spdk_bdev_io *bdev_io)
+{
+ struct spdk_bdev_mgmt_channel *ch;
+
+ assert(bdev_io != NULL);
+ assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING);
+
+ ch = bdev_io->internal.ch->shared_resource->mgmt_ch;
+
+ if (bdev_io->internal.buf != NULL) {
+ bdev_io_put_buf(bdev_io);
+ }
+
+ if (ch->per_thread_cache_count < ch->bdev_io_cache_size) {
+ ch->per_thread_cache_count++;
+ STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link);
+ while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) {
+ struct spdk_bdev_io_wait_entry *entry;
+
+ entry = TAILQ_FIRST(&ch->io_wait_queue);
+ TAILQ_REMOVE(&ch->io_wait_queue, entry, link);
+ entry->cb_fn(entry->cb_arg);
+ }
+ } else {
+ /* We should never have a full cache with entries on the io wait queue. */
+ assert(TAILQ_EMPTY(&ch->io_wait_queue));
+ spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io);
+ }
+}
+
+static bool
+bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit)
+{
+ assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES);
+
+ switch (limit) {
+ case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT:
+ return true;
+ case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT:
+ case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT:
+ case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT:
+ return false;
+ case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES:
+ default:
+ return false;
+ }
+}
+
+static bool
+bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io)
+{
+ switch (bdev_io->type) {
+ case SPDK_BDEV_IO_TYPE_NVME_IO:
+ case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
+ case SPDK_BDEV_IO_TYPE_READ:
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ return true;
+ case SPDK_BDEV_IO_TYPE_ZCOPY:
+ if (bdev_io->u.bdev.zcopy.start) {
+ return true;
+ } else {
+ return false;
+ }
+ default:
+ return false;
+ }
+}
+
+static bool
+bdev_is_read_io(struct spdk_bdev_io *bdev_io)
+{
+ switch (bdev_io->type) {
+ case SPDK_BDEV_IO_TYPE_NVME_IO:
+ case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
+ /* Bit 1 (0x2) set for read operation */
+ if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) {
+ return true;
+ } else {
+ return false;
+ }
+ case SPDK_BDEV_IO_TYPE_READ:
+ return true;
+ case SPDK_BDEV_IO_TYPE_ZCOPY:
+ /* Populate to read from disk */
+ if (bdev_io->u.bdev.zcopy.populate) {
+ return true;
+ } else {
+ return false;
+ }
+ default:
+ return false;
+ }
+}
+
+static uint64_t
+bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io)
+{
+ struct spdk_bdev *bdev = bdev_io->bdev;
+
+ switch (bdev_io->type) {
+ case SPDK_BDEV_IO_TYPE_NVME_IO:
+ case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
+ return bdev_io->u.nvme_passthru.nbytes;
+ case SPDK_BDEV_IO_TYPE_READ:
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ return bdev_io->u.bdev.num_blocks * bdev->blocklen;
+ case SPDK_BDEV_IO_TYPE_ZCOPY:
+ /* Track the data in the start phase only */
+ if (bdev_io->u.bdev.zcopy.start) {
+ return bdev_io->u.bdev.num_blocks * bdev->blocklen;
+ } else {
+ return 0;
+ }
+ default:
+ return 0;
+ }
+}
+
+static bool
+bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
+{
+ if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) {
+ return true;
+ } else {
+ return false;
+ }
+}
+
+static bool
+bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
+{
+ if (bdev_is_read_io(io) == false) {
+ return false;
+ }
+
+ return bdev_qos_rw_queue_io(limit, io);
+}
+
+static bool
+bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
+{
+ if (bdev_is_read_io(io) == true) {
+ return false;
+ }
+
+ return bdev_qos_rw_queue_io(limit, io);
+}
+
+static void
+bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
+{
+ limit->remaining_this_timeslice--;
+}
+
+static void
+bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
+{
+ limit->remaining_this_timeslice -= bdev_get_io_size_in_byte(io);
+}
+
+static void
+bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
+{
+ if (bdev_is_read_io(io) == false) {
+ return;
+ }
+
+ return bdev_qos_rw_bps_update_quota(limit, io);
+}
+
+static void
+bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io)
+{
+ if (bdev_is_read_io(io) == true) {
+ return;
+ }
+
+ return bdev_qos_rw_bps_update_quota(limit, io);
+}
+
+static void
+bdev_qos_set_ops(struct spdk_bdev_qos *qos)
+{
+ int i;
+
+ for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
+ if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
+ qos->rate_limits[i].queue_io = NULL;
+ qos->rate_limits[i].update_quota = NULL;
+ continue;
+ }
+
+ switch (i) {
+ case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT:
+ qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io;
+ qos->rate_limits[i].update_quota = bdev_qos_rw_iops_update_quota;
+ break;
+ case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT:
+ qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io;
+ qos->rate_limits[i].update_quota = bdev_qos_rw_bps_update_quota;
+ break;
+ case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT:
+ qos->rate_limits[i].queue_io = bdev_qos_r_queue_io;
+ qos->rate_limits[i].update_quota = bdev_qos_r_bps_update_quota;
+ break;
+ case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT:
+ qos->rate_limits[i].queue_io = bdev_qos_w_queue_io;
+ qos->rate_limits[i].update_quota = bdev_qos_w_bps_update_quota;
+ break;
+ default:
+ break;
+ }
+ }
+}
+
+static void
+_bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch,
+ struct spdk_bdev_io *bdev_io,
+ enum spdk_bdev_io_status status)
+{
+ struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
+
+ bdev_io->internal.in_submit_request = true;
+ bdev_ch->io_outstanding++;
+ shared_resource->io_outstanding++;
+ spdk_bdev_io_complete(bdev_io, status);
+ bdev_io->internal.in_submit_request = false;
+}
+
+static inline void
+bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io)
+{
+ struct spdk_bdev *bdev = bdev_io->bdev;
+ struct spdk_io_channel *ch = bdev_ch->channel;
+ struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
+
+ if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) {
+ struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch;
+ struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort;
+
+ if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) ||
+ bdev_abort_buf_io(&mgmt_channel->need_buf_small, bio_to_abort) ||
+ bdev_abort_buf_io(&mgmt_channel->need_buf_large, bio_to_abort)) {
+ _bdev_io_complete_in_submit(bdev_ch, bdev_io,
+ SPDK_BDEV_IO_STATUS_SUCCESS);
+ return;
+ }
+ }
+
+ if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) {
+ bdev_ch->io_outstanding++;
+ shared_resource->io_outstanding++;
+ bdev_io->internal.in_submit_request = true;
+ bdev->fn_table->submit_request(ch, bdev_io);
+ bdev_io->internal.in_submit_request = false;
+ } else {
+ TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link);
+ }
+}
+
+static int
+bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos)
+{
+ struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL;
+ int i, submitted_ios = 0;
+
+ TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) {
+ if (bdev_qos_io_to_limit(bdev_io) == true) {
+ for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
+ if (!qos->rate_limits[i].queue_io) {
+ continue;
+ }
+
+ if (qos->rate_limits[i].queue_io(&qos->rate_limits[i],
+ bdev_io) == true) {
+ return submitted_ios;
+ }
+ }
+ for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
+ if (!qos->rate_limits[i].update_quota) {
+ continue;
+ }
+
+ qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io);
+ }
+ }
+
+ TAILQ_REMOVE(&qos->queued, bdev_io, internal.link);
+ bdev_io_do_submit(ch, bdev_io);
+ submitted_ios++;
+ }
+
+ return submitted_ios;
+}
+
+static void
+bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn)
+{
+ int rc;
+
+ bdev_io->internal.waitq_entry.bdev = bdev_io->bdev;
+ bdev_io->internal.waitq_entry.cb_fn = cb_fn;
+ bdev_io->internal.waitq_entry.cb_arg = bdev_io;
+ rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch),
+ &bdev_io->internal.waitq_entry);
+ if (rc != 0) {
+ SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc);
+ bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
+ bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
+ }
+}
+
+static bool
+bdev_io_type_can_split(uint8_t type)
+{
+ assert(type != SPDK_BDEV_IO_TYPE_INVALID);
+ assert(type < SPDK_BDEV_NUM_IO_TYPES);
+
+ /* Only split READ and WRITE I/O. Theoretically other types of I/O like
+ * UNMAP could be split, but these types of I/O are typically much larger
+ * in size (sometimes the size of the entire block device), and the bdev
+ * module can more efficiently split these types of I/O. Plus those types
+ * of I/O do not have a payload, which makes the splitting process simpler.
+ */
+ if (type == SPDK_BDEV_IO_TYPE_READ || type == SPDK_BDEV_IO_TYPE_WRITE) {
+ return true;
+ } else {
+ return false;
+ }
+}
+
+static bool
+bdev_io_should_split(struct spdk_bdev_io *bdev_io)
+{
+ uint64_t start_stripe, end_stripe;
+ uint32_t io_boundary = bdev_io->bdev->optimal_io_boundary;
+
+ if (io_boundary == 0) {
+ return false;
+ }
+
+ if (!bdev_io_type_can_split(bdev_io->type)) {
+ return false;
+ }
+
+ start_stripe = bdev_io->u.bdev.offset_blocks;
+ end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1;
+ /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */
+ if (spdk_likely(spdk_u32_is_pow2(io_boundary))) {
+ start_stripe >>= spdk_u32log2(io_boundary);
+ end_stripe >>= spdk_u32log2(io_boundary);
+ } else {
+ start_stripe /= io_boundary;
+ end_stripe /= io_boundary;
+ }
+ return (start_stripe != end_stripe);
+}
+
+static uint32_t
+_to_next_boundary(uint64_t offset, uint32_t boundary)
+{
+ return (boundary - (offset % boundary));
+}
+
+static void
+bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg);
+
+static void
+_bdev_io_split(void *_bdev_io)
+{
+ struct spdk_bdev_io *bdev_io = _bdev_io;
+ uint64_t current_offset, remaining;
+ uint32_t blocklen, to_next_boundary, to_next_boundary_bytes, to_last_block_bytes;
+ struct iovec *parent_iov, *iov;
+ uint64_t parent_iov_offset, iov_len;
+ uint32_t parent_iovpos, parent_iovcnt, child_iovcnt, iovcnt;
+ void *md_buf = NULL;
+ int rc;
+
+ remaining = bdev_io->u.bdev.split_remaining_num_blocks;
+ current_offset = bdev_io->u.bdev.split_current_offset_blocks;
+ blocklen = bdev_io->bdev->blocklen;
+ parent_iov_offset = (current_offset - bdev_io->u.bdev.offset_blocks) * blocklen;
+ parent_iovcnt = bdev_io->u.bdev.iovcnt;
+
+ for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) {
+ parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos];
+ if (parent_iov_offset < parent_iov->iov_len) {
+ break;
+ }
+ parent_iov_offset -= parent_iov->iov_len;
+ }
+
+ child_iovcnt = 0;
+ while (remaining > 0 && parent_iovpos < parent_iovcnt && child_iovcnt < BDEV_IO_NUM_CHILD_IOV) {
+ to_next_boundary = _to_next_boundary(current_offset, bdev_io->bdev->optimal_io_boundary);
+ to_next_boundary = spdk_min(remaining, to_next_boundary);
+ to_next_boundary_bytes = to_next_boundary * blocklen;
+ iov = &bdev_io->child_iov[child_iovcnt];
+ iovcnt = 0;
+
+ if (bdev_io->u.bdev.md_buf) {
+ assert((parent_iov_offset % blocklen) > 0);
+ md_buf = (char *)bdev_io->u.bdev.md_buf + (parent_iov_offset / blocklen) *
+ spdk_bdev_get_md_size(bdev_io->bdev);
+ }
+
+ while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt &&
+ child_iovcnt < BDEV_IO_NUM_CHILD_IOV) {
+ parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos];
+ iov_len = spdk_min(to_next_boundary_bytes, parent_iov->iov_len - parent_iov_offset);
+ to_next_boundary_bytes -= iov_len;
+
+ bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset;
+ bdev_io->child_iov[child_iovcnt].iov_len = iov_len;
+
+ if (iov_len < parent_iov->iov_len - parent_iov_offset) {
+ parent_iov_offset += iov_len;
+ } else {
+ parent_iovpos++;
+ parent_iov_offset = 0;
+ }
+ child_iovcnt++;
+ iovcnt++;
+ }
+
+ if (to_next_boundary_bytes > 0) {
+ /* We had to stop this child I/O early because we ran out of
+ * child_iov space. Ensure the iovs to be aligned with block
+ * size and then adjust to_next_boundary before starting the
+ * child I/O.
+ */
+ assert(child_iovcnt == BDEV_IO_NUM_CHILD_IOV);
+ to_last_block_bytes = to_next_boundary_bytes % blocklen;
+ if (to_last_block_bytes != 0) {
+ uint32_t child_iovpos = child_iovcnt - 1;
+ /* don't decrease child_iovcnt so the loop will naturally end */
+
+ to_last_block_bytes = blocklen - to_last_block_bytes;
+ to_next_boundary_bytes += to_last_block_bytes;
+ while (to_last_block_bytes > 0 && iovcnt > 0) {
+ iov_len = spdk_min(to_last_block_bytes,
+ bdev_io->child_iov[child_iovpos].iov_len);
+ bdev_io->child_iov[child_iovpos].iov_len -= iov_len;
+ if (bdev_io->child_iov[child_iovpos].iov_len == 0) {
+ child_iovpos--;
+ if (--iovcnt == 0) {
+ return;
+ }
+ }
+ to_last_block_bytes -= iov_len;
+ }
+
+ assert(to_last_block_bytes == 0);
+ }
+ to_next_boundary -= to_next_boundary_bytes / blocklen;
+ }
+
+ bdev_io->u.bdev.split_outstanding++;
+
+ if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
+ rc = bdev_readv_blocks_with_md(bdev_io->internal.desc,
+ spdk_io_channel_from_ctx(bdev_io->internal.ch),
+ iov, iovcnt, md_buf, current_offset,
+ to_next_boundary,
+ bdev_io_split_done, bdev_io);
+ } else {
+ rc = bdev_writev_blocks_with_md(bdev_io->internal.desc,
+ spdk_io_channel_from_ctx(bdev_io->internal.ch),
+ iov, iovcnt, md_buf, current_offset,
+ to_next_boundary,
+ bdev_io_split_done, bdev_io);
+ }
+
+ if (rc == 0) {
+ current_offset += to_next_boundary;
+ remaining -= to_next_boundary;
+ bdev_io->u.bdev.split_current_offset_blocks = current_offset;
+ bdev_io->u.bdev.split_remaining_num_blocks = remaining;
+ } else {
+ bdev_io->u.bdev.split_outstanding--;
+ if (rc == -ENOMEM) {
+ if (bdev_io->u.bdev.split_outstanding == 0) {
+ /* No I/O is outstanding. Hence we should wait here. */
+ bdev_queue_io_wait_with_cb(bdev_io, _bdev_io_split);
+ }
+ } else {
+ bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
+ if (bdev_io->u.bdev.split_outstanding == 0) {
+ spdk_trace_record_tsc(spdk_get_ticks(), TRACE_BDEV_IO_DONE, 0, 0,
+ (uintptr_t)bdev_io, 0);
+ TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link);
+ bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
+ }
+ }
+
+ return;
+ }
+ }
+}
+
+static void
+bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct spdk_bdev_io *parent_io = cb_arg;
+
+ spdk_bdev_free_io(bdev_io);
+
+ if (!success) {
+ parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
+ /* If any child I/O failed, stop further splitting process. */
+ parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks;
+ parent_io->u.bdev.split_remaining_num_blocks = 0;
+ }
+ parent_io->u.bdev.split_outstanding--;
+ if (parent_io->u.bdev.split_outstanding != 0) {
+ return;
+ }
+
+ /*
+ * Parent I/O finishes when all blocks are consumed.
+ */
+ if (parent_io->u.bdev.split_remaining_num_blocks == 0) {
+ assert(parent_io->internal.cb != bdev_io_split_done);
+ spdk_trace_record_tsc(spdk_get_ticks(), TRACE_BDEV_IO_DONE, 0, 0,
+ (uintptr_t)parent_io, 0);
+ TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link);
+ parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS,
+ parent_io->internal.caller_ctx);
+ return;
+ }
+
+ /*
+ * Continue with the splitting process. This function will complete the parent I/O if the
+ * splitting is done.
+ */
+ _bdev_io_split(parent_io);
+}
+
+static void
+bdev_io_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success);
+
+static void
+bdev_io_split(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
+{
+ assert(bdev_io_type_can_split(bdev_io->type));
+
+ bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks;
+ bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks;
+ bdev_io->u.bdev.split_outstanding = 0;
+ bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
+
+ if (_is_buf_allocated(bdev_io->u.bdev.iovs)) {
+ _bdev_io_split(bdev_io);
+ } else {
+ assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ);
+ spdk_bdev_io_get_buf(bdev_io, bdev_io_split_get_buf_cb,
+ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
+ }
+}
+
+static void
+bdev_io_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success)
+{
+ if (!success) {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ return;
+ }
+
+ bdev_io_split(ch, bdev_io);
+}
+
+/* Explicitly mark this inline, since it's used as a function pointer and otherwise won't
+ * be inlined, at least on some compilers.
+ */
+static inline void
+_bdev_io_submit(void *ctx)
+{
+ struct spdk_bdev_io *bdev_io = ctx;
+ struct spdk_bdev *bdev = bdev_io->bdev;
+ struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
+ uint64_t tsc;
+
+ tsc = spdk_get_ticks();
+ bdev_io->internal.submit_tsc = tsc;
+ spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_START, 0, 0, (uintptr_t)bdev_io, bdev_io->type);
+
+ if (spdk_likely(bdev_ch->flags == 0)) {
+ bdev_io_do_submit(bdev_ch, bdev_io);
+ return;
+ }
+
+ if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) {
+ _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED);
+ } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) {
+ if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) &&
+ bdev_abort_queued_io(&bdev->internal.qos->queued, bdev_io->u.abort.bio_to_abort)) {
+ _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
+ } else {
+ TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link);
+ bdev_qos_io_submit(bdev_ch, bdev->internal.qos);
+ }
+ } else {
+ SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags);
+ _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+}
+
+bool
+bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2);
+
+bool
+bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2)
+{
+ if (range1->length == 0 || range2->length == 0) {
+ return false;
+ }
+
+ if (range1->offset + range1->length <= range2->offset) {
+ return false;
+ }
+
+ if (range2->offset + range2->length <= range1->offset) {
+ return false;
+ }
+
+ return true;
+}
+
+static bool
+bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range)
+{
+ struct spdk_bdev_channel *ch = bdev_io->internal.ch;
+ struct lba_range r;
+
+ switch (bdev_io->type) {
+ case SPDK_BDEV_IO_TYPE_NVME_IO:
+ case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
+ /* Don't try to decode the NVMe command - just assume worst-case and that
+ * it overlaps a locked range.
+ */
+ return true;
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ case SPDK_BDEV_IO_TYPE_UNMAP:
+ case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
+ case SPDK_BDEV_IO_TYPE_ZCOPY:
+ r.offset = bdev_io->u.bdev.offset_blocks;
+ r.length = bdev_io->u.bdev.num_blocks;
+ if (!bdev_lba_range_overlapped(range, &r)) {
+ /* This I/O doesn't overlap the specified LBA range. */
+ return false;
+ } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) {
+ /* This I/O overlaps, but the I/O is on the same channel that locked this
+ * range, and the caller_ctx is the same as the locked_ctx. This means
+ * that this I/O is associated with the lock, and is allowed to execute.
+ */
+ return false;
+ } else {
+ return true;
+ }
+ default:
+ return false;
+ }
+}
+
+void
+bdev_io_submit(struct spdk_bdev_io *bdev_io)
+{
+ struct spdk_bdev *bdev = bdev_io->bdev;
+ struct spdk_thread *thread = spdk_bdev_io_get_thread(bdev_io);
+ struct spdk_bdev_channel *ch = bdev_io->internal.ch;
+
+ assert(thread != NULL);
+ assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING);
+
+ if (!TAILQ_EMPTY(&ch->locked_ranges)) {
+ struct lba_range *range;
+
+ TAILQ_FOREACH(range, &ch->locked_ranges, tailq) {
+ if (bdev_io_range_is_locked(bdev_io, range)) {
+ TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link);
+ return;
+ }
+ }
+ }
+
+ TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link);
+
+ if (bdev->split_on_optimal_io_boundary && bdev_io_should_split(bdev_io)) {
+ bdev_io->internal.submit_tsc = spdk_get_ticks();
+ spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0,
+ (uintptr_t)bdev_io, bdev_io->type);
+ bdev_io_split(NULL, bdev_io);
+ return;
+ }
+
+ if (ch->flags & BDEV_CH_QOS_ENABLED) {
+ if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) {
+ _bdev_io_submit(bdev_io);
+ } else {
+ bdev_io->internal.io_submit_ch = ch;
+ bdev_io->internal.ch = bdev->internal.qos->ch;
+ spdk_thread_send_msg(bdev->internal.qos->thread, _bdev_io_submit, bdev_io);
+ }
+ } else {
+ _bdev_io_submit(bdev_io);
+ }
+}
+
+static void
+bdev_io_submit_reset(struct spdk_bdev_io *bdev_io)
+{
+ struct spdk_bdev *bdev = bdev_io->bdev;
+ struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
+ struct spdk_io_channel *ch = bdev_ch->channel;
+
+ assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING);
+
+ bdev_io->internal.in_submit_request = true;
+ bdev->fn_table->submit_request(ch, bdev_io);
+ bdev_io->internal.in_submit_request = false;
+}
+
+void
+bdev_io_init(struct spdk_bdev_io *bdev_io,
+ struct spdk_bdev *bdev, void *cb_arg,
+ spdk_bdev_io_completion_cb cb)
+{
+ bdev_io->bdev = bdev;
+ bdev_io->internal.caller_ctx = cb_arg;
+ bdev_io->internal.cb = cb;
+ bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING;
+ bdev_io->internal.in_submit_request = false;
+ bdev_io->internal.buf = NULL;
+ bdev_io->internal.io_submit_ch = NULL;
+ bdev_io->internal.orig_iovs = NULL;
+ bdev_io->internal.orig_iovcnt = 0;
+ bdev_io->internal.orig_md_buf = NULL;
+ bdev_io->internal.error.nvme.cdw0 = 0;
+ bdev_io->num_retries = 0;
+ bdev_io->internal.get_buf_cb = NULL;
+ bdev_io->internal.get_aux_buf_cb = NULL;
+}
+
+static bool
+bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type)
+{
+ return bdev->fn_table->io_type_supported(bdev->ctxt, io_type);
+}
+
+bool
+spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type)
+{
+ bool supported;
+
+ supported = bdev_io_type_supported(bdev, io_type);
+
+ if (!supported) {
+ switch (io_type) {
+ case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
+ /* The bdev layer will emulate write zeroes as long as write is supported. */
+ supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE);
+ break;
+ case SPDK_BDEV_IO_TYPE_ZCOPY:
+ /* Zero copy can be emulated with regular read and write */
+ supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_READ) &&
+ bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE);
+ break;
+ default:
+ break;
+ }
+ }
+
+ return supported;
+}
+
+int
+spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
+{
+ if (bdev->fn_table->dump_info_json) {
+ return bdev->fn_table->dump_info_json(bdev->ctxt, w);
+ }
+
+ return 0;
+}
+
+static void
+bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos)
+{
+ uint32_t max_per_timeslice = 0;
+ int i;
+
+ for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
+ if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
+ qos->rate_limits[i].max_per_timeslice = 0;
+ continue;
+ }
+
+ max_per_timeslice = qos->rate_limits[i].limit *
+ SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC;
+
+ qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice,
+ qos->rate_limits[i].min_per_timeslice);
+
+ qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice;
+ }
+
+ bdev_qos_set_ops(qos);
+}
+
+static int
+bdev_channel_poll_qos(void *arg)
+{
+ struct spdk_bdev_qos *qos = arg;
+ uint64_t now = spdk_get_ticks();
+ int i;
+
+ if (now < (qos->last_timeslice + qos->timeslice_size)) {
+ /* We received our callback earlier than expected - return
+ * immediately and wait to do accounting until at least one
+ * timeslice has actually expired. This should never happen
+ * with a well-behaved timer implementation.
+ */
+ return SPDK_POLLER_IDLE;
+ }
+
+ /* Reset for next round of rate limiting */
+ for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
+ /* We may have allowed the IOs or bytes to slightly overrun in the last
+ * timeslice. remaining_this_timeslice is signed, so if it's negative
+ * here, we'll account for the overrun so that the next timeslice will
+ * be appropriately reduced.
+ */
+ if (qos->rate_limits[i].remaining_this_timeslice > 0) {
+ qos->rate_limits[i].remaining_this_timeslice = 0;
+ }
+ }
+
+ while (now >= (qos->last_timeslice + qos->timeslice_size)) {
+ qos->last_timeslice += qos->timeslice_size;
+ for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
+ qos->rate_limits[i].remaining_this_timeslice +=
+ qos->rate_limits[i].max_per_timeslice;
+ }
+ }
+
+ return bdev_qos_io_submit(qos->ch, qos);
+}
+
+static void
+bdev_channel_destroy_resource(struct spdk_bdev_channel *ch)
+{
+ struct spdk_bdev_shared_resource *shared_resource;
+ struct lba_range *range;
+
+ while (!TAILQ_EMPTY(&ch->locked_ranges)) {
+ range = TAILQ_FIRST(&ch->locked_ranges);
+ TAILQ_REMOVE(&ch->locked_ranges, range, tailq);
+ free(range);
+ }
+
+ spdk_put_io_channel(ch->channel);
+
+ shared_resource = ch->shared_resource;
+
+ assert(TAILQ_EMPTY(&ch->io_locked));
+ assert(TAILQ_EMPTY(&ch->io_submitted));
+ assert(ch->io_outstanding == 0);
+ assert(shared_resource->ref > 0);
+ shared_resource->ref--;
+ if (shared_resource->ref == 0) {
+ assert(shared_resource->io_outstanding == 0);
+ TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link);
+ spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch));
+ free(shared_resource);
+ }
+}
+
+/* Caller must hold bdev->internal.mutex. */
+static void
+bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch)
+{
+ struct spdk_bdev_qos *qos = bdev->internal.qos;
+ int i;
+
+ /* Rate limiting on this bdev enabled */
+ if (qos) {
+ if (qos->ch == NULL) {
+ struct spdk_io_channel *io_ch;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch,
+ bdev->name, spdk_get_thread());
+
+ /* No qos channel has been selected, so set one up */
+
+ /* Take another reference to ch */
+ io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev));
+ assert(io_ch != NULL);
+ qos->ch = ch;
+
+ qos->thread = spdk_io_channel_get_thread(io_ch);
+
+ TAILQ_INIT(&qos->queued);
+
+ for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
+ if (bdev_qos_is_iops_rate_limit(i) == true) {
+ qos->rate_limits[i].min_per_timeslice =
+ SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE;
+ } else {
+ qos->rate_limits[i].min_per_timeslice =
+ SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE;
+ }
+
+ if (qos->rate_limits[i].limit == 0) {
+ qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED;
+ }
+ }
+ bdev_qos_update_max_quota_per_timeslice(qos);
+ qos->timeslice_size =
+ SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC;
+ qos->last_timeslice = spdk_get_ticks();
+ qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos,
+ qos,
+ SPDK_BDEV_QOS_TIMESLICE_IN_USEC);
+ }
+
+ ch->flags |= BDEV_CH_QOS_ENABLED;
+ }
+}
+
+struct poll_timeout_ctx {
+ struct spdk_bdev_desc *desc;
+ uint64_t timeout_in_sec;
+ spdk_bdev_io_timeout_cb cb_fn;
+ void *cb_arg;
+};
+
+static void
+bdev_desc_free(struct spdk_bdev_desc *desc)
+{
+ pthread_mutex_destroy(&desc->mutex);
+ free(desc->media_events_buffer);
+ free(desc);
+}
+
+static void
+bdev_channel_poll_timeout_io_done(struct spdk_io_channel_iter *i, int status)
+{
+ struct poll_timeout_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
+ struct spdk_bdev_desc *desc = ctx->desc;
+
+ free(ctx);
+
+ pthread_mutex_lock(&desc->mutex);
+ desc->refs--;
+ if (desc->closed == true && desc->refs == 0) {
+ pthread_mutex_unlock(&desc->mutex);
+ bdev_desc_free(desc);
+ return;
+ }
+ pthread_mutex_unlock(&desc->mutex);
+}
+
+static void
+bdev_channel_poll_timeout_io(struct spdk_io_channel_iter *i)
+{
+ struct poll_timeout_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
+ struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i);
+ struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(io_ch);
+ struct spdk_bdev_desc *desc = ctx->desc;
+ struct spdk_bdev_io *bdev_io;
+ uint64_t now;
+
+ pthread_mutex_lock(&desc->mutex);
+ if (desc->closed == true) {
+ pthread_mutex_unlock(&desc->mutex);
+ spdk_for_each_channel_continue(i, -1);
+ return;
+ }
+ pthread_mutex_unlock(&desc->mutex);
+
+ now = spdk_get_ticks();
+ TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) {
+ /* Exclude any I/O that are generated via splitting. */
+ if (bdev_io->internal.cb == bdev_io_split_done) {
+ continue;
+ }
+
+ /* Once we find an I/O that has not timed out, we can immediately
+ * exit the loop.
+ */
+ if (now < (bdev_io->internal.submit_tsc +
+ ctx->timeout_in_sec * spdk_get_ticks_hz())) {
+ goto end;
+ }
+
+ if (bdev_io->internal.desc == desc) {
+ ctx->cb_fn(ctx->cb_arg, bdev_io);
+ }
+ }
+
+end:
+ spdk_for_each_channel_continue(i, 0);
+}
+
+static int
+bdev_poll_timeout_io(void *arg)
+{
+ struct spdk_bdev_desc *desc = arg;
+ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+ struct poll_timeout_ctx *ctx;
+
+ ctx = calloc(1, sizeof(struct poll_timeout_ctx));
+ if (!ctx) {
+ SPDK_ERRLOG("failed to allocate memory\n");
+ return SPDK_POLLER_BUSY;
+ }
+ ctx->desc = desc;
+ ctx->cb_arg = desc->cb_arg;
+ ctx->cb_fn = desc->cb_fn;
+ ctx->timeout_in_sec = desc->timeout_in_sec;
+
+ /* Take a ref on the descriptor in case it gets closed while we are checking
+ * all of the channels.
+ */
+ pthread_mutex_lock(&desc->mutex);
+ desc->refs++;
+ pthread_mutex_unlock(&desc->mutex);
+
+ spdk_for_each_channel(__bdev_to_io_dev(bdev),
+ bdev_channel_poll_timeout_io,
+ ctx,
+ bdev_channel_poll_timeout_io_done);
+
+ return SPDK_POLLER_BUSY;
+}
+
+int
+spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec,
+ spdk_bdev_io_timeout_cb cb_fn, void *cb_arg)
+{
+ assert(desc->thread == spdk_get_thread());
+
+ spdk_poller_unregister(&desc->io_timeout_poller);
+
+ if (timeout_in_sec) {
+ assert(cb_fn != NULL);
+ desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io,
+ desc,
+ SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC /
+ 1000);
+ if (desc->io_timeout_poller == NULL) {
+ SPDK_ERRLOG("can not register the desc timeout IO poller\n");
+ return -1;
+ }
+ }
+
+ desc->cb_fn = cb_fn;
+ desc->cb_arg = cb_arg;
+ desc->timeout_in_sec = timeout_in_sec;
+
+ return 0;
+}
+
+static int
+bdev_channel_create(void *io_device, void *ctx_buf)
+{
+ struct spdk_bdev *bdev = __bdev_from_io_dev(io_device);
+ struct spdk_bdev_channel *ch = ctx_buf;
+ struct spdk_io_channel *mgmt_io_ch;
+ struct spdk_bdev_mgmt_channel *mgmt_ch;
+ struct spdk_bdev_shared_resource *shared_resource;
+ struct lba_range *range;
+
+ ch->bdev = bdev;
+ ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt);
+ if (!ch->channel) {
+ return -1;
+ }
+
+ assert(ch->histogram == NULL);
+ if (bdev->internal.histogram_enabled) {
+ ch->histogram = spdk_histogram_data_alloc();
+ if (ch->histogram == NULL) {
+ SPDK_ERRLOG("Could not allocate histogram\n");
+ }
+ }
+
+ mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr);
+ if (!mgmt_io_ch) {
+ spdk_put_io_channel(ch->channel);
+ return -1;
+ }
+
+ mgmt_ch = spdk_io_channel_get_ctx(mgmt_io_ch);
+ TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) {
+ if (shared_resource->shared_ch == ch->channel) {
+ spdk_put_io_channel(mgmt_io_ch);
+ shared_resource->ref++;
+ break;
+ }
+ }
+
+ if (shared_resource == NULL) {
+ shared_resource = calloc(1, sizeof(*shared_resource));
+ if (shared_resource == NULL) {
+ spdk_put_io_channel(ch->channel);
+ spdk_put_io_channel(mgmt_io_ch);
+ return -1;
+ }
+
+ shared_resource->mgmt_ch = mgmt_ch;
+ shared_resource->io_outstanding = 0;
+ TAILQ_INIT(&shared_resource->nomem_io);
+ shared_resource->nomem_threshold = 0;
+ shared_resource->shared_ch = ch->channel;
+ shared_resource->ref = 1;
+ TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link);
+ }
+
+ memset(&ch->stat, 0, sizeof(ch->stat));
+ ch->stat.ticks_rate = spdk_get_ticks_hz();
+ ch->io_outstanding = 0;
+ TAILQ_INIT(&ch->queued_resets);
+ TAILQ_INIT(&ch->locked_ranges);
+ ch->flags = 0;
+ ch->shared_resource = shared_resource;
+
+ TAILQ_INIT(&ch->io_submitted);
+ TAILQ_INIT(&ch->io_locked);
+
+#ifdef SPDK_CONFIG_VTUNE
+ {
+ char *name;
+ __itt_init_ittlib(NULL, 0);
+ name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch);
+ if (!name) {
+ bdev_channel_destroy_resource(ch);
+ return -1;
+ }
+ ch->handle = __itt_string_handle_create(name);
+ free(name);
+ ch->start_tsc = spdk_get_ticks();
+ ch->interval_tsc = spdk_get_ticks_hz() / 100;
+ memset(&ch->prev_stat, 0, sizeof(ch->prev_stat));
+ }
+#endif
+
+ pthread_mutex_lock(&bdev->internal.mutex);
+ bdev_enable_qos(bdev, ch);
+
+ TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) {
+ struct lba_range *new_range;
+
+ new_range = calloc(1, sizeof(*new_range));
+ if (new_range == NULL) {
+ pthread_mutex_unlock(&bdev->internal.mutex);
+ bdev_channel_destroy_resource(ch);
+ return -1;
+ }
+ new_range->length = range->length;
+ new_range->offset = range->offset;
+ new_range->locked_ctx = range->locked_ctx;
+ TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq);
+ }
+
+ pthread_mutex_unlock(&bdev->internal.mutex);
+
+ return 0;
+}
+
+/*
+ * Abort I/O that are waiting on a data buffer. These types of I/O are
+ * linked using the spdk_bdev_io internal.buf_link TAILQ_ENTRY.
+ */
+static void
+bdev_abort_all_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_channel *ch)
+{
+ bdev_io_stailq_t tmp;
+ struct spdk_bdev_io *bdev_io;
+
+ STAILQ_INIT(&tmp);
+
+ while (!STAILQ_EMPTY(queue)) {
+ bdev_io = STAILQ_FIRST(queue);
+ STAILQ_REMOVE_HEAD(queue, internal.buf_link);
+ if (bdev_io->internal.ch == ch) {
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED);
+ } else {
+ STAILQ_INSERT_TAIL(&tmp, bdev_io, internal.buf_link);
+ }
+ }
+
+ STAILQ_SWAP(&tmp, queue, spdk_bdev_io);
+}
+
+/*
+ * Abort I/O that are queued waiting for submission. These types of I/O are
+ * linked using the spdk_bdev_io link TAILQ_ENTRY.
+ */
+static void
+bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch)
+{
+ struct spdk_bdev_io *bdev_io, *tmp;
+
+ TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) {
+ if (bdev_io->internal.ch == ch) {
+ TAILQ_REMOVE(queue, bdev_io, internal.link);
+ /*
+ * spdk_bdev_io_complete() assumes that the completed I/O had
+ * been submitted to the bdev module. Since in this case it
+ * hadn't, bump io_outstanding to account for the decrement
+ * that spdk_bdev_io_complete() will do.
+ */
+ if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) {
+ ch->io_outstanding++;
+ ch->shared_resource->io_outstanding++;
+ }
+ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED);
+ }
+ }
+}
+
+static bool
+bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort)
+{
+ struct spdk_bdev_io *bdev_io;
+
+ TAILQ_FOREACH(bdev_io, queue, internal.link) {
+ if (bdev_io == bio_to_abort) {
+ TAILQ_REMOVE(queue, bio_to_abort, internal.link);
+ spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static bool
+bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_io *bio_to_abort)
+{
+ struct spdk_bdev_io *bdev_io;
+
+ STAILQ_FOREACH(bdev_io, queue, internal.buf_link) {
+ if (bdev_io == bio_to_abort) {
+ STAILQ_REMOVE(queue, bio_to_abort, spdk_bdev_io, internal.buf_link);
+ spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static void
+bdev_qos_channel_destroy(void *cb_arg)
+{
+ struct spdk_bdev_qos *qos = cb_arg;
+
+ spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch));
+ spdk_poller_unregister(&qos->poller);
+
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Free QoS %p.\n", qos);
+
+ free(qos);
+}
+
+static int
+bdev_qos_destroy(struct spdk_bdev *bdev)
+{
+ int i;
+
+ /*
+ * Cleanly shutting down the QoS poller is tricky, because
+ * during the asynchronous operation the user could open
+ * a new descriptor and create a new channel, spawning
+ * a new QoS poller.
+ *
+ * The strategy is to create a new QoS structure here and swap it
+ * in. The shutdown path then continues to refer to the old one
+ * until it completes and then releases it.
+ */
+ struct spdk_bdev_qos *new_qos, *old_qos;
+
+ old_qos = bdev->internal.qos;
+
+ new_qos = calloc(1, sizeof(*new_qos));
+ if (!new_qos) {
+ SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n");
+ return -ENOMEM;
+ }
+
+ /* Copy the old QoS data into the newly allocated structure */
+ memcpy(new_qos, old_qos, sizeof(*new_qos));
+
+ /* Zero out the key parts of the QoS structure */
+ new_qos->ch = NULL;
+ new_qos->thread = NULL;
+ new_qos->poller = NULL;
+ TAILQ_INIT(&new_qos->queued);
+ /*
+ * The limit member of spdk_bdev_qos_limit structure is not zeroed.
+ * It will be used later for the new QoS structure.
+ */
+ for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
+ new_qos->rate_limits[i].remaining_this_timeslice = 0;
+ new_qos->rate_limits[i].min_per_timeslice = 0;
+ new_qos->rate_limits[i].max_per_timeslice = 0;
+ }
+
+ bdev->internal.qos = new_qos;
+
+ if (old_qos->thread == NULL) {
+ free(old_qos);
+ } else {
+ spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos);
+ }
+
+ /* It is safe to continue with destroying the bdev even though the QoS channel hasn't
+ * been destroyed yet. The destruction path will end up waiting for the final
+ * channel to be put before it releases resources. */
+
+ return 0;
+}
+
+static void
+bdev_io_stat_add(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add)
+{
+ total->bytes_read += add->bytes_read;
+ total->num_read_ops += add->num_read_ops;
+ total->bytes_written += add->bytes_written;
+ total->num_write_ops += add->num_write_ops;
+ total->bytes_unmapped += add->bytes_unmapped;
+ total->num_unmap_ops += add->num_unmap_ops;
+ total->read_latency_ticks += add->read_latency_ticks;
+ total->write_latency_ticks += add->write_latency_ticks;
+ total->unmap_latency_ticks += add->unmap_latency_ticks;
+}
+
+static void
+bdev_channel_destroy(void *io_device, void *ctx_buf)
+{
+ struct spdk_bdev_channel *ch = ctx_buf;
+ struct spdk_bdev_mgmt_channel *mgmt_ch;
+ struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name,
+ spdk_get_thread());
+
+ /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */
+ pthread_mutex_lock(&ch->bdev->internal.mutex);
+ bdev_io_stat_add(&ch->bdev->internal.stat, &ch->stat);
+ pthread_mutex_unlock(&ch->bdev->internal.mutex);
+
+ mgmt_ch = shared_resource->mgmt_ch;
+
+ bdev_abort_all_queued_io(&ch->queued_resets, ch);
+ bdev_abort_all_queued_io(&shared_resource->nomem_io, ch);
+ bdev_abort_all_buf_io(&mgmt_ch->need_buf_small, ch);
+ bdev_abort_all_buf_io(&mgmt_ch->need_buf_large, ch);
+
+ if (ch->histogram) {
+ spdk_histogram_data_free(ch->histogram);
+ }
+
+ bdev_channel_destroy_resource(ch);
+}
+
+int
+spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias)
+{
+ struct spdk_bdev_alias *tmp;
+
+ if (alias == NULL) {
+ SPDK_ERRLOG("Empty alias passed\n");
+ return -EINVAL;
+ }
+
+ if (spdk_bdev_get_by_name(alias)) {
+ SPDK_ERRLOG("Bdev name/alias: %s already exists\n", alias);
+ return -EEXIST;
+ }
+
+ tmp = calloc(1, sizeof(*tmp));
+ if (tmp == NULL) {
+ SPDK_ERRLOG("Unable to allocate alias\n");
+ return -ENOMEM;
+ }
+
+ tmp->alias = strdup(alias);
+ if (tmp->alias == NULL) {
+ free(tmp);
+ SPDK_ERRLOG("Unable to allocate alias\n");
+ return -ENOMEM;
+ }
+
+ TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq);
+
+ return 0;
+}
+
+int
+spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias)
+{
+ struct spdk_bdev_alias *tmp;
+
+ TAILQ_FOREACH(tmp, &bdev->aliases, tailq) {
+ if (strcmp(alias, tmp->alias) == 0) {
+ TAILQ_REMOVE(&bdev->aliases, tmp, tailq);
+ free(tmp->alias);
+ free(tmp);
+ return 0;
+ }
+ }
+
+ SPDK_INFOLOG(SPDK_LOG_BDEV, "Alias %s does not exists\n", alias);
+
+ return -ENOENT;
+}
+
+void
+spdk_bdev_alias_del_all(struct spdk_bdev *bdev)
+{
+ struct spdk_bdev_alias *p, *tmp;
+
+ TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) {
+ TAILQ_REMOVE(&bdev->aliases, p, tailq);
+ free(p->alias);
+ free(p);
+ }
+}
+
+struct spdk_io_channel *
+spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc)
+{
+ return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc)));
+}
+
+const char *
+spdk_bdev_get_name(const struct spdk_bdev *bdev)
+{
+ return bdev->name;
+}
+
+const char *
+spdk_bdev_get_product_name(const struct spdk_bdev *bdev)
+{
+ return bdev->product_name;
+}
+
+const struct spdk_bdev_aliases_list *
+spdk_bdev_get_aliases(const struct spdk_bdev *bdev)
+{
+ return &bdev->aliases;
+}
+
+uint32_t
+spdk_bdev_get_block_size(const struct spdk_bdev *bdev)
+{
+ return bdev->blocklen;
+}
+
+uint32_t
+spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev)
+{
+ return bdev->write_unit_size;
+}
+
+uint64_t
+spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev)
+{
+ return bdev->blockcnt;
+}
+
+const char *
+spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type)
+{
+ return qos_rpc_type[type];
+}
+
+void
+spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits)
+{
+ int i;
+
+ memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES);
+
+ pthread_mutex_lock(&bdev->internal.mutex);
+ if (bdev->internal.qos) {
+ for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
+ if (bdev->internal.qos->rate_limits[i].limit !=
+ SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
+ limits[i] = bdev->internal.qos->rate_limits[i].limit;
+ if (bdev_qos_is_iops_rate_limit(i) == false) {
+ /* Change from Byte to Megabyte which is user visible. */
+ limits[i] = limits[i] / 1024 / 1024;
+ }
+ }
+ }
+ }
+ pthread_mutex_unlock(&bdev->internal.mutex);
+}
+
+size_t
+spdk_bdev_get_buf_align(const struct spdk_bdev *bdev)
+{
+ return 1 << bdev->required_alignment;
+}
+
+uint32_t
+spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev)
+{
+ return bdev->optimal_io_boundary;
+}
+
+bool
+spdk_bdev_has_write_cache(const struct spdk_bdev *bdev)
+{
+ return bdev->write_cache;
+}
+
+const struct spdk_uuid *
+spdk_bdev_get_uuid(const struct spdk_bdev *bdev)
+{
+ return &bdev->uuid;
+}
+
+uint16_t
+spdk_bdev_get_acwu(const struct spdk_bdev *bdev)
+{
+ return bdev->acwu;
+}
+
+uint32_t
+spdk_bdev_get_md_size(const struct spdk_bdev *bdev)
+{
+ return bdev->md_len;
+}
+
+bool
+spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev)
+{
+ return (bdev->md_len != 0) && bdev->md_interleave;
+}
+
+bool
+spdk_bdev_is_md_separate(const struct spdk_bdev *bdev)
+{
+ return (bdev->md_len != 0) && !bdev->md_interleave;
+}
+
+bool
+spdk_bdev_is_zoned(const struct spdk_bdev *bdev)
+{
+ return bdev->zoned;
+}
+
+uint32_t
+spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev)
+{
+ if (spdk_bdev_is_md_interleaved(bdev)) {
+ return bdev->blocklen - bdev->md_len;
+ } else {
+ return bdev->blocklen;
+ }
+}
+
+static uint32_t
+_bdev_get_block_size_with_md(const struct spdk_bdev *bdev)
+{
+ if (!spdk_bdev_is_md_interleaved(bdev)) {
+ return bdev->blocklen + bdev->md_len;
+ } else {
+ return bdev->blocklen;
+ }
+}
+
+enum spdk_dif_type spdk_bdev_get_dif_type(const struct spdk_bdev *bdev)
+{
+ if (bdev->md_len != 0) {
+ return bdev->dif_type;
+ } else {
+ return SPDK_DIF_DISABLE;
+ }
+}
+
+bool
+spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev)
+{
+ if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) {
+ return bdev->dif_is_head_of_md;
+ } else {
+ return false;
+ }
+}
+
+bool
+spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev,
+ enum spdk_dif_check_type check_type)
+{
+ if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) {
+ return false;
+ }
+
+ switch (check_type) {
+ case SPDK_DIF_CHECK_TYPE_REFTAG:
+ return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0;
+ case SPDK_DIF_CHECK_TYPE_APPTAG:
+ return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0;
+ case SPDK_DIF_CHECK_TYPE_GUARD:
+ return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0;
+ default:
+ return false;
+ }
+}
+
+uint64_t
+spdk_bdev_get_qd(const struct spdk_bdev *bdev)
+{
+ return bdev->internal.measured_queue_depth;
+}
+
+uint64_t
+spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev)
+{
+ return bdev->internal.period;
+}
+
+uint64_t
+spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev)
+{
+ return bdev->internal.weighted_io_time;
+}
+
+uint64_t
+spdk_bdev_get_io_time(const struct spdk_bdev *bdev)
+{
+ return bdev->internal.io_time;
+}
+
+static void
+_calculate_measured_qd_cpl(struct spdk_io_channel_iter *i, int status)
+{
+ struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i);
+
+ bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth;
+
+ if (bdev->internal.measured_queue_depth) {
+ bdev->internal.io_time += bdev->internal.period;
+ bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth;
+ }
+}
+
+static void
+_calculate_measured_qd(struct spdk_io_channel_iter *i)
+{
+ struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i);
+ struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i);
+ struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(io_ch);
+
+ bdev->internal.temporary_queue_depth += ch->io_outstanding;
+ spdk_for_each_channel_continue(i, 0);
+}
+
+static int
+bdev_calculate_measured_queue_depth(void *ctx)
+{
+ struct spdk_bdev *bdev = ctx;
+ bdev->internal.temporary_queue_depth = 0;
+ spdk_for_each_channel(__bdev_to_io_dev(bdev), _calculate_measured_qd, bdev,
+ _calculate_measured_qd_cpl);
+ return SPDK_POLLER_BUSY;
+}
+
+void
+spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period)
+{
+ bdev->internal.period = period;
+
+ if (bdev->internal.qd_poller != NULL) {
+ spdk_poller_unregister(&bdev->internal.qd_poller);
+ bdev->internal.measured_queue_depth = UINT64_MAX;
+ }
+
+ if (period != 0) {
+ bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, bdev,
+ period);
+ }
+}
+
+static void
+_resize_notify(void *arg)
+{
+ struct spdk_bdev_desc *desc = arg;
+
+ pthread_mutex_lock(&desc->mutex);
+ desc->refs--;
+ if (!desc->closed) {
+ pthread_mutex_unlock(&desc->mutex);
+ desc->callback.event_fn(SPDK_BDEV_EVENT_RESIZE,
+ desc->bdev,
+ desc->callback.ctx);
+ return;
+ } else if (0 == desc->refs) {
+ /* This descriptor was closed after this resize_notify message was sent.
+ * spdk_bdev_close() could not free the descriptor since this message was
+ * in flight, so we free it now using bdev_desc_free().
+ */
+ pthread_mutex_unlock(&desc->mutex);
+ bdev_desc_free(desc);
+ return;
+ }
+ pthread_mutex_unlock(&desc->mutex);
+}
+
+int
+spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size)
+{
+ struct spdk_bdev_desc *desc;
+ int ret;
+
+ pthread_mutex_lock(&bdev->internal.mutex);
+
+ /* bdev has open descriptors */
+ if (!TAILQ_EMPTY(&bdev->internal.open_descs) &&
+ bdev->blockcnt > size) {
+ ret = -EBUSY;
+ } else {
+ bdev->blockcnt = size;
+ TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) {
+ pthread_mutex_lock(&desc->mutex);
+ if (desc->callback.open_with_ext && !desc->closed) {
+ desc->refs++;
+ spdk_thread_send_msg(desc->thread, _resize_notify, desc);
+ }
+ pthread_mutex_unlock(&desc->mutex);
+ }
+ ret = 0;
+ }
+
+ pthread_mutex_unlock(&bdev->internal.mutex);
+
+ return ret;
+}
+
+/*
+ * Convert I/O offset and length from bytes to blocks.
+ *
+ * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size.
+ */
+static uint64_t
+bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks,
+ uint64_t num_bytes, uint64_t *num_blocks)
+{
+ uint32_t block_size = bdev->blocklen;
+ uint8_t shift_cnt;
+
+ /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */
+ if (spdk_likely(spdk_u32_is_pow2(block_size))) {
+ shift_cnt = spdk_u32log2(block_size);
+ *offset_blocks = offset_bytes >> shift_cnt;
+ *num_blocks = num_bytes >> shift_cnt;
+ return (offset_bytes - (*offset_blocks << shift_cnt)) |
+ (num_bytes - (*num_blocks << shift_cnt));
+ } else {
+ *offset_blocks = offset_bytes / block_size;
+ *num_blocks = num_bytes / block_size;
+ return (offset_bytes % block_size) | (num_bytes % block_size);
+ }
+}
+
+static bool
+bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks)
+{
+ /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there
+ * has been an overflow and hence the offset has been wrapped around */
+ if (offset_blocks + num_blocks < offset_blocks) {
+ return false;
+ }
+
+ /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */
+ if (offset_blocks + num_blocks > bdev->blockcnt) {
+ return false;
+ }
+
+ return true;
+}
+
+static bool
+_bdev_io_check_md_buf(const struct iovec *iovs, const void *md_buf)
+{
+ return _is_buf_allocated(iovs) == (md_buf != NULL);
+}
+
+static int
+bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf,
+ void *md_buf, int64_t offset_blocks, uint64_t num_blocks,
+ spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+ struct spdk_bdev_io *bdev_io;
+ struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
+
+ if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
+ return -EINVAL;
+ }
+
+ bdev_io = bdev_channel_get_io(channel);
+ if (!bdev_io) {
+ return -ENOMEM;
+ }
+
+ bdev_io->internal.ch = channel;
+ bdev_io->internal.desc = desc;
+ bdev_io->type = SPDK_BDEV_IO_TYPE_READ;
+ bdev_io->u.bdev.iovs = &bdev_io->iov;
+ bdev_io->u.bdev.iovs[0].iov_base = buf;
+ bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen;
+ bdev_io->u.bdev.iovcnt = 1;
+ bdev_io->u.bdev.md_buf = md_buf;
+ bdev_io->u.bdev.num_blocks = num_blocks;
+ bdev_io->u.bdev.offset_blocks = offset_blocks;
+ bdev_io_init(bdev_io, bdev, cb_arg, cb);
+
+ bdev_io_submit(bdev_io);
+ return 0;
+}
+
+int
+spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+ void *buf, uint64_t offset, uint64_t nbytes,
+ spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+ uint64_t offset_blocks, num_blocks;
+
+ if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
+ nbytes, &num_blocks) != 0) {
+ return -EINVAL;
+ }
+
+ return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg);
+}
+
+int
+spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+ void *buf, uint64_t offset_blocks, uint64_t num_blocks,
+ spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+ return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg);
+}
+
+int
+spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+ void *buf, void *md_buf, int64_t offset_blocks, uint64_t num_blocks,
+ spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+ struct iovec iov = {
+ .iov_base = buf,
+ };
+
+ if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
+ return -EINVAL;
+ }
+
+ if (!_bdev_io_check_md_buf(&iov, md_buf)) {
+ return -EINVAL;
+ }
+
+ return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks,
+ cb, cb_arg);
+}
+
+int
+spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+ struct iovec *iov, int iovcnt,
+ uint64_t offset, uint64_t nbytes,
+ spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+ uint64_t offset_blocks, num_blocks;
+
+ if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
+ nbytes, &num_blocks) != 0) {
+ return -EINVAL;
+ }
+
+ return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg);
+}
+
+static int
+bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+ struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks,
+ uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+ struct spdk_bdev_io *bdev_io;
+ struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
+
+ if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
+ return -EINVAL;
+ }
+
+ bdev_io = bdev_channel_get_io(channel);
+ if (!bdev_io) {
+ return -ENOMEM;
+ }
+
+ bdev_io->internal.ch = channel;
+ bdev_io->internal.desc = desc;
+ bdev_io->type = SPDK_BDEV_IO_TYPE_READ;
+ bdev_io->u.bdev.iovs = iov;
+ bdev_io->u.bdev.iovcnt = iovcnt;
+ bdev_io->u.bdev.md_buf = md_buf;
+ bdev_io->u.bdev.num_blocks = num_blocks;
+ bdev_io->u.bdev.offset_blocks = offset_blocks;
+ bdev_io_init(bdev_io, bdev, cb_arg, cb);
+
+ bdev_io_submit(bdev_io);
+ return 0;
+}
+
+int spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+ struct iovec *iov, int iovcnt,
+ uint64_t offset_blocks, uint64_t num_blocks,
+ spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+ return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks,
+ num_blocks, cb, cb_arg);
+}
+
+int
+spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+ struct iovec *iov, int iovcnt, void *md_buf,
+ uint64_t offset_blocks, uint64_t num_blocks,
+ spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+ if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
+ return -EINVAL;
+ }
+
+ if (!_bdev_io_check_md_buf(iov, md_buf)) {
+ return -EINVAL;
+ }
+
+ return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks,
+ num_blocks, cb, cb_arg);
+}
+
+static int
+bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+ void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
+ spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+ struct spdk_bdev_io *bdev_io;
+ struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
+
+ if (!desc->write) {
+ return -EBADF;
+ }
+
+ if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
+ return -EINVAL;
+ }
+
+ bdev_io = bdev_channel_get_io(channel);
+ if (!bdev_io) {
+ return -ENOMEM;
+ }
+
+ bdev_io->internal.ch = channel;
+ bdev_io->internal.desc = desc;
+ bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE;
+ bdev_io->u.bdev.iovs = &bdev_io->iov;
+ bdev_io->u.bdev.iovs[0].iov_base = buf;
+ bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen;
+ bdev_io->u.bdev.iovcnt = 1;
+ bdev_io->u.bdev.md_buf = md_buf;
+ bdev_io->u.bdev.num_blocks = num_blocks;
+ bdev_io->u.bdev.offset_blocks = offset_blocks;
+ bdev_io_init(bdev_io, bdev, cb_arg, cb);
+
+ bdev_io_submit(bdev_io);
+ return 0;
+}
+
+int
+spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+ void *buf, uint64_t offset, uint64_t nbytes,
+ spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+ uint64_t offset_blocks, num_blocks;
+
+ if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
+ nbytes, &num_blocks) != 0) {
+ return -EINVAL;
+ }
+
+ return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg);
+}
+
+int
+spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+ void *buf, uint64_t offset_blocks, uint64_t num_blocks,
+ spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+ return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks,
+ cb, cb_arg);
+}
+
+int
+spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+ void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
+ spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+ struct iovec iov = {
+ .iov_base = buf,
+ };
+
+ if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
+ return -EINVAL;
+ }
+
+ if (!_bdev_io_check_md_buf(&iov, md_buf)) {
+ return -EINVAL;
+ }
+
+ return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks,
+ cb, cb_arg);
+}
+
+static int
+bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+ struct iovec *iov, int iovcnt, void *md_buf,
+ uint64_t offset_blocks, uint64_t num_blocks,
+ spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+ struct spdk_bdev_io *bdev_io;
+ struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
+
+ if (!desc->write) {
+ return -EBADF;
+ }
+
+ if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
+ return -EINVAL;
+ }
+
+ bdev_io = bdev_channel_get_io(channel);
+ if (!bdev_io) {
+ return -ENOMEM;
+ }
+
+ bdev_io->internal.ch = channel;
+ bdev_io->internal.desc = desc;
+ bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE;
+ bdev_io->u.bdev.iovs = iov;
+ bdev_io->u.bdev.iovcnt = iovcnt;
+ bdev_io->u.bdev.md_buf = md_buf;
+ bdev_io->u.bdev.num_blocks = num_blocks;
+ bdev_io->u.bdev.offset_blocks = offset_blocks;
+ bdev_io_init(bdev_io, bdev, cb_arg, cb);
+
+ bdev_io_submit(bdev_io);
+ return 0;
+}
+
+int
+spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+ struct iovec *iov, int iovcnt,
+ uint64_t offset, uint64_t len,
+ spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+ uint64_t offset_blocks, num_blocks;
+
+ if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
+ len, &num_blocks) != 0) {
+ return -EINVAL;
+ }
+
+ return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg);
+}
+
+int
+spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+ struct iovec *iov, int iovcnt,
+ uint64_t offset_blocks, uint64_t num_blocks,
+ spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+ return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks,
+ num_blocks, cb, cb_arg);
+}
+
+int
+spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+ struct iovec *iov, int iovcnt, void *md_buf,
+ uint64_t offset_blocks, uint64_t num_blocks,
+ spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+ if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
+ return -EINVAL;
+ }
+
+ if (!_bdev_io_check_md_buf(iov, md_buf)) {
+ return -EINVAL;
+ }
+
+ return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks,
+ num_blocks, cb, cb_arg);
+}
+
+static void
+bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct spdk_bdev_io *parent_io = cb_arg;
+ uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base;
+ int i, rc = 0;
+
+ if (!success) {
+ parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
+ parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx);
+ spdk_bdev_free_io(bdev_io);
+ return;
+ }
+
+ for (i = 0; i < parent_io->u.bdev.iovcnt; i++) {
+ rc = memcmp(read_buf,
+ parent_io->u.bdev.iovs[i].iov_base,
+ parent_io->u.bdev.iovs[i].iov_len);
+ if (rc) {
+ break;
+ }
+ read_buf += parent_io->u.bdev.iovs[i].iov_len;
+ }
+
+ spdk_bdev_free_io(bdev_io);
+
+ if (rc == 0) {
+ parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
+ parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx);
+ } else {
+ parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE;
+ parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx);
+ }
+}
+
+static void
+bdev_compare_do_read(void *_bdev_io)
+{
+ struct spdk_bdev_io *bdev_io = _bdev_io;
+ int rc;
+
+ rc = spdk_bdev_read_blocks(bdev_io->internal.desc,
+ spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL,
+ bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
+ bdev_compare_do_read_done, bdev_io);
+
+ if (rc == -ENOMEM) {
+ bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read);
+ } else if (rc != 0) {
+ bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
+ bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
+ }
+}
+
+static int
+bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+ struct iovec *iov, int iovcnt, void *md_buf,
+ uint64_t offset_blocks, uint64_t num_blocks,
+ spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+ struct spdk_bdev_io *bdev_io;
+ struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
+
+ if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
+ return -EINVAL;
+ }
+
+ bdev_io = bdev_channel_get_io(channel);
+ if (!bdev_io) {
+ return -ENOMEM;
+ }
+
+ bdev_io->internal.ch = channel;
+ bdev_io->internal.desc = desc;
+ bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE;
+ bdev_io->u.bdev.iovs = iov;
+ bdev_io->u.bdev.iovcnt = iovcnt;
+ bdev_io->u.bdev.md_buf = md_buf;
+ bdev_io->u.bdev.num_blocks = num_blocks;
+ bdev_io->u.bdev.offset_blocks = offset_blocks;
+ bdev_io_init(bdev_io, bdev, cb_arg, cb);
+
+ if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) {
+ bdev_io_submit(bdev_io);
+ return 0;
+ }
+
+ bdev_compare_do_read(bdev_io);
+
+ return 0;
+}
+
+int
+spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+ struct iovec *iov, int iovcnt,
+ uint64_t offset_blocks, uint64_t num_blocks,
+ spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+ return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks,
+ num_blocks, cb, cb_arg);
+}
+
+int
+spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+ struct iovec *iov, int iovcnt, void *md_buf,
+ uint64_t offset_blocks, uint64_t num_blocks,
+ spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+ if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
+ return -EINVAL;
+ }
+
+ if (!_bdev_io_check_md_buf(iov, md_buf)) {
+ return -EINVAL;
+ }
+
+ return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks,
+ num_blocks, cb, cb_arg);
+}
+
+static int
+bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+ void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
+ spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+ struct spdk_bdev_io *bdev_io;
+ struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
+
+ if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
+ return -EINVAL;
+ }
+
+ bdev_io = bdev_channel_get_io(channel);
+ if (!bdev_io) {
+ return -ENOMEM;
+ }
+
+ bdev_io->internal.ch = channel;
+ bdev_io->internal.desc = desc;
+ bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE;
+ bdev_io->u.bdev.iovs = &bdev_io->iov;
+ bdev_io->u.bdev.iovs[0].iov_base = buf;
+ bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen;
+ bdev_io->u.bdev.iovcnt = 1;
+ bdev_io->u.bdev.md_buf = md_buf;
+ bdev_io->u.bdev.num_blocks = num_blocks;
+ bdev_io->u.bdev.offset_blocks = offset_blocks;
+ bdev_io_init(bdev_io, bdev, cb_arg, cb);
+
+ if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) {
+ bdev_io_submit(bdev_io);
+ return 0;
+ }
+
+ bdev_compare_do_read(bdev_io);
+
+ return 0;
+}
+
+int
+spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+ void *buf, uint64_t offset_blocks, uint64_t num_blocks,
+ spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+ return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks,
+ cb, cb_arg);
+}
+
+int
+spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+ void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks,
+ spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+ struct iovec iov = {
+ .iov_base = buf,
+ };
+
+ if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) {
+ return -EINVAL;
+ }
+
+ if (!_bdev_io_check_md_buf(&iov, md_buf)) {
+ return -EINVAL;
+ }
+
+ return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks,
+ cb, cb_arg);
+}
+
+static void
+bdev_comparev_and_writev_blocks_unlocked(void *ctx, int unlock_status)
+{
+ struct spdk_bdev_io *bdev_io = ctx;
+
+ if (unlock_status) {
+ SPDK_ERRLOG("LBA range unlock failed\n");
+ }
+
+ bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true :
+ false, bdev_io->internal.caller_ctx);
+}
+
+static void
+bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status)
+{
+ bdev_io->internal.status = status;
+
+ bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch),
+ bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
+ bdev_comparev_and_writev_blocks_unlocked, bdev_io);
+}
+
+static void
+bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct spdk_bdev_io *parent_io = cb_arg;
+
+ if (!success) {
+ SPDK_ERRLOG("Compare and write operation failed\n");
+ }
+
+ spdk_bdev_free_io(bdev_io);
+
+ bdev_comparev_and_writev_blocks_unlock(parent_io,
+ success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED);
+}
+
+static void
+bdev_compare_and_write_do_write(void *_bdev_io)
+{
+ struct spdk_bdev_io *bdev_io = _bdev_io;
+ int rc;
+
+ rc = spdk_bdev_writev_blocks(bdev_io->internal.desc,
+ spdk_io_channel_from_ctx(bdev_io->internal.ch),
+ bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt,
+ bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
+ bdev_compare_and_write_do_write_done, bdev_io);
+
+
+ if (rc == -ENOMEM) {
+ bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write);
+ } else if (rc != 0) {
+ bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
+ }
+}
+
+static void
+bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct spdk_bdev_io *parent_io = cb_arg;
+
+ spdk_bdev_free_io(bdev_io);
+
+ if (!success) {
+ bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE);
+ return;
+ }
+
+ bdev_compare_and_write_do_write(parent_io);
+}
+
+static void
+bdev_compare_and_write_do_compare(void *_bdev_io)
+{
+ struct spdk_bdev_io *bdev_io = _bdev_io;
+ int rc;
+
+ rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc,
+ spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs,
+ bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks,
+ bdev_compare_and_write_do_compare_done, bdev_io);
+
+ if (rc == -ENOMEM) {
+ bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare);
+ } else if (rc != 0) {
+ bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED);
+ }
+}
+
+static void
+bdev_comparev_and_writev_blocks_locked(void *ctx, int status)
+{
+ struct spdk_bdev_io *bdev_io = ctx;
+
+ if (status) {
+ bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED;
+ bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
+ }
+
+ bdev_compare_and_write_do_compare(bdev_io);
+}
+
+int
+spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+ struct iovec *compare_iov, int compare_iovcnt,
+ struct iovec *write_iov, int write_iovcnt,
+ uint64_t offset_blocks, uint64_t num_blocks,
+ spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+ struct spdk_bdev_io *bdev_io;
+ struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
+
+ if (!desc->write) {
+ return -EBADF;
+ }
+
+ if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
+ return -EINVAL;
+ }
+
+ if (num_blocks > bdev->acwu) {
+ return -EINVAL;
+ }
+
+ bdev_io = bdev_channel_get_io(channel);
+ if (!bdev_io) {
+ return -ENOMEM;
+ }
+
+ bdev_io->internal.ch = channel;
+ bdev_io->internal.desc = desc;
+ bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE;
+ bdev_io->u.bdev.iovs = compare_iov;
+ bdev_io->u.bdev.iovcnt = compare_iovcnt;
+ bdev_io->u.bdev.fused_iovs = write_iov;
+ bdev_io->u.bdev.fused_iovcnt = write_iovcnt;
+ bdev_io->u.bdev.md_buf = NULL;
+ bdev_io->u.bdev.num_blocks = num_blocks;
+ bdev_io->u.bdev.offset_blocks = offset_blocks;
+ bdev_io_init(bdev_io, bdev, cb_arg, cb);
+
+ if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) {
+ bdev_io_submit(bdev_io);
+ return 0;
+ }
+
+ return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks,
+ bdev_comparev_and_writev_blocks_locked, bdev_io);
+}
+
+static void
+bdev_zcopy_get_buf(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success)
+{
+ if (!success) {
+ /* Don't use spdk_bdev_io_complete here - this bdev_io was never actually submitted. */
+ bdev_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM;
+ bdev_io->internal.cb(bdev_io, success, bdev_io->internal.caller_ctx);
+ return;
+ }
+
+ if (bdev_io->u.bdev.zcopy.populate) {
+ /* Read the real data into the buffer */
+ bdev_io->type = SPDK_BDEV_IO_TYPE_READ;
+ bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING;
+ bdev_io_submit(bdev_io);
+ return;
+ }
+
+ /* Don't use spdk_bdev_io_complete here - this bdev_io was never actually submitted. */
+ bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
+ bdev_io->internal.cb(bdev_io, success, bdev_io->internal.caller_ctx);
+}
+
+int
+spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+ uint64_t offset_blocks, uint64_t num_blocks,
+ bool populate,
+ spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+ struct spdk_bdev_io *bdev_io;
+ struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
+
+ if (!desc->write) {
+ return -EBADF;
+ }
+
+ if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
+ return -EINVAL;
+ }
+
+ if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) {
+ return -ENOTSUP;
+ }
+
+ bdev_io = bdev_channel_get_io(channel);
+ if (!bdev_io) {
+ return -ENOMEM;
+ }
+
+ bdev_io->internal.ch = channel;
+ bdev_io->internal.desc = desc;
+ bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY;
+ bdev_io->u.bdev.num_blocks = num_blocks;
+ bdev_io->u.bdev.offset_blocks = offset_blocks;
+ bdev_io->u.bdev.iovs = NULL;
+ bdev_io->u.bdev.iovcnt = 0;
+ bdev_io->u.bdev.md_buf = NULL;
+ bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0;
+ bdev_io->u.bdev.zcopy.commit = 0;
+ bdev_io->u.bdev.zcopy.start = 1;
+ bdev_io_init(bdev_io, bdev, cb_arg, cb);
+
+ if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) {
+ bdev_io_submit(bdev_io);
+ } else {
+ /* Emulate zcopy by allocating a buffer */
+ spdk_bdev_io_get_buf(bdev_io, bdev_zcopy_get_buf,
+ bdev_io->u.bdev.num_blocks * bdev->blocklen);
+ }
+
+ return 0;
+}
+
+int
+spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit,
+ spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+ struct spdk_bdev *bdev = bdev_io->bdev;
+
+ if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) {
+ /* This can happen if the zcopy was emulated in start */
+ if (bdev_io->u.bdev.zcopy.start != 1) {
+ return -EINVAL;
+ }
+ bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY;
+ }
+
+ if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) {
+ return -EINVAL;
+ }
+
+ bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0;
+ bdev_io->u.bdev.zcopy.start = 0;
+ bdev_io->internal.caller_ctx = cb_arg;
+ bdev_io->internal.cb = cb;
+ bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING;
+
+ if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) {
+ bdev_io_submit(bdev_io);
+ return 0;
+ }
+
+ if (!bdev_io->u.bdev.zcopy.commit) {
+ /* Don't use spdk_bdev_io_complete here - this bdev_io was never actually submitted. */
+ bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
+ bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx);
+ return 0;
+ }
+
+ bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE;
+ bdev_io_submit(bdev_io);
+
+ return 0;
+}
+
+int
+spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+ uint64_t offset, uint64_t len,
+ spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+ uint64_t offset_blocks, num_blocks;
+
+ if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
+ len, &num_blocks) != 0) {
+ return -EINVAL;
+ }
+
+ return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg);
+}
+
+int
+spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+ uint64_t offset_blocks, uint64_t num_blocks,
+ spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+ struct spdk_bdev_io *bdev_io;
+ struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
+
+ if (!desc->write) {
+ return -EBADF;
+ }
+
+ if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
+ return -EINVAL;
+ }
+
+ if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) &&
+ !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) {
+ return -ENOTSUP;
+ }
+
+ bdev_io = bdev_channel_get_io(channel);
+
+ if (!bdev_io) {
+ return -ENOMEM;
+ }
+
+ bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES;
+ bdev_io->internal.ch = channel;
+ bdev_io->internal.desc = desc;
+ bdev_io->u.bdev.offset_blocks = offset_blocks;
+ bdev_io->u.bdev.num_blocks = num_blocks;
+ bdev_io_init(bdev_io, bdev, cb_arg, cb);
+
+ if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
+ bdev_io_submit(bdev_io);
+ return 0;
+ }
+
+ assert(bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE));
+ assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE);
+ bdev_io->u.bdev.split_remaining_num_blocks = num_blocks;
+ bdev_io->u.bdev.split_current_offset_blocks = offset_blocks;
+ bdev_write_zero_buffer_next(bdev_io);
+
+ return 0;
+}
+
+int
+spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+ uint64_t offset, uint64_t nbytes,
+ spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+ uint64_t offset_blocks, num_blocks;
+
+ if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
+ nbytes, &num_blocks) != 0) {
+ return -EINVAL;
+ }
+
+ return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg);
+}
+
+int
+spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+ uint64_t offset_blocks, uint64_t num_blocks,
+ spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+ struct spdk_bdev_io *bdev_io;
+ struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
+
+ if (!desc->write) {
+ return -EBADF;
+ }
+
+ if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
+ return -EINVAL;
+ }
+
+ if (num_blocks == 0) {
+ SPDK_ERRLOG("Can't unmap 0 bytes\n");
+ return -EINVAL;
+ }
+
+ bdev_io = bdev_channel_get_io(channel);
+ if (!bdev_io) {
+ return -ENOMEM;
+ }
+
+ bdev_io->internal.ch = channel;
+ bdev_io->internal.desc = desc;
+ bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP;
+
+ bdev_io->u.bdev.iovs = &bdev_io->iov;
+ bdev_io->u.bdev.iovs[0].iov_base = NULL;
+ bdev_io->u.bdev.iovs[0].iov_len = 0;
+ bdev_io->u.bdev.iovcnt = 1;
+
+ bdev_io->u.bdev.offset_blocks = offset_blocks;
+ bdev_io->u.bdev.num_blocks = num_blocks;
+ bdev_io_init(bdev_io, bdev, cb_arg, cb);
+
+ bdev_io_submit(bdev_io);
+ return 0;
+}
+
+int
+spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+ uint64_t offset, uint64_t length,
+ spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+ uint64_t offset_blocks, num_blocks;
+
+ if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks,
+ length, &num_blocks) != 0) {
+ return -EINVAL;
+ }
+
+ return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg);
+}
+
+int
+spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+ uint64_t offset_blocks, uint64_t num_blocks,
+ spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+ struct spdk_bdev_io *bdev_io;
+ struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
+
+ if (!desc->write) {
+ return -EBADF;
+ }
+
+ if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) {
+ return -EINVAL;
+ }
+
+ bdev_io = bdev_channel_get_io(channel);
+ if (!bdev_io) {
+ return -ENOMEM;
+ }
+
+ bdev_io->internal.ch = channel;
+ bdev_io->internal.desc = desc;
+ bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH;
+ bdev_io->u.bdev.iovs = NULL;
+ bdev_io->u.bdev.iovcnt = 0;
+ bdev_io->u.bdev.offset_blocks = offset_blocks;
+ bdev_io->u.bdev.num_blocks = num_blocks;
+ bdev_io_init(bdev_io, bdev, cb_arg, cb);
+
+ bdev_io_submit(bdev_io);
+ return 0;
+}
+
+static void
+bdev_reset_dev(struct spdk_io_channel_iter *i, int status)
+{
+ struct spdk_bdev_channel *ch = spdk_io_channel_iter_get_ctx(i);
+ struct spdk_bdev_io *bdev_io;
+
+ bdev_io = TAILQ_FIRST(&ch->queued_resets);
+ TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link);
+ bdev_io_submit_reset(bdev_io);
+}
+
+static void
+bdev_reset_freeze_channel(struct spdk_io_channel_iter *i)
+{
+ struct spdk_io_channel *ch;
+ struct spdk_bdev_channel *channel;
+ struct spdk_bdev_mgmt_channel *mgmt_channel;
+ struct spdk_bdev_shared_resource *shared_resource;
+ bdev_io_tailq_t tmp_queued;
+
+ TAILQ_INIT(&tmp_queued);
+
+ ch = spdk_io_channel_iter_get_channel(i);
+ channel = spdk_io_channel_get_ctx(ch);
+ shared_resource = channel->shared_resource;
+ mgmt_channel = shared_resource->mgmt_ch;
+
+ channel->flags |= BDEV_CH_RESET_IN_PROGRESS;
+
+ if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) {
+ /* The QoS object is always valid and readable while
+ * the channel flag is set, so the lock here should not
+ * be necessary. We're not in the fast path though, so
+ * just take it anyway. */
+ pthread_mutex_lock(&channel->bdev->internal.mutex);
+ if (channel->bdev->internal.qos->ch == channel) {
+ TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link);
+ }
+ pthread_mutex_unlock(&channel->bdev->internal.mutex);
+ }
+
+ bdev_abort_all_queued_io(&shared_resource->nomem_io, channel);
+ bdev_abort_all_buf_io(&mgmt_channel->need_buf_small, channel);
+ bdev_abort_all_buf_io(&mgmt_channel->need_buf_large, channel);
+ bdev_abort_all_queued_io(&tmp_queued, channel);
+
+ spdk_for_each_channel_continue(i, 0);
+}
+
+static void
+bdev_start_reset(void *ctx)
+{
+ struct spdk_bdev_channel *ch = ctx;
+
+ spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), bdev_reset_freeze_channel,
+ ch, bdev_reset_dev);
+}
+
+static void
+bdev_channel_start_reset(struct spdk_bdev_channel *ch)
+{
+ struct spdk_bdev *bdev = ch->bdev;
+
+ assert(!TAILQ_EMPTY(&ch->queued_resets));
+
+ pthread_mutex_lock(&bdev->internal.mutex);
+ if (bdev->internal.reset_in_progress == NULL) {
+ bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets);
+ /*
+ * Take a channel reference for the target bdev for the life of this
+ * reset. This guards against the channel getting destroyed while
+ * spdk_for_each_channel() calls related to this reset IO are in
+ * progress. We will release the reference when this reset is
+ * completed.
+ */
+ bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev));
+ bdev_start_reset(ch);
+ }
+ pthread_mutex_unlock(&bdev->internal.mutex);
+}
+
+int
+spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+ spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+ struct spdk_bdev_io *bdev_io;
+ struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
+
+ bdev_io = bdev_channel_get_io(channel);
+ if (!bdev_io) {
+ return -ENOMEM;
+ }
+
+ bdev_io->internal.ch = channel;
+ bdev_io->internal.desc = desc;
+ bdev_io->internal.submit_tsc = spdk_get_ticks();
+ bdev_io->type = SPDK_BDEV_IO_TYPE_RESET;
+ bdev_io->u.reset.ch_ref = NULL;
+ bdev_io_init(bdev_io, bdev, cb_arg, cb);
+
+ pthread_mutex_lock(&bdev->internal.mutex);
+ TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link);
+ pthread_mutex_unlock(&bdev->internal.mutex);
+
+ TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io,
+ internal.ch_link);
+
+ bdev_channel_start_reset(channel);
+
+ return 0;
+}
+
+void
+spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch,
+ struct spdk_bdev_io_stat *stat)
+{
+ struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
+
+ *stat = channel->stat;
+}
+
+static void
+bdev_get_device_stat_done(struct spdk_io_channel_iter *i, int status)
+{
+ void *io_device = spdk_io_channel_iter_get_io_device(i);
+ struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i);
+
+ bdev_iostat_ctx->cb(__bdev_from_io_dev(io_device), bdev_iostat_ctx->stat,
+ bdev_iostat_ctx->cb_arg, 0);
+ free(bdev_iostat_ctx);
+}
+
+static void
+bdev_get_each_channel_stat(struct spdk_io_channel_iter *i)
+{
+ struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i);
+ struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
+ struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
+
+ bdev_io_stat_add(bdev_iostat_ctx->stat, &channel->stat);
+ spdk_for_each_channel_continue(i, 0);
+}
+
+void
+spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat,
+ spdk_bdev_get_device_stat_cb cb, void *cb_arg)
+{
+ struct spdk_bdev_iostat_ctx *bdev_iostat_ctx;
+
+ assert(bdev != NULL);
+ assert(stat != NULL);
+ assert(cb != NULL);
+
+ bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx));
+ if (bdev_iostat_ctx == NULL) {
+ SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n");
+ cb(bdev, stat, cb_arg, -ENOMEM);
+ return;
+ }
+
+ bdev_iostat_ctx->stat = stat;
+ bdev_iostat_ctx->cb = cb;
+ bdev_iostat_ctx->cb_arg = cb_arg;
+
+ /* Start with the statistics from previously deleted channels. */
+ pthread_mutex_lock(&bdev->internal.mutex);
+ bdev_io_stat_add(bdev_iostat_ctx->stat, &bdev->internal.stat);
+ pthread_mutex_unlock(&bdev->internal.mutex);
+
+ /* Then iterate and add the statistics from each existing channel. */
+ spdk_for_each_channel(__bdev_to_io_dev(bdev),
+ bdev_get_each_channel_stat,
+ bdev_iostat_ctx,
+ bdev_get_device_stat_done);
+}
+
+int
+spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+ const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes,
+ spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+ struct spdk_bdev_io *bdev_io;
+ struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
+
+ if (!desc->write) {
+ return -EBADF;
+ }
+
+ bdev_io = bdev_channel_get_io(channel);
+ if (!bdev_io) {
+ return -ENOMEM;
+ }
+
+ bdev_io->internal.ch = channel;
+ bdev_io->internal.desc = desc;
+ bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN;
+ bdev_io->u.nvme_passthru.cmd = *cmd;
+ bdev_io->u.nvme_passthru.buf = buf;
+ bdev_io->u.nvme_passthru.nbytes = nbytes;
+ bdev_io->u.nvme_passthru.md_buf = NULL;
+ bdev_io->u.nvme_passthru.md_len = 0;
+
+ bdev_io_init(bdev_io, bdev, cb_arg, cb);
+
+ bdev_io_submit(bdev_io);
+ return 0;
+}
+
+int
+spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+ const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes,
+ spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+ struct spdk_bdev_io *bdev_io;
+ struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
+
+ if (!desc->write) {
+ /*
+ * Do not try to parse the NVMe command - we could maybe use bits in the opcode
+ * to easily determine if the command is a read or write, but for now just
+ * do not allow io_passthru with a read-only descriptor.
+ */
+ return -EBADF;
+ }
+
+ bdev_io = bdev_channel_get_io(channel);
+ if (!bdev_io) {
+ return -ENOMEM;
+ }
+
+ bdev_io->internal.ch = channel;
+ bdev_io->internal.desc = desc;
+ bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO;
+ bdev_io->u.nvme_passthru.cmd = *cmd;
+ bdev_io->u.nvme_passthru.buf = buf;
+ bdev_io->u.nvme_passthru.nbytes = nbytes;
+ bdev_io->u.nvme_passthru.md_buf = NULL;
+ bdev_io->u.nvme_passthru.md_len = 0;
+
+ bdev_io_init(bdev_io, bdev, cb_arg, cb);
+
+ bdev_io_submit(bdev_io);
+ return 0;
+}
+
+int
+spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+ const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len,
+ spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+ struct spdk_bdev_io *bdev_io;
+ struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
+
+ if (!desc->write) {
+ /*
+ * Do not try to parse the NVMe command - we could maybe use bits in the opcode
+ * to easily determine if the command is a read or write, but for now just
+ * do not allow io_passthru with a read-only descriptor.
+ */
+ return -EBADF;
+ }
+
+ bdev_io = bdev_channel_get_io(channel);
+ if (!bdev_io) {
+ return -ENOMEM;
+ }
+
+ bdev_io->internal.ch = channel;
+ bdev_io->internal.desc = desc;
+ bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD;
+ bdev_io->u.nvme_passthru.cmd = *cmd;
+ bdev_io->u.nvme_passthru.buf = buf;
+ bdev_io->u.nvme_passthru.nbytes = nbytes;
+ bdev_io->u.nvme_passthru.md_buf = md_buf;
+ bdev_io->u.nvme_passthru.md_len = md_len;
+
+ bdev_io_init(bdev_io, bdev, cb_arg, cb);
+
+ bdev_io_submit(bdev_io);
+ return 0;
+}
+
+static void bdev_abort_retry(void *ctx);
+static void bdev_abort(struct spdk_bdev_io *parent_io);
+
+static void
+bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct spdk_bdev_channel *channel = bdev_io->internal.ch;
+ struct spdk_bdev_io *parent_io = cb_arg;
+ struct spdk_bdev_io *bio_to_abort, *tmp_io;
+
+ bio_to_abort = bdev_io->u.abort.bio_to_abort;
+
+ spdk_bdev_free_io(bdev_io);
+
+ if (!success) {
+ /* Check if the target I/O completed in the meantime. */
+ TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) {
+ if (tmp_io == bio_to_abort) {
+ break;
+ }
+ }
+
+ /* If the target I/O still exists, set the parent to failed. */
+ if (tmp_io != NULL) {
+ parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
+ }
+ }
+
+ parent_io->u.bdev.split_outstanding--;
+ if (parent_io->u.bdev.split_outstanding == 0) {
+ if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) {
+ bdev_abort_retry(parent_io);
+ } else {
+ bdev_io_complete(parent_io);
+ }
+ }
+}
+
+static int
+bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel,
+ struct spdk_bdev_io *bio_to_abort,
+ spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+ struct spdk_bdev_io *bdev_io;
+
+ if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT ||
+ bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) {
+ /* TODO: Abort reset or abort request. */
+ return -ENOTSUP;
+ }
+
+ bdev_io = bdev_channel_get_io(channel);
+ if (bdev_io == NULL) {
+ return -ENOMEM;
+ }
+
+ bdev_io->internal.ch = channel;
+ bdev_io->internal.desc = desc;
+ bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT;
+ bdev_io_init(bdev_io, bdev, cb_arg, cb);
+
+ if (bdev->split_on_optimal_io_boundary && bdev_io_should_split(bio_to_abort)) {
+ bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort;
+
+ /* Parent abort request is not submitted directly, but to manage its
+ * execution add it to the submitted list here.
+ */
+ bdev_io->internal.submit_tsc = spdk_get_ticks();
+ TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link);
+
+ bdev_abort(bdev_io);
+
+ return 0;
+ }
+
+ bdev_io->u.abort.bio_to_abort = bio_to_abort;
+
+ /* Submit the abort request to the underlying bdev module. */
+ bdev_io_submit(bdev_io);
+
+ return 0;
+}
+
+static uint32_t
+_bdev_abort(struct spdk_bdev_io *parent_io)
+{
+ struct spdk_bdev_desc *desc = parent_io->internal.desc;
+ struct spdk_bdev_channel *channel = parent_io->internal.ch;
+ void *bio_cb_arg;
+ struct spdk_bdev_io *bio_to_abort;
+ uint32_t matched_ios;
+ int rc;
+
+ bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg;
+
+ /* matched_ios is returned and will be kept by the caller.
+ *
+ * This funcion will be used for two cases, 1) the same cb_arg is used for
+ * multiple I/Os, 2) a single large I/O is split into smaller ones.
+ * Incrementing split_outstanding directly here may confuse readers especially
+ * for the 1st case.
+ *
+ * Completion of I/O abort is processed after stack unwinding. Hence this trick
+ * works as expected.
+ */
+ matched_ios = 0;
+ parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
+
+ TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) {
+ if (bio_to_abort->internal.caller_ctx != bio_cb_arg) {
+ continue;
+ }
+
+ if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) {
+ /* Any I/O which was submitted after this abort command should be excluded. */
+ continue;
+ }
+
+ rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io);
+ if (rc != 0) {
+ if (rc == -ENOMEM) {
+ parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM;
+ } else {
+ parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
+ }
+ break;
+ }
+ matched_ios++;
+ }
+
+ return matched_ios;
+}
+
+static void
+bdev_abort_retry(void *ctx)
+{
+ struct spdk_bdev_io *parent_io = ctx;
+ uint32_t matched_ios;
+
+ matched_ios = _bdev_abort(parent_io);
+
+ if (matched_ios == 0) {
+ if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) {
+ bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry);
+ } else {
+ /* For retry, the case that no target I/O was found is success
+ * because it means target I/Os completed in the meantime.
+ */
+ bdev_io_complete(parent_io);
+ }
+ return;
+ }
+
+ /* Use split_outstanding to manage the progress of aborting I/Os. */
+ parent_io->u.bdev.split_outstanding = matched_ios;
+}
+
+static void
+bdev_abort(struct spdk_bdev_io *parent_io)
+{
+ uint32_t matched_ios;
+
+ matched_ios = _bdev_abort(parent_io);
+
+ if (matched_ios == 0) {
+ if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) {
+ bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry);
+ } else {
+ /* The case the no target I/O was found is failure. */
+ parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
+ bdev_io_complete(parent_io);
+ }
+ return;
+ }
+
+ /* Use split_outstanding to manage the progress of aborting I/Os. */
+ parent_io->u.bdev.split_outstanding = matched_ios;
+}
+
+int
+spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+ void *bio_cb_arg,
+ spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+ struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
+ struct spdk_bdev_io *bdev_io;
+
+ if (bio_cb_arg == NULL) {
+ return -EINVAL;
+ }
+
+ if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) {
+ return -ENOTSUP;
+ }
+
+ bdev_io = bdev_channel_get_io(channel);
+ if (bdev_io == NULL) {
+ return -ENOMEM;
+ }
+
+ bdev_io->internal.ch = channel;
+ bdev_io->internal.desc = desc;
+ bdev_io->internal.submit_tsc = spdk_get_ticks();
+ bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT;
+ bdev_io_init(bdev_io, bdev, cb_arg, cb);
+
+ bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg;
+
+ /* Parent abort request is not submitted directly, but to manage its execution,
+ * add it to the submitted list here.
+ */
+ TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link);
+
+ bdev_abort(bdev_io);
+
+ return 0;
+}
+
+int
+spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch,
+ struct spdk_bdev_io_wait_entry *entry)
+{
+ struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
+ struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch;
+
+ if (bdev != entry->bdev) {
+ SPDK_ERRLOG("bdevs do not match\n");
+ return -EINVAL;
+ }
+
+ if (mgmt_ch->per_thread_cache_count > 0) {
+ SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n");
+ return -EINVAL;
+ }
+
+ TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link);
+ return 0;
+}
+
+static void
+bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch)
+{
+ struct spdk_bdev *bdev = bdev_ch->bdev;
+ struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
+ struct spdk_bdev_io *bdev_io;
+
+ if (shared_resource->io_outstanding > shared_resource->nomem_threshold) {
+ /*
+ * Allow some more I/O to complete before retrying the nomem_io queue.
+ * Some drivers (such as nvme) cannot immediately take a new I/O in
+ * the context of a completion, because the resources for the I/O are
+ * not released until control returns to the bdev poller. Also, we
+ * may require several small I/O to complete before a larger I/O
+ * (that requires splitting) can be submitted.
+ */
+ return;
+ }
+
+ while (!TAILQ_EMPTY(&shared_resource->nomem_io)) {
+ bdev_io = TAILQ_FIRST(&shared_resource->nomem_io);
+ TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link);
+ bdev_io->internal.ch->io_outstanding++;
+ shared_resource->io_outstanding++;
+ bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING;
+ bdev_io->internal.error.nvme.cdw0 = 0;
+ bdev_io->num_retries++;
+ bdev->fn_table->submit_request(spdk_bdev_io_get_io_channel(bdev_io), bdev_io);
+ if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) {
+ break;
+ }
+ }
+}
+
+static inline void
+bdev_io_complete(void *ctx)
+{
+ struct spdk_bdev_io *bdev_io = ctx;
+ struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
+ uint64_t tsc, tsc_diff;
+
+ if (spdk_unlikely(bdev_io->internal.in_submit_request || bdev_io->internal.io_submit_ch)) {
+ /*
+ * Send the completion to the thread that originally submitted the I/O,
+ * which may not be the current thread in the case of QoS.
+ */
+ if (bdev_io->internal.io_submit_ch) {
+ bdev_io->internal.ch = bdev_io->internal.io_submit_ch;
+ bdev_io->internal.io_submit_ch = NULL;
+ }
+
+ /*
+ * Defer completion to avoid potential infinite recursion if the
+ * user's completion callback issues a new I/O.
+ */
+ spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io),
+ bdev_io_complete, bdev_io);
+ return;
+ }
+
+ tsc = spdk_get_ticks();
+ tsc_diff = tsc - bdev_io->internal.submit_tsc;
+ spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, 0);
+
+ TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link);
+
+ if (bdev_io->internal.ch->histogram) {
+ spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff);
+ }
+
+ if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
+ switch (bdev_io->type) {
+ case SPDK_BDEV_IO_TYPE_READ:
+ bdev_io->internal.ch->stat.bytes_read += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
+ bdev_io->internal.ch->stat.num_read_ops++;
+ bdev_io->internal.ch->stat.read_latency_ticks += tsc_diff;
+ break;
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ bdev_io->internal.ch->stat.bytes_written += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
+ bdev_io->internal.ch->stat.num_write_ops++;
+ bdev_io->internal.ch->stat.write_latency_ticks += tsc_diff;
+ break;
+ case SPDK_BDEV_IO_TYPE_UNMAP:
+ bdev_io->internal.ch->stat.bytes_unmapped += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
+ bdev_io->internal.ch->stat.num_unmap_ops++;
+ bdev_io->internal.ch->stat.unmap_latency_ticks += tsc_diff;
+ break;
+ case SPDK_BDEV_IO_TYPE_ZCOPY:
+ /* Track the data in the start phase only */
+ if (bdev_io->u.bdev.zcopy.start) {
+ if (bdev_io->u.bdev.zcopy.populate) {
+ bdev_io->internal.ch->stat.bytes_read +=
+ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
+ bdev_io->internal.ch->stat.num_read_ops++;
+ bdev_io->internal.ch->stat.read_latency_ticks += tsc_diff;
+ } else {
+ bdev_io->internal.ch->stat.bytes_written +=
+ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen;
+ bdev_io->internal.ch->stat.num_write_ops++;
+ bdev_io->internal.ch->stat.write_latency_ticks += tsc_diff;
+ }
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+#ifdef SPDK_CONFIG_VTUNE
+ uint64_t now_tsc = spdk_get_ticks();
+ if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) {
+ uint64_t data[5];
+
+ data[0] = bdev_io->internal.ch->stat.num_read_ops - bdev_io->internal.ch->prev_stat.num_read_ops;
+ data[1] = bdev_io->internal.ch->stat.bytes_read - bdev_io->internal.ch->prev_stat.bytes_read;
+ data[2] = bdev_io->internal.ch->stat.num_write_ops - bdev_io->internal.ch->prev_stat.num_write_ops;
+ data[3] = bdev_io->internal.ch->stat.bytes_written - bdev_io->internal.ch->prev_stat.bytes_written;
+ data[4] = bdev_io->bdev->fn_table->get_spin_time ?
+ bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0;
+
+ __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle,
+ __itt_metadata_u64, 5, data);
+
+ bdev_io->internal.ch->prev_stat = bdev_io->internal.ch->stat;
+ bdev_io->internal.ch->start_tsc = now_tsc;
+ }
+#endif
+
+ assert(bdev_io->internal.cb != NULL);
+ assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io));
+
+ bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS,
+ bdev_io->internal.caller_ctx);
+}
+
+static void
+bdev_reset_complete(struct spdk_io_channel_iter *i, int status)
+{
+ struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i);
+
+ if (bdev_io->u.reset.ch_ref != NULL) {
+ spdk_put_io_channel(bdev_io->u.reset.ch_ref);
+ bdev_io->u.reset.ch_ref = NULL;
+ }
+
+ bdev_io_complete(bdev_io);
+}
+
+static void
+bdev_unfreeze_channel(struct spdk_io_channel_iter *i)
+{
+ struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i);
+ struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
+ struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch);
+ struct spdk_bdev_io *queued_reset;
+
+ ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS;
+ while (!TAILQ_EMPTY(&ch->queued_resets)) {
+ queued_reset = TAILQ_FIRST(&ch->queued_resets);
+ TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link);
+ spdk_bdev_io_complete(queued_reset, bdev_io->internal.status);
+ }
+
+ spdk_for_each_channel_continue(i, 0);
+}
+
+void
+spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status)
+{
+ struct spdk_bdev *bdev = bdev_io->bdev;
+ struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch;
+ struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource;
+
+ bdev_io->internal.status = status;
+
+ if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) {
+ bool unlock_channels = false;
+
+ if (status == SPDK_BDEV_IO_STATUS_NOMEM) {
+ SPDK_ERRLOG("NOMEM returned for reset\n");
+ }
+ pthread_mutex_lock(&bdev->internal.mutex);
+ if (bdev_io == bdev->internal.reset_in_progress) {
+ bdev->internal.reset_in_progress = NULL;
+ unlock_channels = true;
+ }
+ pthread_mutex_unlock(&bdev->internal.mutex);
+
+ if (unlock_channels) {
+ spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_unfreeze_channel,
+ bdev_io, bdev_reset_complete);
+ return;
+ }
+ } else {
+ _bdev_io_unset_bounce_buf(bdev_io);
+
+ assert(bdev_ch->io_outstanding > 0);
+ assert(shared_resource->io_outstanding > 0);
+ bdev_ch->io_outstanding--;
+ shared_resource->io_outstanding--;
+
+ if (spdk_unlikely(status == SPDK_BDEV_IO_STATUS_NOMEM)) {
+ TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link);
+ /*
+ * Wait for some of the outstanding I/O to complete before we
+ * retry any of the nomem_io. Normally we will wait for
+ * NOMEM_THRESHOLD_COUNT I/O to complete but for low queue
+ * depth channels we will instead wait for half to complete.
+ */
+ shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2,
+ (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT);
+ return;
+ }
+
+ if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) {
+ bdev_ch_retry_io(bdev_ch);
+ }
+ }
+
+ bdev_io_complete(bdev_io);
+}
+
+void
+spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc,
+ enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq)
+{
+ if (sc == SPDK_SCSI_STATUS_GOOD) {
+ bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
+ } else {
+ bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SCSI_ERROR;
+ bdev_io->internal.error.scsi.sc = sc;
+ bdev_io->internal.error.scsi.sk = sk;
+ bdev_io->internal.error.scsi.asc = asc;
+ bdev_io->internal.error.scsi.ascq = ascq;
+ }
+
+ spdk_bdev_io_complete(bdev_io, bdev_io->internal.status);
+}
+
+void
+spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io,
+ int *sc, int *sk, int *asc, int *ascq)
+{
+ assert(sc != NULL);
+ assert(sk != NULL);
+ assert(asc != NULL);
+ assert(ascq != NULL);
+
+ switch (bdev_io->internal.status) {
+ case SPDK_BDEV_IO_STATUS_SUCCESS:
+ *sc = SPDK_SCSI_STATUS_GOOD;
+ *sk = SPDK_SCSI_SENSE_NO_SENSE;
+ *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
+ *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+ break;
+ case SPDK_BDEV_IO_STATUS_NVME_ERROR:
+ spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq);
+ break;
+ case SPDK_BDEV_IO_STATUS_SCSI_ERROR:
+ *sc = bdev_io->internal.error.scsi.sc;
+ *sk = bdev_io->internal.error.scsi.sk;
+ *asc = bdev_io->internal.error.scsi.asc;
+ *ascq = bdev_io->internal.error.scsi.ascq;
+ break;
+ default:
+ *sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
+ *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND;
+ *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
+ *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+ break;
+ }
+}
+
+void
+spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc)
+{
+ if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) {
+ bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
+ } else {
+ bdev_io->internal.status = SPDK_BDEV_IO_STATUS_NVME_ERROR;
+ }
+
+ bdev_io->internal.error.nvme.cdw0 = cdw0;
+ bdev_io->internal.error.nvme.sct = sct;
+ bdev_io->internal.error.nvme.sc = sc;
+
+ spdk_bdev_io_complete(bdev_io, bdev_io->internal.status);
+}
+
+void
+spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc)
+{
+ assert(sct != NULL);
+ assert(sc != NULL);
+ assert(cdw0 != NULL);
+
+ if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) {
+ *sct = bdev_io->internal.error.nvme.sct;
+ *sc = bdev_io->internal.error.nvme.sc;
+ } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
+ *sct = SPDK_NVME_SCT_GENERIC;
+ *sc = SPDK_NVME_SC_SUCCESS;
+ } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) {
+ *sct = SPDK_NVME_SCT_GENERIC;
+ *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST;
+ } else {
+ *sct = SPDK_NVME_SCT_GENERIC;
+ *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+ }
+
+ *cdw0 = bdev_io->internal.error.nvme.cdw0;
+}
+
+void
+spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0,
+ int *first_sct, int *first_sc, int *second_sct, int *second_sc)
+{
+ assert(first_sct != NULL);
+ assert(first_sc != NULL);
+ assert(second_sct != NULL);
+ assert(second_sc != NULL);
+ assert(cdw0 != NULL);
+
+ if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) {
+ if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR &&
+ bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) {
+ *first_sct = bdev_io->internal.error.nvme.sct;
+ *first_sc = bdev_io->internal.error.nvme.sc;
+ *second_sct = SPDK_NVME_SCT_GENERIC;
+ *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED;
+ } else {
+ *first_sct = SPDK_NVME_SCT_GENERIC;
+ *first_sc = SPDK_NVME_SC_SUCCESS;
+ *second_sct = bdev_io->internal.error.nvme.sct;
+ *second_sc = bdev_io->internal.error.nvme.sc;
+ }
+ } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) {
+ *first_sct = SPDK_NVME_SCT_GENERIC;
+ *first_sc = SPDK_NVME_SC_SUCCESS;
+ *second_sct = SPDK_NVME_SCT_GENERIC;
+ *second_sc = SPDK_NVME_SC_SUCCESS;
+ } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) {
+ *first_sct = SPDK_NVME_SCT_GENERIC;
+ *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+ *second_sct = SPDK_NVME_SCT_GENERIC;
+ *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED;
+ } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) {
+ *first_sct = SPDK_NVME_SCT_MEDIA_ERROR;
+ *first_sc = SPDK_NVME_SC_COMPARE_FAILURE;
+ *second_sct = SPDK_NVME_SCT_GENERIC;
+ *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED;
+ } else {
+ *first_sct = SPDK_NVME_SCT_GENERIC;
+ *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+ *second_sct = SPDK_NVME_SCT_GENERIC;
+ *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+ }
+
+ *cdw0 = bdev_io->internal.error.nvme.cdw0;
+}
+
+struct spdk_thread *
+spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io)
+{
+ return spdk_io_channel_get_thread(bdev_io->internal.ch->channel);
+}
+
+struct spdk_io_channel *
+spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io)
+{
+ return bdev_io->internal.ch->channel;
+}
+
+static void
+bdev_qos_config_limit(struct spdk_bdev *bdev, uint64_t *limits)
+{
+ uint64_t min_qos_set;
+ int i;
+
+ for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
+ if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
+ break;
+ }
+ }
+
+ if (i == SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES) {
+ SPDK_ERRLOG("Invalid rate limits set.\n");
+ return;
+ }
+
+ for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
+ if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
+ continue;
+ }
+
+ if (bdev_qos_is_iops_rate_limit(i) == true) {
+ min_qos_set = SPDK_BDEV_QOS_MIN_IOS_PER_SEC;
+ } else {
+ min_qos_set = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC;
+ }
+
+ if (limits[i] == 0 || limits[i] % min_qos_set) {
+ SPDK_ERRLOG("Assigned limit %" PRIu64 " on bdev %s is not multiple of %" PRIu64 "\n",
+ limits[i], bdev->name, min_qos_set);
+ SPDK_ERRLOG("Failed to enable QoS on this bdev %s\n", bdev->name);
+ return;
+ }
+ }
+
+ if (!bdev->internal.qos) {
+ bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos));
+ if (!bdev->internal.qos) {
+ SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n");
+ return;
+ }
+ }
+
+ for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
+ bdev->internal.qos->rate_limits[i].limit = limits[i];
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Bdev:%s QoS type:%d set:%lu\n",
+ bdev->name, i, limits[i]);
+ }
+
+ return;
+}
+
+static void
+bdev_qos_config(struct spdk_bdev *bdev)
+{
+ struct spdk_conf_section *sp = NULL;
+ const char *val = NULL;
+ int i = 0, j = 0;
+ uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES] = {};
+ bool config_qos = false;
+
+ sp = spdk_conf_find_section(NULL, "QoS");
+ if (!sp) {
+ return;
+ }
+
+ while (j < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES) {
+ limits[j] = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED;
+
+ i = 0;
+ while (true) {
+ val = spdk_conf_section_get_nmval(sp, qos_conf_type[j], i, 0);
+ if (!val) {
+ break;
+ }
+
+ if (strcmp(bdev->name, val) != 0) {
+ i++;
+ continue;
+ }
+
+ val = spdk_conf_section_get_nmval(sp, qos_conf_type[j], i, 1);
+ if (val) {
+ if (bdev_qos_is_iops_rate_limit(j) == true) {
+ limits[j] = strtoull(val, NULL, 10);
+ } else {
+ limits[j] = strtoull(val, NULL, 10) * 1024 * 1024;
+ }
+ config_qos = true;
+ }
+
+ break;
+ }
+
+ j++;
+ }
+
+ if (config_qos == true) {
+ bdev_qos_config_limit(bdev, limits);
+ }
+
+ return;
+}
+
+static int
+bdev_init(struct spdk_bdev *bdev)
+{
+ char *bdev_name;
+
+ assert(bdev->module != NULL);
+
+ if (!bdev->name) {
+ SPDK_ERRLOG("Bdev name is NULL\n");
+ return -EINVAL;
+ }
+
+ if (!strlen(bdev->name)) {
+ SPDK_ERRLOG("Bdev name must not be an empty string\n");
+ return -EINVAL;
+ }
+
+ if (spdk_bdev_get_by_name(bdev->name)) {
+ SPDK_ERRLOG("Bdev name:%s already exists\n", bdev->name);
+ return -EEXIST;
+ }
+
+ /* Users often register their own I/O devices using the bdev name. In
+ * order to avoid conflicts, prepend bdev_. */
+ bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name);
+ if (!bdev_name) {
+ SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n");
+ return -ENOMEM;
+ }
+
+ bdev->internal.status = SPDK_BDEV_STATUS_READY;
+ bdev->internal.measured_queue_depth = UINT64_MAX;
+ bdev->internal.claim_module = NULL;
+ bdev->internal.qd_poller = NULL;
+ bdev->internal.qos = NULL;
+
+ /* If the user didn't specify a uuid, generate one. */
+ if (spdk_mem_all_zero(&bdev->uuid, sizeof(bdev->uuid))) {
+ spdk_uuid_generate(&bdev->uuid);
+ }
+
+ if (spdk_bdev_get_buf_align(bdev) > 1) {
+ if (bdev->split_on_optimal_io_boundary) {
+ bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary,
+ SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen);
+ } else {
+ bdev->split_on_optimal_io_boundary = true;
+ bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen;
+ }
+ }
+
+ /* If the user didn't specify a write unit size, set it to one. */
+ if (bdev->write_unit_size == 0) {
+ bdev->write_unit_size = 1;
+ }
+
+ /* Set ACWU value to 1 if bdev module did not set it (does not support it natively) */
+ if (bdev->acwu == 0) {
+ bdev->acwu = 1;
+ }
+
+ TAILQ_INIT(&bdev->internal.open_descs);
+ TAILQ_INIT(&bdev->internal.locked_ranges);
+ TAILQ_INIT(&bdev->internal.pending_locked_ranges);
+
+ TAILQ_INIT(&bdev->aliases);
+
+ bdev->internal.reset_in_progress = NULL;
+
+ bdev_qos_config(bdev);
+
+ spdk_io_device_register(__bdev_to_io_dev(bdev),
+ bdev_channel_create, bdev_channel_destroy,
+ sizeof(struct spdk_bdev_channel),
+ bdev_name);
+
+ free(bdev_name);
+
+ pthread_mutex_init(&bdev->internal.mutex, NULL);
+ return 0;
+}
+
+static void
+bdev_destroy_cb(void *io_device)
+{
+ int rc;
+ struct spdk_bdev *bdev;
+ spdk_bdev_unregister_cb cb_fn;
+ void *cb_arg;
+
+ bdev = __bdev_from_io_dev(io_device);
+ cb_fn = bdev->internal.unregister_cb;
+ cb_arg = bdev->internal.unregister_ctx;
+
+ rc = bdev->fn_table->destruct(bdev->ctxt);
+ if (rc < 0) {
+ SPDK_ERRLOG("destruct failed\n");
+ }
+ if (rc <= 0 && cb_fn != NULL) {
+ cb_fn(cb_arg, rc);
+ }
+}
+
+
+static void
+bdev_fini(struct spdk_bdev *bdev)
+{
+ pthread_mutex_destroy(&bdev->internal.mutex);
+
+ free(bdev->internal.qos);
+
+ spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb);
+}
+
+static void
+bdev_start(struct spdk_bdev *bdev)
+{
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Inserting bdev %s into list\n", bdev->name);
+ TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link);
+
+ /* Examine configuration before initializing I/O */
+ bdev_examine(bdev);
+}
+
+int
+spdk_bdev_register(struct spdk_bdev *bdev)
+{
+ int rc = bdev_init(bdev);
+
+ if (rc == 0) {
+ bdev_start(bdev);
+ }
+
+ spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev));
+ return rc;
+}
+
+int
+spdk_vbdev_register(struct spdk_bdev *vbdev, struct spdk_bdev **base_bdevs, int base_bdev_count)
+{
+ SPDK_ERRLOG("This function is deprecated. Use spdk_bdev_register() instead.\n");
+ return spdk_bdev_register(vbdev);
+}
+
+void
+spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno)
+{
+ if (bdev->internal.unregister_cb != NULL) {
+ bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno);
+ }
+}
+
+static void
+_remove_notify(void *arg)
+{
+ struct spdk_bdev_desc *desc = arg;
+
+ pthread_mutex_lock(&desc->mutex);
+ desc->refs--;
+
+ if (!desc->closed) {
+ pthread_mutex_unlock(&desc->mutex);
+ if (desc->callback.open_with_ext) {
+ desc->callback.event_fn(SPDK_BDEV_EVENT_REMOVE, desc->bdev, desc->callback.ctx);
+ } else {
+ desc->callback.remove_fn(desc->callback.ctx);
+ }
+ return;
+ } else if (0 == desc->refs) {
+ /* This descriptor was closed after this remove_notify message was sent.
+ * spdk_bdev_close() could not free the descriptor since this message was
+ * in flight, so we free it now using bdev_desc_free().
+ */
+ pthread_mutex_unlock(&desc->mutex);
+ bdev_desc_free(desc);
+ return;
+ }
+ pthread_mutex_unlock(&desc->mutex);
+}
+
+/* Must be called while holding bdev->internal.mutex.
+ * returns: 0 - bdev removed and ready to be destructed.
+ * -EBUSY - bdev can't be destructed yet. */
+static int
+bdev_unregister_unsafe(struct spdk_bdev *bdev)
+{
+ struct spdk_bdev_desc *desc, *tmp;
+ int rc = 0;
+
+ /* Notify each descriptor about hotremoval */
+ TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) {
+ rc = -EBUSY;
+ pthread_mutex_lock(&desc->mutex);
+ /*
+ * Defer invocation of the event_cb to a separate message that will
+ * run later on its thread. This ensures this context unwinds and
+ * we don't recursively unregister this bdev again if the event_cb
+ * immediately closes its descriptor.
+ */
+ desc->refs++;
+ spdk_thread_send_msg(desc->thread, _remove_notify, desc);
+ pthread_mutex_unlock(&desc->mutex);
+ }
+
+ /* If there are no descriptors, proceed removing the bdev */
+ if (rc == 0) {
+ TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link);
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Removing bdev %s from list done\n", bdev->name);
+ spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev));
+ }
+
+ return rc;
+}
+
+void
+spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg)
+{
+ struct spdk_thread *thread;
+ int rc;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Removing bdev %s from list\n", bdev->name);
+
+ thread = spdk_get_thread();
+ if (!thread) {
+ /* The user called this from a non-SPDK thread. */
+ if (cb_fn != NULL) {
+ cb_fn(cb_arg, -ENOTSUP);
+ }
+ return;
+ }
+
+ pthread_mutex_lock(&g_bdev_mgr.mutex);
+ pthread_mutex_lock(&bdev->internal.mutex);
+ if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) {
+ pthread_mutex_unlock(&bdev->internal.mutex);
+ pthread_mutex_unlock(&g_bdev_mgr.mutex);
+ if (cb_fn) {
+ cb_fn(cb_arg, -EBUSY);
+ }
+ return;
+ }
+
+ bdev->internal.status = SPDK_BDEV_STATUS_REMOVING;
+ bdev->internal.unregister_cb = cb_fn;
+ bdev->internal.unregister_ctx = cb_arg;
+
+ /* Call under lock. */
+ rc = bdev_unregister_unsafe(bdev);
+ pthread_mutex_unlock(&bdev->internal.mutex);
+ pthread_mutex_unlock(&g_bdev_mgr.mutex);
+
+ if (rc == 0) {
+ bdev_fini(bdev);
+ }
+}
+
+static void
+bdev_dummy_event_cb(void *remove_ctx)
+{
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Bdev remove event received with no remove callback specified");
+}
+
+static int
+bdev_start_qos(struct spdk_bdev *bdev)
+{
+ struct set_qos_limit_ctx *ctx;
+
+ /* Enable QoS */
+ if (bdev->internal.qos && bdev->internal.qos->thread == NULL) {
+ ctx = calloc(1, sizeof(*ctx));
+ if (ctx == NULL) {
+ SPDK_ERRLOG("Failed to allocate memory for QoS context\n");
+ return -ENOMEM;
+ }
+ ctx->bdev = bdev;
+ spdk_for_each_channel(__bdev_to_io_dev(bdev),
+ bdev_enable_qos_msg, ctx,
+ bdev_enable_qos_done);
+ }
+
+ return 0;
+}
+
+static int
+bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc)
+{
+ struct spdk_thread *thread;
+ int rc = 0;
+
+ thread = spdk_get_thread();
+ if (!thread) {
+ SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n");
+ return -ENOTSUP;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name,
+ spdk_get_thread());
+
+ desc->bdev = bdev;
+ desc->thread = thread;
+ desc->write = write;
+
+ pthread_mutex_lock(&bdev->internal.mutex);
+ if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) {
+ pthread_mutex_unlock(&bdev->internal.mutex);
+ return -ENODEV;
+ }
+
+ if (write && bdev->internal.claim_module) {
+ SPDK_ERRLOG("Could not open %s - %s module already claimed it\n",
+ bdev->name, bdev->internal.claim_module->name);
+ pthread_mutex_unlock(&bdev->internal.mutex);
+ return -EPERM;
+ }
+
+ rc = bdev_start_qos(bdev);
+ if (rc != 0) {
+ SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name);
+ pthread_mutex_unlock(&bdev->internal.mutex);
+ return rc;
+ }
+
+ TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link);
+
+ pthread_mutex_unlock(&bdev->internal.mutex);
+
+ return 0;
+}
+
+int
+spdk_bdev_open(struct spdk_bdev *bdev, bool write, spdk_bdev_remove_cb_t remove_cb,
+ void *remove_ctx, struct spdk_bdev_desc **_desc)
+{
+ struct spdk_bdev_desc *desc;
+ int rc;
+
+ desc = calloc(1, sizeof(*desc));
+ if (desc == NULL) {
+ SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n");
+ return -ENOMEM;
+ }
+
+ if (remove_cb == NULL) {
+ remove_cb = bdev_dummy_event_cb;
+ }
+
+ TAILQ_INIT(&desc->pending_media_events);
+ TAILQ_INIT(&desc->free_media_events);
+
+ desc->callback.open_with_ext = false;
+ desc->callback.remove_fn = remove_cb;
+ desc->callback.ctx = remove_ctx;
+ pthread_mutex_init(&desc->mutex, NULL);
+
+ pthread_mutex_lock(&g_bdev_mgr.mutex);
+
+ rc = bdev_open(bdev, write, desc);
+ if (rc != 0) {
+ bdev_desc_free(desc);
+ desc = NULL;
+ }
+
+ *_desc = desc;
+
+ pthread_mutex_unlock(&g_bdev_mgr.mutex);
+
+ return rc;
+}
+
+int
+spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb,
+ void *event_ctx, struct spdk_bdev_desc **_desc)
+{
+ struct spdk_bdev_desc *desc;
+ struct spdk_bdev *bdev;
+ unsigned int event_id;
+ int rc;
+
+ if (event_cb == NULL) {
+ SPDK_ERRLOG("Missing event callback function\n");
+ return -EINVAL;
+ }
+
+ pthread_mutex_lock(&g_bdev_mgr.mutex);
+
+ bdev = spdk_bdev_get_by_name(bdev_name);
+
+ if (bdev == NULL) {
+ SPDK_ERRLOG("Failed to find bdev with name: %s\n", bdev_name);
+ pthread_mutex_unlock(&g_bdev_mgr.mutex);
+ return -EINVAL;
+ }
+
+ desc = calloc(1, sizeof(*desc));
+ if (desc == NULL) {
+ SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n");
+ pthread_mutex_unlock(&g_bdev_mgr.mutex);
+ return -ENOMEM;
+ }
+
+ TAILQ_INIT(&desc->pending_media_events);
+ TAILQ_INIT(&desc->free_media_events);
+
+ desc->callback.open_with_ext = true;
+ desc->callback.event_fn = event_cb;
+ desc->callback.ctx = event_ctx;
+ pthread_mutex_init(&desc->mutex, NULL);
+
+ if (bdev->media_events) {
+ desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE,
+ sizeof(*desc->media_events_buffer));
+ if (desc->media_events_buffer == NULL) {
+ SPDK_ERRLOG("Failed to initialize media event pool\n");
+ bdev_desc_free(desc);
+ pthread_mutex_unlock(&g_bdev_mgr.mutex);
+ return -ENOMEM;
+ }
+
+ for (event_id = 0; event_id < MEDIA_EVENT_POOL_SIZE; ++event_id) {
+ TAILQ_INSERT_TAIL(&desc->free_media_events,
+ &desc->media_events_buffer[event_id], tailq);
+ }
+ }
+
+ rc = bdev_open(bdev, write, desc);
+ if (rc != 0) {
+ bdev_desc_free(desc);
+ desc = NULL;
+ }
+
+ *_desc = desc;
+
+ pthread_mutex_unlock(&g_bdev_mgr.mutex);
+
+ return rc;
+}
+
+void
+spdk_bdev_close(struct spdk_bdev_desc *desc)
+{
+ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+ int rc;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name,
+ spdk_get_thread());
+
+ assert(desc->thread == spdk_get_thread());
+
+ spdk_poller_unregister(&desc->io_timeout_poller);
+
+ pthread_mutex_lock(&bdev->internal.mutex);
+ pthread_mutex_lock(&desc->mutex);
+
+ TAILQ_REMOVE(&bdev->internal.open_descs, desc, link);
+
+ desc->closed = true;
+
+ if (0 == desc->refs) {
+ pthread_mutex_unlock(&desc->mutex);
+ bdev_desc_free(desc);
+ } else {
+ pthread_mutex_unlock(&desc->mutex);
+ }
+
+ /* If no more descriptors, kill QoS channel */
+ if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) {
+ SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n",
+ bdev->name, spdk_get_thread());
+
+ if (bdev_qos_destroy(bdev)) {
+ /* There isn't anything we can do to recover here. Just let the
+ * old QoS poller keep running. The QoS handling won't change
+ * cores when the user allocates a new channel, but it won't break. */
+ SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n");
+ }
+ }
+
+ spdk_bdev_set_qd_sampling_period(bdev, 0);
+
+ if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) {
+ rc = bdev_unregister_unsafe(bdev);
+ pthread_mutex_unlock(&bdev->internal.mutex);
+
+ if (rc == 0) {
+ bdev_fini(bdev);
+ }
+ } else {
+ pthread_mutex_unlock(&bdev->internal.mutex);
+ }
+}
+
+int
+spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
+ struct spdk_bdev_module *module)
+{
+ if (bdev->internal.claim_module != NULL) {
+ SPDK_ERRLOG("bdev %s already claimed by module %s\n", bdev->name,
+ bdev->internal.claim_module->name);
+ return -EPERM;
+ }
+
+ if (desc && !desc->write) {
+ desc->write = true;
+ }
+
+ bdev->internal.claim_module = module;
+ return 0;
+}
+
+void
+spdk_bdev_module_release_bdev(struct spdk_bdev *bdev)
+{
+ assert(bdev->internal.claim_module != NULL);
+ bdev->internal.claim_module = NULL;
+}
+
+struct spdk_bdev *
+spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc)
+{
+ assert(desc != NULL);
+ return desc->bdev;
+}
+
+void
+spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp)
+{
+ struct iovec *iovs;
+ int iovcnt;
+
+ if (bdev_io == NULL) {
+ return;
+ }
+
+ switch (bdev_io->type) {
+ case SPDK_BDEV_IO_TYPE_READ:
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ case SPDK_BDEV_IO_TYPE_ZCOPY:
+ iovs = bdev_io->u.bdev.iovs;
+ iovcnt = bdev_io->u.bdev.iovcnt;
+ break;
+ default:
+ iovs = NULL;
+ iovcnt = 0;
+ break;
+ }
+
+ if (iovp) {
+ *iovp = iovs;
+ }
+ if (iovcntp) {
+ *iovcntp = iovcnt;
+ }
+}
+
+void *
+spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io)
+{
+ if (bdev_io == NULL) {
+ return NULL;
+ }
+
+ if (!spdk_bdev_is_md_separate(bdev_io->bdev)) {
+ return NULL;
+ }
+
+ if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ ||
+ bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
+ return bdev_io->u.bdev.md_buf;
+ }
+
+ return NULL;
+}
+
+void *
+spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io)
+{
+ if (bdev_io == NULL) {
+ assert(false);
+ return NULL;
+ }
+
+ return bdev_io->internal.caller_ctx;
+}
+
+void
+spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module)
+{
+
+ if (spdk_bdev_module_list_find(bdev_module->name)) {
+ SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name);
+ assert(false);
+ }
+
+ /*
+ * Modules with examine callbacks must be initialized first, so they are
+ * ready to handle examine callbacks from later modules that will
+ * register physical bdevs.
+ */
+ if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) {
+ TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq);
+ } else {
+ TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq);
+ }
+}
+
+struct spdk_bdev_module *
+spdk_bdev_module_list_find(const char *name)
+{
+ struct spdk_bdev_module *bdev_module;
+
+ TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) {
+ if (strcmp(name, bdev_module->name) == 0) {
+ break;
+ }
+ }
+
+ return bdev_module;
+}
+
+static void
+bdev_write_zero_buffer_next(void *_bdev_io)
+{
+ struct spdk_bdev_io *bdev_io = _bdev_io;
+ uint64_t num_bytes, num_blocks;
+ void *md_buf = NULL;
+ int rc;
+
+ num_bytes = spdk_min(_bdev_get_block_size_with_md(bdev_io->bdev) *
+ bdev_io->u.bdev.split_remaining_num_blocks,
+ ZERO_BUFFER_SIZE);
+ num_blocks = num_bytes / _bdev_get_block_size_with_md(bdev_io->bdev);
+
+ if (spdk_bdev_is_md_separate(bdev_io->bdev)) {
+ md_buf = (char *)g_bdev_mgr.zero_buffer +
+ spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks;
+ }
+
+ rc = bdev_write_blocks_with_md(bdev_io->internal.desc,
+ spdk_io_channel_from_ctx(bdev_io->internal.ch),
+ g_bdev_mgr.zero_buffer, md_buf,
+ bdev_io->u.bdev.split_current_offset_blocks, num_blocks,
+ bdev_write_zero_buffer_done, bdev_io);
+ if (rc == 0) {
+ bdev_io->u.bdev.split_remaining_num_blocks -= num_blocks;
+ bdev_io->u.bdev.split_current_offset_blocks += num_blocks;
+ } else if (rc == -ENOMEM) {
+ bdev_queue_io_wait_with_cb(bdev_io, bdev_write_zero_buffer_next);
+ } else {
+ bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
+ bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx);
+ }
+}
+
+static void
+bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct spdk_bdev_io *parent_io = cb_arg;
+
+ spdk_bdev_free_io(bdev_io);
+
+ if (!success) {
+ parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED;
+ parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx);
+ return;
+ }
+
+ if (parent_io->u.bdev.split_remaining_num_blocks == 0) {
+ parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS;
+ parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx);
+ return;
+ }
+
+ bdev_write_zero_buffer_next(parent_io);
+}
+
+static void
+bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status)
+{
+ pthread_mutex_lock(&ctx->bdev->internal.mutex);
+ ctx->bdev->internal.qos_mod_in_progress = false;
+ pthread_mutex_unlock(&ctx->bdev->internal.mutex);
+
+ if (ctx->cb_fn) {
+ ctx->cb_fn(ctx->cb_arg, status);
+ }
+ free(ctx);
+}
+
+static void
+bdev_disable_qos_done(void *cb_arg)
+{
+ struct set_qos_limit_ctx *ctx = cb_arg;
+ struct spdk_bdev *bdev = ctx->bdev;
+ struct spdk_bdev_io *bdev_io;
+ struct spdk_bdev_qos *qos;
+
+ pthread_mutex_lock(&bdev->internal.mutex);
+ qos = bdev->internal.qos;
+ bdev->internal.qos = NULL;
+ pthread_mutex_unlock(&bdev->internal.mutex);
+
+ while (!TAILQ_EMPTY(&qos->queued)) {
+ /* Send queued I/O back to their original thread for resubmission. */
+ bdev_io = TAILQ_FIRST(&qos->queued);
+ TAILQ_REMOVE(&qos->queued, bdev_io, internal.link);
+
+ if (bdev_io->internal.io_submit_ch) {
+ /*
+ * Channel was changed when sending it to the QoS thread - change it back
+ * before sending it back to the original thread.
+ */
+ bdev_io->internal.ch = bdev_io->internal.io_submit_ch;
+ bdev_io->internal.io_submit_ch = NULL;
+ }
+
+ spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io),
+ _bdev_io_submit, bdev_io);
+ }
+
+ if (qos->thread != NULL) {
+ spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch));
+ spdk_poller_unregister(&qos->poller);
+ }
+
+ free(qos);
+
+ bdev_set_qos_limit_done(ctx, 0);
+}
+
+static void
+bdev_disable_qos_msg_done(struct spdk_io_channel_iter *i, int status)
+{
+ void *io_device = spdk_io_channel_iter_get_io_device(i);
+ struct spdk_bdev *bdev = __bdev_from_io_dev(io_device);
+ struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
+ struct spdk_thread *thread;
+
+ pthread_mutex_lock(&bdev->internal.mutex);
+ thread = bdev->internal.qos->thread;
+ pthread_mutex_unlock(&bdev->internal.mutex);
+
+ if (thread != NULL) {
+ spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx);
+ } else {
+ bdev_disable_qos_done(ctx);
+ }
+}
+
+static void
+bdev_disable_qos_msg(struct spdk_io_channel_iter *i)
+{
+ struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
+ struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch);
+
+ bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED;
+
+ spdk_for_each_channel_continue(i, 0);
+}
+
+static void
+bdev_update_qos_rate_limit_msg(void *cb_arg)
+{
+ struct set_qos_limit_ctx *ctx = cb_arg;
+ struct spdk_bdev *bdev = ctx->bdev;
+
+ pthread_mutex_lock(&bdev->internal.mutex);
+ bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos);
+ pthread_mutex_unlock(&bdev->internal.mutex);
+
+ bdev_set_qos_limit_done(ctx, 0);
+}
+
+static void
+bdev_enable_qos_msg(struct spdk_io_channel_iter *i)
+{
+ void *io_device = spdk_io_channel_iter_get_io_device(i);
+ struct spdk_bdev *bdev = __bdev_from_io_dev(io_device);
+ struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
+ struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch);
+
+ pthread_mutex_lock(&bdev->internal.mutex);
+ bdev_enable_qos(bdev, bdev_ch);
+ pthread_mutex_unlock(&bdev->internal.mutex);
+ spdk_for_each_channel_continue(i, 0);
+}
+
+static void
+bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status)
+{
+ struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
+
+ bdev_set_qos_limit_done(ctx, status);
+}
+
+static void
+bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits)
+{
+ int i;
+
+ assert(bdev->internal.qos != NULL);
+
+ for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
+ if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
+ bdev->internal.qos->rate_limits[i].limit = limits[i];
+
+ if (limits[i] == 0) {
+ bdev->internal.qos->rate_limits[i].limit =
+ SPDK_BDEV_QOS_LIMIT_NOT_DEFINED;
+ }
+ }
+ }
+}
+
+void
+spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits,
+ void (*cb_fn)(void *cb_arg, int status), void *cb_arg)
+{
+ struct set_qos_limit_ctx *ctx;
+ uint32_t limit_set_complement;
+ uint64_t min_limit_per_sec;
+ int i;
+ bool disable_rate_limit = true;
+
+ for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
+ if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) {
+ continue;
+ }
+
+ if (limits[i] > 0) {
+ disable_rate_limit = false;
+ }
+
+ if (bdev_qos_is_iops_rate_limit(i) == true) {
+ min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC;
+ } else {
+ /* Change from megabyte to byte rate limit */
+ limits[i] = limits[i] * 1024 * 1024;
+ min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC;
+ }
+
+ limit_set_complement = limits[i] % min_limit_per_sec;
+ if (limit_set_complement) {
+ SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n",
+ limits[i], min_limit_per_sec);
+ limits[i] += min_limit_per_sec - limit_set_complement;
+ SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]);
+ }
+ }
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (ctx == NULL) {
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ ctx->cb_fn = cb_fn;
+ ctx->cb_arg = cb_arg;
+ ctx->bdev = bdev;
+
+ pthread_mutex_lock(&bdev->internal.mutex);
+ if (bdev->internal.qos_mod_in_progress) {
+ pthread_mutex_unlock(&bdev->internal.mutex);
+ free(ctx);
+ cb_fn(cb_arg, -EAGAIN);
+ return;
+ }
+ bdev->internal.qos_mod_in_progress = true;
+
+ if (disable_rate_limit == true && bdev->internal.qos) {
+ for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) {
+ if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED &&
+ (bdev->internal.qos->rate_limits[i].limit > 0 &&
+ bdev->internal.qos->rate_limits[i].limit !=
+ SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) {
+ disable_rate_limit = false;
+ break;
+ }
+ }
+ }
+
+ if (disable_rate_limit == false) {
+ if (bdev->internal.qos == NULL) {
+ bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos));
+ if (!bdev->internal.qos) {
+ pthread_mutex_unlock(&bdev->internal.mutex);
+ SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n");
+ bdev_set_qos_limit_done(ctx, -ENOMEM);
+ return;
+ }
+ }
+
+ if (bdev->internal.qos->thread == NULL) {
+ /* Enabling */
+ bdev_set_qos_rate_limits(bdev, limits);
+
+ spdk_for_each_channel(__bdev_to_io_dev(bdev),
+ bdev_enable_qos_msg, ctx,
+ bdev_enable_qos_done);
+ } else {
+ /* Updating */
+ bdev_set_qos_rate_limits(bdev, limits);
+
+ spdk_thread_send_msg(bdev->internal.qos->thread,
+ bdev_update_qos_rate_limit_msg, ctx);
+ }
+ } else {
+ if (bdev->internal.qos != NULL) {
+ bdev_set_qos_rate_limits(bdev, limits);
+
+ /* Disabling */
+ spdk_for_each_channel(__bdev_to_io_dev(bdev),
+ bdev_disable_qos_msg, ctx,
+ bdev_disable_qos_msg_done);
+ } else {
+ pthread_mutex_unlock(&bdev->internal.mutex);
+ bdev_set_qos_limit_done(ctx, 0);
+ return;
+ }
+ }
+
+ pthread_mutex_unlock(&bdev->internal.mutex);
+}
+
+struct spdk_bdev_histogram_ctx {
+ spdk_bdev_histogram_status_cb cb_fn;
+ void *cb_arg;
+ struct spdk_bdev *bdev;
+ int status;
+};
+
+static void
+bdev_histogram_disable_channel_cb(struct spdk_io_channel_iter *i, int status)
+{
+ struct spdk_bdev_histogram_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
+
+ pthread_mutex_lock(&ctx->bdev->internal.mutex);
+ ctx->bdev->internal.histogram_in_progress = false;
+ pthread_mutex_unlock(&ctx->bdev->internal.mutex);
+ ctx->cb_fn(ctx->cb_arg, ctx->status);
+ free(ctx);
+}
+
+static void
+bdev_histogram_disable_channel(struct spdk_io_channel_iter *i)
+{
+ struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
+ struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch);
+
+ if (ch->histogram != NULL) {
+ spdk_histogram_data_free(ch->histogram);
+ ch->histogram = NULL;
+ }
+ spdk_for_each_channel_continue(i, 0);
+}
+
+static void
+bdev_histogram_enable_channel_cb(struct spdk_io_channel_iter *i, int status)
+{
+ struct spdk_bdev_histogram_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
+
+ if (status != 0) {
+ ctx->status = status;
+ ctx->bdev->internal.histogram_enabled = false;
+ spdk_for_each_channel(__bdev_to_io_dev(ctx->bdev), bdev_histogram_disable_channel, ctx,
+ bdev_histogram_disable_channel_cb);
+ } else {
+ pthread_mutex_lock(&ctx->bdev->internal.mutex);
+ ctx->bdev->internal.histogram_in_progress = false;
+ pthread_mutex_unlock(&ctx->bdev->internal.mutex);
+ ctx->cb_fn(ctx->cb_arg, ctx->status);
+ free(ctx);
+ }
+}
+
+static void
+bdev_histogram_enable_channel(struct spdk_io_channel_iter *i)
+{
+ struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
+ struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch);
+ int status = 0;
+
+ if (ch->histogram == NULL) {
+ ch->histogram = spdk_histogram_data_alloc();
+ if (ch->histogram == NULL) {
+ status = -ENOMEM;
+ }
+ }
+
+ spdk_for_each_channel_continue(i, status);
+}
+
+void
+spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn,
+ void *cb_arg, bool enable)
+{
+ struct spdk_bdev_histogram_ctx *ctx;
+
+ ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx));
+ if (ctx == NULL) {
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ ctx->bdev = bdev;
+ ctx->status = 0;
+ ctx->cb_fn = cb_fn;
+ ctx->cb_arg = cb_arg;
+
+ pthread_mutex_lock(&bdev->internal.mutex);
+ if (bdev->internal.histogram_in_progress) {
+ pthread_mutex_unlock(&bdev->internal.mutex);
+ free(ctx);
+ cb_fn(cb_arg, -EAGAIN);
+ return;
+ }
+
+ bdev->internal.histogram_in_progress = true;
+ pthread_mutex_unlock(&bdev->internal.mutex);
+
+ bdev->internal.histogram_enabled = enable;
+
+ if (enable) {
+ /* Allocate histogram for each channel */
+ spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_enable_channel, ctx,
+ bdev_histogram_enable_channel_cb);
+ } else {
+ spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_disable_channel, ctx,
+ bdev_histogram_disable_channel_cb);
+ }
+}
+
+struct spdk_bdev_histogram_data_ctx {
+ spdk_bdev_histogram_data_cb cb_fn;
+ void *cb_arg;
+ struct spdk_bdev *bdev;
+ /** merged histogram data from all channels */
+ struct spdk_histogram_data *histogram;
+};
+
+static void
+bdev_histogram_get_channel_cb(struct spdk_io_channel_iter *i, int status)
+{
+ struct spdk_bdev_histogram_data_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
+
+ ctx->cb_fn(ctx->cb_arg, status, ctx->histogram);
+ free(ctx);
+}
+
+static void
+bdev_histogram_get_channel(struct spdk_io_channel_iter *i)
+{
+ struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
+ struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch);
+ struct spdk_bdev_histogram_data_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
+ int status = 0;
+
+ if (ch->histogram == NULL) {
+ status = -EFAULT;
+ } else {
+ spdk_histogram_data_merge(ctx->histogram, ch->histogram);
+ }
+
+ spdk_for_each_channel_continue(i, status);
+}
+
+void
+spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram,
+ spdk_bdev_histogram_data_cb cb_fn,
+ void *cb_arg)
+{
+ struct spdk_bdev_histogram_data_ctx *ctx;
+
+ ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx));
+ if (ctx == NULL) {
+ cb_fn(cb_arg, -ENOMEM, NULL);
+ return;
+ }
+
+ ctx->bdev = bdev;
+ ctx->cb_fn = cb_fn;
+ ctx->cb_arg = cb_arg;
+
+ ctx->histogram = histogram;
+
+ spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_get_channel, ctx,
+ bdev_histogram_get_channel_cb);
+}
+
+size_t
+spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events,
+ size_t max_events)
+{
+ struct media_event_entry *entry;
+ size_t num_events = 0;
+
+ for (; num_events < max_events; ++num_events) {
+ entry = TAILQ_FIRST(&desc->pending_media_events);
+ if (entry == NULL) {
+ break;
+ }
+
+ events[num_events] = entry->event;
+ TAILQ_REMOVE(&desc->pending_media_events, entry, tailq);
+ TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq);
+ }
+
+ return num_events;
+}
+
+int
+spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events,
+ size_t num_events)
+{
+ struct spdk_bdev_desc *desc;
+ struct media_event_entry *entry;
+ size_t event_id;
+ int rc = 0;
+
+ assert(bdev->media_events);
+
+ pthread_mutex_lock(&bdev->internal.mutex);
+ TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) {
+ if (desc->write) {
+ break;
+ }
+ }
+
+ if (desc == NULL || desc->media_events_buffer == NULL) {
+ rc = -ENODEV;
+ goto out;
+ }
+
+ for (event_id = 0; event_id < num_events; ++event_id) {
+ entry = TAILQ_FIRST(&desc->free_media_events);
+ if (entry == NULL) {
+ break;
+ }
+
+ TAILQ_REMOVE(&desc->free_media_events, entry, tailq);
+ TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq);
+ entry->event = events[event_id];
+ }
+
+ rc = event_id;
+out:
+ pthread_mutex_unlock(&bdev->internal.mutex);
+ return rc;
+}
+
+void
+spdk_bdev_notify_media_management(struct spdk_bdev *bdev)
+{
+ struct spdk_bdev_desc *desc;
+
+ pthread_mutex_lock(&bdev->internal.mutex);
+ TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) {
+ if (!TAILQ_EMPTY(&desc->pending_media_events)) {
+ desc->callback.event_fn(SPDK_BDEV_EVENT_MEDIA_MANAGEMENT, bdev,
+ desc->callback.ctx);
+ }
+ }
+ pthread_mutex_unlock(&bdev->internal.mutex);
+}
+
+struct locked_lba_range_ctx {
+ struct lba_range range;
+ struct spdk_bdev *bdev;
+ struct lba_range *current_range;
+ struct lba_range *owner_range;
+ struct spdk_poller *poller;
+ lock_range_cb cb_fn;
+ void *cb_arg;
+};
+
+static void
+bdev_lock_error_cleanup_cb(struct spdk_io_channel_iter *i, int status)
+{
+ struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
+
+ ctx->cb_fn(ctx->cb_arg, -ENOMEM);
+ free(ctx);
+}
+
+static void
+bdev_unlock_lba_range_get_channel(struct spdk_io_channel_iter *i);
+
+static void
+bdev_lock_lba_range_cb(struct spdk_io_channel_iter *i, int status)
+{
+ struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
+ struct spdk_bdev *bdev = ctx->bdev;
+
+ if (status == -ENOMEM) {
+ /* One of the channels could not allocate a range object.
+ * So we have to go back and clean up any ranges that were
+ * allocated successfully before we return error status to
+ * the caller. We can reuse the unlock function to do that
+ * clean up.
+ */
+ spdk_for_each_channel(__bdev_to_io_dev(bdev),
+ bdev_unlock_lba_range_get_channel, ctx,
+ bdev_lock_error_cleanup_cb);
+ return;
+ }
+
+ /* All channels have locked this range and no I/O overlapping the range
+ * are outstanding! Set the owner_ch for the range object for the
+ * locking channel, so that this channel will know that it is allowed
+ * to write to this range.
+ */
+ ctx->owner_range->owner_ch = ctx->range.owner_ch;
+ ctx->cb_fn(ctx->cb_arg, status);
+
+ /* Don't free the ctx here. Its range is in the bdev's global list of
+ * locked ranges still, and will be removed and freed when this range
+ * is later unlocked.
+ */
+}
+
+static int
+bdev_lock_lba_range_check_io(void *_i)
+{
+ struct spdk_io_channel_iter *i = _i;
+ struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
+ struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch);
+ struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
+ struct lba_range *range = ctx->current_range;
+ struct spdk_bdev_io *bdev_io;
+
+ spdk_poller_unregister(&ctx->poller);
+
+ /* The range is now in the locked_ranges, so no new IO can be submitted to this
+ * range. But we need to wait until any outstanding IO overlapping with this range
+ * are completed.
+ */
+ TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) {
+ if (bdev_io_range_is_locked(bdev_io, range)) {
+ ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100);
+ return SPDK_POLLER_BUSY;
+ }
+ }
+
+ spdk_for_each_channel_continue(i, 0);
+ return SPDK_POLLER_BUSY;
+}
+
+static void
+bdev_lock_lba_range_get_channel(struct spdk_io_channel_iter *i)
+{
+ struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
+ struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch);
+ struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
+ struct lba_range *range;
+
+ TAILQ_FOREACH(range, &ch->locked_ranges, tailq) {
+ if (range->length == ctx->range.length &&
+ range->offset == ctx->range.offset &&
+ range->locked_ctx == ctx->range.locked_ctx) {
+ /* This range already exists on this channel, so don't add
+ * it again. This can happen when a new channel is created
+ * while the for_each_channel operation is in progress.
+ * Do not check for outstanding I/O in that case, since the
+ * range was locked before any I/O could be submitted to the
+ * new channel.
+ */
+ spdk_for_each_channel_continue(i, 0);
+ return;
+ }
+ }
+
+ range = calloc(1, sizeof(*range));
+ if (range == NULL) {
+ spdk_for_each_channel_continue(i, -ENOMEM);
+ return;
+ }
+
+ range->length = ctx->range.length;
+ range->offset = ctx->range.offset;
+ range->locked_ctx = ctx->range.locked_ctx;
+ ctx->current_range = range;
+ if (ctx->range.owner_ch == ch) {
+ /* This is the range object for the channel that will hold
+ * the lock. Store it in the ctx object so that we can easily
+ * set its owner_ch after the lock is finally acquired.
+ */
+ ctx->owner_range = range;
+ }
+ TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq);
+ bdev_lock_lba_range_check_io(i);
+}
+
+static void
+bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx)
+{
+ assert(spdk_get_thread() == ctx->range.owner_ch->channel->thread);
+
+ /* We will add a copy of this range to each channel now. */
+ spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_lock_lba_range_get_channel, ctx,
+ bdev_lock_lba_range_cb);
+}
+
+static bool
+bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq)
+{
+ struct lba_range *r;
+
+ TAILQ_FOREACH(r, tailq, tailq) {
+ if (bdev_lba_range_overlapped(range, r)) {
+ return true;
+ }
+ }
+ return false;
+}
+
+static int
+bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch,
+ uint64_t offset, uint64_t length,
+ lock_range_cb cb_fn, void *cb_arg)
+{
+ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+ struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch);
+ struct locked_lba_range_ctx *ctx;
+
+ if (cb_arg == NULL) {
+ SPDK_ERRLOG("cb_arg must not be NULL\n");
+ return -EINVAL;
+ }
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (ctx == NULL) {
+ return -ENOMEM;
+ }
+
+ ctx->range.offset = offset;
+ ctx->range.length = length;
+ ctx->range.owner_ch = ch;
+ ctx->range.locked_ctx = cb_arg;
+ ctx->bdev = bdev;
+ ctx->cb_fn = cb_fn;
+ ctx->cb_arg = cb_arg;
+
+ pthread_mutex_lock(&bdev->internal.mutex);
+ if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) {
+ /* There is an active lock overlapping with this range.
+ * Put it on the pending list until this range no
+ * longer overlaps with another.
+ */
+ TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq);
+ } else {
+ TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq);
+ bdev_lock_lba_range_ctx(bdev, ctx);
+ }
+ pthread_mutex_unlock(&bdev->internal.mutex);
+ return 0;
+}
+
+static void
+bdev_lock_lba_range_ctx_msg(void *_ctx)
+{
+ struct locked_lba_range_ctx *ctx = _ctx;
+
+ bdev_lock_lba_range_ctx(ctx->bdev, ctx);
+}
+
+static void
+bdev_unlock_lba_range_cb(struct spdk_io_channel_iter *i, int status)
+{
+ struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
+ struct locked_lba_range_ctx *pending_ctx;
+ struct spdk_bdev_channel *ch = ctx->range.owner_ch;
+ struct spdk_bdev *bdev = ch->bdev;
+ struct lba_range *range, *tmp;
+
+ pthread_mutex_lock(&bdev->internal.mutex);
+ /* Check if there are any pending locked ranges that overlap with this range
+ * that was just unlocked. If there are, check that it doesn't overlap with any
+ * other locked ranges before calling bdev_lock_lba_range_ctx which will start
+ * the lock process.
+ */
+ TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) {
+ if (bdev_lba_range_overlapped(range, &ctx->range) &&
+ !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) {
+ TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq);
+ pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range);
+ TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq);
+ spdk_thread_send_msg(pending_ctx->range.owner_ch->channel->thread,
+ bdev_lock_lba_range_ctx_msg, pending_ctx);
+ }
+ }
+ pthread_mutex_unlock(&bdev->internal.mutex);
+
+ ctx->cb_fn(ctx->cb_arg, status);
+ free(ctx);
+}
+
+static void
+bdev_unlock_lba_range_get_channel(struct spdk_io_channel_iter *i)
+{
+ struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
+ struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch);
+ struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
+ TAILQ_HEAD(, spdk_bdev_io) io_locked;
+ struct spdk_bdev_io *bdev_io;
+ struct lba_range *range;
+
+ TAILQ_FOREACH(range, &ch->locked_ranges, tailq) {
+ if (ctx->range.offset == range->offset &&
+ ctx->range.length == range->length &&
+ ctx->range.locked_ctx == range->locked_ctx) {
+ TAILQ_REMOVE(&ch->locked_ranges, range, tailq);
+ free(range);
+ break;
+ }
+ }
+
+ /* Note: we should almost always be able to assert that the range specified
+ * was found. But there are some very rare corner cases where a new channel
+ * gets created simultaneously with a range unlock, where this function
+ * would execute on that new channel and wouldn't have the range.
+ * We also use this to clean up range allocations when a later allocation
+ * fails in the locking path.
+ * So we can't actually assert() here.
+ */
+
+ /* Swap the locked IO into a temporary list, and then try to submit them again.
+ * We could hyper-optimize this to only resubmit locked I/O that overlap
+ * with the range that was just unlocked, but this isn't a performance path so
+ * we go for simplicity here.
+ */
+ TAILQ_INIT(&io_locked);
+ TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link);
+ while (!TAILQ_EMPTY(&io_locked)) {
+ bdev_io = TAILQ_FIRST(&io_locked);
+ TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link);
+ bdev_io_submit(bdev_io);
+ }
+
+ spdk_for_each_channel_continue(i, 0);
+}
+
+static int
+bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch,
+ uint64_t offset, uint64_t length,
+ lock_range_cb cb_fn, void *cb_arg)
+{
+ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+ struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch);
+ struct locked_lba_range_ctx *ctx;
+ struct lba_range *range;
+ bool range_found = false;
+
+ /* Let's make sure the specified channel actually has a lock on
+ * the specified range. Note that the range must match exactly.
+ */
+ TAILQ_FOREACH(range, &ch->locked_ranges, tailq) {
+ if (range->offset == offset && range->length == length &&
+ range->owner_ch == ch && range->locked_ctx == cb_arg) {
+ range_found = true;
+ break;
+ }
+ }
+
+ if (!range_found) {
+ return -EINVAL;
+ }
+
+ pthread_mutex_lock(&bdev->internal.mutex);
+ /* We confirmed that this channel has locked the specified range. To
+ * start the unlock the process, we find the range in the bdev's locked_ranges
+ * and remove it. This ensures new channels don't inherit the locked range.
+ * Then we will send a message to each channel (including the one specified
+ * here) to remove the range from its per-channel list.
+ */
+ TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) {
+ if (range->offset == offset && range->length == length &&
+ range->locked_ctx == cb_arg) {
+ break;
+ }
+ }
+ if (range == NULL) {
+ assert(false);
+ pthread_mutex_unlock(&bdev->internal.mutex);
+ return -EINVAL;
+ }
+ TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq);
+ ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range);
+ pthread_mutex_unlock(&bdev->internal.mutex);
+
+ ctx->cb_fn = cb_fn;
+ ctx->cb_arg = cb_arg;
+
+ spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_unlock_lba_range_get_channel, ctx,
+ bdev_unlock_lba_range_cb);
+ return 0;
+}
+
+SPDK_LOG_REGISTER_COMPONENT("bdev", SPDK_LOG_BDEV)
+
+SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV)
+{
+ spdk_trace_register_owner(OWNER_BDEV, 'b');
+ spdk_trace_register_object(OBJECT_BDEV_IO, 'i');
+ spdk_trace_register_description("BDEV_IO_START", TRACE_BDEV_IO_START, OWNER_BDEV,
+ OBJECT_BDEV_IO, 1, 0, "type: ");
+ spdk_trace_register_description("BDEV_IO_DONE", TRACE_BDEV_IO_DONE, OWNER_BDEV,
+ OBJECT_BDEV_IO, 0, 0, "");
+}
diff --git a/src/spdk/lib/bdev/bdev_internal.h b/src/spdk/lib/bdev/bdev_internal.h
new file mode 100644
index 000000000..d1fa6e65a
--- /dev/null
+++ b/src/spdk/lib/bdev/bdev_internal.h
@@ -0,0 +1,50 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_BDEV_INTERNAL_H
+#define SPDK_BDEV_INTERNAL_H
+
+#include "spdk/bdev.h"
+
+struct spdk_bdev;
+struct spdk_bdev_io;
+struct spdk_bdev_channel;
+
+struct spdk_bdev_io *bdev_channel_get_io(struct spdk_bdev_channel *channel);
+
+void bdev_io_init(struct spdk_bdev_io *bdev_io, struct spdk_bdev *bdev, void *cb_arg,
+ spdk_bdev_io_completion_cb cb);
+
+void bdev_io_submit(struct spdk_bdev_io *bdev_io);
+
+#endif /* SPDK_BDEV_INTERNAL_H */
diff --git a/src/spdk/lib/bdev/bdev_rpc.c b/src/spdk/lib/bdev/bdev_rpc.c
new file mode 100644
index 000000000..6ce7136c4
--- /dev/null
+++ b/src/spdk/lib/bdev/bdev_rpc.c
@@ -0,0 +1,98 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/bdev.h"
+
+#include "spdk/rpc.h"
+#include "spdk/util.h"
+#include "spdk/string.h"
+
+#include "spdk_internal/log.h"
+
+struct spdk_rpc_set_bdev_opts {
+ uint32_t bdev_io_pool_size;
+ uint32_t bdev_io_cache_size;
+ bool bdev_auto_examine;
+};
+
+static const struct spdk_json_object_decoder rpc_set_bdev_opts_decoders[] = {
+ {"bdev_io_pool_size", offsetof(struct spdk_rpc_set_bdev_opts, bdev_io_pool_size), spdk_json_decode_uint32, true},
+ {"bdev_io_cache_size", offsetof(struct spdk_rpc_set_bdev_opts, bdev_io_cache_size), spdk_json_decode_uint32, true},
+ {"bdev_auto_examine", offsetof(struct spdk_rpc_set_bdev_opts, bdev_auto_examine), spdk_json_decode_bool, true},
+};
+
+static void
+rpc_bdev_set_options(struct spdk_jsonrpc_request *request, const struct spdk_json_val *params)
+{
+ struct spdk_rpc_set_bdev_opts rpc_opts;
+ struct spdk_bdev_opts bdev_opts;
+ struct spdk_json_write_ctx *w;
+ int rc;
+
+ rpc_opts.bdev_io_pool_size = UINT32_MAX;
+ rpc_opts.bdev_io_cache_size = UINT32_MAX;
+ rpc_opts.bdev_auto_examine = true;
+
+ if (params != NULL) {
+ if (spdk_json_decode_object(params, rpc_set_bdev_opts_decoders,
+ SPDK_COUNTOF(rpc_set_bdev_opts_decoders), &rpc_opts)) {
+ SPDK_ERRLOG("spdk_json_decode_object() failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid parameters");
+ return;
+ }
+ }
+
+ spdk_bdev_get_opts(&bdev_opts);
+ if (rpc_opts.bdev_io_pool_size != UINT32_MAX) {
+ bdev_opts.bdev_io_pool_size = rpc_opts.bdev_io_pool_size;
+ }
+ if (rpc_opts.bdev_io_cache_size != UINT32_MAX) {
+ bdev_opts.bdev_io_cache_size = rpc_opts.bdev_io_cache_size;
+ }
+ bdev_opts.bdev_auto_examine = rpc_opts.bdev_auto_examine;
+ rc = spdk_bdev_set_opts(&bdev_opts);
+
+ if (rc != 0) {
+ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Pool size %" PRIu32 " too small for cache size %" PRIu32,
+ bdev_opts.bdev_io_pool_size, bdev_opts.bdev_io_cache_size);
+ return;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("bdev_set_options", rpc_bdev_set_options, SPDK_RPC_STARTUP)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(bdev_set_options, set_bdev_options)
diff --git a/src/spdk/lib/bdev/bdev_zone.c b/src/spdk/lib/bdev/bdev_zone.c
new file mode 100644
index 000000000..3cf2ecb67
--- /dev/null
+++ b/src/spdk/lib/bdev/bdev_zone.c
@@ -0,0 +1,201 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/bdev_zone.h"
+#include "spdk/bdev_module.h"
+
+#include "bdev_internal.h"
+
+uint64_t
+spdk_bdev_get_zone_size(const struct spdk_bdev *bdev)
+{
+ return bdev->zone_size;
+}
+
+uint32_t
+spdk_bdev_get_max_open_zones(const struct spdk_bdev *bdev)
+{
+ return bdev->max_open_zones;
+}
+
+uint32_t
+spdk_bdev_get_optimal_open_zones(const struct spdk_bdev *bdev)
+{
+ return bdev->optimal_open_zones;
+}
+
+int
+spdk_bdev_get_zone_info(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+ uint64_t zone_id, size_t num_zones, struct spdk_bdev_zone_info *info,
+ spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+ struct spdk_bdev_io *bdev_io;
+ struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
+
+ bdev_io = bdev_channel_get_io(channel);
+ if (!bdev_io) {
+ return -ENOMEM;
+ }
+
+ bdev_io->internal.ch = channel;
+ bdev_io->internal.desc = desc;
+ bdev_io->type = SPDK_BDEV_IO_TYPE_GET_ZONE_INFO;
+ bdev_io->u.zone_mgmt.zone_id = zone_id;
+ bdev_io->u.zone_mgmt.num_zones = num_zones;
+ bdev_io->u.zone_mgmt.buf = info;
+ bdev_io_init(bdev_io, bdev, cb_arg, cb);
+
+ bdev_io_submit(bdev_io);
+ return 0;
+}
+
+int
+spdk_bdev_zone_management(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+ uint64_t zone_id, enum spdk_bdev_zone_action action,
+ spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+ struct spdk_bdev_io *bdev_io;
+ struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
+
+ bdev_io = bdev_channel_get_io(channel);
+ if (!bdev_io) {
+ return -ENOMEM;
+ }
+
+ bdev_io->internal.ch = channel;
+ bdev_io->internal.desc = desc;
+ bdev_io->type = SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT;
+ bdev_io->u.zone_mgmt.zone_action = action;
+ bdev_io->u.zone_mgmt.zone_id = zone_id;
+ bdev_io->u.zone_mgmt.num_zones = 1;
+ bdev_io_init(bdev_io, bdev, cb_arg, cb);
+
+ bdev_io_submit(bdev_io);
+ return 0;
+}
+
+static int
+zone_bdev_append_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+ void *buf, void *md_buf, uint64_t zone_id, uint64_t num_blocks,
+ spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+ struct spdk_bdev_io *bdev_io;
+ struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
+
+ bdev_io = bdev_channel_get_io(channel);
+ if (!bdev_io) {
+ return -ENOMEM;
+ }
+
+ bdev_io->internal.ch = channel;
+ bdev_io->internal.desc = desc;
+ bdev_io->type = SPDK_BDEV_IO_TYPE_ZONE_APPEND;
+ bdev_io->u.bdev.iovs = &bdev_io->iov;
+ bdev_io->u.bdev.iovs[0].iov_base = buf;
+ bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen;
+ bdev_io->u.bdev.iovcnt = 1;
+ bdev_io->u.bdev.md_buf = md_buf;
+ bdev_io->u.bdev.num_blocks = num_blocks;
+ bdev_io->u.bdev.offset_blocks = zone_id;
+ bdev_io_init(bdev_io, bdev, cb_arg, cb);
+
+ bdev_io_submit(bdev_io);
+ return 0;
+}
+
+int
+spdk_bdev_zone_append(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+ void *buf, uint64_t start_lba, uint64_t num_blocks,
+ spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+ return zone_bdev_append_with_md(desc, ch, buf, NULL, start_lba, num_blocks,
+ cb, cb_arg);
+}
+
+int
+spdk_bdev_zone_append_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+ void *buf, void *md, uint64_t start_lba, uint64_t num_blocks,
+ spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+ return zone_bdev_append_with_md(desc, ch, buf, md, start_lba, num_blocks,
+ cb, cb_arg);
+}
+
+int
+spdk_bdev_zone_appendv_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+ struct iovec *iov, int iovcnt, void *md_buf, uint64_t zone_id,
+ uint64_t num_blocks, spdk_bdev_io_completion_cb cb,
+ void *cb_arg)
+{
+ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc);
+ struct spdk_bdev_io *bdev_io;
+ struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch);
+
+ bdev_io = bdev_channel_get_io(channel);
+ if (!bdev_io) {
+ return -ENOMEM;
+ }
+
+ bdev_io->internal.ch = channel;
+ bdev_io->internal.desc = desc;
+ bdev_io->type = SPDK_BDEV_IO_TYPE_ZONE_APPEND;
+ bdev_io->u.bdev.iovs = iov;
+ bdev_io->u.bdev.iovcnt = iovcnt;
+ bdev_io->u.bdev.md_buf = md_buf;
+ bdev_io->u.bdev.num_blocks = num_blocks;
+ bdev_io->u.bdev.offset_blocks = zone_id;
+ bdev_io_init(bdev_io, bdev, cb_arg, cb);
+
+ bdev_io_submit(bdev_io);
+ return 0;
+}
+
+int
+spdk_bdev_zone_appendv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
+ struct iovec *iovs, int iovcnt, uint64_t zone_id, uint64_t num_blocks,
+ spdk_bdev_io_completion_cb cb, void *cb_arg)
+{
+ return spdk_bdev_zone_appendv_with_md(desc, ch, iovs, iovcnt, NULL, zone_id, num_blocks,
+ cb, cb_arg);
+}
+
+uint64_t
+spdk_bdev_io_get_append_location(struct spdk_bdev_io *bdev_io)
+{
+ return bdev_io->u.bdev.offset_blocks;
+}
diff --git a/src/spdk/lib/bdev/part.c b/src/spdk/lib/bdev/part.c
new file mode 100644
index 000000000..01a395591
--- /dev/null
+++ b/src/spdk/lib/bdev/part.c
@@ -0,0 +1,524 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Common code for partition-like virtual bdevs.
+ */
+
+#include "spdk/bdev.h"
+#include "spdk/likely.h"
+#include "spdk/log.h"
+#include "spdk/string.h"
+#include "spdk/thread.h"
+
+#include "spdk/bdev_module.h"
+
+struct spdk_bdev_part_base {
+ struct spdk_bdev *bdev;
+ struct spdk_bdev_desc *desc;
+ uint32_t ref;
+ uint32_t channel_size;
+ spdk_bdev_part_base_free_fn base_free_fn;
+ void *ctx;
+ bool claimed;
+ struct spdk_bdev_module *module;
+ struct spdk_bdev_fn_table *fn_table;
+ struct bdev_part_tailq *tailq;
+ spdk_io_channel_create_cb ch_create_cb;
+ spdk_io_channel_destroy_cb ch_destroy_cb;
+ struct spdk_thread *thread;
+};
+
+struct spdk_bdev *
+spdk_bdev_part_base_get_bdev(struct spdk_bdev_part_base *part_base)
+{
+ return part_base->bdev;
+}
+
+struct spdk_bdev_desc *
+spdk_bdev_part_base_get_desc(struct spdk_bdev_part_base *part_base)
+{
+ return part_base->desc;
+}
+
+struct bdev_part_tailq *
+spdk_bdev_part_base_get_tailq(struct spdk_bdev_part_base *part_base)
+{
+ return part_base->tailq;
+}
+
+void *
+spdk_bdev_part_base_get_ctx(struct spdk_bdev_part_base *part_base)
+{
+ return part_base->ctx;
+}
+
+const char *
+spdk_bdev_part_base_get_bdev_name(struct spdk_bdev_part_base *part_base)
+{
+ return part_base->bdev->name;
+}
+
+static void
+bdev_part_base_free(void *ctx)
+{
+ struct spdk_bdev_desc *desc = ctx;
+
+ spdk_bdev_close(desc);
+}
+
+void
+spdk_bdev_part_base_free(struct spdk_bdev_part_base *base)
+{
+ if (base->desc) {
+ /* Close the underlying bdev on its same opened thread. */
+ if (base->thread && base->thread != spdk_get_thread()) {
+ spdk_thread_send_msg(base->thread, bdev_part_base_free, base->desc);
+ } else {
+ spdk_bdev_close(base->desc);
+ }
+ }
+
+ if (base->base_free_fn != NULL) {
+ base->base_free_fn(base->ctx);
+ }
+
+ free(base);
+}
+
+static void
+bdev_part_free_cb(void *io_device)
+{
+ struct spdk_bdev_part *part = io_device;
+ struct spdk_bdev_part_base *base;
+
+ assert(part);
+ assert(part->internal.base);
+
+ base = part->internal.base;
+
+ TAILQ_REMOVE(base->tailq, part, tailq);
+
+ if (--base->ref == 0) {
+ spdk_bdev_module_release_bdev(base->bdev);
+ spdk_bdev_part_base_free(base);
+ }
+
+ spdk_bdev_destruct_done(&part->internal.bdev, 0);
+ free(part->internal.bdev.name);
+ free(part->internal.bdev.product_name);
+ free(part);
+}
+
+int
+spdk_bdev_part_free(struct spdk_bdev_part *part)
+{
+ spdk_io_device_unregister(part, bdev_part_free_cb);
+
+ /* Return 1 to indicate that this is an asynchronous operation that isn't complete
+ * until spdk_bdev_destruct_done is called */
+ return 1;
+}
+
+void
+spdk_bdev_part_base_hotremove(struct spdk_bdev_part_base *part_base, struct bdev_part_tailq *tailq)
+{
+ struct spdk_bdev_part *part, *tmp;
+
+ TAILQ_FOREACH_SAFE(part, tailq, tailq, tmp) {
+ if (part->internal.base == part_base) {
+ spdk_bdev_unregister(&part->internal.bdev, NULL, NULL);
+ }
+ }
+}
+
+static bool
+bdev_part_io_type_supported(void *_part, enum spdk_bdev_io_type io_type)
+{
+ struct spdk_bdev_part *part = _part;
+
+ /* We can't decode/modify passthrough NVMe commands, so don't report
+ * that a partition supports these io types, even if the underlying
+ * bdev does.
+ */
+ switch (io_type) {
+ case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
+ case SPDK_BDEV_IO_TYPE_NVME_IO:
+ case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
+ return false;
+ default:
+ break;
+ }
+
+ return part->internal.base->bdev->fn_table->io_type_supported(part->internal.base->bdev->ctxt,
+ io_type);
+}
+
+static struct spdk_io_channel *
+bdev_part_get_io_channel(void *_part)
+{
+ struct spdk_bdev_part *part = _part;
+
+ return spdk_get_io_channel(part);
+}
+
+struct spdk_bdev *
+spdk_bdev_part_get_bdev(struct spdk_bdev_part *part)
+{
+ return &part->internal.bdev;
+}
+
+struct spdk_bdev_part_base *
+spdk_bdev_part_get_base(struct spdk_bdev_part *part)
+{
+ return part->internal.base;
+}
+
+struct spdk_bdev *
+spdk_bdev_part_get_base_bdev(struct spdk_bdev_part *part)
+{
+ return part->internal.base->bdev;
+}
+
+uint64_t
+spdk_bdev_part_get_offset_blocks(struct spdk_bdev_part *part)
+{
+ return part->internal.offset_blocks;
+}
+
+static int
+bdev_part_remap_dif(struct spdk_bdev_io *bdev_io, uint32_t offset,
+ uint32_t remapped_offset)
+{
+ struct spdk_bdev *bdev = bdev_io->bdev;
+ struct spdk_dif_ctx dif_ctx;
+ struct spdk_dif_error err_blk = {};
+ int rc;
+
+ if (spdk_likely(!(bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK))) {
+ return 0;
+ }
+
+ rc = spdk_dif_ctx_init(&dif_ctx,
+ bdev->blocklen, bdev->md_len, bdev->md_interleave,
+ bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags,
+ offset, 0, 0, 0, 0);
+ if (rc != 0) {
+ SPDK_ERRLOG("Initialization of DIF context failed\n");
+ return rc;
+ }
+
+ spdk_dif_ctx_set_remapped_init_ref_tag(&dif_ctx, remapped_offset);
+
+ if (bdev->md_interleave) {
+ rc = spdk_dif_remap_ref_tag(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
+ bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
+ } else {
+ struct iovec md_iov = {
+ .iov_base = bdev_io->u.bdev.md_buf,
+ .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len,
+ };
+
+ rc = spdk_dix_remap_ref_tag(&md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
+ }
+
+ if (rc != 0) {
+ SPDK_ERRLOG("Remapping reference tag failed. type=%d, offset=%" PRIu32 "\n",
+ err_blk.err_type, err_blk.err_offset);
+ }
+
+ return rc;
+}
+
+static void
+bdev_part_complete_read_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct spdk_bdev_io *part_io = cb_arg;
+ uint32_t offset, remapped_offset;
+ int rc, status;
+
+ offset = bdev_io->u.bdev.offset_blocks;
+ remapped_offset = part_io->u.bdev.offset_blocks;
+
+ if (success) {
+ rc = bdev_part_remap_dif(bdev_io, offset, remapped_offset);
+ if (rc != 0) {
+ success = false;
+ }
+ }
+
+ status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
+
+ spdk_bdev_io_complete(part_io, status);
+ spdk_bdev_free_io(bdev_io);
+}
+
+static void
+bdev_part_complete_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct spdk_bdev_io *part_io = cb_arg;
+ int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
+
+ spdk_bdev_io_complete(part_io, status);
+ spdk_bdev_free_io(bdev_io);
+}
+
+static void
+bdev_part_complete_zcopy_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct spdk_bdev_io *part_io = cb_arg;
+ int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
+
+ spdk_bdev_io_set_buf(part_io, bdev_io->u.bdev.iovs[0].iov_base, bdev_io->u.bdev.iovs[0].iov_len);
+ spdk_bdev_io_complete(part_io, status);
+ spdk_bdev_free_io(bdev_io);
+}
+
+int
+spdk_bdev_part_submit_request(struct spdk_bdev_part_channel *ch, struct spdk_bdev_io *bdev_io)
+{
+ struct spdk_bdev_part *part = ch->part;
+ struct spdk_io_channel *base_ch = ch->base_ch;
+ struct spdk_bdev_desc *base_desc = part->internal.base->desc;
+ uint64_t offset, remapped_offset;
+ int rc = 0;
+
+ offset = bdev_io->u.bdev.offset_blocks;
+ remapped_offset = offset + part->internal.offset_blocks;
+
+ /* Modify the I/O to adjust for the offset within the base bdev. */
+ switch (bdev_io->type) {
+ case SPDK_BDEV_IO_TYPE_READ:
+ if (bdev_io->u.bdev.md_buf == NULL) {
+ rc = spdk_bdev_readv_blocks(base_desc, base_ch, bdev_io->u.bdev.iovs,
+ bdev_io->u.bdev.iovcnt, remapped_offset,
+ bdev_io->u.bdev.num_blocks,
+ bdev_part_complete_read_io, bdev_io);
+ } else {
+ rc = spdk_bdev_readv_blocks_with_md(base_desc, base_ch,
+ bdev_io->u.bdev.iovs,
+ bdev_io->u.bdev.iovcnt,
+ bdev_io->u.bdev.md_buf, remapped_offset,
+ bdev_io->u.bdev.num_blocks,
+ bdev_part_complete_read_io, bdev_io);
+ }
+ break;
+ case SPDK_BDEV_IO_TYPE_WRITE:
+ rc = bdev_part_remap_dif(bdev_io, offset, remapped_offset);
+ if (rc != 0) {
+ return SPDK_BDEV_IO_STATUS_FAILED;
+ }
+
+ if (bdev_io->u.bdev.md_buf == NULL) {
+ rc = spdk_bdev_writev_blocks(base_desc, base_ch, bdev_io->u.bdev.iovs,
+ bdev_io->u.bdev.iovcnt, remapped_offset,
+ bdev_io->u.bdev.num_blocks,
+ bdev_part_complete_io, bdev_io);
+ } else {
+ rc = spdk_bdev_writev_blocks_with_md(base_desc, base_ch,
+ bdev_io->u.bdev.iovs,
+ bdev_io->u.bdev.iovcnt,
+ bdev_io->u.bdev.md_buf, remapped_offset,
+ bdev_io->u.bdev.num_blocks,
+ bdev_part_complete_io, bdev_io);
+ }
+ break;
+ case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
+ rc = spdk_bdev_write_zeroes_blocks(base_desc, base_ch, remapped_offset,
+ bdev_io->u.bdev.num_blocks, bdev_part_complete_io,
+ bdev_io);
+ break;
+ case SPDK_BDEV_IO_TYPE_UNMAP:
+ rc = spdk_bdev_unmap_blocks(base_desc, base_ch, remapped_offset,
+ bdev_io->u.bdev.num_blocks, bdev_part_complete_io,
+ bdev_io);
+ break;
+ case SPDK_BDEV_IO_TYPE_FLUSH:
+ rc = spdk_bdev_flush_blocks(base_desc, base_ch, remapped_offset,
+ bdev_io->u.bdev.num_blocks, bdev_part_complete_io,
+ bdev_io);
+ break;
+ case SPDK_BDEV_IO_TYPE_RESET:
+ rc = spdk_bdev_reset(base_desc, base_ch,
+ bdev_part_complete_io, bdev_io);
+ break;
+ case SPDK_BDEV_IO_TYPE_ZCOPY:
+ rc = spdk_bdev_zcopy_start(base_desc, base_ch, remapped_offset,
+ bdev_io->u.bdev.num_blocks, bdev_io->u.bdev.zcopy.populate,
+ bdev_part_complete_zcopy_io, bdev_io);
+ break;
+ default:
+ SPDK_ERRLOG("unknown I/O type %d\n", bdev_io->type);
+ return SPDK_BDEV_IO_STATUS_FAILED;
+ }
+
+ return rc;
+}
+
+static int
+bdev_part_channel_create_cb(void *io_device, void *ctx_buf)
+{
+ struct spdk_bdev_part *part = (struct spdk_bdev_part *)io_device;
+ struct spdk_bdev_part_channel *ch = ctx_buf;
+
+ ch->part = part;
+ ch->base_ch = spdk_bdev_get_io_channel(part->internal.base->desc);
+ if (ch->base_ch == NULL) {
+ return -1;
+ }
+
+ if (part->internal.base->ch_create_cb) {
+ return part->internal.base->ch_create_cb(io_device, ctx_buf);
+ } else {
+ return 0;
+ }
+}
+
+static void
+bdev_part_channel_destroy_cb(void *io_device, void *ctx_buf)
+{
+ struct spdk_bdev_part *part = (struct spdk_bdev_part *)io_device;
+ struct spdk_bdev_part_channel *ch = ctx_buf;
+
+ if (part->internal.base->ch_destroy_cb) {
+ part->internal.base->ch_destroy_cb(io_device, ctx_buf);
+ }
+ spdk_put_io_channel(ch->base_ch);
+}
+
+struct spdk_bdev_part_base *
+ spdk_bdev_part_base_construct(struct spdk_bdev *bdev,
+ spdk_bdev_remove_cb_t remove_cb, struct spdk_bdev_module *module,
+ struct spdk_bdev_fn_table *fn_table, struct bdev_part_tailq *tailq,
+ spdk_bdev_part_base_free_fn free_fn, void *ctx,
+ uint32_t channel_size, spdk_io_channel_create_cb ch_create_cb,
+ spdk_io_channel_destroy_cb ch_destroy_cb)
+{
+ int rc;
+ struct spdk_bdev_part_base *base;
+
+ base = calloc(1, sizeof(*base));
+ if (!base) {
+ SPDK_ERRLOG("Memory allocation failure\n");
+ return NULL;
+ }
+ fn_table->get_io_channel = bdev_part_get_io_channel;
+ fn_table->io_type_supported = bdev_part_io_type_supported;
+
+ base->bdev = bdev;
+ base->desc = NULL;
+ base->ref = 0;
+ base->module = module;
+ base->fn_table = fn_table;
+ base->tailq = tailq;
+ base->base_free_fn = free_fn;
+ base->ctx = ctx;
+ base->claimed = false;
+ base->channel_size = channel_size;
+ base->ch_create_cb = ch_create_cb;
+ base->ch_destroy_cb = ch_destroy_cb;
+
+ rc = spdk_bdev_open(bdev, false, remove_cb, base, &base->desc);
+ if (rc) {
+ spdk_bdev_part_base_free(base);
+ SPDK_ERRLOG("could not open bdev %s: %s\n", spdk_bdev_get_name(bdev),
+ spdk_strerror(-rc));
+ return NULL;
+ }
+
+ /* Save the thread where the base device is opened */
+ base->thread = spdk_get_thread();
+
+ return base;
+}
+
+int
+spdk_bdev_part_construct(struct spdk_bdev_part *part, struct spdk_bdev_part_base *base,
+ char *name, uint64_t offset_blocks, uint64_t num_blocks,
+ char *product_name)
+{
+ part->internal.bdev.blocklen = base->bdev->blocklen;
+ part->internal.bdev.blockcnt = num_blocks;
+ part->internal.offset_blocks = offset_blocks;
+
+ part->internal.bdev.write_cache = base->bdev->write_cache;
+ part->internal.bdev.required_alignment = base->bdev->required_alignment;
+ part->internal.bdev.ctxt = part;
+ part->internal.bdev.module = base->module;
+ part->internal.bdev.fn_table = base->fn_table;
+
+ part->internal.bdev.md_interleave = base->bdev->md_interleave;
+ part->internal.bdev.md_len = base->bdev->md_len;
+ part->internal.bdev.dif_type = base->bdev->dif_type;
+ part->internal.bdev.dif_is_head_of_md = base->bdev->dif_is_head_of_md;
+ part->internal.bdev.dif_check_flags = base->bdev->dif_check_flags;
+
+ part->internal.bdev.name = strdup(name);
+ part->internal.bdev.product_name = strdup(product_name);
+
+ if (part->internal.bdev.name == NULL) {
+ SPDK_ERRLOG("Failed to allocate name for new part of bdev %s\n", spdk_bdev_get_name(base->bdev));
+ return -1;
+ } else if (part->internal.bdev.product_name == NULL) {
+ free(part->internal.bdev.name);
+ SPDK_ERRLOG("Failed to allocate product name for new part of bdev %s\n",
+ spdk_bdev_get_name(base->bdev));
+ return -1;
+ }
+
+ base->ref++;
+ part->internal.base = base;
+
+ if (!base->claimed) {
+ int rc;
+
+ rc = spdk_bdev_module_claim_bdev(base->bdev, base->desc, base->module);
+ if (rc) {
+ SPDK_ERRLOG("could not claim bdev %s\n", spdk_bdev_get_name(base->bdev));
+ free(part->internal.bdev.name);
+ free(part->internal.bdev.product_name);
+ return -1;
+ }
+ base->claimed = true;
+ }
+
+ spdk_io_device_register(part, bdev_part_channel_create_cb,
+ bdev_part_channel_destroy_cb,
+ base->channel_size,
+ name);
+
+ spdk_bdev_register(&part->internal.bdev);
+ TAILQ_INSERT_TAIL(base->tailq, part, tailq);
+
+ return 0;
+}
diff --git a/src/spdk/lib/bdev/scsi_nvme.c b/src/spdk/lib/bdev/scsi_nvme.c
new file mode 100644
index 000000000..f9fe319bd
--- /dev/null
+++ b/src/spdk/lib/bdev/scsi_nvme.c
@@ -0,0 +1,261 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) 2016 FUJITSU LIMITED, All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/bdev_module.h"
+
+#include "spdk/nvme_spec.h"
+
+void
+spdk_scsi_nvme_translate(const struct spdk_bdev_io *bdev_io, int *sc, int *sk,
+ int *asc, int *ascq)
+{
+ int nvme_sct = bdev_io->internal.error.nvme.sct;
+ int nvme_sc = bdev_io->internal.error.nvme.sc;
+
+ switch (nvme_sct) {
+ case SPDK_NVME_SCT_GENERIC:
+ switch (nvme_sc) {
+ case SPDK_NVME_SC_SUCCESS:
+ *sc = SPDK_SCSI_STATUS_GOOD;
+ *sk = SPDK_SCSI_SENSE_NO_SENSE;
+ *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
+ *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+ break;
+ case SPDK_NVME_SC_INVALID_OPCODE:
+ *sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
+ *sk = SPDK_SCSI_SENSE_ILLEGAL_REQUEST;
+ *asc = SPDK_SCSI_ASC_INVALID_COMMAND_OPERATION_CODE;
+ *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+ break;
+ case SPDK_NVME_SC_INVALID_FIELD:
+ *sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
+ *sk = SPDK_SCSI_SENSE_ILLEGAL_REQUEST;
+ *asc = SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB;
+ *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+ break;
+ case SPDK_NVME_SC_DATA_TRANSFER_ERROR:
+ case SPDK_NVME_SC_CAPACITY_EXCEEDED:
+ *sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
+ *sk = SPDK_SCSI_SENSE_MEDIUM_ERROR;
+ *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
+ *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+ break;
+ case SPDK_NVME_SC_ABORTED_POWER_LOSS:
+ *sc = SPDK_SCSI_STATUS_TASK_ABORTED;
+ *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND;
+ *asc = SPDK_SCSI_ASC_WARNING;
+ *ascq = SPDK_SCSI_ASCQ_POWER_LOSS_EXPECTED;
+ break;
+ case SPDK_NVME_SC_INTERNAL_DEVICE_ERROR:
+ *sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
+ *sk = SPDK_SCSI_SENSE_HARDWARE_ERROR;
+ *asc = SPDK_SCSI_ASC_INTERNAL_TARGET_FAILURE;
+ *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+ break;
+ case SPDK_NVME_SC_ABORTED_BY_REQUEST:
+ case SPDK_NVME_SC_ABORTED_SQ_DELETION:
+ case SPDK_NVME_SC_ABORTED_FAILED_FUSED:
+ case SPDK_NVME_SC_ABORTED_MISSING_FUSED:
+ *sc = SPDK_SCSI_STATUS_TASK_ABORTED;
+ *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND;
+ *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
+ *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+ break;
+ case SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT:
+ *sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
+ *sk = SPDK_SCSI_SENSE_ILLEGAL_REQUEST;
+ *asc = SPDK_SCSI_ASC_ACCESS_DENIED;
+ *ascq = SPDK_SCSI_ASCQ_INVALID_LU_IDENTIFIER;
+ break;
+ case SPDK_NVME_SC_LBA_OUT_OF_RANGE:
+ *sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
+ *sk = SPDK_SCSI_SENSE_ILLEGAL_REQUEST;
+ *asc = SPDK_SCSI_ASC_LOGICAL_BLOCK_ADDRESS_OUT_OF_RANGE;
+ *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+ break;
+ case SPDK_NVME_SC_NAMESPACE_NOT_READY:
+ *sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
+ *sk = SPDK_SCSI_SENSE_NOT_READY;
+ *asc = SPDK_SCSI_ASC_LOGICAL_UNIT_NOT_READY;
+ *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+ break;
+ case SPDK_NVME_SC_RESERVATION_CONFLICT:
+ *sc = SPDK_SCSI_STATUS_RESERVATION_CONFLICT;
+ *sk = SPDK_SCSI_SENSE_NO_SENSE;
+ *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
+ *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+ break;
+ case SPDK_NVME_SC_COMMAND_ID_CONFLICT:
+ case SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR:
+ case SPDK_NVME_SC_INVALID_SGL_SEG_DESCRIPTOR:
+ case SPDK_NVME_SC_INVALID_NUM_SGL_DESCIRPTORS:
+ case SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID:
+ case SPDK_NVME_SC_METADATA_SGL_LENGTH_INVALID:
+ case SPDK_NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID:
+ case SPDK_NVME_SC_INVALID_CONTROLLER_MEM_BUF:
+ case SPDK_NVME_SC_INVALID_PRP_OFFSET:
+ case SPDK_NVME_SC_ATOMIC_WRITE_UNIT_EXCEEDED:
+ case SPDK_NVME_SC_INVALID_SGL_OFFSET:
+ case SPDK_NVME_SC_HOSTID_INCONSISTENT_FORMAT:
+ case SPDK_NVME_SC_KEEP_ALIVE_EXPIRED:
+ case SPDK_NVME_SC_KEEP_ALIVE_INVALID:
+ case SPDK_NVME_SC_FORMAT_IN_PROGRESS:
+ default:
+ *sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
+ *sk = SPDK_SCSI_SENSE_ILLEGAL_REQUEST;
+ *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
+ *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+ break;
+ }
+ break;
+ case SPDK_NVME_SCT_COMMAND_SPECIFIC:
+ switch (nvme_sc) {
+ case SPDK_NVME_SC_COMPLETION_QUEUE_INVALID:
+ case SPDK_NVME_SC_ABORT_COMMAND_LIMIT_EXCEEDED:
+ *sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
+ *sk = SPDK_SCSI_SENSE_ILLEGAL_REQUEST;
+ *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
+ *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+ break;
+ case SPDK_NVME_SC_INVALID_FORMAT:
+ *sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
+ *sk = SPDK_SCSI_SENSE_ILLEGAL_REQUEST;
+ *asc = SPDK_SCSI_ASC_FORMAT_COMMAND_FAILED;
+ *ascq = SPDK_SCSI_ASCQ_FORMAT_COMMAND_FAILED;
+ break;
+ case SPDK_NVME_SC_CONFLICTING_ATTRIBUTES:
+ *sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
+ *sk = SPDK_SCSI_SENSE_ILLEGAL_REQUEST;
+ *asc = SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB;
+ *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+ break;
+ case SPDK_NVME_SC_ATTEMPTED_WRITE_TO_RO_RANGE:
+ *sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
+ *sk = SPDK_SCSI_SENSE_DATA_PROTECT;
+ *asc = SPDK_SCSI_ASC_WRITE_PROTECTED;
+ *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+ break;
+ case SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER:
+ case SPDK_NVME_SC_INVALID_QUEUE_SIZE:
+ case SPDK_NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED:
+ case SPDK_NVME_SC_INVALID_FIRMWARE_SLOT:
+ case SPDK_NVME_SC_INVALID_FIRMWARE_IMAGE:
+ case SPDK_NVME_SC_INVALID_INTERRUPT_VECTOR:
+ case SPDK_NVME_SC_INVALID_LOG_PAGE:
+ case SPDK_NVME_SC_FIRMWARE_REQ_CONVENTIONAL_RESET:
+ case SPDK_NVME_SC_INVALID_QUEUE_DELETION:
+ case SPDK_NVME_SC_FEATURE_ID_NOT_SAVEABLE:
+ case SPDK_NVME_SC_FEATURE_NOT_CHANGEABLE:
+ case SPDK_NVME_SC_FEATURE_NOT_NAMESPACE_SPECIFIC:
+ case SPDK_NVME_SC_FIRMWARE_REQ_NVM_RESET:
+ case SPDK_NVME_SC_FIRMWARE_REQ_RESET:
+ case SPDK_NVME_SC_FIRMWARE_REQ_MAX_TIME_VIOLATION:
+ case SPDK_NVME_SC_FIRMWARE_ACTIVATION_PROHIBITED:
+ case SPDK_NVME_SC_OVERLAPPING_RANGE:
+ case SPDK_NVME_SC_NAMESPACE_INSUFFICIENT_CAPACITY:
+ case SPDK_NVME_SC_NAMESPACE_ID_UNAVAILABLE:
+ case SPDK_NVME_SC_NAMESPACE_ALREADY_ATTACHED:
+ case SPDK_NVME_SC_NAMESPACE_IS_PRIVATE:
+ case SPDK_NVME_SC_NAMESPACE_NOT_ATTACHED:
+ case SPDK_NVME_SC_THINPROVISIONING_NOT_SUPPORTED:
+ case SPDK_NVME_SC_CONTROLLER_LIST_INVALID:
+ case SPDK_NVME_SC_INVALID_PROTECTION_INFO:
+ default:
+ *sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
+ *sk = SPDK_SCSI_SENSE_ILLEGAL_REQUEST;
+ *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
+ *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+ break;
+ }
+ break;
+ case SPDK_NVME_SCT_MEDIA_ERROR:
+ switch (nvme_sc) {
+ case SPDK_NVME_SC_WRITE_FAULTS:
+ *sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
+ *sk = SPDK_SCSI_SENSE_MEDIUM_ERROR;
+ *asc = SPDK_SCSI_ASC_PERIPHERAL_DEVICE_WRITE_FAULT;
+ *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+ break;
+ case SPDK_NVME_SC_UNRECOVERED_READ_ERROR:
+ *sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
+ *sk = SPDK_SCSI_SENSE_MEDIUM_ERROR;
+ *asc = SPDK_SCSI_ASC_UNRECOVERED_READ_ERROR;
+ *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+ break;
+ case SPDK_NVME_SC_GUARD_CHECK_ERROR:
+ *sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
+ *sk = SPDK_SCSI_SENSE_MEDIUM_ERROR;
+ *asc = SPDK_SCSI_ASC_LOGICAL_BLOCK_GUARD_CHECK_FAILED;
+ *ascq = SPDK_SCSI_ASCQ_LOGICAL_BLOCK_GUARD_CHECK_FAILED;
+ break;
+ case SPDK_NVME_SC_APPLICATION_TAG_CHECK_ERROR:
+ *sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
+ *sk = SPDK_SCSI_SENSE_MEDIUM_ERROR;
+ *asc = SPDK_SCSI_ASC_LOGICAL_BLOCK_APP_TAG_CHECK_FAILED;
+ *ascq = SPDK_SCSI_ASCQ_LOGICAL_BLOCK_APP_TAG_CHECK_FAILED;
+ break;
+ case SPDK_NVME_SC_REFERENCE_TAG_CHECK_ERROR:
+ *sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
+ *sk = SPDK_SCSI_SENSE_MEDIUM_ERROR;
+ *asc = SPDK_SCSI_ASC_LOGICAL_BLOCK_REF_TAG_CHECK_FAILED;
+ *ascq = SPDK_SCSI_ASCQ_LOGICAL_BLOCK_REF_TAG_CHECK_FAILED;
+ break;
+ case SPDK_NVME_SC_COMPARE_FAILURE:
+ *sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
+ *sk = SPDK_SCSI_SENSE_MISCOMPARE;
+ *asc = SPDK_SCSI_ASC_MISCOMPARE_DURING_VERIFY_OPERATION;
+ *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+ break;
+ case SPDK_NVME_SC_ACCESS_DENIED:
+ *sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
+ *sk = SPDK_SCSI_SENSE_DATA_PROTECT;
+ *asc = SPDK_SCSI_ASC_ACCESS_DENIED;
+ *ascq = SPDK_SCSI_ASCQ_NO_ACCESS_RIGHTS;
+ break;
+ case SPDK_NVME_SC_DEALLOCATED_OR_UNWRITTEN_BLOCK:
+ default:
+ *sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
+ *sk = SPDK_SCSI_SENSE_ILLEGAL_REQUEST;
+ *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
+ *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+ break;
+ }
+ break;
+ case SPDK_NVME_SCT_VENDOR_SPECIFIC:
+ default:
+ *sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
+ *sk = SPDK_SCSI_SENSE_ILLEGAL_REQUEST;
+ *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
+ *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+ break;
+ }
+}
diff --git a/src/spdk/lib/bdev/spdk_bdev.map b/src/spdk/lib/bdev/spdk_bdev.map
new file mode 100644
index 000000000..9f9c3c7e5
--- /dev/null
+++ b/src/spdk/lib/bdev/spdk_bdev.map
@@ -0,0 +1,154 @@
+{
+ global:
+
+ # Public functions in bdev.h
+ spdk_bdev_get_opts;
+ spdk_bdev_set_opts;
+ spdk_bdev_initialize;
+ spdk_bdev_finish;
+ spdk_bdev_config_text;
+ spdk_bdev_subsystem_config_json;
+ spdk_bdev_get_by_name;
+ spdk_bdev_first;
+ spdk_bdev_next;
+ spdk_bdev_first_leaf;
+ spdk_bdev_next_leaf;
+ spdk_bdev_open;
+ spdk_bdev_open_ext;
+ spdk_bdev_close;
+ spdk_bdev_desc_get_bdev;
+ spdk_bdev_set_timeout;
+ spdk_bdev_io_type_supported;
+ spdk_bdev_dump_info_json;
+ spdk_bdev_get_name;
+ spdk_bdev_get_product_name;
+ spdk_bdev_get_block_size;
+ spdk_bdev_get_write_unit_size;
+ spdk_bdev_get_num_blocks;
+ spdk_bdev_get_qos_rpc_type;
+ spdk_bdev_get_qos_rate_limits;
+ spdk_bdev_set_qos_rate_limits;
+ spdk_bdev_get_buf_align;
+ spdk_bdev_get_optimal_io_boundary;
+ spdk_bdev_has_write_cache;
+ spdk_bdev_get_uuid;
+ spdk_bdev_get_acwu;
+ spdk_bdev_get_md_size;
+ spdk_bdev_is_md_interleaved;
+ spdk_bdev_is_md_separate;
+ spdk_bdev_is_zoned;
+ spdk_bdev_get_data_block_size;
+ spdk_bdev_get_dif_type;
+ spdk_bdev_is_dif_head_of_md;
+ spdk_bdev_is_dif_check_enabled;
+ spdk_bdev_get_qd;
+ spdk_bdev_get_qd_sampling_period;
+ spdk_bdev_set_qd_sampling_period;
+ spdk_bdev_get_io_time;
+ spdk_bdev_get_weighted_io_time;
+ spdk_bdev_get_io_channel;
+ spdk_bdev_read;
+ spdk_bdev_read_blocks;
+ spdk_bdev_read_blocks_with_md;
+ spdk_bdev_readv;
+ spdk_bdev_readv_blocks;
+ spdk_bdev_readv_blocks_with_md;
+ spdk_bdev_write;
+ spdk_bdev_write_blocks;
+ spdk_bdev_write_blocks_with_md;
+ spdk_bdev_writev;
+ spdk_bdev_writev_blocks;
+ spdk_bdev_writev_blocks_with_md;
+ spdk_bdev_compare_blocks;
+ spdk_bdev_compare_blocks_with_md;
+ spdk_bdev_comparev_blocks;
+ spdk_bdev_comparev_blocks_with_md;
+ spdk_bdev_comparev_and_writev_blocks;
+ spdk_bdev_zcopy_start;
+ spdk_bdev_zcopy_end;
+ spdk_bdev_write_zeroes;
+ spdk_bdev_write_zeroes_blocks;
+ spdk_bdev_unmap;
+ spdk_bdev_unmap_blocks;
+ spdk_bdev_flush;
+ spdk_bdev_flush_blocks;
+ spdk_bdev_reset;
+ spdk_bdev_abort;
+ spdk_bdev_nvme_admin_passthru;
+ spdk_bdev_nvme_io_passthru;
+ spdk_bdev_nvme_io_passthru_md;
+ spdk_bdev_free_io;
+ spdk_bdev_queue_io_wait;
+ spdk_bdev_get_io_stat;
+ spdk_bdev_get_device_stat;
+ spdk_bdev_io_get_nvme_status;
+ spdk_bdev_io_get_nvme_fused_status;
+ spdk_bdev_io_get_scsi_status;
+ spdk_bdev_io_get_iovec;
+ spdk_bdev_io_get_md_buf;
+ spdk_bdev_io_get_cb_arg;
+ spdk_bdev_histogram_enable;
+ spdk_bdev_histogram_get;
+ spdk_bdev_get_media_events;
+
+ # Public functions in bdev_module.h
+ spdk_bdev_register;
+ spdk_bdev_unregister;
+ spdk_bdev_destruct_done;
+ spdk_vbdev_register;
+ spdk_bdev_module_examine_done;
+ spdk_bdev_module_init_done;
+ spdk_bdev_module_finish_done;
+ spdk_bdev_module_claim_bdev;
+ spdk_bdev_module_release_bdev;
+ spdk_bdev_alias_add;
+ spdk_bdev_alias_del;
+ spdk_bdev_alias_del_all;
+ spdk_bdev_get_aliases;
+ spdk_bdev_io_get_buf;
+ spdk_bdev_io_get_aux_buf;
+ spdk_bdev_io_put_aux_buf;
+ spdk_bdev_io_set_buf;
+ spdk_bdev_io_set_md_buf;
+ spdk_bdev_io_complete;
+ spdk_bdev_io_complete_nvme_status;
+ spdk_bdev_io_complete_scsi_status;
+ spdk_bdev_io_get_thread;
+ spdk_bdev_io_get_io_channel;
+ spdk_bdev_notify_blockcnt_change;
+ spdk_scsi_nvme_translate;
+ spdk_bdev_module_list_add;
+ spdk_bdev_module_list_find;
+ spdk_bdev_part_base_get_bdev;
+ spdk_bdev_part_base_get_bdev_name;
+ spdk_bdev_part_base_get_desc;
+ spdk_bdev_part_base_get_tailq;
+ spdk_bdev_part_base_get_ctx;
+ spdk_bdev_part_base_free;
+ spdk_bdev_part_free;
+ spdk_bdev_part_base_hotremove;
+ spdk_bdev_part_base_construct;
+ spdk_bdev_part_construct;
+ spdk_bdev_part_submit_request;
+ spdk_bdev_part_get_bdev;
+ spdk_bdev_part_get_base;
+ spdk_bdev_part_get_base_bdev;
+ spdk_bdev_part_get_offset_blocks;
+ spdk_bdev_push_media_events;
+ spdk_bdev_notify_media_management;
+
+ # Public functions in bdev_zone.h
+ spdk_bdev_get_zone_size;
+ spdk_bdev_get_max_open_zones;
+ spdk_bdev_get_optimal_open_zones;
+ spdk_bdev_get_zone_info;
+ spdk_bdev_zone_management;
+ spdk_bdev_zone_append;
+ spdk_bdev_zone_appendv;
+ spdk_bdev_zone_append_with_md;
+ spdk_bdev_zone_appendv_with_md;
+ spdk_bdev_io_get_append_location;
+
+ # Everything else
+ local: *;
+};
diff --git a/src/spdk/lib/bdev/vtune.c b/src/spdk/lib/bdev/vtune.c
new file mode 100644
index 000000000..2cb48826e
--- /dev/null
+++ b/src/spdk/lib/bdev/vtune.c
@@ -0,0 +1,49 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/config.h"
+#if SPDK_CONFIG_VTUNE
+
+/* Disable warnings triggered by the VTune code */
+#if defined(__GNUC__) && \
+ __GNUC__ > 4 || \
+ (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
+#pragma GCC diagnostic ignored "-Wsign-compare"
+#if __GNUC__ >= 7
+#pragma GCC diagnostic ignored "-Wimplicit-fallthrough"
+#endif
+#endif
+
+#include "ittnotify_static.c"
+
+#endif
diff --git a/src/spdk/lib/blob/Makefile b/src/spdk/lib/blob/Makefile
new file mode 100644
index 000000000..53ae6800b
--- /dev/null
+++ b/src/spdk/lib/blob/Makefile
@@ -0,0 +1,45 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 3
+SO_MINOR := 0
+
+C_SRCS = blobstore.c request.c zeroes.c blob_bs_dev.c
+LIBNAME = blob
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_blob.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/blob/blob_bs_dev.c b/src/spdk/lib/blob/blob_bs_dev.c
new file mode 100644
index 000000000..8705a1c16
--- /dev/null
+++ b/src/spdk/lib/blob/blob_bs_dev.c
@@ -0,0 +1,150 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+#include "spdk/blob.h"
+#include "spdk/log.h"
+#include "blobstore.h"
+
+static void
+blob_bs_dev_write(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, void *payload,
+ uint64_t lba, uint32_t lba_count,
+ struct spdk_bs_dev_cb_args *cb_args)
+{
+ cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, -EPERM);
+ assert(false);
+}
+
+static void
+blob_bs_dev_writev(struct spdk_bs_dev *dev, struct spdk_io_channel *channel,
+ struct iovec *iov, int iovcnt,
+ uint64_t lba, uint32_t lba_count,
+ struct spdk_bs_dev_cb_args *cb_args)
+{
+ cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, -EPERM);
+ assert(false);
+}
+
+static void
+blob_bs_dev_write_zeroes(struct spdk_bs_dev *dev, struct spdk_io_channel *channel,
+ uint64_t lba, uint32_t lba_count,
+ struct spdk_bs_dev_cb_args *cb_args)
+{
+ cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, -EPERM);
+ assert(false);
+}
+
+static void
+blob_bs_dev_unmap(struct spdk_bs_dev *dev, struct spdk_io_channel *channel,
+ uint64_t lba, uint32_t lba_count,
+ struct spdk_bs_dev_cb_args *cb_args)
+{
+ cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, -EPERM);
+ assert(false);
+}
+
+static void
+blob_bs_dev_read_cpl(void *cb_arg, int bserrno)
+{
+ struct spdk_bs_dev_cb_args *cb_args = (struct spdk_bs_dev_cb_args *)cb_arg;
+
+ cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, bserrno);
+}
+
+static inline void
+blob_bs_dev_read(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, void *payload,
+ uint64_t lba, uint32_t lba_count, struct spdk_bs_dev_cb_args *cb_args)
+{
+ struct spdk_blob_bs_dev *b = (struct spdk_blob_bs_dev *)dev;
+
+ spdk_blob_io_read(b->blob, channel, payload, lba, lba_count,
+ blob_bs_dev_read_cpl, cb_args);
+}
+
+static inline void
+blob_bs_dev_readv(struct spdk_bs_dev *dev, struct spdk_io_channel *channel,
+ struct iovec *iov, int iovcnt,
+ uint64_t lba, uint32_t lba_count, struct spdk_bs_dev_cb_args *cb_args)
+{
+ struct spdk_blob_bs_dev *b = (struct spdk_blob_bs_dev *)dev;
+
+ spdk_blob_io_readv(b->blob, channel, iov, iovcnt, lba, lba_count,
+ blob_bs_dev_read_cpl, cb_args);
+}
+
+static void
+blob_bs_dev_destroy_cpl(void *cb_arg, int bserrno)
+{
+ if (bserrno != 0) {
+ SPDK_ERRLOG("Error on blob_bs_dev destroy: %d", bserrno);
+ }
+
+ /* Free blob_bs_dev */
+ free(cb_arg);
+}
+
+static void
+blob_bs_dev_destroy(struct spdk_bs_dev *bs_dev)
+{
+ struct spdk_blob_bs_dev *b = (struct spdk_blob_bs_dev *)bs_dev;
+
+ spdk_blob_close(b->blob, blob_bs_dev_destroy_cpl, b);
+}
+
+
+struct spdk_bs_dev *
+bs_create_blob_bs_dev(struct spdk_blob *blob)
+{
+ struct spdk_blob_bs_dev *b;
+
+ b = calloc(1, sizeof(*b));
+ if (b == NULL) {
+ return NULL;
+ }
+ /* snapshot blob */
+ b->bs_dev.blockcnt = blob->active.num_clusters *
+ blob->bs->pages_per_cluster * bs_io_unit_per_page(blob->bs);
+ b->bs_dev.blocklen = spdk_bs_get_io_unit_size(blob->bs);
+ b->bs_dev.create_channel = NULL;
+ b->bs_dev.destroy_channel = NULL;
+ b->bs_dev.destroy = blob_bs_dev_destroy;
+ b->bs_dev.write = blob_bs_dev_write;
+ b->bs_dev.writev = blob_bs_dev_writev;
+ b->bs_dev.read = blob_bs_dev_read;
+ b->bs_dev.readv = blob_bs_dev_readv;
+ b->bs_dev.write_zeroes = blob_bs_dev_write_zeroes;
+ b->bs_dev.unmap = blob_bs_dev_unmap;
+ b->blob = blob;
+
+ return &b->bs_dev;
+}
diff --git a/src/spdk/lib/blob/blobstore.c b/src/spdk/lib/blob/blobstore.c
new file mode 100644
index 000000000..768fc5b45
--- /dev/null
+++ b/src/spdk/lib/blob/blobstore.c
@@ -0,0 +1,7461 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/blob.h"
+#include "spdk/crc32.h"
+#include "spdk/env.h"
+#include "spdk/queue.h"
+#include "spdk/thread.h"
+#include "spdk/bit_array.h"
+#include "spdk/likely.h"
+#include "spdk/util.h"
+#include "spdk/string.h"
+
+#include "spdk_internal/assert.h"
+#include "spdk_internal/log.h"
+
+#include "blobstore.h"
+
+#define BLOB_CRC32C_INITIAL 0xffffffffUL
+
+static int bs_register_md_thread(struct spdk_blob_store *bs);
+static int bs_unregister_md_thread(struct spdk_blob_store *bs);
+static void blob_close_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno);
+static void blob_insert_cluster_on_md_thread(struct spdk_blob *blob, uint32_t cluster_num,
+ uint64_t cluster, uint32_t extent, spdk_blob_op_complete cb_fn, void *cb_arg);
+
+static int blob_set_xattr(struct spdk_blob *blob, const char *name, const void *value,
+ uint16_t value_len, bool internal);
+static int blob_get_xattr_value(struct spdk_blob *blob, const char *name,
+ const void **value, size_t *value_len, bool internal);
+static int blob_remove_xattr(struct spdk_blob *blob, const char *name, bool internal);
+
+static void blob_insert_extent(struct spdk_blob *blob, uint32_t extent, uint64_t cluster_num,
+ spdk_blob_op_complete cb_fn, void *cb_arg);
+
+static void
+blob_verify_md_op(struct spdk_blob *blob)
+{
+ assert(blob != NULL);
+ assert(spdk_get_thread() == blob->bs->md_thread);
+ assert(blob->state != SPDK_BLOB_STATE_LOADING);
+}
+
+static struct spdk_blob_list *
+bs_get_snapshot_entry(struct spdk_blob_store *bs, spdk_blob_id blobid)
+{
+ struct spdk_blob_list *snapshot_entry = NULL;
+
+ TAILQ_FOREACH(snapshot_entry, &bs->snapshots, link) {
+ if (snapshot_entry->id == blobid) {
+ break;
+ }
+ }
+
+ return snapshot_entry;
+}
+
+static void
+bs_claim_md_page(struct spdk_blob_store *bs, uint32_t page)
+{
+ assert(page < spdk_bit_array_capacity(bs->used_md_pages));
+ assert(spdk_bit_array_get(bs->used_md_pages, page) == false);
+
+ spdk_bit_array_set(bs->used_md_pages, page);
+}
+
+static void
+bs_release_md_page(struct spdk_blob_store *bs, uint32_t page)
+{
+ assert(page < spdk_bit_array_capacity(bs->used_md_pages));
+ assert(spdk_bit_array_get(bs->used_md_pages, page) == true);
+
+ spdk_bit_array_clear(bs->used_md_pages, page);
+}
+
+static void
+bs_claim_cluster(struct spdk_blob_store *bs, uint32_t cluster_num)
+{
+ assert(cluster_num < spdk_bit_array_capacity(bs->used_clusters));
+ assert(spdk_bit_array_get(bs->used_clusters, cluster_num) == false);
+ assert(bs->num_free_clusters > 0);
+
+ SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Claiming cluster %u\n", cluster_num);
+
+ spdk_bit_array_set(bs->used_clusters, cluster_num);
+ bs->num_free_clusters--;
+}
+
+static int
+blob_insert_cluster(struct spdk_blob *blob, uint32_t cluster_num, uint64_t cluster)
+{
+ uint64_t *cluster_lba = &blob->active.clusters[cluster_num];
+
+ blob_verify_md_op(blob);
+
+ if (*cluster_lba != 0) {
+ return -EEXIST;
+ }
+
+ *cluster_lba = bs_cluster_to_lba(blob->bs, cluster);
+ return 0;
+}
+
+static int
+bs_allocate_cluster(struct spdk_blob *blob, uint32_t cluster_num,
+ uint64_t *lowest_free_cluster, uint32_t *lowest_free_md_page, bool update_map)
+{
+ uint32_t *extent_page = 0;
+
+ pthread_mutex_lock(&blob->bs->used_clusters_mutex);
+ *lowest_free_cluster = spdk_bit_array_find_first_clear(blob->bs->used_clusters,
+ *lowest_free_cluster);
+ if (*lowest_free_cluster == UINT32_MAX) {
+ /* No more free clusters. Cannot satisfy the request */
+ pthread_mutex_unlock(&blob->bs->used_clusters_mutex);
+ return -ENOSPC;
+ }
+
+ if (blob->use_extent_table) {
+ extent_page = bs_cluster_to_extent_page(blob, cluster_num);
+ if (*extent_page == 0) {
+ /* No extent_page is allocated for the cluster */
+ *lowest_free_md_page = spdk_bit_array_find_first_clear(blob->bs->used_md_pages,
+ *lowest_free_md_page);
+ if (*lowest_free_md_page == UINT32_MAX) {
+ /* No more free md pages. Cannot satisfy the request */
+ pthread_mutex_unlock(&blob->bs->used_clusters_mutex);
+ return -ENOSPC;
+ }
+ bs_claim_md_page(blob->bs, *lowest_free_md_page);
+ }
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Claiming cluster %lu for blob %lu\n", *lowest_free_cluster, blob->id);
+ bs_claim_cluster(blob->bs, *lowest_free_cluster);
+
+ pthread_mutex_unlock(&blob->bs->used_clusters_mutex);
+
+ if (update_map) {
+ blob_insert_cluster(blob, cluster_num, *lowest_free_cluster);
+ if (blob->use_extent_table && *extent_page == 0) {
+ *extent_page = *lowest_free_md_page;
+ }
+ }
+
+ return 0;
+}
+
+static void
+bs_release_cluster(struct spdk_blob_store *bs, uint32_t cluster_num)
+{
+ assert(cluster_num < spdk_bit_array_capacity(bs->used_clusters));
+ assert(spdk_bit_array_get(bs->used_clusters, cluster_num) == true);
+ assert(bs->num_free_clusters < bs->total_clusters);
+
+ SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Releasing cluster %u\n", cluster_num);
+
+ pthread_mutex_lock(&bs->used_clusters_mutex);
+ spdk_bit_array_clear(bs->used_clusters, cluster_num);
+ bs->num_free_clusters++;
+ pthread_mutex_unlock(&bs->used_clusters_mutex);
+}
+
+static void
+blob_xattrs_init(struct spdk_blob_xattr_opts *xattrs)
+{
+ xattrs->count = 0;
+ xattrs->names = NULL;
+ xattrs->ctx = NULL;
+ xattrs->get_value = NULL;
+}
+
+void
+spdk_blob_opts_init(struct spdk_blob_opts *opts)
+{
+ opts->num_clusters = 0;
+ opts->thin_provision = false;
+ opts->clear_method = BLOB_CLEAR_WITH_DEFAULT;
+ blob_xattrs_init(&opts->xattrs);
+ opts->use_extent_table = true;
+}
+
+void
+spdk_blob_open_opts_init(struct spdk_blob_open_opts *opts)
+{
+ opts->clear_method = BLOB_CLEAR_WITH_DEFAULT;
+}
+
+static struct spdk_blob *
+blob_alloc(struct spdk_blob_store *bs, spdk_blob_id id)
+{
+ struct spdk_blob *blob;
+
+ blob = calloc(1, sizeof(*blob));
+ if (!blob) {
+ return NULL;
+ }
+
+ blob->id = id;
+ blob->bs = bs;
+
+ blob->parent_id = SPDK_BLOBID_INVALID;
+
+ blob->state = SPDK_BLOB_STATE_DIRTY;
+ blob->extent_rle_found = false;
+ blob->extent_table_found = false;
+ blob->active.num_pages = 1;
+ blob->active.pages = calloc(1, sizeof(*blob->active.pages));
+ if (!blob->active.pages) {
+ free(blob);
+ return NULL;
+ }
+
+ blob->active.pages[0] = bs_blobid_to_page(id);
+
+ TAILQ_INIT(&blob->xattrs);
+ TAILQ_INIT(&blob->xattrs_internal);
+ TAILQ_INIT(&blob->pending_persists);
+
+ return blob;
+}
+
+static void
+xattrs_free(struct spdk_xattr_tailq *xattrs)
+{
+ struct spdk_xattr *xattr, *xattr_tmp;
+
+ TAILQ_FOREACH_SAFE(xattr, xattrs, link, xattr_tmp) {
+ TAILQ_REMOVE(xattrs, xattr, link);
+ free(xattr->name);
+ free(xattr->value);
+ free(xattr);
+ }
+}
+
+static void
+blob_free(struct spdk_blob *blob)
+{
+ assert(blob != NULL);
+ assert(TAILQ_EMPTY(&blob->pending_persists));
+
+ free(blob->active.extent_pages);
+ free(blob->clean.extent_pages);
+ free(blob->active.clusters);
+ free(blob->clean.clusters);
+ free(blob->active.pages);
+ free(blob->clean.pages);
+
+ xattrs_free(&blob->xattrs);
+ xattrs_free(&blob->xattrs_internal);
+
+ if (blob->back_bs_dev) {
+ blob->back_bs_dev->destroy(blob->back_bs_dev);
+ }
+
+ free(blob);
+}
+
+struct freeze_io_ctx {
+ struct spdk_bs_cpl cpl;
+ struct spdk_blob *blob;
+};
+
+static void
+blob_io_sync(struct spdk_io_channel_iter *i)
+{
+ spdk_for_each_channel_continue(i, 0);
+}
+
+static void
+blob_execute_queued_io(struct spdk_io_channel_iter *i)
+{
+ struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
+ struct spdk_bs_channel *ch = spdk_io_channel_get_ctx(_ch);
+ struct freeze_io_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
+ struct spdk_bs_request_set *set;
+ struct spdk_bs_user_op_args *args;
+ spdk_bs_user_op_t *op, *tmp;
+
+ TAILQ_FOREACH_SAFE(op, &ch->queued_io, link, tmp) {
+ set = (struct spdk_bs_request_set *)op;
+ args = &set->u.user_op;
+
+ if (args->blob == ctx->blob) {
+ TAILQ_REMOVE(&ch->queued_io, op, link);
+ bs_user_op_execute(op);
+ }
+ }
+
+ spdk_for_each_channel_continue(i, 0);
+}
+
+static void
+blob_io_cpl(struct spdk_io_channel_iter *i, int status)
+{
+ struct freeze_io_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
+
+ ctx->cpl.u.blob_basic.cb_fn(ctx->cpl.u.blob_basic.cb_arg, 0);
+
+ free(ctx);
+}
+
+static void
+blob_freeze_io(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg)
+{
+ struct freeze_io_ctx *ctx;
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx) {
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ ctx->cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC;
+ ctx->cpl.u.blob_basic.cb_fn = cb_fn;
+ ctx->cpl.u.blob_basic.cb_arg = cb_arg;
+ ctx->blob = blob;
+
+ /* Freeze I/O on blob */
+ blob->frozen_refcnt++;
+
+ if (blob->frozen_refcnt == 1) {
+ spdk_for_each_channel(blob->bs, blob_io_sync, ctx, blob_io_cpl);
+ } else {
+ cb_fn(cb_arg, 0);
+ free(ctx);
+ }
+}
+
+static void
+blob_unfreeze_io(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg)
+{
+ struct freeze_io_ctx *ctx;
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx) {
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ ctx->cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC;
+ ctx->cpl.u.blob_basic.cb_fn = cb_fn;
+ ctx->cpl.u.blob_basic.cb_arg = cb_arg;
+ ctx->blob = blob;
+
+ assert(blob->frozen_refcnt > 0);
+
+ blob->frozen_refcnt--;
+
+ if (blob->frozen_refcnt == 0) {
+ spdk_for_each_channel(blob->bs, blob_execute_queued_io, ctx, blob_io_cpl);
+ } else {
+ cb_fn(cb_arg, 0);
+ free(ctx);
+ }
+}
+
+static int
+blob_mark_clean(struct spdk_blob *blob)
+{
+ uint32_t *extent_pages = NULL;
+ uint64_t *clusters = NULL;
+ uint32_t *pages = NULL;
+
+ assert(blob != NULL);
+
+ if (blob->active.num_extent_pages) {
+ assert(blob->active.extent_pages);
+ extent_pages = calloc(blob->active.num_extent_pages, sizeof(*blob->active.extent_pages));
+ if (!extent_pages) {
+ return -ENOMEM;
+ }
+ memcpy(extent_pages, blob->active.extent_pages,
+ blob->active.num_extent_pages * sizeof(*extent_pages));
+ }
+
+ if (blob->active.num_clusters) {
+ assert(blob->active.clusters);
+ clusters = calloc(blob->active.num_clusters, sizeof(*blob->active.clusters));
+ if (!clusters) {
+ free(extent_pages);
+ return -ENOMEM;
+ }
+ memcpy(clusters, blob->active.clusters, blob->active.num_clusters * sizeof(*blob->active.clusters));
+ }
+
+ if (blob->active.num_pages) {
+ assert(blob->active.pages);
+ pages = calloc(blob->active.num_pages, sizeof(*blob->active.pages));
+ if (!pages) {
+ free(extent_pages);
+ free(clusters);
+ return -ENOMEM;
+ }
+ memcpy(pages, blob->active.pages, blob->active.num_pages * sizeof(*blob->active.pages));
+ }
+
+ free(blob->clean.extent_pages);
+ free(blob->clean.clusters);
+ free(blob->clean.pages);
+
+ blob->clean.num_extent_pages = blob->active.num_extent_pages;
+ blob->clean.extent_pages = blob->active.extent_pages;
+ blob->clean.num_clusters = blob->active.num_clusters;
+ blob->clean.clusters = blob->active.clusters;
+ blob->clean.num_pages = blob->active.num_pages;
+ blob->clean.pages = blob->active.pages;
+
+ blob->active.extent_pages = extent_pages;
+ blob->active.clusters = clusters;
+ blob->active.pages = pages;
+
+ /* If the metadata was dirtied again while the metadata was being written to disk,
+ * we do not want to revert the DIRTY state back to CLEAN here.
+ */
+ if (blob->state == SPDK_BLOB_STATE_LOADING) {
+ blob->state = SPDK_BLOB_STATE_CLEAN;
+ }
+
+ return 0;
+}
+
+static int
+blob_deserialize_xattr(struct spdk_blob *blob,
+ struct spdk_blob_md_descriptor_xattr *desc_xattr, bool internal)
+{
+ struct spdk_xattr *xattr;
+
+ if (desc_xattr->length != sizeof(desc_xattr->name_length) +
+ sizeof(desc_xattr->value_length) +
+ desc_xattr->name_length + desc_xattr->value_length) {
+ return -EINVAL;
+ }
+
+ xattr = calloc(1, sizeof(*xattr));
+ if (xattr == NULL) {
+ return -ENOMEM;
+ }
+
+ xattr->name = malloc(desc_xattr->name_length + 1);
+ if (xattr->name == NULL) {
+ free(xattr);
+ return -ENOMEM;
+ }
+ memcpy(xattr->name, desc_xattr->name, desc_xattr->name_length);
+ xattr->name[desc_xattr->name_length] = '\0';
+
+ xattr->value = malloc(desc_xattr->value_length);
+ if (xattr->value == NULL) {
+ free(xattr->name);
+ free(xattr);
+ return -ENOMEM;
+ }
+ xattr->value_len = desc_xattr->value_length;
+ memcpy(xattr->value,
+ (void *)((uintptr_t)desc_xattr->name + desc_xattr->name_length),
+ desc_xattr->value_length);
+
+ TAILQ_INSERT_TAIL(internal ? &blob->xattrs_internal : &blob->xattrs, xattr, link);
+
+ return 0;
+}
+
+
+static int
+blob_parse_page(const struct spdk_blob_md_page *page, struct spdk_blob *blob)
+{
+ struct spdk_blob_md_descriptor *desc;
+ size_t cur_desc = 0;
+ void *tmp;
+
+ desc = (struct spdk_blob_md_descriptor *)page->descriptors;
+ while (cur_desc < sizeof(page->descriptors)) {
+ if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_PADDING) {
+ if (desc->length == 0) {
+ /* If padding and length are 0, this terminates the page */
+ break;
+ }
+ } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_FLAGS) {
+ struct spdk_blob_md_descriptor_flags *desc_flags;
+
+ desc_flags = (struct spdk_blob_md_descriptor_flags *)desc;
+
+ if (desc_flags->length != sizeof(*desc_flags) - sizeof(*desc)) {
+ return -EINVAL;
+ }
+
+ if ((desc_flags->invalid_flags | SPDK_BLOB_INVALID_FLAGS_MASK) !=
+ SPDK_BLOB_INVALID_FLAGS_MASK) {
+ return -EINVAL;
+ }
+
+ if ((desc_flags->data_ro_flags | SPDK_BLOB_DATA_RO_FLAGS_MASK) !=
+ SPDK_BLOB_DATA_RO_FLAGS_MASK) {
+ blob->data_ro = true;
+ blob->md_ro = true;
+ }
+
+ if ((desc_flags->md_ro_flags | SPDK_BLOB_MD_RO_FLAGS_MASK) !=
+ SPDK_BLOB_MD_RO_FLAGS_MASK) {
+ blob->md_ro = true;
+ }
+
+ if ((desc_flags->data_ro_flags & SPDK_BLOB_READ_ONLY)) {
+ blob->data_ro = true;
+ blob->md_ro = true;
+ }
+
+ blob->invalid_flags = desc_flags->invalid_flags;
+ blob->data_ro_flags = desc_flags->data_ro_flags;
+ blob->md_ro_flags = desc_flags->md_ro_flags;
+
+ } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE) {
+ struct spdk_blob_md_descriptor_extent_rle *desc_extent_rle;
+ unsigned int i, j;
+ unsigned int cluster_count = blob->active.num_clusters;
+
+ if (blob->extent_table_found) {
+ /* Extent Table already present in the md,
+ * both descriptors should never be at the same time. */
+ return -EINVAL;
+ }
+ blob->extent_rle_found = true;
+
+ desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)desc;
+
+ if (desc_extent_rle->length == 0 ||
+ (desc_extent_rle->length % sizeof(desc_extent_rle->extents[0]) != 0)) {
+ return -EINVAL;
+ }
+
+ for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) {
+ for (j = 0; j < desc_extent_rle->extents[i].length; j++) {
+ if (desc_extent_rle->extents[i].cluster_idx != 0) {
+ if (!spdk_bit_array_get(blob->bs->used_clusters,
+ desc_extent_rle->extents[i].cluster_idx + j)) {
+ return -EINVAL;
+ }
+ }
+ cluster_count++;
+ }
+ }
+
+ if (cluster_count == 0) {
+ return -EINVAL;
+ }
+ tmp = realloc(blob->active.clusters, cluster_count * sizeof(*blob->active.clusters));
+ if (tmp == NULL) {
+ return -ENOMEM;
+ }
+ blob->active.clusters = tmp;
+ blob->active.cluster_array_size = cluster_count;
+
+ for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) {
+ for (j = 0; j < desc_extent_rle->extents[i].length; j++) {
+ if (desc_extent_rle->extents[i].cluster_idx != 0) {
+ blob->active.clusters[blob->active.num_clusters++] = bs_cluster_to_lba(blob->bs,
+ desc_extent_rle->extents[i].cluster_idx + j);
+ } else if (spdk_blob_is_thin_provisioned(blob)) {
+ blob->active.clusters[blob->active.num_clusters++] = 0;
+ } else {
+ return -EINVAL;
+ }
+ }
+ }
+ } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE) {
+ struct spdk_blob_md_descriptor_extent_table *desc_extent_table;
+ uint32_t num_extent_pages = blob->active.num_extent_pages;
+ uint32_t i, j;
+ size_t extent_pages_length;
+
+ desc_extent_table = (struct spdk_blob_md_descriptor_extent_table *)desc;
+ extent_pages_length = desc_extent_table->length - sizeof(desc_extent_table->num_clusters);
+
+ if (blob->extent_rle_found) {
+ /* This means that Extent RLE is present in MD,
+ * both should never be at the same time. */
+ return -EINVAL;
+ } else if (blob->extent_table_found &&
+ desc_extent_table->num_clusters != blob->remaining_clusters_in_et) {
+ /* Number of clusters in this ET does not match number
+ * from previously read EXTENT_TABLE. */
+ return -EINVAL;
+ }
+
+ blob->extent_table_found = true;
+
+ if (desc_extent_table->length == 0 ||
+ (extent_pages_length % sizeof(desc_extent_table->extent_page[0]) != 0)) {
+ return -EINVAL;
+ }
+
+ for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) {
+ num_extent_pages += desc_extent_table->extent_page[i].num_pages;
+ }
+
+ tmp = realloc(blob->active.extent_pages, num_extent_pages * sizeof(uint32_t));
+ if (tmp == NULL) {
+ return -ENOMEM;
+ }
+ blob->active.extent_pages = tmp;
+ blob->active.extent_pages_array_size = num_extent_pages;
+
+ blob->remaining_clusters_in_et = desc_extent_table->num_clusters;
+
+ /* Extent table entries contain md page numbers for extent pages.
+ * Zeroes represent unallocated extent pages, those are run-length-encoded.
+ */
+ for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) {
+ if (desc_extent_table->extent_page[i].page_idx != 0) {
+ assert(desc_extent_table->extent_page[i].num_pages == 1);
+ blob->active.extent_pages[blob->active.num_extent_pages++] =
+ desc_extent_table->extent_page[i].page_idx;
+ } else if (spdk_blob_is_thin_provisioned(blob)) {
+ for (j = 0; j < desc_extent_table->extent_page[i].num_pages; j++) {
+ blob->active.extent_pages[blob->active.num_extent_pages++] = 0;
+ }
+ } else {
+ return -EINVAL;
+ }
+ }
+ } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) {
+ struct spdk_blob_md_descriptor_extent_page *desc_extent;
+ unsigned int i;
+ unsigned int cluster_count = 0;
+ size_t cluster_idx_length;
+
+ if (blob->extent_rle_found) {
+ /* This means that Extent RLE is present in MD,
+ * both should never be at the same time. */
+ return -EINVAL;
+ }
+
+ desc_extent = (struct spdk_blob_md_descriptor_extent_page *)desc;
+ cluster_idx_length = desc_extent->length - sizeof(desc_extent->start_cluster_idx);
+
+ if (desc_extent->length <= sizeof(desc_extent->start_cluster_idx) ||
+ (cluster_idx_length % sizeof(desc_extent->cluster_idx[0]) != 0)) {
+ return -EINVAL;
+ }
+
+ for (i = 0; i < cluster_idx_length / sizeof(desc_extent->cluster_idx[0]); i++) {
+ if (desc_extent->cluster_idx[i] != 0) {
+ if (!spdk_bit_array_get(blob->bs->used_clusters, desc_extent->cluster_idx[i])) {
+ return -EINVAL;
+ }
+ }
+ cluster_count++;
+ }
+
+ if (cluster_count == 0) {
+ return -EINVAL;
+ }
+
+ /* When reading extent pages sequentially starting cluster idx should match
+ * current size of a blob.
+ * If changed to batch reading, this check shall be removed. */
+ if (desc_extent->start_cluster_idx != blob->active.num_clusters) {
+ return -EINVAL;
+ }
+
+ tmp = realloc(blob->active.clusters,
+ (cluster_count + blob->active.num_clusters) * sizeof(*blob->active.clusters));
+ if (tmp == NULL) {
+ return -ENOMEM;
+ }
+ blob->active.clusters = tmp;
+ blob->active.cluster_array_size = (cluster_count + blob->active.num_clusters);
+
+ for (i = 0; i < cluster_idx_length / sizeof(desc_extent->cluster_idx[0]); i++) {
+ if (desc_extent->cluster_idx[i] != 0) {
+ blob->active.clusters[blob->active.num_clusters++] = bs_cluster_to_lba(blob->bs,
+ desc_extent->cluster_idx[i]);
+ } else if (spdk_blob_is_thin_provisioned(blob)) {
+ blob->active.clusters[blob->active.num_clusters++] = 0;
+ } else {
+ return -EINVAL;
+ }
+ }
+ assert(desc_extent->start_cluster_idx + cluster_count == blob->active.num_clusters);
+ assert(blob->remaining_clusters_in_et >= cluster_count);
+ blob->remaining_clusters_in_et -= cluster_count;
+ } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) {
+ int rc;
+
+ rc = blob_deserialize_xattr(blob,
+ (struct spdk_blob_md_descriptor_xattr *) desc, false);
+ if (rc != 0) {
+ return rc;
+ }
+ } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) {
+ int rc;
+
+ rc = blob_deserialize_xattr(blob,
+ (struct spdk_blob_md_descriptor_xattr *) desc, true);
+ if (rc != 0) {
+ return rc;
+ }
+ } else {
+ /* Unrecognized descriptor type. Do not fail - just continue to the
+ * next descriptor. If this descriptor is associated with some feature
+ * defined in a newer version of blobstore, that version of blobstore
+ * should create and set an associated feature flag to specify if this
+ * blob can be loaded or not.
+ */
+ }
+
+ /* Advance to the next descriptor */
+ cur_desc += sizeof(*desc) + desc->length;
+ if (cur_desc + sizeof(*desc) > sizeof(page->descriptors)) {
+ break;
+ }
+ desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + cur_desc);
+ }
+
+ return 0;
+}
+
+static bool bs_load_cur_extent_page_valid(struct spdk_blob_md_page *page);
+
+static int
+blob_parse_extent_page(struct spdk_blob_md_page *extent_page, struct spdk_blob *blob)
+{
+ assert(blob != NULL);
+ assert(blob->state == SPDK_BLOB_STATE_LOADING);
+
+ if (bs_load_cur_extent_page_valid(extent_page) == false) {
+ return -ENOENT;
+ }
+
+ return blob_parse_page(extent_page, blob);
+}
+
+static int
+blob_parse(const struct spdk_blob_md_page *pages, uint32_t page_count,
+ struct spdk_blob *blob)
+{
+ const struct spdk_blob_md_page *page;
+ uint32_t i;
+ int rc;
+
+ assert(page_count > 0);
+ assert(pages[0].sequence_num == 0);
+ assert(blob != NULL);
+ assert(blob->state == SPDK_BLOB_STATE_LOADING);
+ assert(blob->active.clusters == NULL);
+
+ /* The blobid provided doesn't match what's in the MD, this can
+ * happen for example if a bogus blobid is passed in through open.
+ */
+ if (blob->id != pages[0].id) {
+ SPDK_ERRLOG("Blobid (%lu) doesn't match what's in metadata (%lu)\n",
+ blob->id, pages[0].id);
+ return -ENOENT;
+ }
+
+ for (i = 0; i < page_count; i++) {
+ page = &pages[i];
+
+ assert(page->id == blob->id);
+ assert(page->sequence_num == i);
+
+ rc = blob_parse_page(page, blob);
+ if (rc != 0) {
+ return rc;
+ }
+ }
+
+ return 0;
+}
+
+static int
+blob_serialize_add_page(const struct spdk_blob *blob,
+ struct spdk_blob_md_page **pages,
+ uint32_t *page_count,
+ struct spdk_blob_md_page **last_page)
+{
+ struct spdk_blob_md_page *page;
+
+ assert(pages != NULL);
+ assert(page_count != NULL);
+
+ if (*page_count == 0) {
+ assert(*pages == NULL);
+ *page_count = 1;
+ *pages = spdk_malloc(SPDK_BS_PAGE_SIZE, SPDK_BS_PAGE_SIZE,
+ NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
+ } else {
+ assert(*pages != NULL);
+ (*page_count)++;
+ *pages = spdk_realloc(*pages,
+ SPDK_BS_PAGE_SIZE * (*page_count),
+ SPDK_BS_PAGE_SIZE);
+ }
+
+ if (*pages == NULL) {
+ *page_count = 0;
+ *last_page = NULL;
+ return -ENOMEM;
+ }
+
+ page = &(*pages)[*page_count - 1];
+ memset(page, 0, sizeof(*page));
+ page->id = blob->id;
+ page->sequence_num = *page_count - 1;
+ page->next = SPDK_INVALID_MD_PAGE;
+ *last_page = page;
+
+ return 0;
+}
+
+/* Transform the in-memory representation 'xattr' into an on-disk xattr descriptor.
+ * Update required_sz on both success and failure.
+ *
+ */
+static int
+blob_serialize_xattr(const struct spdk_xattr *xattr,
+ uint8_t *buf, size_t buf_sz,
+ size_t *required_sz, bool internal)
+{
+ struct spdk_blob_md_descriptor_xattr *desc;
+
+ *required_sz = sizeof(struct spdk_blob_md_descriptor_xattr) +
+ strlen(xattr->name) +
+ xattr->value_len;
+
+ if (buf_sz < *required_sz) {
+ return -1;
+ }
+
+ desc = (struct spdk_blob_md_descriptor_xattr *)buf;
+
+ desc->type = internal ? SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL : SPDK_MD_DESCRIPTOR_TYPE_XATTR;
+ desc->length = sizeof(desc->name_length) +
+ sizeof(desc->value_length) +
+ strlen(xattr->name) +
+ xattr->value_len;
+ desc->name_length = strlen(xattr->name);
+ desc->value_length = xattr->value_len;
+
+ memcpy(desc->name, xattr->name, desc->name_length);
+ memcpy((void *)((uintptr_t)desc->name + desc->name_length),
+ xattr->value,
+ desc->value_length);
+
+ return 0;
+}
+
+static void
+blob_serialize_extent_table_entry(const struct spdk_blob *blob,
+ uint64_t start_ep, uint64_t *next_ep,
+ uint8_t **buf, size_t *remaining_sz)
+{
+ struct spdk_blob_md_descriptor_extent_table *desc;
+ size_t cur_sz;
+ uint64_t i, et_idx;
+ uint32_t extent_page, ep_len;
+
+ /* The buffer must have room for at least num_clusters entry */
+ cur_sz = sizeof(struct spdk_blob_md_descriptor) + sizeof(desc->num_clusters);
+ if (*remaining_sz < cur_sz) {
+ *next_ep = start_ep;
+ return;
+ }
+
+ desc = (struct spdk_blob_md_descriptor_extent_table *)*buf;
+ desc->type = SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE;
+
+ desc->num_clusters = blob->active.num_clusters;
+
+ ep_len = 1;
+ et_idx = 0;
+ for (i = start_ep; i < blob->active.num_extent_pages; i++) {
+ if (*remaining_sz < cur_sz + sizeof(desc->extent_page[0])) {
+ /* If we ran out of buffer space, return */
+ break;
+ }
+
+ extent_page = blob->active.extent_pages[i];
+ /* Verify that next extent_page is unallocated */
+ if (extent_page == 0 &&
+ (i + 1 < blob->active.num_extent_pages && blob->active.extent_pages[i + 1] == 0)) {
+ ep_len++;
+ continue;
+ }
+ desc->extent_page[et_idx].page_idx = extent_page;
+ desc->extent_page[et_idx].num_pages = ep_len;
+ et_idx++;
+
+ ep_len = 1;
+ cur_sz += sizeof(desc->extent_page[et_idx]);
+ }
+ *next_ep = i;
+
+ desc->length = sizeof(desc->num_clusters) + sizeof(desc->extent_page[0]) * et_idx;
+ *remaining_sz -= sizeof(struct spdk_blob_md_descriptor) + desc->length;
+ *buf += sizeof(struct spdk_blob_md_descriptor) + desc->length;
+}
+
+static int
+blob_serialize_extent_table(const struct spdk_blob *blob,
+ struct spdk_blob_md_page **pages,
+ struct spdk_blob_md_page *cur_page,
+ uint32_t *page_count, uint8_t **buf,
+ size_t *remaining_sz)
+{
+ uint64_t last_extent_page;
+ int rc;
+
+ last_extent_page = 0;
+ /* At least single extent table entry has to be always persisted.
+ * Such case occurs with num_extent_pages == 0. */
+ while (last_extent_page <= blob->active.num_extent_pages) {
+ blob_serialize_extent_table_entry(blob, last_extent_page, &last_extent_page, buf,
+ remaining_sz);
+
+ if (last_extent_page == blob->active.num_extent_pages) {
+ break;
+ }
+
+ rc = blob_serialize_add_page(blob, pages, page_count, &cur_page);
+ if (rc < 0) {
+ return rc;
+ }
+
+ *buf = (uint8_t *)cur_page->descriptors;
+ *remaining_sz = sizeof(cur_page->descriptors);
+ }
+
+ return 0;
+}
+
+static void
+blob_serialize_extent_rle(const struct spdk_blob *blob,
+ uint64_t start_cluster, uint64_t *next_cluster,
+ uint8_t **buf, size_t *buf_sz)
+{
+ struct spdk_blob_md_descriptor_extent_rle *desc_extent_rle;
+ size_t cur_sz;
+ uint64_t i, extent_idx;
+ uint64_t lba, lba_per_cluster, lba_count;
+
+ /* The buffer must have room for at least one extent */
+ cur_sz = sizeof(struct spdk_blob_md_descriptor) + sizeof(desc_extent_rle->extents[0]);
+ if (*buf_sz < cur_sz) {
+ *next_cluster = start_cluster;
+ return;
+ }
+
+ desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)*buf;
+ desc_extent_rle->type = SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE;
+
+ lba_per_cluster = bs_cluster_to_lba(blob->bs, 1);
+
+ lba = blob->active.clusters[start_cluster];
+ lba_count = lba_per_cluster;
+ extent_idx = 0;
+ for (i = start_cluster + 1; i < blob->active.num_clusters; i++) {
+ if ((lba + lba_count) == blob->active.clusters[i] && lba != 0) {
+ /* Run-length encode sequential non-zero LBA */
+ lba_count += lba_per_cluster;
+ continue;
+ } else if (lba == 0 && blob->active.clusters[i] == 0) {
+ /* Run-length encode unallocated clusters */
+ lba_count += lba_per_cluster;
+ continue;
+ }
+ desc_extent_rle->extents[extent_idx].cluster_idx = lba / lba_per_cluster;
+ desc_extent_rle->extents[extent_idx].length = lba_count / lba_per_cluster;
+ extent_idx++;
+
+ cur_sz += sizeof(desc_extent_rle->extents[extent_idx]);
+
+ if (*buf_sz < cur_sz) {
+ /* If we ran out of buffer space, return */
+ *next_cluster = i;
+ break;
+ }
+
+ lba = blob->active.clusters[i];
+ lba_count = lba_per_cluster;
+ }
+
+ if (*buf_sz >= cur_sz) {
+ desc_extent_rle->extents[extent_idx].cluster_idx = lba / lba_per_cluster;
+ desc_extent_rle->extents[extent_idx].length = lba_count / lba_per_cluster;
+ extent_idx++;
+
+ *next_cluster = blob->active.num_clusters;
+ }
+
+ desc_extent_rle->length = sizeof(desc_extent_rle->extents[0]) * extent_idx;
+ *buf_sz -= sizeof(struct spdk_blob_md_descriptor) + desc_extent_rle->length;
+ *buf += sizeof(struct spdk_blob_md_descriptor) + desc_extent_rle->length;
+}
+
+static int
+blob_serialize_extents_rle(const struct spdk_blob *blob,
+ struct spdk_blob_md_page **pages,
+ struct spdk_blob_md_page *cur_page,
+ uint32_t *page_count, uint8_t **buf,
+ size_t *remaining_sz)
+{
+ uint64_t last_cluster;
+ int rc;
+
+ last_cluster = 0;
+ while (last_cluster < blob->active.num_clusters) {
+ blob_serialize_extent_rle(blob, last_cluster, &last_cluster, buf, remaining_sz);
+
+ if (last_cluster == blob->active.num_clusters) {
+ break;
+ }
+
+ rc = blob_serialize_add_page(blob, pages, page_count, &cur_page);
+ if (rc < 0) {
+ return rc;
+ }
+
+ *buf = (uint8_t *)cur_page->descriptors;
+ *remaining_sz = sizeof(cur_page->descriptors);
+ }
+
+ return 0;
+}
+
+static void
+blob_serialize_extent_page(const struct spdk_blob *blob,
+ uint64_t cluster, struct spdk_blob_md_page *page)
+{
+ struct spdk_blob_md_descriptor_extent_page *desc_extent;
+ uint64_t i, extent_idx;
+ uint64_t lba, lba_per_cluster;
+ uint64_t start_cluster_idx = (cluster / SPDK_EXTENTS_PER_EP) * SPDK_EXTENTS_PER_EP;
+
+ desc_extent = (struct spdk_blob_md_descriptor_extent_page *) page->descriptors;
+ desc_extent->type = SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE;
+
+ lba_per_cluster = bs_cluster_to_lba(blob->bs, 1);
+
+ desc_extent->start_cluster_idx = start_cluster_idx;
+ extent_idx = 0;
+ for (i = start_cluster_idx; i < blob->active.num_clusters; i++) {
+ lba = blob->active.clusters[i];
+ desc_extent->cluster_idx[extent_idx++] = lba / lba_per_cluster;
+ if (extent_idx >= SPDK_EXTENTS_PER_EP) {
+ break;
+ }
+ }
+ desc_extent->length = sizeof(desc_extent->start_cluster_idx) +
+ sizeof(desc_extent->cluster_idx[0]) * extent_idx;
+}
+
+static void
+blob_serialize_flags(const struct spdk_blob *blob,
+ uint8_t *buf, size_t *buf_sz)
+{
+ struct spdk_blob_md_descriptor_flags *desc;
+
+ /*
+ * Flags get serialized first, so we should always have room for the flags
+ * descriptor.
+ */
+ assert(*buf_sz >= sizeof(*desc));
+
+ desc = (struct spdk_blob_md_descriptor_flags *)buf;
+ desc->type = SPDK_MD_DESCRIPTOR_TYPE_FLAGS;
+ desc->length = sizeof(*desc) - sizeof(struct spdk_blob_md_descriptor);
+ desc->invalid_flags = blob->invalid_flags;
+ desc->data_ro_flags = blob->data_ro_flags;
+ desc->md_ro_flags = blob->md_ro_flags;
+
+ *buf_sz -= sizeof(*desc);
+}
+
+static int
+blob_serialize_xattrs(const struct spdk_blob *blob,
+ const struct spdk_xattr_tailq *xattrs, bool internal,
+ struct spdk_blob_md_page **pages,
+ struct spdk_blob_md_page *cur_page,
+ uint32_t *page_count, uint8_t **buf,
+ size_t *remaining_sz)
+{
+ const struct spdk_xattr *xattr;
+ int rc;
+
+ TAILQ_FOREACH(xattr, xattrs, link) {
+ size_t required_sz = 0;
+
+ rc = blob_serialize_xattr(xattr,
+ *buf, *remaining_sz,
+ &required_sz, internal);
+ if (rc < 0) {
+ /* Need to add a new page to the chain */
+ rc = blob_serialize_add_page(blob, pages, page_count,
+ &cur_page);
+ if (rc < 0) {
+ spdk_free(*pages);
+ *pages = NULL;
+ *page_count = 0;
+ return rc;
+ }
+
+ *buf = (uint8_t *)cur_page->descriptors;
+ *remaining_sz = sizeof(cur_page->descriptors);
+
+ /* Try again */
+ required_sz = 0;
+ rc = blob_serialize_xattr(xattr,
+ *buf, *remaining_sz,
+ &required_sz, internal);
+
+ if (rc < 0) {
+ spdk_free(*pages);
+ *pages = NULL;
+ *page_count = 0;
+ return rc;
+ }
+ }
+
+ *remaining_sz -= required_sz;
+ *buf += required_sz;
+ }
+
+ return 0;
+}
+
+static int
+blob_serialize(const struct spdk_blob *blob, struct spdk_blob_md_page **pages,
+ uint32_t *page_count)
+{
+ struct spdk_blob_md_page *cur_page;
+ int rc;
+ uint8_t *buf;
+ size_t remaining_sz;
+
+ assert(pages != NULL);
+ assert(page_count != NULL);
+ assert(blob != NULL);
+ assert(blob->state == SPDK_BLOB_STATE_DIRTY);
+
+ *pages = NULL;
+ *page_count = 0;
+
+ /* A blob always has at least 1 page, even if it has no descriptors */
+ rc = blob_serialize_add_page(blob, pages, page_count, &cur_page);
+ if (rc < 0) {
+ return rc;
+ }
+
+ buf = (uint8_t *)cur_page->descriptors;
+ remaining_sz = sizeof(cur_page->descriptors);
+
+ /* Serialize flags */
+ blob_serialize_flags(blob, buf, &remaining_sz);
+ buf += sizeof(struct spdk_blob_md_descriptor_flags);
+
+ /* Serialize xattrs */
+ rc = blob_serialize_xattrs(blob, &blob->xattrs, false,
+ pages, cur_page, page_count, &buf, &remaining_sz);
+ if (rc < 0) {
+ return rc;
+ }
+
+ /* Serialize internal xattrs */
+ rc = blob_serialize_xattrs(blob, &blob->xattrs_internal, true,
+ pages, cur_page, page_count, &buf, &remaining_sz);
+ if (rc < 0) {
+ return rc;
+ }
+
+ if (blob->use_extent_table) {
+ /* Serialize extent table */
+ rc = blob_serialize_extent_table(blob, pages, cur_page, page_count, &buf, &remaining_sz);
+ } else {
+ /* Serialize extents */
+ rc = blob_serialize_extents_rle(blob, pages, cur_page, page_count, &buf, &remaining_sz);
+ }
+
+ return rc;
+}
+
+struct spdk_blob_load_ctx {
+ struct spdk_blob *blob;
+
+ struct spdk_blob_md_page *pages;
+ uint32_t num_pages;
+ uint32_t next_extent_page;
+ spdk_bs_sequence_t *seq;
+
+ spdk_bs_sequence_cpl cb_fn;
+ void *cb_arg;
+};
+
+static uint32_t
+blob_md_page_calc_crc(void *page)
+{
+ uint32_t crc;
+
+ crc = BLOB_CRC32C_INITIAL;
+ crc = spdk_crc32c_update(page, SPDK_BS_PAGE_SIZE - 4, crc);
+ crc ^= BLOB_CRC32C_INITIAL;
+
+ return crc;
+
+}
+
+static void
+blob_load_final(void *cb_arg, int bserrno)
+{
+ struct spdk_blob_load_ctx *ctx = cb_arg;
+ struct spdk_blob *blob = ctx->blob;
+
+ if (bserrno == 0) {
+ blob_mark_clean(blob);
+ }
+
+ ctx->cb_fn(ctx->seq, ctx->cb_arg, bserrno);
+
+ /* Free the memory */
+ spdk_free(ctx->pages);
+ free(ctx);
+}
+
+static void
+blob_load_snapshot_cpl(void *cb_arg, struct spdk_blob *snapshot, int bserrno)
+{
+ struct spdk_blob_load_ctx *ctx = cb_arg;
+ struct spdk_blob *blob = ctx->blob;
+
+ if (bserrno == 0) {
+ blob->back_bs_dev = bs_create_blob_bs_dev(snapshot);
+ if (blob->back_bs_dev == NULL) {
+ bserrno = -ENOMEM;
+ }
+ }
+ if (bserrno != 0) {
+ SPDK_ERRLOG("Snapshot fail\n");
+ }
+
+ blob_load_final(ctx, bserrno);
+}
+
+static void blob_update_clear_method(struct spdk_blob *blob);
+
+static void
+blob_load_backing_dev(void *cb_arg)
+{
+ struct spdk_blob_load_ctx *ctx = cb_arg;
+ struct spdk_blob *blob = ctx->blob;
+ const void *value;
+ size_t len;
+ int rc;
+
+ if (spdk_blob_is_thin_provisioned(blob)) {
+ rc = blob_get_xattr_value(blob, BLOB_SNAPSHOT, &value, &len, true);
+ if (rc == 0) {
+ if (len != sizeof(spdk_blob_id)) {
+ blob_load_final(ctx, -EINVAL);
+ return;
+ }
+ /* open snapshot blob and continue in the callback function */
+ blob->parent_id = *(spdk_blob_id *)value;
+ spdk_bs_open_blob(blob->bs, blob->parent_id,
+ blob_load_snapshot_cpl, ctx);
+ return;
+ } else {
+ /* add zeroes_dev for thin provisioned blob */
+ blob->back_bs_dev = bs_create_zeroes_dev();
+ }
+ } else {
+ /* standard blob */
+ blob->back_bs_dev = NULL;
+ }
+ blob_load_final(ctx, 0);
+}
+
+static void
+blob_load_cpl_extents_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+ struct spdk_blob_load_ctx *ctx = cb_arg;
+ struct spdk_blob *blob = ctx->blob;
+ struct spdk_blob_md_page *page;
+ uint64_t i;
+ uint32_t crc;
+ uint64_t lba;
+ void *tmp;
+ uint64_t sz;
+
+ if (bserrno) {
+ SPDK_ERRLOG("Extent page read failed: %d\n", bserrno);
+ blob_load_final(ctx, bserrno);
+ return;
+ }
+
+ if (ctx->pages == NULL) {
+ /* First iteration of this function, allocate buffer for single EXTENT_PAGE */
+ ctx->pages = spdk_zmalloc(SPDK_BS_PAGE_SIZE, SPDK_BS_PAGE_SIZE, NULL, SPDK_ENV_SOCKET_ID_ANY,
+ SPDK_MALLOC_DMA);
+ if (!ctx->pages) {
+ blob_load_final(ctx, -ENOMEM);
+ return;
+ }
+ ctx->num_pages = 1;
+ ctx->next_extent_page = 0;
+ } else {
+ page = &ctx->pages[0];
+ crc = blob_md_page_calc_crc(page);
+ if (crc != page->crc) {
+ blob_load_final(ctx, -EINVAL);
+ return;
+ }
+
+ if (page->next != SPDK_INVALID_MD_PAGE) {
+ blob_load_final(ctx, -EINVAL);
+ return;
+ }
+
+ bserrno = blob_parse_extent_page(page, blob);
+ if (bserrno) {
+ blob_load_final(ctx, bserrno);
+ return;
+ }
+ }
+
+ for (i = ctx->next_extent_page; i < blob->active.num_extent_pages; i++) {
+ if (blob->active.extent_pages[i] != 0) {
+ /* Extent page was allocated, read and parse it. */
+ lba = bs_md_page_to_lba(blob->bs, blob->active.extent_pages[i]);
+ ctx->next_extent_page = i + 1;
+
+ bs_sequence_read_dev(seq, &ctx->pages[0], lba,
+ bs_byte_to_lba(blob->bs, SPDK_BS_PAGE_SIZE),
+ blob_load_cpl_extents_cpl, ctx);
+ return;
+ } else {
+ /* Thin provisioned blobs can point to unallocated extent pages.
+ * In this case blob size should be increased by up to the amount left in remaining_clusters_in_et. */
+
+ sz = spdk_min(blob->remaining_clusters_in_et, SPDK_EXTENTS_PER_EP);
+ blob->active.num_clusters += sz;
+ blob->remaining_clusters_in_et -= sz;
+
+ assert(spdk_blob_is_thin_provisioned(blob));
+ assert(i + 1 < blob->active.num_extent_pages || blob->remaining_clusters_in_et == 0);
+
+ tmp = realloc(blob->active.clusters, blob->active.num_clusters * sizeof(*blob->active.clusters));
+ if (tmp == NULL) {
+ blob_load_final(ctx, -ENOMEM);
+ return;
+ }
+ memset(tmp + sizeof(*blob->active.clusters) * blob->active.cluster_array_size, 0,
+ sizeof(*blob->active.clusters) * (blob->active.num_clusters - blob->active.cluster_array_size));
+ blob->active.clusters = tmp;
+ blob->active.cluster_array_size = blob->active.num_clusters;
+ }
+ }
+
+ blob_load_backing_dev(ctx);
+}
+
+static void
+blob_load_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+ struct spdk_blob_load_ctx *ctx = cb_arg;
+ struct spdk_blob *blob = ctx->blob;
+ struct spdk_blob_md_page *page;
+ int rc;
+ uint32_t crc;
+ uint32_t current_page;
+
+ if (ctx->num_pages == 1) {
+ current_page = bs_blobid_to_page(blob->id);
+ } else {
+ assert(ctx->num_pages != 0);
+ page = &ctx->pages[ctx->num_pages - 2];
+ current_page = page->next;
+ }
+
+ if (bserrno) {
+ SPDK_ERRLOG("Metadata page %d read failed for blobid %lu: %d\n",
+ current_page, blob->id, bserrno);
+ blob_load_final(ctx, bserrno);
+ return;
+ }
+
+ page = &ctx->pages[ctx->num_pages - 1];
+ crc = blob_md_page_calc_crc(page);
+ if (crc != page->crc) {
+ SPDK_ERRLOG("Metadata page %d crc mismatch for blobid %lu\n",
+ current_page, blob->id);
+ blob_load_final(ctx, -EINVAL);
+ return;
+ }
+
+ if (page->next != SPDK_INVALID_MD_PAGE) {
+ uint32_t next_page = page->next;
+ uint64_t next_lba = bs_md_page_to_lba(blob->bs, next_page);
+
+ /* Read the next page */
+ ctx->num_pages++;
+ ctx->pages = spdk_realloc(ctx->pages, (sizeof(*page) * ctx->num_pages),
+ sizeof(*page));
+ if (ctx->pages == NULL) {
+ blob_load_final(ctx, -ENOMEM);
+ return;
+ }
+
+ bs_sequence_read_dev(seq, &ctx->pages[ctx->num_pages - 1],
+ next_lba,
+ bs_byte_to_lba(blob->bs, sizeof(*page)),
+ blob_load_cpl, ctx);
+ return;
+ }
+
+ /* Parse the pages */
+ rc = blob_parse(ctx->pages, ctx->num_pages, blob);
+ if (rc) {
+ blob_load_final(ctx, rc);
+ return;
+ }
+
+ if (blob->extent_table_found == true) {
+ /* If EXTENT_TABLE was found, that means support for it should be enabled. */
+ assert(blob->extent_rle_found == false);
+ blob->use_extent_table = true;
+ } else {
+ /* If EXTENT_RLE or no extent_* descriptor was found disable support
+ * for extent table. No extent_* descriptors means that blob has length of 0
+ * and no extent_rle descriptors were persisted for it.
+ * EXTENT_TABLE if used, is always present in metadata regardless of length. */
+ blob->use_extent_table = false;
+ }
+
+ /* Check the clear_method stored in metadata vs what may have been passed
+ * via spdk_bs_open_blob_ext() and update accordingly.
+ */
+ blob_update_clear_method(blob);
+
+ spdk_free(ctx->pages);
+ ctx->pages = NULL;
+
+ if (blob->extent_table_found) {
+ blob_load_cpl_extents_cpl(seq, ctx, 0);
+ } else {
+ blob_load_backing_dev(ctx);
+ }
+}
+
+/* Load a blob from disk given a blobid */
+static void
+blob_load(spdk_bs_sequence_t *seq, struct spdk_blob *blob,
+ spdk_bs_sequence_cpl cb_fn, void *cb_arg)
+{
+ struct spdk_blob_load_ctx *ctx;
+ struct spdk_blob_store *bs;
+ uint32_t page_num;
+ uint64_t lba;
+
+ blob_verify_md_op(blob);
+
+ bs = blob->bs;
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx) {
+ cb_fn(seq, cb_arg, -ENOMEM);
+ return;
+ }
+
+ ctx->blob = blob;
+ ctx->pages = spdk_realloc(ctx->pages, SPDK_BS_PAGE_SIZE, SPDK_BS_PAGE_SIZE);
+ if (!ctx->pages) {
+ free(ctx);
+ cb_fn(seq, cb_arg, -ENOMEM);
+ return;
+ }
+ ctx->num_pages = 1;
+ ctx->cb_fn = cb_fn;
+ ctx->cb_arg = cb_arg;
+ ctx->seq = seq;
+
+ page_num = bs_blobid_to_page(blob->id);
+ lba = bs_md_page_to_lba(blob->bs, page_num);
+
+ blob->state = SPDK_BLOB_STATE_LOADING;
+
+ bs_sequence_read_dev(seq, &ctx->pages[0], lba,
+ bs_byte_to_lba(bs, SPDK_BS_PAGE_SIZE),
+ blob_load_cpl, ctx);
+}
+
+struct spdk_blob_persist_ctx {
+ struct spdk_blob *blob;
+
+ struct spdk_bs_super_block *super;
+
+ struct spdk_blob_md_page *pages;
+ uint32_t next_extent_page;
+ struct spdk_blob_md_page *extent_page;
+
+ spdk_bs_sequence_t *seq;
+ spdk_bs_sequence_cpl cb_fn;
+ void *cb_arg;
+ TAILQ_ENTRY(spdk_blob_persist_ctx) link;
+};
+
+static void
+bs_batch_clear_dev(struct spdk_blob_persist_ctx *ctx, spdk_bs_batch_t *batch, uint64_t lba,
+ uint32_t lba_count)
+{
+ switch (ctx->blob->clear_method) {
+ case BLOB_CLEAR_WITH_DEFAULT:
+ case BLOB_CLEAR_WITH_UNMAP:
+ bs_batch_unmap_dev(batch, lba, lba_count);
+ break;
+ case BLOB_CLEAR_WITH_WRITE_ZEROES:
+ bs_batch_write_zeroes_dev(batch, lba, lba_count);
+ break;
+ case BLOB_CLEAR_WITH_NONE:
+ default:
+ break;
+ }
+}
+
+static void blob_persist_check_dirty(struct spdk_blob_persist_ctx *ctx);
+
+static void
+blob_persist_complete(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+ struct spdk_blob_persist_ctx *ctx = cb_arg;
+ struct spdk_blob_persist_ctx *next_persist;
+ struct spdk_blob *blob = ctx->blob;
+
+ if (bserrno == 0) {
+ blob_mark_clean(blob);
+ }
+
+ assert(ctx == TAILQ_FIRST(&blob->pending_persists));
+ TAILQ_REMOVE(&blob->pending_persists, ctx, link);
+
+ next_persist = TAILQ_FIRST(&blob->pending_persists);
+
+ /* Call user callback */
+ ctx->cb_fn(seq, ctx->cb_arg, bserrno);
+
+ /* Free the memory */
+ spdk_free(ctx->pages);
+ free(ctx);
+
+ if (next_persist != NULL) {
+ blob_persist_check_dirty(next_persist);
+ }
+}
+
+static void
+blob_persist_clear_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+ struct spdk_blob_persist_ctx *ctx = cb_arg;
+ struct spdk_blob *blob = ctx->blob;
+ struct spdk_blob_store *bs = blob->bs;
+ size_t i;
+
+ if (bserrno != 0) {
+ blob_persist_complete(seq, ctx, bserrno);
+ return;
+ }
+
+ /* Release all clusters that were truncated */
+ for (i = blob->active.num_clusters; i < blob->active.cluster_array_size; i++) {
+ uint32_t cluster_num = bs_lba_to_cluster(bs, blob->active.clusters[i]);
+
+ /* Nothing to release if it was not allocated */
+ if (blob->active.clusters[i] != 0) {
+ bs_release_cluster(bs, cluster_num);
+ }
+ }
+
+ if (blob->active.num_clusters == 0) {
+ free(blob->active.clusters);
+ blob->active.clusters = NULL;
+ blob->active.cluster_array_size = 0;
+ } else if (blob->active.num_clusters != blob->active.cluster_array_size) {
+#ifndef __clang_analyzer__
+ void *tmp;
+
+ /* scan-build really can't figure reallocs, workaround it */
+ tmp = realloc(blob->active.clusters, sizeof(*blob->active.clusters) * blob->active.num_clusters);
+ assert(tmp != NULL);
+ blob->active.clusters = tmp;
+
+ tmp = realloc(blob->active.extent_pages, sizeof(uint32_t) * blob->active.num_extent_pages);
+ assert(tmp != NULL);
+ blob->active.extent_pages = tmp;
+#endif
+ blob->active.extent_pages_array_size = blob->active.num_extent_pages;
+ blob->active.cluster_array_size = blob->active.num_clusters;
+ }
+
+ /* TODO: Add path to persist clear extent pages. */
+ blob_persist_complete(seq, ctx, bserrno);
+}
+
+static void
+blob_persist_clear_clusters(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+ struct spdk_blob_persist_ctx *ctx = cb_arg;
+ struct spdk_blob *blob = ctx->blob;
+ struct spdk_blob_store *bs = blob->bs;
+ spdk_bs_batch_t *batch;
+ size_t i;
+ uint64_t lba;
+ uint32_t lba_count;
+
+ if (bserrno != 0) {
+ blob_persist_complete(seq, ctx, bserrno);
+ return;
+ }
+
+ /* Clusters don't move around in blobs. The list shrinks or grows
+ * at the end, but no changes ever occur in the middle of the list.
+ */
+
+ batch = bs_sequence_to_batch(seq, blob_persist_clear_clusters_cpl, ctx);
+
+ /* Clear all clusters that were truncated */
+ lba = 0;
+ lba_count = 0;
+ for (i = blob->active.num_clusters; i < blob->active.cluster_array_size; i++) {
+ uint64_t next_lba = blob->active.clusters[i];
+ uint32_t next_lba_count = bs_cluster_to_lba(bs, 1);
+
+ if (next_lba > 0 && (lba + lba_count) == next_lba) {
+ /* This cluster is contiguous with the previous one. */
+ lba_count += next_lba_count;
+ continue;
+ }
+
+ /* This cluster is not contiguous with the previous one. */
+
+ /* If a run of LBAs previously existing, clear them now */
+ if (lba_count > 0) {
+ bs_batch_clear_dev(ctx, batch, lba, lba_count);
+ }
+
+ /* Start building the next batch */
+ lba = next_lba;
+ if (next_lba > 0) {
+ lba_count = next_lba_count;
+ } else {
+ lba_count = 0;
+ }
+ }
+
+ /* If we ended with a contiguous set of LBAs, clear them now */
+ if (lba_count > 0) {
+ bs_batch_clear_dev(ctx, batch, lba, lba_count);
+ }
+
+ bs_batch_close(batch);
+}
+
+static void
+blob_persist_zero_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+ struct spdk_blob_persist_ctx *ctx = cb_arg;
+ struct spdk_blob *blob = ctx->blob;
+ struct spdk_blob_store *bs = blob->bs;
+ size_t i;
+
+ if (bserrno != 0) {
+ blob_persist_complete(seq, ctx, bserrno);
+ return;
+ }
+
+ /* This loop starts at 1 because the first page is special and handled
+ * below. The pages (except the first) are never written in place,
+ * so any pages in the clean list must be zeroed.
+ */
+ for (i = 1; i < blob->clean.num_pages; i++) {
+ bs_release_md_page(bs, blob->clean.pages[i]);
+ }
+
+ if (blob->active.num_pages == 0) {
+ uint32_t page_num;
+
+ page_num = bs_blobid_to_page(blob->id);
+ bs_release_md_page(bs, page_num);
+ }
+
+ /* Move on to clearing clusters */
+ blob_persist_clear_clusters(seq, ctx, 0);
+}
+
+static void
+blob_persist_zero_pages(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+ struct spdk_blob_persist_ctx *ctx = cb_arg;
+ struct spdk_blob *blob = ctx->blob;
+ struct spdk_blob_store *bs = blob->bs;
+ uint64_t lba;
+ uint32_t lba_count;
+ spdk_bs_batch_t *batch;
+ size_t i;
+
+ if (bserrno != 0) {
+ blob_persist_complete(seq, ctx, bserrno);
+ return;
+ }
+
+ batch = bs_sequence_to_batch(seq, blob_persist_zero_pages_cpl, ctx);
+
+ lba_count = bs_byte_to_lba(bs, SPDK_BS_PAGE_SIZE);
+
+ /* This loop starts at 1 because the first page is special and handled
+ * below. The pages (except the first) are never written in place,
+ * so any pages in the clean list must be zeroed.
+ */
+ for (i = 1; i < blob->clean.num_pages; i++) {
+ lba = bs_md_page_to_lba(bs, blob->clean.pages[i]);
+
+ bs_batch_write_zeroes_dev(batch, lba, lba_count);
+ }
+
+ /* The first page will only be zeroed if this is a delete. */
+ if (blob->active.num_pages == 0) {
+ uint32_t page_num;
+
+ /* The first page in the metadata goes where the blobid indicates */
+ page_num = bs_blobid_to_page(blob->id);
+ lba = bs_md_page_to_lba(bs, page_num);
+
+ bs_batch_write_zeroes_dev(batch, lba, lba_count);
+ }
+
+ bs_batch_close(batch);
+}
+
+static void
+blob_persist_write_page_root(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+ struct spdk_blob_persist_ctx *ctx = cb_arg;
+ struct spdk_blob *blob = ctx->blob;
+ struct spdk_blob_store *bs = blob->bs;
+ uint64_t lba;
+ uint32_t lba_count;
+ struct spdk_blob_md_page *page;
+
+ if (bserrno != 0) {
+ blob_persist_complete(seq, ctx, bserrno);
+ return;
+ }
+
+ if (blob->active.num_pages == 0) {
+ /* Move on to the next step */
+ blob_persist_zero_pages(seq, ctx, 0);
+ return;
+ }
+
+ lba_count = bs_byte_to_lba(bs, sizeof(*page));
+
+ page = &ctx->pages[0];
+ /* The first page in the metadata goes where the blobid indicates */
+ lba = bs_md_page_to_lba(bs, bs_blobid_to_page(blob->id));
+
+ bs_sequence_write_dev(seq, page, lba, lba_count,
+ blob_persist_zero_pages, ctx);
+}
+
+static void
+blob_persist_write_page_chain(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+ struct spdk_blob_persist_ctx *ctx = cb_arg;
+ struct spdk_blob *blob = ctx->blob;
+ struct spdk_blob_store *bs = blob->bs;
+ uint64_t lba;
+ uint32_t lba_count;
+ struct spdk_blob_md_page *page;
+ spdk_bs_batch_t *batch;
+ size_t i;
+
+ if (bserrno != 0) {
+ blob_persist_complete(seq, ctx, bserrno);
+ return;
+ }
+
+ /* Clusters don't move around in blobs. The list shrinks or grows
+ * at the end, but no changes ever occur in the middle of the list.
+ */
+
+ lba_count = bs_byte_to_lba(bs, sizeof(*page));
+
+ batch = bs_sequence_to_batch(seq, blob_persist_write_page_root, ctx);
+
+ /* This starts at 1. The root page is not written until
+ * all of the others are finished
+ */
+ for (i = 1; i < blob->active.num_pages; i++) {
+ page = &ctx->pages[i];
+ assert(page->sequence_num == i);
+
+ lba = bs_md_page_to_lba(bs, blob->active.pages[i]);
+
+ bs_batch_write_dev(batch, page, lba, lba_count);
+ }
+
+ bs_batch_close(batch);
+}
+
+static int
+blob_resize(struct spdk_blob *blob, uint64_t sz)
+{
+ uint64_t i;
+ uint64_t *tmp;
+ uint64_t lfc; /* lowest free cluster */
+ uint32_t lfmd; /* lowest free md page */
+ uint64_t num_clusters;
+ uint32_t *ep_tmp;
+ uint64_t new_num_ep = 0, current_num_ep = 0;
+ struct spdk_blob_store *bs;
+
+ bs = blob->bs;
+
+ blob_verify_md_op(blob);
+
+ if (blob->active.num_clusters == sz) {
+ return 0;
+ }
+
+ if (blob->active.num_clusters < blob->active.cluster_array_size) {
+ /* If this blob was resized to be larger, then smaller, then
+ * larger without syncing, then the cluster array already
+ * contains spare assigned clusters we can use.
+ */
+ num_clusters = spdk_min(blob->active.cluster_array_size,
+ sz);
+ } else {
+ num_clusters = blob->active.num_clusters;
+ }
+
+ if (blob->use_extent_table) {
+ /* Round up since every cluster beyond current Extent Table size,
+ * requires new extent page. */
+ new_num_ep = spdk_divide_round_up(sz, SPDK_EXTENTS_PER_EP);
+ current_num_ep = spdk_divide_round_up(num_clusters, SPDK_EXTENTS_PER_EP);
+ }
+
+ /* Do two passes - one to verify that we can obtain enough clusters
+ * and md pages, another to actually claim them.
+ */
+
+ if (spdk_blob_is_thin_provisioned(blob) == false) {
+ lfc = 0;
+ for (i = num_clusters; i < sz; i++) {
+ lfc = spdk_bit_array_find_first_clear(bs->used_clusters, lfc);
+ if (lfc == UINT32_MAX) {
+ /* No more free clusters. Cannot satisfy the request */
+ return -ENOSPC;
+ }
+ lfc++;
+ }
+ lfmd = 0;
+ for (i = current_num_ep; i < new_num_ep ; i++) {
+ lfmd = spdk_bit_array_find_first_clear(blob->bs->used_md_pages, lfmd);
+ if (lfmd == UINT32_MAX) {
+ /* No more free md pages. Cannot satisfy the request */
+ return -ENOSPC;
+ }
+ }
+ }
+
+ if (sz > num_clusters) {
+ /* Expand the cluster array if necessary.
+ * We only shrink the array when persisting.
+ */
+ tmp = realloc(blob->active.clusters, sizeof(*blob->active.clusters) * sz);
+ if (sz > 0 && tmp == NULL) {
+ return -ENOMEM;
+ }
+ memset(tmp + blob->active.cluster_array_size, 0,
+ sizeof(*blob->active.clusters) * (sz - blob->active.cluster_array_size));
+ blob->active.clusters = tmp;
+ blob->active.cluster_array_size = sz;
+
+ /* Expand the extents table, only if enough clusters were added */
+ if (new_num_ep > current_num_ep && blob->use_extent_table) {
+ ep_tmp = realloc(blob->active.extent_pages, sizeof(*blob->active.extent_pages) * new_num_ep);
+ if (new_num_ep > 0 && ep_tmp == NULL) {
+ return -ENOMEM;
+ }
+ memset(ep_tmp + blob->active.extent_pages_array_size, 0,
+ sizeof(*blob->active.extent_pages) * (new_num_ep - blob->active.extent_pages_array_size));
+ blob->active.extent_pages = ep_tmp;
+ blob->active.extent_pages_array_size = new_num_ep;
+ }
+ }
+
+ blob->state = SPDK_BLOB_STATE_DIRTY;
+
+ if (spdk_blob_is_thin_provisioned(blob) == false) {
+ lfc = 0;
+ lfmd = 0;
+ for (i = num_clusters; i < sz; i++) {
+ bs_allocate_cluster(blob, i, &lfc, &lfmd, true);
+ lfc++;
+ lfmd++;
+ }
+ }
+
+ blob->active.num_clusters = sz;
+ blob->active.num_extent_pages = new_num_ep;
+
+ return 0;
+}
+
+static void
+blob_persist_generate_new_md(struct spdk_blob_persist_ctx *ctx)
+{
+ spdk_bs_sequence_t *seq = ctx->seq;
+ struct spdk_blob *blob = ctx->blob;
+ struct spdk_blob_store *bs = blob->bs;
+ uint64_t i;
+ uint32_t page_num;
+ void *tmp;
+ int rc;
+
+ /* Generate the new metadata */
+ rc = blob_serialize(blob, &ctx->pages, &blob->active.num_pages);
+ if (rc < 0) {
+ blob_persist_complete(seq, ctx, rc);
+ return;
+ }
+
+ assert(blob->active.num_pages >= 1);
+
+ /* Resize the cache of page indices */
+ tmp = realloc(blob->active.pages, blob->active.num_pages * sizeof(*blob->active.pages));
+ if (!tmp) {
+ blob_persist_complete(seq, ctx, -ENOMEM);
+ return;
+ }
+ blob->active.pages = tmp;
+
+ /* Assign this metadata to pages. This requires two passes -
+ * one to verify that there are enough pages and a second
+ * to actually claim them. */
+ page_num = 0;
+ /* Note that this loop starts at one. The first page location is fixed by the blobid. */
+ for (i = 1; i < blob->active.num_pages; i++) {
+ page_num = spdk_bit_array_find_first_clear(bs->used_md_pages, page_num);
+ if (page_num == UINT32_MAX) {
+ blob_persist_complete(seq, ctx, -ENOMEM);
+ return;
+ }
+ page_num++;
+ }
+
+ page_num = 0;
+ blob->active.pages[0] = bs_blobid_to_page(blob->id);
+ for (i = 1; i < blob->active.num_pages; i++) {
+ page_num = spdk_bit_array_find_first_clear(bs->used_md_pages, page_num);
+ ctx->pages[i - 1].next = page_num;
+ /* Now that previous metadata page is complete, calculate the crc for it. */
+ ctx->pages[i - 1].crc = blob_md_page_calc_crc(&ctx->pages[i - 1]);
+ blob->active.pages[i] = page_num;
+ bs_claim_md_page(bs, page_num);
+ SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Claiming page %u for blob %lu\n", page_num, blob->id);
+ page_num++;
+ }
+ ctx->pages[i - 1].crc = blob_md_page_calc_crc(&ctx->pages[i - 1]);
+ /* Start writing the metadata from last page to first */
+ blob->state = SPDK_BLOB_STATE_CLEAN;
+ blob_persist_write_page_chain(seq, ctx, 0);
+}
+
+static void
+blob_persist_write_extent_pages(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+ struct spdk_blob_persist_ctx *ctx = cb_arg;
+ struct spdk_blob *blob = ctx->blob;
+ size_t i;
+ uint32_t extent_page_id;
+ uint32_t page_count = 0;
+ int rc;
+
+ if (ctx->extent_page != NULL) {
+ spdk_free(ctx->extent_page);
+ ctx->extent_page = NULL;
+ }
+
+ if (bserrno != 0) {
+ blob_persist_complete(seq, ctx, bserrno);
+ return;
+ }
+
+ /* Only write out changed extent pages */
+ for (i = ctx->next_extent_page; i < blob->active.num_extent_pages; i++) {
+ extent_page_id = blob->active.extent_pages[i];
+ if (extent_page_id == 0) {
+ /* No Extent Page to persist */
+ assert(spdk_blob_is_thin_provisioned(blob));
+ continue;
+ }
+ /* Writing out new extent page for the first time. Either active extent pages is larger
+ * than clean extent pages or there was no extent page assigned due to thin provisioning. */
+ if (i >= blob->clean.extent_pages_array_size || blob->clean.extent_pages[i] == 0) {
+ blob->state = SPDK_BLOB_STATE_DIRTY;
+ assert(spdk_bit_array_get(blob->bs->used_md_pages, extent_page_id));
+ ctx->next_extent_page = i + 1;
+ rc = blob_serialize_add_page(ctx->blob, &ctx->extent_page, &page_count, &ctx->extent_page);
+ if (rc < 0) {
+ blob_persist_complete(seq, ctx, rc);
+ return;
+ }
+
+ blob_serialize_extent_page(blob, i * SPDK_EXTENTS_PER_EP, ctx->extent_page);
+
+ ctx->extent_page->crc = blob_md_page_calc_crc(ctx->extent_page);
+
+ bs_sequence_write_dev(seq, ctx->extent_page, bs_md_page_to_lba(blob->bs, extent_page_id),
+ bs_byte_to_lba(blob->bs, SPDK_BS_PAGE_SIZE),
+ blob_persist_write_extent_pages, ctx);
+ return;
+ }
+ assert(blob->clean.extent_pages[i] != 0);
+ }
+
+ blob_persist_generate_new_md(ctx);
+}
+
+static void
+blob_persist_start(struct spdk_blob_persist_ctx *ctx)
+{
+ spdk_bs_sequence_t *seq = ctx->seq;
+ struct spdk_blob *blob = ctx->blob;
+
+ if (blob->active.num_pages == 0) {
+ /* This is the signal that the blob should be deleted.
+ * Immediately jump to the clean up routine. */
+ assert(blob->clean.num_pages > 0);
+ blob->state = SPDK_BLOB_STATE_CLEAN;
+ blob_persist_zero_pages(seq, ctx, 0);
+ return;
+
+ }
+
+ blob_persist_write_extent_pages(seq, ctx, 0);
+}
+
+static void
+blob_persist_dirty_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+ struct spdk_blob_persist_ctx *ctx = cb_arg;
+
+ spdk_free(ctx->super);
+
+ if (bserrno != 0) {
+ blob_persist_complete(seq, ctx, bserrno);
+ return;
+ }
+
+ ctx->blob->bs->clean = 0;
+
+ blob_persist_start(ctx);
+}
+
+static void
+bs_write_super(spdk_bs_sequence_t *seq, struct spdk_blob_store *bs,
+ struct spdk_bs_super_block *super, spdk_bs_sequence_cpl cb_fn, void *cb_arg);
+
+
+static void
+blob_persist_dirty(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+ struct spdk_blob_persist_ctx *ctx = cb_arg;
+
+ if (bserrno != 0) {
+ spdk_free(ctx->super);
+ blob_persist_complete(seq, ctx, bserrno);
+ return;
+ }
+
+ ctx->super->clean = 0;
+ if (ctx->super->size == 0) {
+ ctx->super->size = ctx->blob->bs->dev->blockcnt * ctx->blob->bs->dev->blocklen;
+ }
+
+ bs_write_super(seq, ctx->blob->bs, ctx->super, blob_persist_dirty_cpl, ctx);
+}
+
+static void
+blob_persist_check_dirty(struct spdk_blob_persist_ctx *ctx)
+{
+ if (ctx->blob->bs->clean) {
+ ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL,
+ SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
+ if (!ctx->super) {
+ blob_persist_complete(ctx->seq, ctx, -ENOMEM);
+ return;
+ }
+
+ bs_sequence_read_dev(ctx->seq, ctx->super, bs_page_to_lba(ctx->blob->bs, 0),
+ bs_byte_to_lba(ctx->blob->bs, sizeof(*ctx->super)),
+ blob_persist_dirty, ctx);
+ } else {
+ blob_persist_start(ctx);
+ }
+}
+
+/* Write a blob to disk */
+static void
+blob_persist(spdk_bs_sequence_t *seq, struct spdk_blob *blob,
+ spdk_bs_sequence_cpl cb_fn, void *cb_arg)
+{
+ struct spdk_blob_persist_ctx *ctx;
+
+ blob_verify_md_op(blob);
+
+ if (blob->state == SPDK_BLOB_STATE_CLEAN && TAILQ_EMPTY(&blob->pending_persists)) {
+ cb_fn(seq, cb_arg, 0);
+ return;
+ }
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx) {
+ cb_fn(seq, cb_arg, -ENOMEM);
+ return;
+ }
+ ctx->blob = blob;
+ ctx->seq = seq;
+ ctx->cb_fn = cb_fn;
+ ctx->cb_arg = cb_arg;
+ ctx->next_extent_page = 0;
+
+ /* Multiple blob persists can affect one another, via blob->state or
+ * blob mutable data changes. To prevent it, queue up the persists. */
+ if (!TAILQ_EMPTY(&blob->pending_persists)) {
+ TAILQ_INSERT_TAIL(&blob->pending_persists, ctx, link);
+ return;
+ }
+ TAILQ_INSERT_HEAD(&blob->pending_persists, ctx, link);
+
+ blob_persist_check_dirty(ctx);
+}
+
+struct spdk_blob_copy_cluster_ctx {
+ struct spdk_blob *blob;
+ uint8_t *buf;
+ uint64_t page;
+ uint64_t new_cluster;
+ uint32_t new_extent_page;
+ spdk_bs_sequence_t *seq;
+};
+
+static void
+blob_allocate_and_copy_cluster_cpl(void *cb_arg, int bserrno)
+{
+ struct spdk_blob_copy_cluster_ctx *ctx = cb_arg;
+ struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)ctx->seq;
+ TAILQ_HEAD(, spdk_bs_request_set) requests;
+ spdk_bs_user_op_t *op;
+
+ TAILQ_INIT(&requests);
+ TAILQ_SWAP(&set->channel->need_cluster_alloc, &requests, spdk_bs_request_set, link);
+
+ while (!TAILQ_EMPTY(&requests)) {
+ op = TAILQ_FIRST(&requests);
+ TAILQ_REMOVE(&requests, op, link);
+ if (bserrno == 0) {
+ bs_user_op_execute(op);
+ } else {
+ bs_user_op_abort(op);
+ }
+ }
+
+ spdk_free(ctx->buf);
+ free(ctx);
+}
+
+static void
+blob_insert_cluster_cpl(void *cb_arg, int bserrno)
+{
+ struct spdk_blob_copy_cluster_ctx *ctx = cb_arg;
+
+ if (bserrno) {
+ if (bserrno == -EEXIST) {
+ /* The metadata insert failed because another thread
+ * allocated the cluster first. Free our cluster
+ * but continue without error. */
+ bserrno = 0;
+ }
+ bs_release_cluster(ctx->blob->bs, ctx->new_cluster);
+ if (ctx->new_extent_page != 0) {
+ bs_release_md_page(ctx->blob->bs, ctx->new_extent_page);
+ }
+ }
+
+ bs_sequence_finish(ctx->seq, bserrno);
+}
+
+static void
+blob_write_copy_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+ struct spdk_blob_copy_cluster_ctx *ctx = cb_arg;
+ uint32_t cluster_number;
+
+ if (bserrno) {
+ /* The write failed, so jump to the final completion handler */
+ bs_sequence_finish(seq, bserrno);
+ return;
+ }
+
+ cluster_number = bs_page_to_cluster(ctx->blob->bs, ctx->page);
+
+ blob_insert_cluster_on_md_thread(ctx->blob, cluster_number, ctx->new_cluster,
+ ctx->new_extent_page, blob_insert_cluster_cpl, ctx);
+}
+
+static void
+blob_write_copy(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+ struct spdk_blob_copy_cluster_ctx *ctx = cb_arg;
+
+ if (bserrno != 0) {
+ /* The read failed, so jump to the final completion handler */
+ bs_sequence_finish(seq, bserrno);
+ return;
+ }
+
+ /* Write whole cluster */
+ bs_sequence_write_dev(seq, ctx->buf,
+ bs_cluster_to_lba(ctx->blob->bs, ctx->new_cluster),
+ bs_cluster_to_lba(ctx->blob->bs, 1),
+ blob_write_copy_cpl, ctx);
+}
+
+static void
+bs_allocate_and_copy_cluster(struct spdk_blob *blob,
+ struct spdk_io_channel *_ch,
+ uint64_t io_unit, spdk_bs_user_op_t *op)
+{
+ struct spdk_bs_cpl cpl;
+ struct spdk_bs_channel *ch;
+ struct spdk_blob_copy_cluster_ctx *ctx;
+ uint32_t cluster_start_page;
+ uint32_t cluster_number;
+ int rc;
+
+ ch = spdk_io_channel_get_ctx(_ch);
+
+ if (!TAILQ_EMPTY(&ch->need_cluster_alloc)) {
+ /* There are already operations pending. Queue this user op
+ * and return because it will be re-executed when the outstanding
+ * cluster allocation completes. */
+ TAILQ_INSERT_TAIL(&ch->need_cluster_alloc, op, link);
+ return;
+ }
+
+ /* Round the io_unit offset down to the first page in the cluster */
+ cluster_start_page = bs_io_unit_to_cluster_start(blob, io_unit);
+
+ /* Calculate which index in the metadata cluster array the corresponding
+ * cluster is supposed to be at. */
+ cluster_number = bs_io_unit_to_cluster_number(blob, io_unit);
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx) {
+ bs_user_op_abort(op);
+ return;
+ }
+
+ assert(blob->bs->cluster_sz % blob->back_bs_dev->blocklen == 0);
+
+ ctx->blob = blob;
+ ctx->page = cluster_start_page;
+
+ if (blob->parent_id != SPDK_BLOBID_INVALID) {
+ ctx->buf = spdk_malloc(blob->bs->cluster_sz, blob->back_bs_dev->blocklen,
+ NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
+ if (!ctx->buf) {
+ SPDK_ERRLOG("DMA allocation for cluster of size = %" PRIu32 " failed.\n",
+ blob->bs->cluster_sz);
+ free(ctx);
+ bs_user_op_abort(op);
+ return;
+ }
+ }
+
+ rc = bs_allocate_cluster(blob, cluster_number, &ctx->new_cluster, &ctx->new_extent_page,
+ false);
+ if (rc != 0) {
+ spdk_free(ctx->buf);
+ free(ctx);
+ bs_user_op_abort(op);
+ return;
+ }
+
+ cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
+ cpl.u.blob_basic.cb_fn = blob_allocate_and_copy_cluster_cpl;
+ cpl.u.blob_basic.cb_arg = ctx;
+
+ ctx->seq = bs_sequence_start(_ch, &cpl);
+ if (!ctx->seq) {
+ bs_release_cluster(blob->bs, ctx->new_cluster);
+ spdk_free(ctx->buf);
+ free(ctx);
+ bs_user_op_abort(op);
+ return;
+ }
+
+ /* Queue the user op to block other incoming operations */
+ TAILQ_INSERT_TAIL(&ch->need_cluster_alloc, op, link);
+
+ if (blob->parent_id != SPDK_BLOBID_INVALID) {
+ /* Read cluster from backing device */
+ bs_sequence_read_bs_dev(ctx->seq, blob->back_bs_dev, ctx->buf,
+ bs_dev_page_to_lba(blob->back_bs_dev, cluster_start_page),
+ bs_dev_byte_to_lba(blob->back_bs_dev, blob->bs->cluster_sz),
+ blob_write_copy, ctx);
+ } else {
+ blob_insert_cluster_on_md_thread(ctx->blob, cluster_number, ctx->new_cluster,
+ ctx->new_extent_page, blob_insert_cluster_cpl, ctx);
+ }
+}
+
+static inline void
+blob_calculate_lba_and_lba_count(struct spdk_blob *blob, uint64_t io_unit, uint64_t length,
+ uint64_t *lba, uint32_t *lba_count)
+{
+ *lba_count = length;
+
+ if (!bs_io_unit_is_allocated(blob, io_unit)) {
+ assert(blob->back_bs_dev != NULL);
+ *lba = bs_io_unit_to_back_dev_lba(blob, io_unit);
+ *lba_count = bs_io_unit_to_back_dev_lba(blob, *lba_count);
+ } else {
+ *lba = bs_blob_io_unit_to_lba(blob, io_unit);
+ }
+}
+
+struct op_split_ctx {
+ struct spdk_blob *blob;
+ struct spdk_io_channel *channel;
+ uint64_t io_unit_offset;
+ uint64_t io_units_remaining;
+ void *curr_payload;
+ enum spdk_blob_op_type op_type;
+ spdk_bs_sequence_t *seq;
+};
+
+static void
+blob_request_submit_op_split_next(void *cb_arg, int bserrno)
+{
+ struct op_split_ctx *ctx = cb_arg;
+ struct spdk_blob *blob = ctx->blob;
+ struct spdk_io_channel *ch = ctx->channel;
+ enum spdk_blob_op_type op_type = ctx->op_type;
+ uint8_t *buf = ctx->curr_payload;
+ uint64_t offset = ctx->io_unit_offset;
+ uint64_t length = ctx->io_units_remaining;
+ uint64_t op_length;
+
+ if (bserrno != 0 || ctx->io_units_remaining == 0) {
+ bs_sequence_finish(ctx->seq, bserrno);
+ free(ctx);
+ return;
+ }
+
+ op_length = spdk_min(length, bs_num_io_units_to_cluster_boundary(blob,
+ offset));
+
+ /* Update length and payload for next operation */
+ ctx->io_units_remaining -= op_length;
+ ctx->io_unit_offset += op_length;
+ if (op_type == SPDK_BLOB_WRITE || op_type == SPDK_BLOB_READ) {
+ ctx->curr_payload += op_length * blob->bs->io_unit_size;
+ }
+
+ switch (op_type) {
+ case SPDK_BLOB_READ:
+ spdk_blob_io_read(blob, ch, buf, offset, op_length,
+ blob_request_submit_op_split_next, ctx);
+ break;
+ case SPDK_BLOB_WRITE:
+ spdk_blob_io_write(blob, ch, buf, offset, op_length,
+ blob_request_submit_op_split_next, ctx);
+ break;
+ case SPDK_BLOB_UNMAP:
+ spdk_blob_io_unmap(blob, ch, offset, op_length,
+ blob_request_submit_op_split_next, ctx);
+ break;
+ case SPDK_BLOB_WRITE_ZEROES:
+ spdk_blob_io_write_zeroes(blob, ch, offset, op_length,
+ blob_request_submit_op_split_next, ctx);
+ break;
+ case SPDK_BLOB_READV:
+ case SPDK_BLOB_WRITEV:
+ SPDK_ERRLOG("readv/write not valid\n");
+ bs_sequence_finish(ctx->seq, -EINVAL);
+ free(ctx);
+ break;
+ }
+}
+
+static void
+blob_request_submit_op_split(struct spdk_io_channel *ch, struct spdk_blob *blob,
+ void *payload, uint64_t offset, uint64_t length,
+ spdk_blob_op_complete cb_fn, void *cb_arg, enum spdk_blob_op_type op_type)
+{
+ struct op_split_ctx *ctx;
+ spdk_bs_sequence_t *seq;
+ struct spdk_bs_cpl cpl;
+
+ assert(blob != NULL);
+
+ ctx = calloc(1, sizeof(struct op_split_ctx));
+ if (ctx == NULL) {
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
+ cpl.u.blob_basic.cb_fn = cb_fn;
+ cpl.u.blob_basic.cb_arg = cb_arg;
+
+ seq = bs_sequence_start(ch, &cpl);
+ if (!seq) {
+ free(ctx);
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ ctx->blob = blob;
+ ctx->channel = ch;
+ ctx->curr_payload = payload;
+ ctx->io_unit_offset = offset;
+ ctx->io_units_remaining = length;
+ ctx->op_type = op_type;
+ ctx->seq = seq;
+
+ blob_request_submit_op_split_next(ctx, 0);
+}
+
+static void
+blob_request_submit_op_single(struct spdk_io_channel *_ch, struct spdk_blob *blob,
+ void *payload, uint64_t offset, uint64_t length,
+ spdk_blob_op_complete cb_fn, void *cb_arg, enum spdk_blob_op_type op_type)
+{
+ struct spdk_bs_cpl cpl;
+ uint64_t lba;
+ uint32_t lba_count;
+
+ assert(blob != NULL);
+
+ cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
+ cpl.u.blob_basic.cb_fn = cb_fn;
+ cpl.u.blob_basic.cb_arg = cb_arg;
+
+ blob_calculate_lba_and_lba_count(blob, offset, length, &lba, &lba_count);
+
+ if (blob->frozen_refcnt) {
+ /* This blob I/O is frozen */
+ spdk_bs_user_op_t *op;
+ struct spdk_bs_channel *bs_channel = spdk_io_channel_get_ctx(_ch);
+
+ op = bs_user_op_alloc(_ch, &cpl, op_type, blob, payload, 0, offset, length);
+ if (!op) {
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ TAILQ_INSERT_TAIL(&bs_channel->queued_io, op, link);
+
+ return;
+ }
+
+ switch (op_type) {
+ case SPDK_BLOB_READ: {
+ spdk_bs_batch_t *batch;
+
+ batch = bs_batch_open(_ch, &cpl);
+ if (!batch) {
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ if (bs_io_unit_is_allocated(blob, offset)) {
+ /* Read from the blob */
+ bs_batch_read_dev(batch, payload, lba, lba_count);
+ } else {
+ /* Read from the backing block device */
+ bs_batch_read_bs_dev(batch, blob->back_bs_dev, payload, lba, lba_count);
+ }
+
+ bs_batch_close(batch);
+ break;
+ }
+ case SPDK_BLOB_WRITE:
+ case SPDK_BLOB_WRITE_ZEROES: {
+ if (bs_io_unit_is_allocated(blob, offset)) {
+ /* Write to the blob */
+ spdk_bs_batch_t *batch;
+
+ if (lba_count == 0) {
+ cb_fn(cb_arg, 0);
+ return;
+ }
+
+ batch = bs_batch_open(_ch, &cpl);
+ if (!batch) {
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ if (op_type == SPDK_BLOB_WRITE) {
+ bs_batch_write_dev(batch, payload, lba, lba_count);
+ } else {
+ bs_batch_write_zeroes_dev(batch, lba, lba_count);
+ }
+
+ bs_batch_close(batch);
+ } else {
+ /* Queue this operation and allocate the cluster */
+ spdk_bs_user_op_t *op;
+
+ op = bs_user_op_alloc(_ch, &cpl, op_type, blob, payload, 0, offset, length);
+ if (!op) {
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ bs_allocate_and_copy_cluster(blob, _ch, offset, op);
+ }
+ break;
+ }
+ case SPDK_BLOB_UNMAP: {
+ spdk_bs_batch_t *batch;
+
+ batch = bs_batch_open(_ch, &cpl);
+ if (!batch) {
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ if (bs_io_unit_is_allocated(blob, offset)) {
+ bs_batch_unmap_dev(batch, lba, lba_count);
+ }
+
+ bs_batch_close(batch);
+ break;
+ }
+ case SPDK_BLOB_READV:
+ case SPDK_BLOB_WRITEV:
+ SPDK_ERRLOG("readv/write not valid\n");
+ cb_fn(cb_arg, -EINVAL);
+ break;
+ }
+}
+
+static void
+blob_request_submit_op(struct spdk_blob *blob, struct spdk_io_channel *_channel,
+ void *payload, uint64_t offset, uint64_t length,
+ spdk_blob_op_complete cb_fn, void *cb_arg, enum spdk_blob_op_type op_type)
+{
+ assert(blob != NULL);
+
+ if (blob->data_ro && op_type != SPDK_BLOB_READ) {
+ cb_fn(cb_arg, -EPERM);
+ return;
+ }
+
+ if (offset + length > bs_cluster_to_lba(blob->bs, blob->active.num_clusters)) {
+ cb_fn(cb_arg, -EINVAL);
+ return;
+ }
+ if (length <= bs_num_io_units_to_cluster_boundary(blob, offset)) {
+ blob_request_submit_op_single(_channel, blob, payload, offset, length,
+ cb_fn, cb_arg, op_type);
+ } else {
+ blob_request_submit_op_split(_channel, blob, payload, offset, length,
+ cb_fn, cb_arg, op_type);
+ }
+}
+
+struct rw_iov_ctx {
+ struct spdk_blob *blob;
+ struct spdk_io_channel *channel;
+ spdk_blob_op_complete cb_fn;
+ void *cb_arg;
+ bool read;
+ int iovcnt;
+ struct iovec *orig_iov;
+ uint64_t io_unit_offset;
+ uint64_t io_units_remaining;
+ uint64_t io_units_done;
+ struct iovec iov[0];
+};
+
+static void
+rw_iov_done(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+ assert(cb_arg == NULL);
+ bs_sequence_finish(seq, bserrno);
+}
+
+static void
+rw_iov_split_next(void *cb_arg, int bserrno)
+{
+ struct rw_iov_ctx *ctx = cb_arg;
+ struct spdk_blob *blob = ctx->blob;
+ struct iovec *iov, *orig_iov;
+ int iovcnt;
+ size_t orig_iovoff;
+ uint64_t io_units_count, io_units_to_boundary, io_unit_offset;
+ uint64_t byte_count;
+
+ if (bserrno != 0 || ctx->io_units_remaining == 0) {
+ ctx->cb_fn(ctx->cb_arg, bserrno);
+ free(ctx);
+ return;
+ }
+
+ io_unit_offset = ctx->io_unit_offset;
+ io_units_to_boundary = bs_num_io_units_to_cluster_boundary(blob, io_unit_offset);
+ io_units_count = spdk_min(ctx->io_units_remaining, io_units_to_boundary);
+ /*
+ * Get index and offset into the original iov array for our current position in the I/O sequence.
+ * byte_count will keep track of how many bytes remaining until orig_iov and orig_iovoff will
+ * point to the current position in the I/O sequence.
+ */
+ byte_count = ctx->io_units_done * blob->bs->io_unit_size;
+ orig_iov = &ctx->orig_iov[0];
+ orig_iovoff = 0;
+ while (byte_count > 0) {
+ if (byte_count >= orig_iov->iov_len) {
+ byte_count -= orig_iov->iov_len;
+ orig_iov++;
+ } else {
+ orig_iovoff = byte_count;
+ byte_count = 0;
+ }
+ }
+
+ /*
+ * Build an iov array for the next I/O in the sequence. byte_count will keep track of how many
+ * bytes of this next I/O remain to be accounted for in the new iov array.
+ */
+ byte_count = io_units_count * blob->bs->io_unit_size;
+ iov = &ctx->iov[0];
+ iovcnt = 0;
+ while (byte_count > 0) {
+ assert(iovcnt < ctx->iovcnt);
+ iov->iov_len = spdk_min(byte_count, orig_iov->iov_len - orig_iovoff);
+ iov->iov_base = orig_iov->iov_base + orig_iovoff;
+ byte_count -= iov->iov_len;
+ orig_iovoff = 0;
+ orig_iov++;
+ iov++;
+ iovcnt++;
+ }
+
+ ctx->io_unit_offset += io_units_count;
+ ctx->io_units_remaining -= io_units_count;
+ ctx->io_units_done += io_units_count;
+ iov = &ctx->iov[0];
+
+ if (ctx->read) {
+ spdk_blob_io_readv(ctx->blob, ctx->channel, iov, iovcnt, io_unit_offset,
+ io_units_count, rw_iov_split_next, ctx);
+ } else {
+ spdk_blob_io_writev(ctx->blob, ctx->channel, iov, iovcnt, io_unit_offset,
+ io_units_count, rw_iov_split_next, ctx);
+ }
+}
+
+static void
+blob_request_submit_rw_iov(struct spdk_blob *blob, struct spdk_io_channel *_channel,
+ struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length,
+ spdk_blob_op_complete cb_fn, void *cb_arg, bool read)
+{
+ struct spdk_bs_cpl cpl;
+
+ assert(blob != NULL);
+
+ if (!read && blob->data_ro) {
+ cb_fn(cb_arg, -EPERM);
+ return;
+ }
+
+ if (length == 0) {
+ cb_fn(cb_arg, 0);
+ return;
+ }
+
+ if (offset + length > bs_cluster_to_lba(blob->bs, blob->active.num_clusters)) {
+ cb_fn(cb_arg, -EINVAL);
+ return;
+ }
+
+ /*
+ * For now, we implement readv/writev using a sequence (instead of a batch) to account for having
+ * to split a request that spans a cluster boundary. For I/O that do not span a cluster boundary,
+ * there will be no noticeable difference compared to using a batch. For I/O that do span a cluster
+ * boundary, the target LBAs (after blob offset to LBA translation) may not be contiguous, so we need
+ * to allocate a separate iov array and split the I/O such that none of the resulting
+ * smaller I/O cross a cluster boundary. These smaller I/O will be issued in sequence (not in parallel)
+ * but since this case happens very infrequently, any performance impact will be negligible.
+ *
+ * This could be optimized in the future to allocate a big enough iov array to account for all of the iovs
+ * for all of the smaller I/Os, pre-build all of the iov arrays for the smaller I/Os, then issue them
+ * in a batch. That would also require creating an intermediate spdk_bs_cpl that would get called
+ * when the batch was completed, to allow for freeing the memory for the iov arrays.
+ */
+ if (spdk_likely(length <= bs_num_io_units_to_cluster_boundary(blob, offset))) {
+ uint32_t lba_count;
+ uint64_t lba;
+
+ cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
+ cpl.u.blob_basic.cb_fn = cb_fn;
+ cpl.u.blob_basic.cb_arg = cb_arg;
+
+ if (blob->frozen_refcnt) {
+ /* This blob I/O is frozen */
+ enum spdk_blob_op_type op_type;
+ spdk_bs_user_op_t *op;
+ struct spdk_bs_channel *bs_channel = spdk_io_channel_get_ctx(_channel);
+
+ op_type = read ? SPDK_BLOB_READV : SPDK_BLOB_WRITEV;
+ op = bs_user_op_alloc(_channel, &cpl, op_type, blob, iov, iovcnt, offset, length);
+ if (!op) {
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ TAILQ_INSERT_TAIL(&bs_channel->queued_io, op, link);
+
+ return;
+ }
+
+ blob_calculate_lba_and_lba_count(blob, offset, length, &lba, &lba_count);
+
+ if (read) {
+ spdk_bs_sequence_t *seq;
+
+ seq = bs_sequence_start(_channel, &cpl);
+ if (!seq) {
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ if (bs_io_unit_is_allocated(blob, offset)) {
+ bs_sequence_readv_dev(seq, iov, iovcnt, lba, lba_count, rw_iov_done, NULL);
+ } else {
+ bs_sequence_readv_bs_dev(seq, blob->back_bs_dev, iov, iovcnt, lba, lba_count,
+ rw_iov_done, NULL);
+ }
+ } else {
+ if (bs_io_unit_is_allocated(blob, offset)) {
+ spdk_bs_sequence_t *seq;
+
+ seq = bs_sequence_start(_channel, &cpl);
+ if (!seq) {
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ bs_sequence_writev_dev(seq, iov, iovcnt, lba, lba_count, rw_iov_done, NULL);
+ } else {
+ /* Queue this operation and allocate the cluster */
+ spdk_bs_user_op_t *op;
+
+ op = bs_user_op_alloc(_channel, &cpl, SPDK_BLOB_WRITEV, blob, iov, iovcnt, offset,
+ length);
+ if (!op) {
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ bs_allocate_and_copy_cluster(blob, _channel, offset, op);
+ }
+ }
+ } else {
+ struct rw_iov_ctx *ctx;
+
+ ctx = calloc(1, sizeof(struct rw_iov_ctx) + iovcnt * sizeof(struct iovec));
+ if (ctx == NULL) {
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ ctx->blob = blob;
+ ctx->channel = _channel;
+ ctx->cb_fn = cb_fn;
+ ctx->cb_arg = cb_arg;
+ ctx->read = read;
+ ctx->orig_iov = iov;
+ ctx->iovcnt = iovcnt;
+ ctx->io_unit_offset = offset;
+ ctx->io_units_remaining = length;
+ ctx->io_units_done = 0;
+
+ rw_iov_split_next(ctx, 0);
+ }
+}
+
+static struct spdk_blob *
+blob_lookup(struct spdk_blob_store *bs, spdk_blob_id blobid)
+{
+ struct spdk_blob *blob;
+
+ if (spdk_bit_array_get(bs->open_blobids, blobid) == 0) {
+ return NULL;
+ }
+
+ TAILQ_FOREACH(blob, &bs->blobs, link) {
+ if (blob->id == blobid) {
+ return blob;
+ }
+ }
+
+ return NULL;
+}
+
+static void
+blob_get_snapshot_and_clone_entries(struct spdk_blob *blob,
+ struct spdk_blob_list **snapshot_entry, struct spdk_blob_list **clone_entry)
+{
+ assert(blob != NULL);
+ *snapshot_entry = NULL;
+ *clone_entry = NULL;
+
+ if (blob->parent_id == SPDK_BLOBID_INVALID) {
+ return;
+ }
+
+ TAILQ_FOREACH(*snapshot_entry, &blob->bs->snapshots, link) {
+ if ((*snapshot_entry)->id == blob->parent_id) {
+ break;
+ }
+ }
+
+ if (*snapshot_entry != NULL) {
+ TAILQ_FOREACH(*clone_entry, &(*snapshot_entry)->clones, link) {
+ if ((*clone_entry)->id == blob->id) {
+ break;
+ }
+ }
+
+ assert(clone_entry != NULL);
+ }
+}
+
+static int
+bs_channel_create(void *io_device, void *ctx_buf)
+{
+ struct spdk_blob_store *bs = io_device;
+ struct spdk_bs_channel *channel = ctx_buf;
+ struct spdk_bs_dev *dev;
+ uint32_t max_ops = bs->max_channel_ops;
+ uint32_t i;
+
+ dev = bs->dev;
+
+ channel->req_mem = calloc(max_ops, sizeof(struct spdk_bs_request_set));
+ if (!channel->req_mem) {
+ return -1;
+ }
+
+ TAILQ_INIT(&channel->reqs);
+
+ for (i = 0; i < max_ops; i++) {
+ TAILQ_INSERT_TAIL(&channel->reqs, &channel->req_mem[i], link);
+ }
+
+ channel->bs = bs;
+ channel->dev = dev;
+ channel->dev_channel = dev->create_channel(dev);
+
+ if (!channel->dev_channel) {
+ SPDK_ERRLOG("Failed to create device channel.\n");
+ free(channel->req_mem);
+ return -1;
+ }
+
+ TAILQ_INIT(&channel->need_cluster_alloc);
+ TAILQ_INIT(&channel->queued_io);
+
+ return 0;
+}
+
+static void
+bs_channel_destroy(void *io_device, void *ctx_buf)
+{
+ struct spdk_bs_channel *channel = ctx_buf;
+ spdk_bs_user_op_t *op;
+
+ while (!TAILQ_EMPTY(&channel->need_cluster_alloc)) {
+ op = TAILQ_FIRST(&channel->need_cluster_alloc);
+ TAILQ_REMOVE(&channel->need_cluster_alloc, op, link);
+ bs_user_op_abort(op);
+ }
+
+ while (!TAILQ_EMPTY(&channel->queued_io)) {
+ op = TAILQ_FIRST(&channel->queued_io);
+ TAILQ_REMOVE(&channel->queued_io, op, link);
+ bs_user_op_abort(op);
+ }
+
+ free(channel->req_mem);
+ channel->dev->destroy_channel(channel->dev, channel->dev_channel);
+}
+
+static void
+bs_dev_destroy(void *io_device)
+{
+ struct spdk_blob_store *bs = io_device;
+ struct spdk_blob *blob, *blob_tmp;
+
+ bs->dev->destroy(bs->dev);
+
+ TAILQ_FOREACH_SAFE(blob, &bs->blobs, link, blob_tmp) {
+ TAILQ_REMOVE(&bs->blobs, blob, link);
+ spdk_bit_array_clear(bs->open_blobids, blob->id);
+ blob_free(blob);
+ }
+
+ pthread_mutex_destroy(&bs->used_clusters_mutex);
+
+ spdk_bit_array_free(&bs->open_blobids);
+ spdk_bit_array_free(&bs->used_blobids);
+ spdk_bit_array_free(&bs->used_md_pages);
+ spdk_bit_array_free(&bs->used_clusters);
+ /*
+ * If this function is called for any reason except a successful unload,
+ * the unload_cpl type will be NONE and this will be a nop.
+ */
+ bs_call_cpl(&bs->unload_cpl, bs->unload_err);
+
+ free(bs);
+}
+
+static int
+bs_blob_list_add(struct spdk_blob *blob)
+{
+ spdk_blob_id snapshot_id;
+ struct spdk_blob_list *snapshot_entry = NULL;
+ struct spdk_blob_list *clone_entry = NULL;
+
+ assert(blob != NULL);
+
+ snapshot_id = blob->parent_id;
+ if (snapshot_id == SPDK_BLOBID_INVALID) {
+ return 0;
+ }
+
+ snapshot_entry = bs_get_snapshot_entry(blob->bs, snapshot_id);
+ if (snapshot_entry == NULL) {
+ /* Snapshot not found */
+ snapshot_entry = calloc(1, sizeof(struct spdk_blob_list));
+ if (snapshot_entry == NULL) {
+ return -ENOMEM;
+ }
+ snapshot_entry->id = snapshot_id;
+ TAILQ_INIT(&snapshot_entry->clones);
+ TAILQ_INSERT_TAIL(&blob->bs->snapshots, snapshot_entry, link);
+ } else {
+ TAILQ_FOREACH(clone_entry, &snapshot_entry->clones, link) {
+ if (clone_entry->id == blob->id) {
+ break;
+ }
+ }
+ }
+
+ if (clone_entry == NULL) {
+ /* Clone not found */
+ clone_entry = calloc(1, sizeof(struct spdk_blob_list));
+ if (clone_entry == NULL) {
+ return -ENOMEM;
+ }
+ clone_entry->id = blob->id;
+ TAILQ_INIT(&clone_entry->clones);
+ TAILQ_INSERT_TAIL(&snapshot_entry->clones, clone_entry, link);
+ snapshot_entry->clone_count++;
+ }
+
+ return 0;
+}
+
+static void
+bs_blob_list_remove(struct spdk_blob *blob)
+{
+ struct spdk_blob_list *snapshot_entry = NULL;
+ struct spdk_blob_list *clone_entry = NULL;
+
+ blob_get_snapshot_and_clone_entries(blob, &snapshot_entry, &clone_entry);
+
+ if (snapshot_entry == NULL) {
+ return;
+ }
+
+ blob->parent_id = SPDK_BLOBID_INVALID;
+ TAILQ_REMOVE(&snapshot_entry->clones, clone_entry, link);
+ free(clone_entry);
+
+ snapshot_entry->clone_count--;
+}
+
+static int
+bs_blob_list_free(struct spdk_blob_store *bs)
+{
+ struct spdk_blob_list *snapshot_entry;
+ struct spdk_blob_list *snapshot_entry_tmp;
+ struct spdk_blob_list *clone_entry;
+ struct spdk_blob_list *clone_entry_tmp;
+
+ TAILQ_FOREACH_SAFE(snapshot_entry, &bs->snapshots, link, snapshot_entry_tmp) {
+ TAILQ_FOREACH_SAFE(clone_entry, &snapshot_entry->clones, link, clone_entry_tmp) {
+ TAILQ_REMOVE(&snapshot_entry->clones, clone_entry, link);
+ free(clone_entry);
+ }
+ TAILQ_REMOVE(&bs->snapshots, snapshot_entry, link);
+ free(snapshot_entry);
+ }
+
+ return 0;
+}
+
+static void
+bs_free(struct spdk_blob_store *bs)
+{
+ bs_blob_list_free(bs);
+
+ bs_unregister_md_thread(bs);
+ spdk_io_device_unregister(bs, bs_dev_destroy);
+}
+
+void
+spdk_bs_opts_init(struct spdk_bs_opts *opts)
+{
+ opts->cluster_sz = SPDK_BLOB_OPTS_CLUSTER_SZ;
+ opts->num_md_pages = SPDK_BLOB_OPTS_NUM_MD_PAGES;
+ opts->max_md_ops = SPDK_BLOB_OPTS_MAX_MD_OPS;
+ opts->max_channel_ops = SPDK_BLOB_OPTS_DEFAULT_CHANNEL_OPS;
+ opts->clear_method = BS_CLEAR_WITH_UNMAP;
+ memset(&opts->bstype, 0, sizeof(opts->bstype));
+ opts->iter_cb_fn = NULL;
+ opts->iter_cb_arg = NULL;
+}
+
+static int
+bs_opts_verify(struct spdk_bs_opts *opts)
+{
+ if (opts->cluster_sz == 0 || opts->num_md_pages == 0 || opts->max_md_ops == 0 ||
+ opts->max_channel_ops == 0) {
+ SPDK_ERRLOG("Blobstore options cannot be set to 0\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int
+bs_alloc(struct spdk_bs_dev *dev, struct spdk_bs_opts *opts, struct spdk_blob_store **_bs)
+{
+ struct spdk_blob_store *bs;
+ uint64_t dev_size;
+ int rc;
+
+ dev_size = dev->blocklen * dev->blockcnt;
+ if (dev_size < opts->cluster_sz) {
+ /* Device size cannot be smaller than cluster size of blobstore */
+ SPDK_INFOLOG(SPDK_LOG_BLOB, "Device size %" PRIu64 " is smaller than cluster size %" PRIu32 "\n",
+ dev_size, opts->cluster_sz);
+ return -ENOSPC;
+ }
+ if (opts->cluster_sz < SPDK_BS_PAGE_SIZE) {
+ /* Cluster size cannot be smaller than page size */
+ SPDK_ERRLOG("Cluster size %" PRIu32 " is smaller than page size %d\n",
+ opts->cluster_sz, SPDK_BS_PAGE_SIZE);
+ return -EINVAL;
+ }
+ bs = calloc(1, sizeof(struct spdk_blob_store));
+ if (!bs) {
+ return -ENOMEM;
+ }
+
+ TAILQ_INIT(&bs->blobs);
+ TAILQ_INIT(&bs->snapshots);
+ bs->dev = dev;
+ bs->md_thread = spdk_get_thread();
+ assert(bs->md_thread != NULL);
+
+ /*
+ * Do not use bs_lba_to_cluster() here since blockcnt may not be an
+ * even multiple of the cluster size.
+ */
+ bs->cluster_sz = opts->cluster_sz;
+ bs->total_clusters = dev->blockcnt / (bs->cluster_sz / dev->blocklen);
+ bs->pages_per_cluster = bs->cluster_sz / SPDK_BS_PAGE_SIZE;
+ if (spdk_u32_is_pow2(bs->pages_per_cluster)) {
+ bs->pages_per_cluster_shift = spdk_u32log2(bs->pages_per_cluster);
+ }
+ bs->num_free_clusters = bs->total_clusters;
+ bs->used_clusters = spdk_bit_array_create(bs->total_clusters);
+ bs->io_unit_size = dev->blocklen;
+ if (bs->used_clusters == NULL) {
+ free(bs);
+ return -ENOMEM;
+ }
+
+ bs->max_channel_ops = opts->max_channel_ops;
+ bs->super_blob = SPDK_BLOBID_INVALID;
+ memcpy(&bs->bstype, &opts->bstype, sizeof(opts->bstype));
+
+ /* The metadata is assumed to be at least 1 page */
+ bs->used_md_pages = spdk_bit_array_create(1);
+ bs->used_blobids = spdk_bit_array_create(0);
+ bs->open_blobids = spdk_bit_array_create(0);
+
+ pthread_mutex_init(&bs->used_clusters_mutex, NULL);
+
+ spdk_io_device_register(bs, bs_channel_create, bs_channel_destroy,
+ sizeof(struct spdk_bs_channel), "blobstore");
+ rc = bs_register_md_thread(bs);
+ if (rc == -1) {
+ spdk_io_device_unregister(bs, NULL);
+ pthread_mutex_destroy(&bs->used_clusters_mutex);
+ spdk_bit_array_free(&bs->open_blobids);
+ spdk_bit_array_free(&bs->used_blobids);
+ spdk_bit_array_free(&bs->used_md_pages);
+ spdk_bit_array_free(&bs->used_clusters);
+ free(bs);
+ /* FIXME: this is a lie but don't know how to get a proper error code here */
+ return -ENOMEM;
+ }
+
+ *_bs = bs;
+ return 0;
+}
+
+/* START spdk_bs_load, spdk_bs_load_ctx will used for both load and unload. */
+
+struct spdk_bs_load_ctx {
+ struct spdk_blob_store *bs;
+ struct spdk_bs_super_block *super;
+
+ struct spdk_bs_md_mask *mask;
+ bool in_page_chain;
+ uint32_t page_index;
+ uint32_t cur_page;
+ struct spdk_blob_md_page *page;
+
+ uint64_t num_extent_pages;
+ uint32_t *extent_page_num;
+ struct spdk_blob_md_page *extent_pages;
+
+ spdk_bs_sequence_t *seq;
+ spdk_blob_op_with_handle_complete iter_cb_fn;
+ void *iter_cb_arg;
+ struct spdk_blob *blob;
+ spdk_blob_id blobid;
+};
+
+static void
+bs_load_ctx_fail(struct spdk_bs_load_ctx *ctx, int bserrno)
+{
+ assert(bserrno != 0);
+
+ spdk_free(ctx->super);
+ bs_sequence_finish(ctx->seq, bserrno);
+ bs_free(ctx->bs);
+ free(ctx);
+}
+
+static void
+bs_set_mask(struct spdk_bit_array *array, struct spdk_bs_md_mask *mask)
+{
+ uint32_t i = 0;
+
+ while (true) {
+ i = spdk_bit_array_find_first_set(array, i);
+ if (i >= mask->length) {
+ break;
+ }
+ mask->mask[i / 8] |= 1U << (i % 8);
+ i++;
+ }
+}
+
+static int
+bs_load_mask(struct spdk_bit_array **array_ptr, struct spdk_bs_md_mask *mask)
+{
+ struct spdk_bit_array *array;
+ uint32_t i;
+
+ if (spdk_bit_array_resize(array_ptr, mask->length) < 0) {
+ return -ENOMEM;
+ }
+
+ array = *array_ptr;
+ for (i = 0; i < mask->length; i++) {
+ if (mask->mask[i / 8] & (1U << (i % 8))) {
+ spdk_bit_array_set(array, i);
+ }
+ }
+
+ return 0;
+}
+
+static void
+bs_write_super(spdk_bs_sequence_t *seq, struct spdk_blob_store *bs,
+ struct spdk_bs_super_block *super, spdk_bs_sequence_cpl cb_fn, void *cb_arg)
+{
+ /* Update the values in the super block */
+ super->super_blob = bs->super_blob;
+ memcpy(&super->bstype, &bs->bstype, sizeof(bs->bstype));
+ super->crc = blob_md_page_calc_crc(super);
+ bs_sequence_write_dev(seq, super, bs_page_to_lba(bs, 0),
+ bs_byte_to_lba(bs, sizeof(*super)),
+ cb_fn, cb_arg);
+}
+
+static void
+bs_write_used_clusters(spdk_bs_sequence_t *seq, void *arg, spdk_bs_sequence_cpl cb_fn)
+{
+ struct spdk_bs_load_ctx *ctx = arg;
+ uint64_t mask_size, lba, lba_count;
+
+ /* Write out the used clusters mask */
+ mask_size = ctx->super->used_cluster_mask_len * SPDK_BS_PAGE_SIZE;
+ ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL,
+ SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
+ if (!ctx->mask) {
+ bs_load_ctx_fail(ctx, -ENOMEM);
+ return;
+ }
+
+ ctx->mask->type = SPDK_MD_MASK_TYPE_USED_CLUSTERS;
+ ctx->mask->length = ctx->bs->total_clusters;
+ assert(ctx->mask->length == spdk_bit_array_capacity(ctx->bs->used_clusters));
+
+ bs_set_mask(ctx->bs->used_clusters, ctx->mask);
+ lba = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_start);
+ lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_len);
+ bs_sequence_write_dev(seq, ctx->mask, lba, lba_count, cb_fn, arg);
+}
+
+static void
+bs_write_used_md(spdk_bs_sequence_t *seq, void *arg, spdk_bs_sequence_cpl cb_fn)
+{
+ struct spdk_bs_load_ctx *ctx = arg;
+ uint64_t mask_size, lba, lba_count;
+
+ mask_size = ctx->super->used_page_mask_len * SPDK_BS_PAGE_SIZE;
+ ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL,
+ SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
+ if (!ctx->mask) {
+ bs_load_ctx_fail(ctx, -ENOMEM);
+ return;
+ }
+
+ ctx->mask->type = SPDK_MD_MASK_TYPE_USED_PAGES;
+ ctx->mask->length = ctx->super->md_len;
+ assert(ctx->mask->length == spdk_bit_array_capacity(ctx->bs->used_md_pages));
+
+ bs_set_mask(ctx->bs->used_md_pages, ctx->mask);
+ lba = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_start);
+ lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_len);
+ bs_sequence_write_dev(seq, ctx->mask, lba, lba_count, cb_fn, arg);
+}
+
+static void
+bs_write_used_blobids(spdk_bs_sequence_t *seq, void *arg, spdk_bs_sequence_cpl cb_fn)
+{
+ struct spdk_bs_load_ctx *ctx = arg;
+ uint64_t mask_size, lba, lba_count;
+
+ if (ctx->super->used_blobid_mask_len == 0) {
+ /*
+ * This is a pre-v3 on-disk format where the blobid mask does not get
+ * written to disk.
+ */
+ cb_fn(seq, arg, 0);
+ return;
+ }
+
+ mask_size = ctx->super->used_blobid_mask_len * SPDK_BS_PAGE_SIZE;
+ ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY,
+ SPDK_MALLOC_DMA);
+ if (!ctx->mask) {
+ bs_load_ctx_fail(ctx, -ENOMEM);
+ return;
+ }
+
+ ctx->mask->type = SPDK_MD_MASK_TYPE_USED_BLOBIDS;
+ ctx->mask->length = ctx->super->md_len;
+ assert(ctx->mask->length == spdk_bit_array_capacity(ctx->bs->used_blobids));
+
+ bs_set_mask(ctx->bs->used_blobids, ctx->mask);
+ lba = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_start);
+ lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_len);
+ bs_sequence_write_dev(seq, ctx->mask, lba, lba_count, cb_fn, arg);
+}
+
+static void
+blob_set_thin_provision(struct spdk_blob *blob)
+{
+ blob_verify_md_op(blob);
+ blob->invalid_flags |= SPDK_BLOB_THIN_PROV;
+ blob->state = SPDK_BLOB_STATE_DIRTY;
+}
+
+static void
+blob_set_clear_method(struct spdk_blob *blob, enum blob_clear_method clear_method)
+{
+ blob_verify_md_op(blob);
+ blob->clear_method = clear_method;
+ blob->md_ro_flags |= (clear_method << SPDK_BLOB_CLEAR_METHOD_SHIFT);
+ blob->state = SPDK_BLOB_STATE_DIRTY;
+}
+
+static void bs_load_iter(void *arg, struct spdk_blob *blob, int bserrno);
+
+static void
+bs_delete_corrupted_blob_cpl(void *cb_arg, int bserrno)
+{
+ struct spdk_bs_load_ctx *ctx = cb_arg;
+ spdk_blob_id id;
+ int64_t page_num;
+
+ /* Iterate to next blob (we can't use spdk_bs_iter_next function as our
+ * last blob has been removed */
+ page_num = bs_blobid_to_page(ctx->blobid);
+ page_num++;
+ page_num = spdk_bit_array_find_first_set(ctx->bs->used_blobids, page_num);
+ if (page_num >= spdk_bit_array_capacity(ctx->bs->used_blobids)) {
+ bs_load_iter(ctx, NULL, -ENOENT);
+ return;
+ }
+
+ id = bs_page_to_blobid(page_num);
+
+ spdk_bs_open_blob(ctx->bs, id, bs_load_iter, ctx);
+}
+
+static void
+bs_delete_corrupted_close_cb(void *cb_arg, int bserrno)
+{
+ struct spdk_bs_load_ctx *ctx = cb_arg;
+
+ if (bserrno != 0) {
+ SPDK_ERRLOG("Failed to close corrupted blob\n");
+ spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx);
+ return;
+ }
+
+ spdk_bs_delete_blob(ctx->bs, ctx->blobid, bs_delete_corrupted_blob_cpl, ctx);
+}
+
+static void
+bs_delete_corrupted_blob(void *cb_arg, int bserrno)
+{
+ struct spdk_bs_load_ctx *ctx = cb_arg;
+ uint64_t i;
+
+ if (bserrno != 0) {
+ SPDK_ERRLOG("Failed to close clone of a corrupted blob\n");
+ spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx);
+ return;
+ }
+
+ /* Snapshot and clone have the same copy of cluster map and extent pages
+ * at this point. Let's clear both for snpashot now,
+ * so that it won't be cleared for clone later when we remove snapshot.
+ * Also set thin provision to pass data corruption check */
+ for (i = 0; i < ctx->blob->active.num_clusters; i++) {
+ ctx->blob->active.clusters[i] = 0;
+ }
+ for (i = 0; i < ctx->blob->active.num_extent_pages; i++) {
+ ctx->blob->active.extent_pages[i] = 0;
+ }
+
+ ctx->blob->md_ro = false;
+
+ blob_set_thin_provision(ctx->blob);
+
+ ctx->blobid = ctx->blob->id;
+
+ spdk_blob_close(ctx->blob, bs_delete_corrupted_close_cb, ctx);
+}
+
+static void
+bs_update_corrupted_blob(void *cb_arg, int bserrno)
+{
+ struct spdk_bs_load_ctx *ctx = cb_arg;
+
+ if (bserrno != 0) {
+ SPDK_ERRLOG("Failed to close clone of a corrupted blob\n");
+ spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx);
+ return;
+ }
+
+ ctx->blob->md_ro = false;
+ blob_remove_xattr(ctx->blob, SNAPSHOT_PENDING_REMOVAL, true);
+ blob_remove_xattr(ctx->blob, SNAPSHOT_IN_PROGRESS, true);
+ spdk_blob_set_read_only(ctx->blob);
+
+ if (ctx->iter_cb_fn) {
+ ctx->iter_cb_fn(ctx->iter_cb_arg, ctx->blob, 0);
+ }
+ bs_blob_list_add(ctx->blob);
+
+ spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx);
+}
+
+static void
+bs_examine_clone(void *cb_arg, struct spdk_blob *blob, int bserrno)
+{
+ struct spdk_bs_load_ctx *ctx = cb_arg;
+
+ if (bserrno != 0) {
+ SPDK_ERRLOG("Failed to open clone of a corrupted blob\n");
+ spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx);
+ return;
+ }
+
+ if (blob->parent_id == ctx->blob->id) {
+ /* Power failure occured before updating clone (snapshot delete case)
+ * or after updating clone (creating snapshot case) - keep snapshot */
+ spdk_blob_close(blob, bs_update_corrupted_blob, ctx);
+ } else {
+ /* Power failure occured after updating clone (snapshot delete case)
+ * or before updating clone (creating snapshot case) - remove snapshot */
+ spdk_blob_close(blob, bs_delete_corrupted_blob, ctx);
+ }
+}
+
+static void
+bs_load_iter(void *arg, struct spdk_blob *blob, int bserrno)
+{
+ struct spdk_bs_load_ctx *ctx = arg;
+ const void *value;
+ size_t len;
+ int rc = 0;
+
+ if (bserrno == 0) {
+ /* Examine blob if it is corrupted after power failure. Fix
+ * the ones that can be fixed and remove any other corrupted
+ * ones. If it is not corrupted just process it */
+ rc = blob_get_xattr_value(blob, SNAPSHOT_PENDING_REMOVAL, &value, &len, true);
+ if (rc != 0) {
+ rc = blob_get_xattr_value(blob, SNAPSHOT_IN_PROGRESS, &value, &len, true);
+ if (rc != 0) {
+ /* Not corrupted - process it and continue with iterating through blobs */
+ if (ctx->iter_cb_fn) {
+ ctx->iter_cb_fn(ctx->iter_cb_arg, blob, 0);
+ }
+ bs_blob_list_add(blob);
+ spdk_bs_iter_next(ctx->bs, blob, bs_load_iter, ctx);
+ return;
+ }
+
+ }
+
+ assert(len == sizeof(spdk_blob_id));
+
+ ctx->blob = blob;
+
+ /* Open clone to check if we are able to fix this blob or should we remove it */
+ spdk_bs_open_blob(ctx->bs, *(spdk_blob_id *)value, bs_examine_clone, ctx);
+ return;
+ } else if (bserrno == -ENOENT) {
+ bserrno = 0;
+ } else {
+ /*
+ * This case needs to be looked at further. Same problem
+ * exists with applications that rely on explicit blob
+ * iteration. We should just skip the blob that failed
+ * to load and continue on to the next one.
+ */
+ SPDK_ERRLOG("Error in iterating blobs\n");
+ }
+
+ ctx->iter_cb_fn = NULL;
+
+ spdk_free(ctx->super);
+ spdk_free(ctx->mask);
+ bs_sequence_finish(ctx->seq, bserrno);
+ free(ctx);
+}
+
+static void
+bs_load_complete(struct spdk_bs_load_ctx *ctx)
+{
+ spdk_bs_iter_first(ctx->bs, bs_load_iter, ctx);
+}
+
+static void
+bs_load_used_blobids_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+ struct spdk_bs_load_ctx *ctx = cb_arg;
+ int rc;
+
+ /* The type must be correct */
+ assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_BLOBIDS);
+
+ /* The length of the mask (in bits) must not be greater than
+ * the length of the buffer (converted to bits) */
+ assert(ctx->mask->length <= (ctx->super->used_blobid_mask_len * SPDK_BS_PAGE_SIZE * 8));
+
+ /* The length of the mask must be exactly equal to the size
+ * (in pages) of the metadata region */
+ assert(ctx->mask->length == ctx->super->md_len);
+
+ rc = bs_load_mask(&ctx->bs->used_blobids, ctx->mask);
+ if (rc < 0) {
+ spdk_free(ctx->mask);
+ bs_load_ctx_fail(ctx, rc);
+ return;
+ }
+
+ bs_load_complete(ctx);
+}
+
+static void
+bs_load_used_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+ struct spdk_bs_load_ctx *ctx = cb_arg;
+ uint64_t lba, lba_count, mask_size;
+ int rc;
+
+ if (bserrno != 0) {
+ bs_load_ctx_fail(ctx, bserrno);
+ return;
+ }
+
+ /* The type must be correct */
+ assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_CLUSTERS);
+ /* The length of the mask (in bits) must not be greater than the length of the buffer (converted to bits) */
+ assert(ctx->mask->length <= (ctx->super->used_cluster_mask_len * sizeof(
+ struct spdk_blob_md_page) * 8));
+ /* The length of the mask must be exactly equal to the total number of clusters */
+ assert(ctx->mask->length == ctx->bs->total_clusters);
+
+ rc = bs_load_mask(&ctx->bs->used_clusters, ctx->mask);
+ if (rc < 0) {
+ spdk_free(ctx->mask);
+ bs_load_ctx_fail(ctx, rc);
+ return;
+ }
+
+ ctx->bs->num_free_clusters = spdk_bit_array_count_clear(ctx->bs->used_clusters);
+ assert(ctx->bs->num_free_clusters <= ctx->bs->total_clusters);
+
+ spdk_free(ctx->mask);
+
+ /* Read the used blobids mask */
+ mask_size = ctx->super->used_blobid_mask_len * SPDK_BS_PAGE_SIZE;
+ ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY,
+ SPDK_MALLOC_DMA);
+ if (!ctx->mask) {
+ bs_load_ctx_fail(ctx, -ENOMEM);
+ return;
+ }
+ lba = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_start);
+ lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_len);
+ bs_sequence_read_dev(seq, ctx->mask, lba, lba_count,
+ bs_load_used_blobids_cpl, ctx);
+}
+
+static void
+bs_load_used_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+ struct spdk_bs_load_ctx *ctx = cb_arg;
+ uint64_t lba, lba_count, mask_size;
+ int rc;
+
+ if (bserrno != 0) {
+ bs_load_ctx_fail(ctx, bserrno);
+ return;
+ }
+
+ /* The type must be correct */
+ assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_PAGES);
+ /* The length of the mask (in bits) must not be greater than the length of the buffer (converted to bits) */
+ assert(ctx->mask->length <= (ctx->super->used_page_mask_len * SPDK_BS_PAGE_SIZE *
+ 8));
+ /* The length of the mask must be exactly equal to the size (in pages) of the metadata region */
+ assert(ctx->mask->length == ctx->super->md_len);
+
+ rc = bs_load_mask(&ctx->bs->used_md_pages, ctx->mask);
+ if (rc < 0) {
+ spdk_free(ctx->mask);
+ bs_load_ctx_fail(ctx, rc);
+ return;
+ }
+
+ spdk_free(ctx->mask);
+
+ /* Read the used clusters mask */
+ mask_size = ctx->super->used_cluster_mask_len * SPDK_BS_PAGE_SIZE;
+ ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY,
+ SPDK_MALLOC_DMA);
+ if (!ctx->mask) {
+ bs_load_ctx_fail(ctx, -ENOMEM);
+ return;
+ }
+ lba = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_start);
+ lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_len);
+ bs_sequence_read_dev(seq, ctx->mask, lba, lba_count,
+ bs_load_used_clusters_cpl, ctx);
+}
+
+static void
+bs_load_read_used_pages(struct spdk_bs_load_ctx *ctx)
+{
+ uint64_t lba, lba_count, mask_size;
+
+ /* Read the used pages mask */
+ mask_size = ctx->super->used_page_mask_len * SPDK_BS_PAGE_SIZE;
+ ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL,
+ SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
+ if (!ctx->mask) {
+ bs_load_ctx_fail(ctx, -ENOMEM);
+ return;
+ }
+
+ lba = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_start);
+ lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_len);
+ bs_sequence_read_dev(ctx->seq, ctx->mask, lba, lba_count,
+ bs_load_used_pages_cpl, ctx);
+}
+
+static int
+bs_load_replay_md_parse_page(struct spdk_bs_load_ctx *ctx, struct spdk_blob_md_page *page)
+{
+ struct spdk_blob_store *bs = ctx->bs;
+ struct spdk_blob_md_descriptor *desc;
+ size_t cur_desc = 0;
+
+ desc = (struct spdk_blob_md_descriptor *)page->descriptors;
+ while (cur_desc < sizeof(page->descriptors)) {
+ if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_PADDING) {
+ if (desc->length == 0) {
+ /* If padding and length are 0, this terminates the page */
+ break;
+ }
+ } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE) {
+ struct spdk_blob_md_descriptor_extent_rle *desc_extent_rle;
+ unsigned int i, j;
+ unsigned int cluster_count = 0;
+ uint32_t cluster_idx;
+
+ desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)desc;
+
+ for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) {
+ for (j = 0; j < desc_extent_rle->extents[i].length; j++) {
+ cluster_idx = desc_extent_rle->extents[i].cluster_idx;
+ /*
+ * cluster_idx = 0 means an unallocated cluster - don't mark that
+ * in the used cluster map.
+ */
+ if (cluster_idx != 0) {
+ spdk_bit_array_set(bs->used_clusters, cluster_idx + j);
+ if (bs->num_free_clusters == 0) {
+ return -ENOSPC;
+ }
+ bs->num_free_clusters--;
+ }
+ cluster_count++;
+ }
+ }
+ if (cluster_count == 0) {
+ return -EINVAL;
+ }
+ } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) {
+ struct spdk_blob_md_descriptor_extent_page *desc_extent;
+ uint32_t i;
+ uint32_t cluster_count = 0;
+ uint32_t cluster_idx;
+ size_t cluster_idx_length;
+
+ desc_extent = (struct spdk_blob_md_descriptor_extent_page *)desc;
+ cluster_idx_length = desc_extent->length - sizeof(desc_extent->start_cluster_idx);
+
+ if (desc_extent->length <= sizeof(desc_extent->start_cluster_idx) ||
+ (cluster_idx_length % sizeof(desc_extent->cluster_idx[0]) != 0)) {
+ return -EINVAL;
+ }
+
+ for (i = 0; i < cluster_idx_length / sizeof(desc_extent->cluster_idx[0]); i++) {
+ cluster_idx = desc_extent->cluster_idx[i];
+ /*
+ * cluster_idx = 0 means an unallocated cluster - don't mark that
+ * in the used cluster map.
+ */
+ if (cluster_idx != 0) {
+ if (cluster_idx < desc_extent->start_cluster_idx &&
+ cluster_idx >= desc_extent->start_cluster_idx + cluster_count) {
+ return -EINVAL;
+ }
+ spdk_bit_array_set(bs->used_clusters, cluster_idx);
+ if (bs->num_free_clusters == 0) {
+ return -ENOSPC;
+ }
+ bs->num_free_clusters--;
+ }
+ cluster_count++;
+ }
+
+ if (cluster_count == 0) {
+ return -EINVAL;
+ }
+ } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) {
+ /* Skip this item */
+ } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) {
+ /* Skip this item */
+ } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_FLAGS) {
+ /* Skip this item */
+ } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE) {
+ struct spdk_blob_md_descriptor_extent_table *desc_extent_table;
+ uint32_t num_extent_pages = ctx->num_extent_pages;
+ uint32_t i;
+ size_t extent_pages_length;
+ void *tmp;
+
+ desc_extent_table = (struct spdk_blob_md_descriptor_extent_table *)desc;
+ extent_pages_length = desc_extent_table->length - sizeof(desc_extent_table->num_clusters);
+
+ if (desc_extent_table->length == 0 ||
+ (extent_pages_length % sizeof(desc_extent_table->extent_page[0]) != 0)) {
+ return -EINVAL;
+ }
+
+ for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) {
+ if (desc_extent_table->extent_page[i].page_idx != 0) {
+ if (desc_extent_table->extent_page[i].num_pages != 1) {
+ return -EINVAL;
+ }
+ num_extent_pages += 1;
+ }
+ }
+
+ if (num_extent_pages > 0) {
+ tmp = realloc(ctx->extent_page_num, num_extent_pages * sizeof(uint32_t));
+ if (tmp == NULL) {
+ return -ENOMEM;
+ }
+ ctx->extent_page_num = tmp;
+
+ /* Extent table entries contain md page numbers for extent pages.
+ * Zeroes represent unallocated extent pages, those are run-length-encoded.
+ */
+ for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) {
+ if (desc_extent_table->extent_page[i].page_idx != 0) {
+ ctx->extent_page_num[ctx->num_extent_pages] = desc_extent_table->extent_page[i].page_idx;
+ ctx->num_extent_pages += 1;
+ }
+ }
+ }
+ } else {
+ /* Error */
+ return -EINVAL;
+ }
+ /* Advance to the next descriptor */
+ cur_desc += sizeof(*desc) + desc->length;
+ if (cur_desc + sizeof(*desc) > sizeof(page->descriptors)) {
+ break;
+ }
+ desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + cur_desc);
+ }
+ return 0;
+}
+
+static bool bs_load_cur_extent_page_valid(struct spdk_blob_md_page *page)
+{
+ uint32_t crc;
+ struct spdk_blob_md_descriptor *desc = (struct spdk_blob_md_descriptor *)page->descriptors;
+ size_t desc_len;
+
+ crc = blob_md_page_calc_crc(page);
+ if (crc != page->crc) {
+ return false;
+ }
+
+ /* Extent page should always be of sequence num 0. */
+ if (page->sequence_num != 0) {
+ return false;
+ }
+
+ /* Descriptor type must be EXTENT_PAGE. */
+ if (desc->type != SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) {
+ return false;
+ }
+
+ /* Descriptor length cannot exceed the page. */
+ desc_len = sizeof(*desc) + desc->length;
+ if (desc_len > sizeof(page->descriptors)) {
+ return false;
+ }
+
+ /* It has to be the only descriptor in the page. */
+ if (desc_len + sizeof(*desc) <= sizeof(page->descriptors)) {
+ desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + desc_len);
+ if (desc->length != 0) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static bool bs_load_cur_md_page_valid(struct spdk_bs_load_ctx *ctx)
+{
+ uint32_t crc;
+ struct spdk_blob_md_page *page = ctx->page;
+
+ crc = blob_md_page_calc_crc(page);
+ if (crc != page->crc) {
+ return false;
+ }
+
+ /* First page of a sequence should match the blobid. */
+ if (page->sequence_num == 0 &&
+ bs_page_to_blobid(ctx->cur_page) != page->id) {
+ return false;
+ }
+ assert(bs_load_cur_extent_page_valid(page) == false);
+
+ return true;
+}
+
+static void
+bs_load_replay_cur_md_page(struct spdk_bs_load_ctx *ctx);
+
+static void
+bs_load_write_used_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+ struct spdk_bs_load_ctx *ctx = cb_arg;
+
+ if (bserrno != 0) {
+ bs_load_ctx_fail(ctx, bserrno);
+ return;
+ }
+
+ bs_load_complete(ctx);
+}
+
+static void
+bs_load_write_used_blobids_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+ struct spdk_bs_load_ctx *ctx = cb_arg;
+
+ spdk_free(ctx->mask);
+ ctx->mask = NULL;
+
+ if (bserrno != 0) {
+ bs_load_ctx_fail(ctx, bserrno);
+ return;
+ }
+
+ bs_write_used_clusters(seq, ctx, bs_load_write_used_clusters_cpl);
+}
+
+static void
+bs_load_write_used_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+ struct spdk_bs_load_ctx *ctx = cb_arg;
+
+ spdk_free(ctx->mask);
+ ctx->mask = NULL;
+
+ if (bserrno != 0) {
+ bs_load_ctx_fail(ctx, bserrno);
+ return;
+ }
+
+ bs_write_used_blobids(seq, ctx, bs_load_write_used_blobids_cpl);
+}
+
+static void
+bs_load_write_used_md(struct spdk_bs_load_ctx *ctx)
+{
+ bs_write_used_md(ctx->seq, ctx, bs_load_write_used_pages_cpl);
+}
+
+static void
+bs_load_replay_md_chain_cpl(struct spdk_bs_load_ctx *ctx)
+{
+ uint64_t num_md_clusters;
+ uint64_t i;
+
+ ctx->in_page_chain = false;
+
+ do {
+ ctx->page_index++;
+ } while (spdk_bit_array_get(ctx->bs->used_md_pages, ctx->page_index) == true);
+
+ if (ctx->page_index < ctx->super->md_len) {
+ ctx->cur_page = ctx->page_index;
+ bs_load_replay_cur_md_page(ctx);
+ } else {
+ /* Claim all of the clusters used by the metadata */
+ num_md_clusters = spdk_divide_round_up(ctx->super->md_len, ctx->bs->pages_per_cluster);
+ for (i = 0; i < num_md_clusters; i++) {
+ bs_claim_cluster(ctx->bs, i);
+ }
+ spdk_free(ctx->page);
+ bs_load_write_used_md(ctx);
+ }
+}
+
+static void
+bs_load_replay_extent_page_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+ struct spdk_bs_load_ctx *ctx = cb_arg;
+ uint32_t page_num;
+ uint64_t i;
+
+ if (bserrno != 0) {
+ spdk_free(ctx->extent_pages);
+ bs_load_ctx_fail(ctx, bserrno);
+ return;
+ }
+
+ for (i = 0; i < ctx->num_extent_pages; i++) {
+ /* Extent pages are only read when present within in chain md.
+ * Integrity of md is not right if that page was not a valid extent page. */
+ if (bs_load_cur_extent_page_valid(&ctx->extent_pages[i]) != true) {
+ spdk_free(ctx->extent_pages);
+ bs_load_ctx_fail(ctx, -EILSEQ);
+ return;
+ }
+
+ page_num = ctx->extent_page_num[i];
+ spdk_bit_array_set(ctx->bs->used_md_pages, page_num);
+ if (bs_load_replay_md_parse_page(ctx, &ctx->extent_pages[i])) {
+ spdk_free(ctx->extent_pages);
+ bs_load_ctx_fail(ctx, -EILSEQ);
+ return;
+ }
+ }
+
+ spdk_free(ctx->extent_pages);
+ free(ctx->extent_page_num);
+ ctx->extent_page_num = NULL;
+ ctx->num_extent_pages = 0;
+
+ bs_load_replay_md_chain_cpl(ctx);
+}
+
+static void
+bs_load_replay_extent_pages(struct spdk_bs_load_ctx *ctx)
+{
+ spdk_bs_batch_t *batch;
+ uint32_t page;
+ uint64_t lba;
+ uint64_t i;
+
+ ctx->extent_pages = spdk_zmalloc(SPDK_BS_PAGE_SIZE * ctx->num_extent_pages, SPDK_BS_PAGE_SIZE,
+ NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
+ if (!ctx->extent_pages) {
+ bs_load_ctx_fail(ctx, -ENOMEM);
+ return;
+ }
+
+ batch = bs_sequence_to_batch(ctx->seq, bs_load_replay_extent_page_cpl, ctx);
+
+ for (i = 0; i < ctx->num_extent_pages; i++) {
+ page = ctx->extent_page_num[i];
+ assert(page < ctx->super->md_len);
+ lba = bs_md_page_to_lba(ctx->bs, page);
+ bs_batch_read_dev(batch, &ctx->extent_pages[i], lba,
+ bs_byte_to_lba(ctx->bs, SPDK_BS_PAGE_SIZE));
+ }
+
+ bs_batch_close(batch);
+}
+
+static void
+bs_load_replay_md_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+ struct spdk_bs_load_ctx *ctx = cb_arg;
+ uint32_t page_num;
+ struct spdk_blob_md_page *page;
+
+ if (bserrno != 0) {
+ bs_load_ctx_fail(ctx, bserrno);
+ return;
+ }
+
+ page_num = ctx->cur_page;
+ page = ctx->page;
+ if (bs_load_cur_md_page_valid(ctx) == true) {
+ if (page->sequence_num == 0 || ctx->in_page_chain == true) {
+ bs_claim_md_page(ctx->bs, page_num);
+ if (page->sequence_num == 0) {
+ spdk_bit_array_set(ctx->bs->used_blobids, page_num);
+ }
+ if (bs_load_replay_md_parse_page(ctx, page)) {
+ bs_load_ctx_fail(ctx, -EILSEQ);
+ return;
+ }
+ if (page->next != SPDK_INVALID_MD_PAGE) {
+ ctx->in_page_chain = true;
+ ctx->cur_page = page->next;
+ bs_load_replay_cur_md_page(ctx);
+ return;
+ }
+ if (ctx->num_extent_pages != 0) {
+ bs_load_replay_extent_pages(ctx);
+ return;
+ }
+ }
+ }
+ bs_load_replay_md_chain_cpl(ctx);
+}
+
+static void
+bs_load_replay_cur_md_page(struct spdk_bs_load_ctx *ctx)
+{
+ uint64_t lba;
+
+ assert(ctx->cur_page < ctx->super->md_len);
+ lba = bs_md_page_to_lba(ctx->bs, ctx->cur_page);
+ bs_sequence_read_dev(ctx->seq, ctx->page, lba,
+ bs_byte_to_lba(ctx->bs, SPDK_BS_PAGE_SIZE),
+ bs_load_replay_md_cpl, ctx);
+}
+
+static void
+bs_load_replay_md(struct spdk_bs_load_ctx *ctx)
+{
+ ctx->page_index = 0;
+ ctx->cur_page = 0;
+ ctx->page = spdk_zmalloc(SPDK_BS_PAGE_SIZE, SPDK_BS_PAGE_SIZE,
+ NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
+ if (!ctx->page) {
+ bs_load_ctx_fail(ctx, -ENOMEM);
+ return;
+ }
+ bs_load_replay_cur_md_page(ctx);
+}
+
+static void
+bs_recover(struct spdk_bs_load_ctx *ctx)
+{
+ int rc;
+
+ rc = spdk_bit_array_resize(&ctx->bs->used_md_pages, ctx->super->md_len);
+ if (rc < 0) {
+ bs_load_ctx_fail(ctx, -ENOMEM);
+ return;
+ }
+
+ rc = spdk_bit_array_resize(&ctx->bs->used_blobids, ctx->super->md_len);
+ if (rc < 0) {
+ bs_load_ctx_fail(ctx, -ENOMEM);
+ return;
+ }
+
+ rc = spdk_bit_array_resize(&ctx->bs->used_clusters, ctx->bs->total_clusters);
+ if (rc < 0) {
+ bs_load_ctx_fail(ctx, -ENOMEM);
+ return;
+ }
+
+ rc = spdk_bit_array_resize(&ctx->bs->open_blobids, ctx->super->md_len);
+ if (rc < 0) {
+ bs_load_ctx_fail(ctx, -ENOMEM);
+ return;
+ }
+
+ ctx->bs->num_free_clusters = ctx->bs->total_clusters;
+ bs_load_replay_md(ctx);
+}
+
+static void
+bs_load_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+ struct spdk_bs_load_ctx *ctx = cb_arg;
+ uint32_t crc;
+ int rc;
+ static const char zeros[SPDK_BLOBSTORE_TYPE_LENGTH];
+
+ if (ctx->super->version > SPDK_BS_VERSION ||
+ ctx->super->version < SPDK_BS_INITIAL_VERSION) {
+ bs_load_ctx_fail(ctx, -EILSEQ);
+ return;
+ }
+
+ if (memcmp(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG,
+ sizeof(ctx->super->signature)) != 0) {
+ bs_load_ctx_fail(ctx, -EILSEQ);
+ return;
+ }
+
+ crc = blob_md_page_calc_crc(ctx->super);
+ if (crc != ctx->super->crc) {
+ bs_load_ctx_fail(ctx, -EILSEQ);
+ return;
+ }
+
+ if (memcmp(&ctx->bs->bstype, &ctx->super->bstype, SPDK_BLOBSTORE_TYPE_LENGTH) == 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Bstype matched - loading blobstore\n");
+ } else if (memcmp(&ctx->bs->bstype, zeros, SPDK_BLOBSTORE_TYPE_LENGTH) == 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Bstype wildcard used - loading blobstore regardless bstype\n");
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Unexpected bstype\n");
+ SPDK_LOGDUMP(SPDK_LOG_BLOB, "Expected:", ctx->bs->bstype.bstype, SPDK_BLOBSTORE_TYPE_LENGTH);
+ SPDK_LOGDUMP(SPDK_LOG_BLOB, "Found:", ctx->super->bstype.bstype, SPDK_BLOBSTORE_TYPE_LENGTH);
+ bs_load_ctx_fail(ctx, -ENXIO);
+ return;
+ }
+
+ if (ctx->super->size > ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen) {
+ SPDK_NOTICELOG("Size mismatch, dev size: %lu, blobstore size: %lu\n",
+ ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen, ctx->super->size);
+ bs_load_ctx_fail(ctx, -EILSEQ);
+ return;
+ }
+
+ if (ctx->super->size == 0) {
+ ctx->super->size = ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen;
+ }
+
+ if (ctx->super->io_unit_size == 0) {
+ ctx->super->io_unit_size = SPDK_BS_PAGE_SIZE;
+ }
+
+ /* Parse the super block */
+ ctx->bs->clean = 1;
+ ctx->bs->cluster_sz = ctx->super->cluster_size;
+ ctx->bs->total_clusters = ctx->super->size / ctx->super->cluster_size;
+ ctx->bs->pages_per_cluster = ctx->bs->cluster_sz / SPDK_BS_PAGE_SIZE;
+ if (spdk_u32_is_pow2(ctx->bs->pages_per_cluster)) {
+ ctx->bs->pages_per_cluster_shift = spdk_u32log2(ctx->bs->pages_per_cluster);
+ }
+ ctx->bs->io_unit_size = ctx->super->io_unit_size;
+ rc = spdk_bit_array_resize(&ctx->bs->used_clusters, ctx->bs->total_clusters);
+ if (rc < 0) {
+ bs_load_ctx_fail(ctx, -ENOMEM);
+ return;
+ }
+ ctx->bs->md_start = ctx->super->md_start;
+ ctx->bs->md_len = ctx->super->md_len;
+ ctx->bs->total_data_clusters = ctx->bs->total_clusters - spdk_divide_round_up(
+ ctx->bs->md_start + ctx->bs->md_len, ctx->bs->pages_per_cluster);
+ ctx->bs->super_blob = ctx->super->super_blob;
+ memcpy(&ctx->bs->bstype, &ctx->super->bstype, sizeof(ctx->super->bstype));
+
+ if (ctx->super->used_blobid_mask_len == 0 || ctx->super->clean == 0) {
+ bs_recover(ctx);
+ } else {
+ bs_load_read_used_pages(ctx);
+ }
+}
+
+void
+spdk_bs_load(struct spdk_bs_dev *dev, struct spdk_bs_opts *o,
+ spdk_bs_op_with_handle_complete cb_fn, void *cb_arg)
+{
+ struct spdk_blob_store *bs;
+ struct spdk_bs_cpl cpl;
+ struct spdk_bs_load_ctx *ctx;
+ struct spdk_bs_opts opts = {};
+ int err;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Loading blobstore from dev %p\n", dev);
+
+ if ((SPDK_BS_PAGE_SIZE % dev->blocklen) != 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_BLOB, "unsupported dev block length of %d\n", dev->blocklen);
+ dev->destroy(dev);
+ cb_fn(cb_arg, NULL, -EINVAL);
+ return;
+ }
+
+ if (o) {
+ opts = *o;
+ } else {
+ spdk_bs_opts_init(&opts);
+ }
+
+ if (opts.max_md_ops == 0 || opts.max_channel_ops == 0) {
+ dev->destroy(dev);
+ cb_fn(cb_arg, NULL, -EINVAL);
+ return;
+ }
+
+ err = bs_alloc(dev, &opts, &bs);
+ if (err) {
+ dev->destroy(dev);
+ cb_fn(cb_arg, NULL, err);
+ return;
+ }
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx) {
+ bs_free(bs);
+ cb_fn(cb_arg, NULL, -ENOMEM);
+ return;
+ }
+
+ ctx->bs = bs;
+ ctx->iter_cb_fn = opts.iter_cb_fn;
+ ctx->iter_cb_arg = opts.iter_cb_arg;
+
+ /* Allocate memory for the super block */
+ ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL,
+ SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
+ if (!ctx->super) {
+ free(ctx);
+ bs_free(bs);
+ cb_fn(cb_arg, NULL, -ENOMEM);
+ return;
+ }
+
+ cpl.type = SPDK_BS_CPL_TYPE_BS_HANDLE;
+ cpl.u.bs_handle.cb_fn = cb_fn;
+ cpl.u.bs_handle.cb_arg = cb_arg;
+ cpl.u.bs_handle.bs = bs;
+
+ ctx->seq = bs_sequence_start(bs->md_channel, &cpl);
+ if (!ctx->seq) {
+ spdk_free(ctx->super);
+ free(ctx);
+ bs_free(bs);
+ cb_fn(cb_arg, NULL, -ENOMEM);
+ return;
+ }
+
+ /* Read the super block */
+ bs_sequence_read_dev(ctx->seq, ctx->super, bs_page_to_lba(bs, 0),
+ bs_byte_to_lba(bs, sizeof(*ctx->super)),
+ bs_load_super_cpl, ctx);
+}
+
+/* END spdk_bs_load */
+
+/* START spdk_bs_dump */
+
+struct spdk_bs_dump_ctx {
+ struct spdk_blob_store *bs;
+ struct spdk_bs_super_block *super;
+ uint32_t cur_page;
+ struct spdk_blob_md_page *page;
+ spdk_bs_sequence_t *seq;
+ FILE *fp;
+ spdk_bs_dump_print_xattr print_xattr_fn;
+ char xattr_name[4096];
+};
+
+static void
+bs_dump_finish(spdk_bs_sequence_t *seq, struct spdk_bs_dump_ctx *ctx, int bserrno)
+{
+ spdk_free(ctx->super);
+
+ /*
+ * We need to defer calling bs_call_cpl() until after
+ * dev destruction, so tuck these away for later use.
+ */
+ ctx->bs->unload_err = bserrno;
+ memcpy(&ctx->bs->unload_cpl, &seq->cpl, sizeof(struct spdk_bs_cpl));
+ seq->cpl.type = SPDK_BS_CPL_TYPE_NONE;
+
+ bs_sequence_finish(seq, 0);
+ bs_free(ctx->bs);
+ free(ctx);
+}
+
+static void bs_dump_read_md_page(spdk_bs_sequence_t *seq, void *cb_arg);
+
+static void
+bs_dump_print_md_page(struct spdk_bs_dump_ctx *ctx)
+{
+ uint32_t page_idx = ctx->cur_page;
+ struct spdk_blob_md_page *page = ctx->page;
+ struct spdk_blob_md_descriptor *desc;
+ size_t cur_desc = 0;
+ uint32_t crc;
+
+ fprintf(ctx->fp, "=========\n");
+ fprintf(ctx->fp, "Metadata Page Index: %" PRIu32 " (0x%" PRIx32 ")\n", page_idx, page_idx);
+ fprintf(ctx->fp, "Blob ID: 0x%" PRIx64 "\n", page->id);
+
+ crc = blob_md_page_calc_crc(page);
+ fprintf(ctx->fp, "CRC: 0x%" PRIx32 " (%s)\n", page->crc, crc == page->crc ? "OK" : "Mismatch");
+
+ desc = (struct spdk_blob_md_descriptor *)page->descriptors;
+ while (cur_desc < sizeof(page->descriptors)) {
+ if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_PADDING) {
+ if (desc->length == 0) {
+ /* If padding and length are 0, this terminates the page */
+ break;
+ }
+ } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE) {
+ struct spdk_blob_md_descriptor_extent_rle *desc_extent_rle;
+ unsigned int i;
+
+ desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)desc;
+
+ for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) {
+ if (desc_extent_rle->extents[i].cluster_idx != 0) {
+ fprintf(ctx->fp, "Allocated Extent - Start: %" PRIu32,
+ desc_extent_rle->extents[i].cluster_idx);
+ } else {
+ fprintf(ctx->fp, "Unallocated Extent - ");
+ }
+ fprintf(ctx->fp, " Length: %" PRIu32, desc_extent_rle->extents[i].length);
+ fprintf(ctx->fp, "\n");
+ }
+ } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) {
+ struct spdk_blob_md_descriptor_extent_page *desc_extent;
+ unsigned int i;
+
+ desc_extent = (struct spdk_blob_md_descriptor_extent_page *)desc;
+
+ for (i = 0; i < desc_extent->length / sizeof(desc_extent->cluster_idx[0]); i++) {
+ if (desc_extent->cluster_idx[i] != 0) {
+ fprintf(ctx->fp, "Allocated Extent - Start: %" PRIu32,
+ desc_extent->cluster_idx[i]);
+ } else {
+ fprintf(ctx->fp, "Unallocated Extent");
+ }
+ fprintf(ctx->fp, "\n");
+ }
+ } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) {
+ struct spdk_blob_md_descriptor_xattr *desc_xattr;
+ uint32_t i;
+
+ desc_xattr = (struct spdk_blob_md_descriptor_xattr *)desc;
+
+ if (desc_xattr->length !=
+ sizeof(desc_xattr->name_length) + sizeof(desc_xattr->value_length) +
+ desc_xattr->name_length + desc_xattr->value_length) {
+ }
+
+ memcpy(ctx->xattr_name, desc_xattr->name, desc_xattr->name_length);
+ ctx->xattr_name[desc_xattr->name_length] = '\0';
+ fprintf(ctx->fp, "XATTR: name = \"%s\"\n", ctx->xattr_name);
+ fprintf(ctx->fp, " value = \"");
+ ctx->print_xattr_fn(ctx->fp, ctx->super->bstype.bstype, ctx->xattr_name,
+ (void *)((uintptr_t)desc_xattr->name + desc_xattr->name_length),
+ desc_xattr->value_length);
+ fprintf(ctx->fp, "\"\n");
+ for (i = 0; i < desc_xattr->value_length; i++) {
+ if (i % 16 == 0) {
+ fprintf(ctx->fp, " ");
+ }
+ fprintf(ctx->fp, "%02" PRIx8 " ", *((uint8_t *)desc_xattr->name + desc_xattr->name_length + i));
+ if ((i + 1) % 16 == 0) {
+ fprintf(ctx->fp, "\n");
+ }
+ }
+ if (i % 16 != 0) {
+ fprintf(ctx->fp, "\n");
+ }
+ } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) {
+ /* TODO */
+ } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_FLAGS) {
+ /* TODO */
+ } else {
+ /* Error */
+ }
+ /* Advance to the next descriptor */
+ cur_desc += sizeof(*desc) + desc->length;
+ if (cur_desc + sizeof(*desc) > sizeof(page->descriptors)) {
+ break;
+ }
+ desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + cur_desc);
+ }
+}
+
+static void
+bs_dump_read_md_page_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+ struct spdk_bs_dump_ctx *ctx = cb_arg;
+
+ if (bserrno != 0) {
+ bs_dump_finish(seq, ctx, bserrno);
+ return;
+ }
+
+ if (ctx->page->id != 0) {
+ bs_dump_print_md_page(ctx);
+ }
+
+ ctx->cur_page++;
+
+ if (ctx->cur_page < ctx->super->md_len) {
+ bs_dump_read_md_page(seq, ctx);
+ } else {
+ spdk_free(ctx->page);
+ bs_dump_finish(seq, ctx, 0);
+ }
+}
+
+static void
+bs_dump_read_md_page(spdk_bs_sequence_t *seq, void *cb_arg)
+{
+ struct spdk_bs_dump_ctx *ctx = cb_arg;
+ uint64_t lba;
+
+ assert(ctx->cur_page < ctx->super->md_len);
+ lba = bs_page_to_lba(ctx->bs, ctx->super->md_start + ctx->cur_page);
+ bs_sequence_read_dev(seq, ctx->page, lba,
+ bs_byte_to_lba(ctx->bs, SPDK_BS_PAGE_SIZE),
+ bs_dump_read_md_page_cpl, ctx);
+}
+
+static void
+bs_dump_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+ struct spdk_bs_dump_ctx *ctx = cb_arg;
+
+ fprintf(ctx->fp, "Signature: \"%.8s\" ", ctx->super->signature);
+ if (memcmp(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG,
+ sizeof(ctx->super->signature)) != 0) {
+ fprintf(ctx->fp, "(Mismatch)\n");
+ bs_dump_finish(seq, ctx, bserrno);
+ return;
+ } else {
+ fprintf(ctx->fp, "(OK)\n");
+ }
+ fprintf(ctx->fp, "Version: %" PRIu32 "\n", ctx->super->version);
+ fprintf(ctx->fp, "CRC: 0x%x (%s)\n", ctx->super->crc,
+ (ctx->super->crc == blob_md_page_calc_crc(ctx->super)) ? "OK" : "Mismatch");
+ fprintf(ctx->fp, "Blobstore Type: %.*s\n", SPDK_BLOBSTORE_TYPE_LENGTH, ctx->super->bstype.bstype);
+ fprintf(ctx->fp, "Cluster Size: %" PRIu32 "\n", ctx->super->cluster_size);
+ fprintf(ctx->fp, "Super Blob ID: ");
+ if (ctx->super->super_blob == SPDK_BLOBID_INVALID) {
+ fprintf(ctx->fp, "(None)\n");
+ } else {
+ fprintf(ctx->fp, "%" PRIu64 "\n", ctx->super->super_blob);
+ }
+ fprintf(ctx->fp, "Clean: %" PRIu32 "\n", ctx->super->clean);
+ fprintf(ctx->fp, "Used Metadata Page Mask Start: %" PRIu32 "\n", ctx->super->used_page_mask_start);
+ fprintf(ctx->fp, "Used Metadata Page Mask Length: %" PRIu32 "\n", ctx->super->used_page_mask_len);
+ fprintf(ctx->fp, "Used Cluster Mask Start: %" PRIu32 "\n", ctx->super->used_cluster_mask_start);
+ fprintf(ctx->fp, "Used Cluster Mask Length: %" PRIu32 "\n", ctx->super->used_cluster_mask_len);
+ fprintf(ctx->fp, "Used Blob ID Mask Start: %" PRIu32 "\n", ctx->super->used_blobid_mask_start);
+ fprintf(ctx->fp, "Used Blob ID Mask Length: %" PRIu32 "\n", ctx->super->used_blobid_mask_len);
+ fprintf(ctx->fp, "Metadata Start: %" PRIu32 "\n", ctx->super->md_start);
+ fprintf(ctx->fp, "Metadata Length: %" PRIu32 "\n", ctx->super->md_len);
+
+ ctx->cur_page = 0;
+ ctx->page = spdk_zmalloc(SPDK_BS_PAGE_SIZE, SPDK_BS_PAGE_SIZE,
+ NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
+ if (!ctx->page) {
+ bs_dump_finish(seq, ctx, -ENOMEM);
+ return;
+ }
+ bs_dump_read_md_page(seq, ctx);
+}
+
+void
+spdk_bs_dump(struct spdk_bs_dev *dev, FILE *fp, spdk_bs_dump_print_xattr print_xattr_fn,
+ spdk_bs_op_complete cb_fn, void *cb_arg)
+{
+ struct spdk_blob_store *bs;
+ struct spdk_bs_cpl cpl;
+ spdk_bs_sequence_t *seq;
+ struct spdk_bs_dump_ctx *ctx;
+ struct spdk_bs_opts opts = {};
+ int err;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Dumping blobstore from dev %p\n", dev);
+
+ spdk_bs_opts_init(&opts);
+
+ err = bs_alloc(dev, &opts, &bs);
+ if (err) {
+ dev->destroy(dev);
+ cb_fn(cb_arg, err);
+ return;
+ }
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx) {
+ bs_free(bs);
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ ctx->bs = bs;
+ ctx->fp = fp;
+ ctx->print_xattr_fn = print_xattr_fn;
+
+ /* Allocate memory for the super block */
+ ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL,
+ SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
+ if (!ctx->super) {
+ free(ctx);
+ bs_free(bs);
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC;
+ cpl.u.bs_basic.cb_fn = cb_fn;
+ cpl.u.bs_basic.cb_arg = cb_arg;
+
+ seq = bs_sequence_start(bs->md_channel, &cpl);
+ if (!seq) {
+ spdk_free(ctx->super);
+ free(ctx);
+ bs_free(bs);
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ /* Read the super block */
+ bs_sequence_read_dev(seq, ctx->super, bs_page_to_lba(bs, 0),
+ bs_byte_to_lba(bs, sizeof(*ctx->super)),
+ bs_dump_super_cpl, ctx);
+}
+
+/* END spdk_bs_dump */
+
+/* START spdk_bs_init */
+
+struct spdk_bs_init_ctx {
+ struct spdk_blob_store *bs;
+ struct spdk_bs_super_block *super;
+};
+
+static void
+bs_init_persist_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+ struct spdk_bs_init_ctx *ctx = cb_arg;
+
+ spdk_free(ctx->super);
+ free(ctx);
+
+ bs_sequence_finish(seq, bserrno);
+}
+
+static void
+bs_init_trim_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+ struct spdk_bs_init_ctx *ctx = cb_arg;
+
+ /* Write super block */
+ bs_sequence_write_dev(seq, ctx->super, bs_page_to_lba(ctx->bs, 0),
+ bs_byte_to_lba(ctx->bs, sizeof(*ctx->super)),
+ bs_init_persist_super_cpl, ctx);
+}
+
+void
+spdk_bs_init(struct spdk_bs_dev *dev, struct spdk_bs_opts *o,
+ spdk_bs_op_with_handle_complete cb_fn, void *cb_arg)
+{
+ struct spdk_bs_init_ctx *ctx;
+ struct spdk_blob_store *bs;
+ struct spdk_bs_cpl cpl;
+ spdk_bs_sequence_t *seq;
+ spdk_bs_batch_t *batch;
+ uint64_t num_md_lba;
+ uint64_t num_md_pages;
+ uint64_t num_md_clusters;
+ uint32_t i;
+ struct spdk_bs_opts opts = {};
+ int rc;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Initializing blobstore on dev %p\n", dev);
+
+ if ((SPDK_BS_PAGE_SIZE % dev->blocklen) != 0) {
+ SPDK_ERRLOG("unsupported dev block length of %d\n",
+ dev->blocklen);
+ dev->destroy(dev);
+ cb_fn(cb_arg, NULL, -EINVAL);
+ return;
+ }
+
+ if (o) {
+ opts = *o;
+ } else {
+ spdk_bs_opts_init(&opts);
+ }
+
+ if (bs_opts_verify(&opts) != 0) {
+ dev->destroy(dev);
+ cb_fn(cb_arg, NULL, -EINVAL);
+ return;
+ }
+
+ rc = bs_alloc(dev, &opts, &bs);
+ if (rc) {
+ dev->destroy(dev);
+ cb_fn(cb_arg, NULL, rc);
+ return;
+ }
+
+ if (opts.num_md_pages == SPDK_BLOB_OPTS_NUM_MD_PAGES) {
+ /* By default, allocate 1 page per cluster.
+ * Technically, this over-allocates metadata
+ * because more metadata will reduce the number
+ * of usable clusters. This can be addressed with
+ * more complex math in the future.
+ */
+ bs->md_len = bs->total_clusters;
+ } else {
+ bs->md_len = opts.num_md_pages;
+ }
+ rc = spdk_bit_array_resize(&bs->used_md_pages, bs->md_len);
+ if (rc < 0) {
+ bs_free(bs);
+ cb_fn(cb_arg, NULL, -ENOMEM);
+ return;
+ }
+
+ rc = spdk_bit_array_resize(&bs->used_blobids, bs->md_len);
+ if (rc < 0) {
+ bs_free(bs);
+ cb_fn(cb_arg, NULL, -ENOMEM);
+ return;
+ }
+
+ rc = spdk_bit_array_resize(&bs->open_blobids, bs->md_len);
+ if (rc < 0) {
+ bs_free(bs);
+ cb_fn(cb_arg, NULL, -ENOMEM);
+ return;
+ }
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx) {
+ bs_free(bs);
+ cb_fn(cb_arg, NULL, -ENOMEM);
+ return;
+ }
+
+ ctx->bs = bs;
+
+ /* Allocate memory for the super block */
+ ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL,
+ SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
+ if (!ctx->super) {
+ free(ctx);
+ bs_free(bs);
+ cb_fn(cb_arg, NULL, -ENOMEM);
+ return;
+ }
+ memcpy(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG,
+ sizeof(ctx->super->signature));
+ ctx->super->version = SPDK_BS_VERSION;
+ ctx->super->length = sizeof(*ctx->super);
+ ctx->super->super_blob = bs->super_blob;
+ ctx->super->clean = 0;
+ ctx->super->cluster_size = bs->cluster_sz;
+ ctx->super->io_unit_size = bs->io_unit_size;
+ memcpy(&ctx->super->bstype, &bs->bstype, sizeof(bs->bstype));
+
+ /* Calculate how many pages the metadata consumes at the front
+ * of the disk.
+ */
+
+ /* The super block uses 1 page */
+ num_md_pages = 1;
+
+ /* The used_md_pages mask requires 1 bit per metadata page, rounded
+ * up to the nearest page, plus a header.
+ */
+ ctx->super->used_page_mask_start = num_md_pages;
+ ctx->super->used_page_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) +
+ spdk_divide_round_up(bs->md_len, 8),
+ SPDK_BS_PAGE_SIZE);
+ num_md_pages += ctx->super->used_page_mask_len;
+
+ /* The used_clusters mask requires 1 bit per cluster, rounded
+ * up to the nearest page, plus a header.
+ */
+ ctx->super->used_cluster_mask_start = num_md_pages;
+ ctx->super->used_cluster_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) +
+ spdk_divide_round_up(bs->total_clusters, 8),
+ SPDK_BS_PAGE_SIZE);
+ num_md_pages += ctx->super->used_cluster_mask_len;
+
+ /* The used_blobids mask requires 1 bit per metadata page, rounded
+ * up to the nearest page, plus a header.
+ */
+ ctx->super->used_blobid_mask_start = num_md_pages;
+ ctx->super->used_blobid_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) +
+ spdk_divide_round_up(bs->md_len, 8),
+ SPDK_BS_PAGE_SIZE);
+ num_md_pages += ctx->super->used_blobid_mask_len;
+
+ /* The metadata region size was chosen above */
+ ctx->super->md_start = bs->md_start = num_md_pages;
+ ctx->super->md_len = bs->md_len;
+ num_md_pages += bs->md_len;
+
+ num_md_lba = bs_page_to_lba(bs, num_md_pages);
+
+ ctx->super->size = dev->blockcnt * dev->blocklen;
+
+ ctx->super->crc = blob_md_page_calc_crc(ctx->super);
+
+ num_md_clusters = spdk_divide_round_up(num_md_pages, bs->pages_per_cluster);
+ if (num_md_clusters > bs->total_clusters) {
+ SPDK_ERRLOG("Blobstore metadata cannot use more clusters than is available, "
+ "please decrease number of pages reserved for metadata "
+ "or increase cluster size.\n");
+ spdk_free(ctx->super);
+ free(ctx);
+ bs_free(bs);
+ cb_fn(cb_arg, NULL, -ENOMEM);
+ return;
+ }
+ /* Claim all of the clusters used by the metadata */
+ for (i = 0; i < num_md_clusters; i++) {
+ bs_claim_cluster(bs, i);
+ }
+
+ bs->total_data_clusters = bs->num_free_clusters;
+
+ cpl.type = SPDK_BS_CPL_TYPE_BS_HANDLE;
+ cpl.u.bs_handle.cb_fn = cb_fn;
+ cpl.u.bs_handle.cb_arg = cb_arg;
+ cpl.u.bs_handle.bs = bs;
+
+ seq = bs_sequence_start(bs->md_channel, &cpl);
+ if (!seq) {
+ spdk_free(ctx->super);
+ free(ctx);
+ bs_free(bs);
+ cb_fn(cb_arg, NULL, -ENOMEM);
+ return;
+ }
+
+ batch = bs_sequence_to_batch(seq, bs_init_trim_cpl, ctx);
+
+ /* Clear metadata space */
+ bs_batch_write_zeroes_dev(batch, 0, num_md_lba);
+
+ switch (opts.clear_method) {
+ case BS_CLEAR_WITH_UNMAP:
+ /* Trim data clusters */
+ bs_batch_unmap_dev(batch, num_md_lba, ctx->bs->dev->blockcnt - num_md_lba);
+ break;
+ case BS_CLEAR_WITH_WRITE_ZEROES:
+ /* Write_zeroes to data clusters */
+ bs_batch_write_zeroes_dev(batch, num_md_lba, ctx->bs->dev->blockcnt - num_md_lba);
+ break;
+ case BS_CLEAR_WITH_NONE:
+ default:
+ break;
+ }
+
+ bs_batch_close(batch);
+}
+
+/* END spdk_bs_init */
+
+/* START spdk_bs_destroy */
+
+static void
+bs_destroy_trim_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+ struct spdk_bs_init_ctx *ctx = cb_arg;
+ struct spdk_blob_store *bs = ctx->bs;
+
+ /*
+ * We need to defer calling bs_call_cpl() until after
+ * dev destruction, so tuck these away for later use.
+ */
+ bs->unload_err = bserrno;
+ memcpy(&bs->unload_cpl, &seq->cpl, sizeof(struct spdk_bs_cpl));
+ seq->cpl.type = SPDK_BS_CPL_TYPE_NONE;
+
+ bs_sequence_finish(seq, bserrno);
+
+ bs_free(bs);
+ free(ctx);
+}
+
+void
+spdk_bs_destroy(struct spdk_blob_store *bs, spdk_bs_op_complete cb_fn,
+ void *cb_arg)
+{
+ struct spdk_bs_cpl cpl;
+ spdk_bs_sequence_t *seq;
+ struct spdk_bs_init_ctx *ctx;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Destroying blobstore\n");
+
+ if (!TAILQ_EMPTY(&bs->blobs)) {
+ SPDK_ERRLOG("Blobstore still has open blobs\n");
+ cb_fn(cb_arg, -EBUSY);
+ return;
+ }
+
+ cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC;
+ cpl.u.bs_basic.cb_fn = cb_fn;
+ cpl.u.bs_basic.cb_arg = cb_arg;
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx) {
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ ctx->bs = bs;
+
+ seq = bs_sequence_start(bs->md_channel, &cpl);
+ if (!seq) {
+ free(ctx);
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ /* Write zeroes to the super block */
+ bs_sequence_write_zeroes_dev(seq,
+ bs_page_to_lba(bs, 0),
+ bs_byte_to_lba(bs, sizeof(struct spdk_bs_super_block)),
+ bs_destroy_trim_cpl, ctx);
+}
+
+/* END spdk_bs_destroy */
+
+/* START spdk_bs_unload */
+
+static void
+bs_unload_finish(struct spdk_bs_load_ctx *ctx, int bserrno)
+{
+ spdk_bs_sequence_t *seq = ctx->seq;
+
+ spdk_free(ctx->super);
+
+ /*
+ * We need to defer calling bs_call_cpl() until after
+ * dev destruction, so tuck these away for later use.
+ */
+ ctx->bs->unload_err = bserrno;
+ memcpy(&ctx->bs->unload_cpl, &seq->cpl, sizeof(struct spdk_bs_cpl));
+ seq->cpl.type = SPDK_BS_CPL_TYPE_NONE;
+
+ bs_sequence_finish(seq, bserrno);
+
+ bs_free(ctx->bs);
+ free(ctx);
+}
+
+static void
+bs_unload_write_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+ struct spdk_bs_load_ctx *ctx = cb_arg;
+
+ bs_unload_finish(ctx, bserrno);
+}
+
+static void
+bs_unload_write_used_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+ struct spdk_bs_load_ctx *ctx = cb_arg;
+
+ spdk_free(ctx->mask);
+
+ if (bserrno != 0) {
+ bs_unload_finish(ctx, bserrno);
+ return;
+ }
+
+ ctx->super->clean = 1;
+
+ bs_write_super(seq, ctx->bs, ctx->super, bs_unload_write_super_cpl, ctx);
+}
+
+static void
+bs_unload_write_used_blobids_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+ struct spdk_bs_load_ctx *ctx = cb_arg;
+
+ spdk_free(ctx->mask);
+ ctx->mask = NULL;
+
+ if (bserrno != 0) {
+ bs_unload_finish(ctx, bserrno);
+ return;
+ }
+
+ bs_write_used_clusters(seq, ctx, bs_unload_write_used_clusters_cpl);
+}
+
+static void
+bs_unload_write_used_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+ struct spdk_bs_load_ctx *ctx = cb_arg;
+
+ spdk_free(ctx->mask);
+ ctx->mask = NULL;
+
+ if (bserrno != 0) {
+ bs_unload_finish(ctx, bserrno);
+ return;
+ }
+
+ bs_write_used_blobids(seq, ctx, bs_unload_write_used_blobids_cpl);
+}
+
+static void
+bs_unload_read_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+ struct spdk_bs_load_ctx *ctx = cb_arg;
+
+ if (bserrno != 0) {
+ bs_unload_finish(ctx, bserrno);
+ return;
+ }
+
+ bs_write_used_md(seq, cb_arg, bs_unload_write_used_pages_cpl);
+}
+
+void
+spdk_bs_unload(struct spdk_blob_store *bs, spdk_bs_op_complete cb_fn, void *cb_arg)
+{
+ struct spdk_bs_cpl cpl;
+ struct spdk_bs_load_ctx *ctx;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Syncing blobstore\n");
+
+ if (!TAILQ_EMPTY(&bs->blobs)) {
+ SPDK_ERRLOG("Blobstore still has open blobs\n");
+ cb_fn(cb_arg, -EBUSY);
+ return;
+ }
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx) {
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ ctx->bs = bs;
+
+ ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL,
+ SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
+ if (!ctx->super) {
+ free(ctx);
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC;
+ cpl.u.bs_basic.cb_fn = cb_fn;
+ cpl.u.bs_basic.cb_arg = cb_arg;
+
+ ctx->seq = bs_sequence_start(bs->md_channel, &cpl);
+ if (!ctx->seq) {
+ spdk_free(ctx->super);
+ free(ctx);
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ /* Read super block */
+ bs_sequence_read_dev(ctx->seq, ctx->super, bs_page_to_lba(bs, 0),
+ bs_byte_to_lba(bs, sizeof(*ctx->super)),
+ bs_unload_read_super_cpl, ctx);
+}
+
+/* END spdk_bs_unload */
+
+/* START spdk_bs_set_super */
+
+struct spdk_bs_set_super_ctx {
+ struct spdk_blob_store *bs;
+ struct spdk_bs_super_block *super;
+};
+
+static void
+bs_set_super_write_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+ struct spdk_bs_set_super_ctx *ctx = cb_arg;
+
+ if (bserrno != 0) {
+ SPDK_ERRLOG("Unable to write to super block of blobstore\n");
+ }
+
+ spdk_free(ctx->super);
+
+ bs_sequence_finish(seq, bserrno);
+
+ free(ctx);
+}
+
+static void
+bs_set_super_read_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+ struct spdk_bs_set_super_ctx *ctx = cb_arg;
+
+ if (bserrno != 0) {
+ SPDK_ERRLOG("Unable to read super block of blobstore\n");
+ spdk_free(ctx->super);
+ bs_sequence_finish(seq, bserrno);
+ free(ctx);
+ return;
+ }
+
+ bs_write_super(seq, ctx->bs, ctx->super, bs_set_super_write_cpl, ctx);
+}
+
+void
+spdk_bs_set_super(struct spdk_blob_store *bs, spdk_blob_id blobid,
+ spdk_bs_op_complete cb_fn, void *cb_arg)
+{
+ struct spdk_bs_cpl cpl;
+ spdk_bs_sequence_t *seq;
+ struct spdk_bs_set_super_ctx *ctx;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Setting super blob id on blobstore\n");
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx) {
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ ctx->bs = bs;
+
+ ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL,
+ SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
+ if (!ctx->super) {
+ free(ctx);
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC;
+ cpl.u.bs_basic.cb_fn = cb_fn;
+ cpl.u.bs_basic.cb_arg = cb_arg;
+
+ seq = bs_sequence_start(bs->md_channel, &cpl);
+ if (!seq) {
+ spdk_free(ctx->super);
+ free(ctx);
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ bs->super_blob = blobid;
+
+ /* Read super block */
+ bs_sequence_read_dev(seq, ctx->super, bs_page_to_lba(bs, 0),
+ bs_byte_to_lba(bs, sizeof(*ctx->super)),
+ bs_set_super_read_cpl, ctx);
+}
+
+/* END spdk_bs_set_super */
+
+void
+spdk_bs_get_super(struct spdk_blob_store *bs,
+ spdk_blob_op_with_id_complete cb_fn, void *cb_arg)
+{
+ if (bs->super_blob == SPDK_BLOBID_INVALID) {
+ cb_fn(cb_arg, SPDK_BLOBID_INVALID, -ENOENT);
+ } else {
+ cb_fn(cb_arg, bs->super_blob, 0);
+ }
+}
+
+uint64_t
+spdk_bs_get_cluster_size(struct spdk_blob_store *bs)
+{
+ return bs->cluster_sz;
+}
+
+uint64_t
+spdk_bs_get_page_size(struct spdk_blob_store *bs)
+{
+ return SPDK_BS_PAGE_SIZE;
+}
+
+uint64_t
+spdk_bs_get_io_unit_size(struct spdk_blob_store *bs)
+{
+ return bs->io_unit_size;
+}
+
+uint64_t
+spdk_bs_free_cluster_count(struct spdk_blob_store *bs)
+{
+ return bs->num_free_clusters;
+}
+
+uint64_t
+spdk_bs_total_data_cluster_count(struct spdk_blob_store *bs)
+{
+ return bs->total_data_clusters;
+}
+
+static int
+bs_register_md_thread(struct spdk_blob_store *bs)
+{
+ bs->md_channel = spdk_get_io_channel(bs);
+ if (!bs->md_channel) {
+ SPDK_ERRLOG("Failed to get IO channel.\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int
+bs_unregister_md_thread(struct spdk_blob_store *bs)
+{
+ spdk_put_io_channel(bs->md_channel);
+
+ return 0;
+}
+
+spdk_blob_id spdk_blob_get_id(struct spdk_blob *blob)
+{
+ assert(blob != NULL);
+
+ return blob->id;
+}
+
+uint64_t spdk_blob_get_num_pages(struct spdk_blob *blob)
+{
+ assert(blob != NULL);
+
+ return bs_cluster_to_page(blob->bs, blob->active.num_clusters);
+}
+
+uint64_t spdk_blob_get_num_io_units(struct spdk_blob *blob)
+{
+ assert(blob != NULL);
+
+ return spdk_blob_get_num_pages(blob) * bs_io_unit_per_page(blob->bs);
+}
+
+uint64_t spdk_blob_get_num_clusters(struct spdk_blob *blob)
+{
+ assert(blob != NULL);
+
+ return blob->active.num_clusters;
+}
+
+/* START spdk_bs_create_blob */
+
+static void
+bs_create_blob_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+ struct spdk_blob *blob = cb_arg;
+ uint32_t page_idx = bs_blobid_to_page(blob->id);
+
+ if (bserrno != 0) {
+ spdk_bit_array_clear(blob->bs->used_blobids, page_idx);
+ bs_release_md_page(blob->bs, page_idx);
+ }
+
+ blob_free(blob);
+
+ bs_sequence_finish(seq, bserrno);
+}
+
+static int
+blob_set_xattrs(struct spdk_blob *blob, const struct spdk_blob_xattr_opts *xattrs,
+ bool internal)
+{
+ uint64_t i;
+ size_t value_len = 0;
+ int rc;
+ const void *value = NULL;
+ if (xattrs->count > 0 && xattrs->get_value == NULL) {
+ return -EINVAL;
+ }
+ for (i = 0; i < xattrs->count; i++) {
+ xattrs->get_value(xattrs->ctx, xattrs->names[i], &value, &value_len);
+ if (value == NULL || value_len == 0) {
+ return -EINVAL;
+ }
+ rc = blob_set_xattr(blob, xattrs->names[i], value, value_len, internal);
+ if (rc < 0) {
+ return rc;
+ }
+ }
+ return 0;
+}
+
+static void
+bs_create_blob(struct spdk_blob_store *bs,
+ const struct spdk_blob_opts *opts,
+ const struct spdk_blob_xattr_opts *internal_xattrs,
+ spdk_blob_op_with_id_complete cb_fn, void *cb_arg)
+{
+ struct spdk_blob *blob;
+ uint32_t page_idx;
+ struct spdk_bs_cpl cpl;
+ struct spdk_blob_opts opts_default;
+ struct spdk_blob_xattr_opts internal_xattrs_default;
+ spdk_bs_sequence_t *seq;
+ spdk_blob_id id;
+ int rc;
+
+ assert(spdk_get_thread() == bs->md_thread);
+
+ page_idx = spdk_bit_array_find_first_clear(bs->used_md_pages, 0);
+ if (page_idx == UINT32_MAX) {
+ cb_fn(cb_arg, 0, -ENOMEM);
+ return;
+ }
+ spdk_bit_array_set(bs->used_blobids, page_idx);
+ bs_claim_md_page(bs, page_idx);
+
+ id = bs_page_to_blobid(page_idx);
+
+ SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Creating blob with id %lu at page %u\n", id, page_idx);
+
+ blob = blob_alloc(bs, id);
+ if (!blob) {
+ spdk_bit_array_clear(bs->used_blobids, page_idx);
+ bs_release_md_page(bs, page_idx);
+ cb_fn(cb_arg, 0, -ENOMEM);
+ return;
+ }
+
+ if (!opts) {
+ spdk_blob_opts_init(&opts_default);
+ opts = &opts_default;
+ }
+
+ blob->use_extent_table = opts->use_extent_table;
+ if (blob->use_extent_table) {
+ blob->invalid_flags |= SPDK_BLOB_EXTENT_TABLE;
+ }
+
+ if (!internal_xattrs) {
+ blob_xattrs_init(&internal_xattrs_default);
+ internal_xattrs = &internal_xattrs_default;
+ }
+
+ rc = blob_set_xattrs(blob, &opts->xattrs, false);
+ if (rc < 0) {
+ blob_free(blob);
+ spdk_bit_array_clear(bs->used_blobids, page_idx);
+ bs_release_md_page(bs, page_idx);
+ cb_fn(cb_arg, 0, rc);
+ return;
+ }
+
+ rc = blob_set_xattrs(blob, internal_xattrs, true);
+ if (rc < 0) {
+ blob_free(blob);
+ spdk_bit_array_clear(bs->used_blobids, page_idx);
+ bs_release_md_page(bs, page_idx);
+ cb_fn(cb_arg, 0, rc);
+ return;
+ }
+
+ if (opts->thin_provision) {
+ blob_set_thin_provision(blob);
+ }
+
+ blob_set_clear_method(blob, opts->clear_method);
+
+ rc = blob_resize(blob, opts->num_clusters);
+ if (rc < 0) {
+ blob_free(blob);
+ spdk_bit_array_clear(bs->used_blobids, page_idx);
+ bs_release_md_page(bs, page_idx);
+ cb_fn(cb_arg, 0, rc);
+ return;
+ }
+ cpl.type = SPDK_BS_CPL_TYPE_BLOBID;
+ cpl.u.blobid.cb_fn = cb_fn;
+ cpl.u.blobid.cb_arg = cb_arg;
+ cpl.u.blobid.blobid = blob->id;
+
+ seq = bs_sequence_start(bs->md_channel, &cpl);
+ if (!seq) {
+ blob_free(blob);
+ spdk_bit_array_clear(bs->used_blobids, page_idx);
+ bs_release_md_page(bs, page_idx);
+ cb_fn(cb_arg, 0, -ENOMEM);
+ return;
+ }
+
+ blob_persist(seq, blob, bs_create_blob_cpl, blob);
+}
+
+void spdk_bs_create_blob(struct spdk_blob_store *bs,
+ spdk_blob_op_with_id_complete cb_fn, void *cb_arg)
+{
+ bs_create_blob(bs, NULL, NULL, cb_fn, cb_arg);
+}
+
+void spdk_bs_create_blob_ext(struct spdk_blob_store *bs, const struct spdk_blob_opts *opts,
+ spdk_blob_op_with_id_complete cb_fn, void *cb_arg)
+{
+ bs_create_blob(bs, opts, NULL, cb_fn, cb_arg);
+}
+
+/* END spdk_bs_create_blob */
+
+/* START blob_cleanup */
+
+struct spdk_clone_snapshot_ctx {
+ struct spdk_bs_cpl cpl;
+ int bserrno;
+ bool frozen;
+
+ struct spdk_io_channel *channel;
+
+ /* Current cluster for inflate operation */
+ uint64_t cluster;
+
+ /* For inflation force allocation of all unallocated clusters and remove
+ * thin-provisioning. Otherwise only decouple parent and keep clone thin. */
+ bool allocate_all;
+
+ struct {
+ spdk_blob_id id;
+ struct spdk_blob *blob;
+ } original;
+ struct {
+ spdk_blob_id id;
+ struct spdk_blob *blob;
+ } new;
+
+ /* xattrs specified for snapshot/clones only. They have no impact on
+ * the original blobs xattrs. */
+ const struct spdk_blob_xattr_opts *xattrs;
+};
+
+static void
+bs_clone_snapshot_cleanup_finish(void *cb_arg, int bserrno)
+{
+ struct spdk_clone_snapshot_ctx *ctx = cb_arg;
+ struct spdk_bs_cpl *cpl = &ctx->cpl;
+
+ if (bserrno != 0) {
+ if (ctx->bserrno != 0) {
+ SPDK_ERRLOG("Cleanup error %d\n", bserrno);
+ } else {
+ ctx->bserrno = bserrno;
+ }
+ }
+
+ switch (cpl->type) {
+ case SPDK_BS_CPL_TYPE_BLOBID:
+ cpl->u.blobid.cb_fn(cpl->u.blobid.cb_arg, cpl->u.blobid.blobid, ctx->bserrno);
+ break;
+ case SPDK_BS_CPL_TYPE_BLOB_BASIC:
+ cpl->u.blob_basic.cb_fn(cpl->u.blob_basic.cb_arg, ctx->bserrno);
+ break;
+ default:
+ SPDK_UNREACHABLE();
+ break;
+ }
+
+ free(ctx);
+}
+
+static void
+bs_snapshot_unfreeze_cpl(void *cb_arg, int bserrno)
+{
+ struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
+ struct spdk_blob *origblob = ctx->original.blob;
+
+ if (bserrno != 0) {
+ if (ctx->bserrno != 0) {
+ SPDK_ERRLOG("Unfreeze error %d\n", bserrno);
+ } else {
+ ctx->bserrno = bserrno;
+ }
+ }
+
+ ctx->original.id = origblob->id;
+ origblob->locked_operation_in_progress = false;
+
+ spdk_blob_close(origblob, bs_clone_snapshot_cleanup_finish, ctx);
+}
+
+static void
+bs_clone_snapshot_origblob_cleanup(void *cb_arg, int bserrno)
+{
+ struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
+ struct spdk_blob *origblob = ctx->original.blob;
+
+ if (bserrno != 0) {
+ if (ctx->bserrno != 0) {
+ SPDK_ERRLOG("Cleanup error %d\n", bserrno);
+ } else {
+ ctx->bserrno = bserrno;
+ }
+ }
+
+ if (ctx->frozen) {
+ /* Unfreeze any outstanding I/O */
+ blob_unfreeze_io(origblob, bs_snapshot_unfreeze_cpl, ctx);
+ } else {
+ bs_snapshot_unfreeze_cpl(ctx, 0);
+ }
+
+}
+
+static void
+bs_clone_snapshot_newblob_cleanup(void *cb_arg, int bserrno)
+{
+ struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
+ struct spdk_blob *newblob = ctx->new.blob;
+
+ if (bserrno != 0) {
+ if (ctx->bserrno != 0) {
+ SPDK_ERRLOG("Cleanup error %d\n", bserrno);
+ } else {
+ ctx->bserrno = bserrno;
+ }
+ }
+
+ ctx->new.id = newblob->id;
+ spdk_blob_close(newblob, bs_clone_snapshot_origblob_cleanup, ctx);
+}
+
+/* END blob_cleanup */
+
+/* START spdk_bs_create_snapshot */
+
+static void
+bs_snapshot_swap_cluster_maps(struct spdk_blob *blob1, struct spdk_blob *blob2)
+{
+ uint64_t *cluster_temp;
+ uint32_t *extent_page_temp;
+
+ cluster_temp = blob1->active.clusters;
+ blob1->active.clusters = blob2->active.clusters;
+ blob2->active.clusters = cluster_temp;
+
+ extent_page_temp = blob1->active.extent_pages;
+ blob1->active.extent_pages = blob2->active.extent_pages;
+ blob2->active.extent_pages = extent_page_temp;
+}
+
+static void
+bs_snapshot_origblob_sync_cpl(void *cb_arg, int bserrno)
+{
+ struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
+ struct spdk_blob *origblob = ctx->original.blob;
+ struct spdk_blob *newblob = ctx->new.blob;
+
+ if (bserrno != 0) {
+ bs_snapshot_swap_cluster_maps(newblob, origblob);
+ bs_clone_snapshot_origblob_cleanup(ctx, bserrno);
+ return;
+ }
+
+ /* Remove metadata descriptor SNAPSHOT_IN_PROGRESS */
+ bserrno = blob_remove_xattr(newblob, SNAPSHOT_IN_PROGRESS, true);
+ if (bserrno != 0) {
+ bs_clone_snapshot_origblob_cleanup(ctx, bserrno);
+ return;
+ }
+
+ bs_blob_list_add(ctx->original.blob);
+
+ spdk_blob_set_read_only(newblob);
+
+ /* sync snapshot metadata */
+ spdk_blob_sync_md(newblob, bs_clone_snapshot_origblob_cleanup, ctx);
+}
+
+static void
+bs_snapshot_newblob_sync_cpl(void *cb_arg, int bserrno)
+{
+ struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
+ struct spdk_blob *origblob = ctx->original.blob;
+ struct spdk_blob *newblob = ctx->new.blob;
+
+ if (bserrno != 0) {
+ /* return cluster map back to original */
+ bs_snapshot_swap_cluster_maps(newblob, origblob);
+
+ /* Newblob md sync failed. Valid clusters are only present in origblob.
+ * Since I/O is frozen on origblob, not changes to zeroed out cluster map should have occured.
+ * Newblob needs to be reverted to thin_provisioned state at creation to properly close. */
+ blob_set_thin_provision(newblob);
+ assert(spdk_mem_all_zero(newblob->active.clusters,
+ newblob->active.num_clusters * sizeof(*newblob->active.clusters)));
+ assert(spdk_mem_all_zero(newblob->active.extent_pages,
+ newblob->active.num_extent_pages * sizeof(*newblob->active.extent_pages)));
+
+ bs_clone_snapshot_newblob_cleanup(ctx, bserrno);
+ return;
+ }
+
+ /* Set internal xattr for snapshot id */
+ bserrno = blob_set_xattr(origblob, BLOB_SNAPSHOT, &newblob->id, sizeof(spdk_blob_id), true);
+ if (bserrno != 0) {
+ /* return cluster map back to original */
+ bs_snapshot_swap_cluster_maps(newblob, origblob);
+ bs_clone_snapshot_newblob_cleanup(ctx, bserrno);
+ return;
+ }
+
+ bs_blob_list_remove(origblob);
+ origblob->parent_id = newblob->id;
+
+ /* Create new back_bs_dev for snapshot */
+ origblob->back_bs_dev = bs_create_blob_bs_dev(newblob);
+ if (origblob->back_bs_dev == NULL) {
+ /* return cluster map back to original */
+ bs_snapshot_swap_cluster_maps(newblob, origblob);
+ bs_clone_snapshot_newblob_cleanup(ctx, -EINVAL);
+ return;
+ }
+
+ /* set clone blob as thin provisioned */
+ blob_set_thin_provision(origblob);
+
+ bs_blob_list_add(newblob);
+
+ /* sync clone metadata */
+ spdk_blob_sync_md(origblob, bs_snapshot_origblob_sync_cpl, ctx);
+}
+
+static void
+bs_snapshot_freeze_cpl(void *cb_arg, int rc)
+{
+ struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
+ struct spdk_blob *origblob = ctx->original.blob;
+ struct spdk_blob *newblob = ctx->new.blob;
+ int bserrno;
+
+ if (rc != 0) {
+ bs_clone_snapshot_newblob_cleanup(ctx, rc);
+ return;
+ }
+
+ ctx->frozen = true;
+
+ /* set new back_bs_dev for snapshot */
+ newblob->back_bs_dev = origblob->back_bs_dev;
+ /* Set invalid flags from origblob */
+ newblob->invalid_flags = origblob->invalid_flags;
+
+ /* inherit parent from original blob if set */
+ newblob->parent_id = origblob->parent_id;
+ if (origblob->parent_id != SPDK_BLOBID_INVALID) {
+ /* Set internal xattr for snapshot id */
+ bserrno = blob_set_xattr(newblob, BLOB_SNAPSHOT,
+ &origblob->parent_id, sizeof(spdk_blob_id), true);
+ if (bserrno != 0) {
+ bs_clone_snapshot_newblob_cleanup(ctx, bserrno);
+ return;
+ }
+ }
+
+ /* swap cluster maps */
+ bs_snapshot_swap_cluster_maps(newblob, origblob);
+
+ /* Set the clear method on the new blob to match the original. */
+ blob_set_clear_method(newblob, origblob->clear_method);
+
+ /* sync snapshot metadata */
+ spdk_blob_sync_md(newblob, bs_snapshot_newblob_sync_cpl, ctx);
+}
+
+static void
+bs_snapshot_newblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno)
+{
+ struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
+ struct spdk_blob *origblob = ctx->original.blob;
+ struct spdk_blob *newblob = _blob;
+
+ if (bserrno != 0) {
+ bs_clone_snapshot_origblob_cleanup(ctx, bserrno);
+ return;
+ }
+
+ ctx->new.blob = newblob;
+ assert(spdk_blob_is_thin_provisioned(newblob));
+ assert(spdk_mem_all_zero(newblob->active.clusters,
+ newblob->active.num_clusters * sizeof(*newblob->active.clusters)));
+ assert(spdk_mem_all_zero(newblob->active.extent_pages,
+ newblob->active.num_extent_pages * sizeof(*newblob->active.extent_pages)));
+
+ blob_freeze_io(origblob, bs_snapshot_freeze_cpl, ctx);
+}
+
+static void
+bs_snapshot_newblob_create_cpl(void *cb_arg, spdk_blob_id blobid, int bserrno)
+{
+ struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
+ struct spdk_blob *origblob = ctx->original.blob;
+
+ if (bserrno != 0) {
+ bs_clone_snapshot_origblob_cleanup(ctx, bserrno);
+ return;
+ }
+
+ ctx->new.id = blobid;
+ ctx->cpl.u.blobid.blobid = blobid;
+
+ spdk_bs_open_blob(origblob->bs, ctx->new.id, bs_snapshot_newblob_open_cpl, ctx);
+}
+
+
+static void
+bs_xattr_snapshot(void *arg, const char *name,
+ const void **value, size_t *value_len)
+{
+ assert(strncmp(name, SNAPSHOT_IN_PROGRESS, sizeof(SNAPSHOT_IN_PROGRESS)) == 0);
+
+ struct spdk_blob *blob = (struct spdk_blob *)arg;
+ *value = &blob->id;
+ *value_len = sizeof(blob->id);
+}
+
+static void
+bs_snapshot_origblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno)
+{
+ struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
+ struct spdk_blob_opts opts;
+ struct spdk_blob_xattr_opts internal_xattrs;
+ char *xattrs_names[] = { SNAPSHOT_IN_PROGRESS };
+
+ if (bserrno != 0) {
+ bs_clone_snapshot_cleanup_finish(ctx, bserrno);
+ return;
+ }
+
+ ctx->original.blob = _blob;
+
+ if (_blob->data_ro || _blob->md_ro) {
+ SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Cannot create snapshot from read only blob with id %lu\n",
+ _blob->id);
+ ctx->bserrno = -EINVAL;
+ spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx);
+ return;
+ }
+
+ if (_blob->locked_operation_in_progress) {
+ SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Cannot create snapshot - another operation in progress\n");
+ ctx->bserrno = -EBUSY;
+ spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx);
+ return;
+ }
+
+ _blob->locked_operation_in_progress = true;
+
+ spdk_blob_opts_init(&opts);
+ blob_xattrs_init(&internal_xattrs);
+
+ /* Change the size of new blob to the same as in original blob,
+ * but do not allocate clusters */
+ opts.thin_provision = true;
+ opts.num_clusters = spdk_blob_get_num_clusters(_blob);
+ opts.use_extent_table = _blob->use_extent_table;
+
+ /* If there are any xattrs specified for snapshot, set them now */
+ if (ctx->xattrs) {
+ memcpy(&opts.xattrs, ctx->xattrs, sizeof(*ctx->xattrs));
+ }
+ /* Set internal xattr SNAPSHOT_IN_PROGRESS */
+ internal_xattrs.count = 1;
+ internal_xattrs.ctx = _blob;
+ internal_xattrs.names = xattrs_names;
+ internal_xattrs.get_value = bs_xattr_snapshot;
+
+ bs_create_blob(_blob->bs, &opts, &internal_xattrs,
+ bs_snapshot_newblob_create_cpl, ctx);
+}
+
+void spdk_bs_create_snapshot(struct spdk_blob_store *bs, spdk_blob_id blobid,
+ const struct spdk_blob_xattr_opts *snapshot_xattrs,
+ spdk_blob_op_with_id_complete cb_fn, void *cb_arg)
+{
+ struct spdk_clone_snapshot_ctx *ctx = calloc(1, sizeof(*ctx));
+
+ if (!ctx) {
+ cb_fn(cb_arg, SPDK_BLOBID_INVALID, -ENOMEM);
+ return;
+ }
+ ctx->cpl.type = SPDK_BS_CPL_TYPE_BLOBID;
+ ctx->cpl.u.blobid.cb_fn = cb_fn;
+ ctx->cpl.u.blobid.cb_arg = cb_arg;
+ ctx->cpl.u.blobid.blobid = SPDK_BLOBID_INVALID;
+ ctx->bserrno = 0;
+ ctx->frozen = false;
+ ctx->original.id = blobid;
+ ctx->xattrs = snapshot_xattrs;
+
+ spdk_bs_open_blob(bs, ctx->original.id, bs_snapshot_origblob_open_cpl, ctx);
+}
+/* END spdk_bs_create_snapshot */
+
+/* START spdk_bs_create_clone */
+
+static void
+bs_xattr_clone(void *arg, const char *name,
+ const void **value, size_t *value_len)
+{
+ assert(strncmp(name, BLOB_SNAPSHOT, sizeof(BLOB_SNAPSHOT)) == 0);
+
+ struct spdk_blob *blob = (struct spdk_blob *)arg;
+ *value = &blob->id;
+ *value_len = sizeof(blob->id);
+}
+
+static void
+bs_clone_newblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno)
+{
+ struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
+ struct spdk_blob *clone = _blob;
+
+ ctx->new.blob = clone;
+ bs_blob_list_add(clone);
+
+ spdk_blob_close(clone, bs_clone_snapshot_origblob_cleanup, ctx);
+}
+
+static void
+bs_clone_newblob_create_cpl(void *cb_arg, spdk_blob_id blobid, int bserrno)
+{
+ struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
+
+ ctx->cpl.u.blobid.blobid = blobid;
+ spdk_bs_open_blob(ctx->original.blob->bs, blobid, bs_clone_newblob_open_cpl, ctx);
+}
+
+static void
+bs_clone_origblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno)
+{
+ struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
+ struct spdk_blob_opts opts;
+ struct spdk_blob_xattr_opts internal_xattrs;
+ char *xattr_names[] = { BLOB_SNAPSHOT };
+
+ if (bserrno != 0) {
+ bs_clone_snapshot_cleanup_finish(ctx, bserrno);
+ return;
+ }
+
+ ctx->original.blob = _blob;
+
+ if (!_blob->data_ro || !_blob->md_ro) {
+ SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Clone not from read-only blob\n");
+ ctx->bserrno = -EINVAL;
+ spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx);
+ return;
+ }
+
+ if (_blob->locked_operation_in_progress) {
+ SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Cannot create clone - another operation in progress\n");
+ ctx->bserrno = -EBUSY;
+ spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx);
+ return;
+ }
+
+ _blob->locked_operation_in_progress = true;
+
+ spdk_blob_opts_init(&opts);
+ blob_xattrs_init(&internal_xattrs);
+
+ opts.thin_provision = true;
+ opts.num_clusters = spdk_blob_get_num_clusters(_blob);
+ opts.use_extent_table = _blob->use_extent_table;
+ if (ctx->xattrs) {
+ memcpy(&opts.xattrs, ctx->xattrs, sizeof(*ctx->xattrs));
+ }
+
+ /* Set internal xattr BLOB_SNAPSHOT */
+ internal_xattrs.count = 1;
+ internal_xattrs.ctx = _blob;
+ internal_xattrs.names = xattr_names;
+ internal_xattrs.get_value = bs_xattr_clone;
+
+ bs_create_blob(_blob->bs, &opts, &internal_xattrs,
+ bs_clone_newblob_create_cpl, ctx);
+}
+
+void spdk_bs_create_clone(struct spdk_blob_store *bs, spdk_blob_id blobid,
+ const struct spdk_blob_xattr_opts *clone_xattrs,
+ spdk_blob_op_with_id_complete cb_fn, void *cb_arg)
+{
+ struct spdk_clone_snapshot_ctx *ctx = calloc(1, sizeof(*ctx));
+
+ if (!ctx) {
+ cb_fn(cb_arg, SPDK_BLOBID_INVALID, -ENOMEM);
+ return;
+ }
+
+ ctx->cpl.type = SPDK_BS_CPL_TYPE_BLOBID;
+ ctx->cpl.u.blobid.cb_fn = cb_fn;
+ ctx->cpl.u.blobid.cb_arg = cb_arg;
+ ctx->cpl.u.blobid.blobid = SPDK_BLOBID_INVALID;
+ ctx->bserrno = 0;
+ ctx->xattrs = clone_xattrs;
+ ctx->original.id = blobid;
+
+ spdk_bs_open_blob(bs, ctx->original.id, bs_clone_origblob_open_cpl, ctx);
+}
+
+/* END spdk_bs_create_clone */
+
+/* START spdk_bs_inflate_blob */
+
+static void
+bs_inflate_blob_set_parent_cpl(void *cb_arg, struct spdk_blob *_parent, int bserrno)
+{
+ struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
+ struct spdk_blob *_blob = ctx->original.blob;
+
+ if (bserrno != 0) {
+ bs_clone_snapshot_origblob_cleanup(ctx, bserrno);
+ return;
+ }
+
+ assert(_parent != NULL);
+
+ bs_blob_list_remove(_blob);
+ _blob->parent_id = _parent->id;
+ blob_set_xattr(_blob, BLOB_SNAPSHOT, &_blob->parent_id,
+ sizeof(spdk_blob_id), true);
+
+ _blob->back_bs_dev->destroy(_blob->back_bs_dev);
+ _blob->back_bs_dev = bs_create_blob_bs_dev(_parent);
+ bs_blob_list_add(_blob);
+
+ spdk_blob_sync_md(_blob, bs_clone_snapshot_origblob_cleanup, ctx);
+}
+
+static void
+bs_inflate_blob_done(void *cb_arg, int bserrno)
+{
+ struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
+ struct spdk_blob *_blob = ctx->original.blob;
+ struct spdk_blob *_parent;
+
+ if (bserrno != 0) {
+ bs_clone_snapshot_origblob_cleanup(ctx, bserrno);
+ return;
+ }
+
+ if (ctx->allocate_all) {
+ /* remove thin provisioning */
+ bs_blob_list_remove(_blob);
+ blob_remove_xattr(_blob, BLOB_SNAPSHOT, true);
+ _blob->invalid_flags = _blob->invalid_flags & ~SPDK_BLOB_THIN_PROV;
+ _blob->back_bs_dev->destroy(_blob->back_bs_dev);
+ _blob->back_bs_dev = NULL;
+ _blob->parent_id = SPDK_BLOBID_INVALID;
+ } else {
+ _parent = ((struct spdk_blob_bs_dev *)(_blob->back_bs_dev))->blob;
+ if (_parent->parent_id != SPDK_BLOBID_INVALID) {
+ /* We must change the parent of the inflated blob */
+ spdk_bs_open_blob(_blob->bs, _parent->parent_id,
+ bs_inflate_blob_set_parent_cpl, ctx);
+ return;
+ }
+
+ bs_blob_list_remove(_blob);
+ blob_remove_xattr(_blob, BLOB_SNAPSHOT, true);
+ _blob->parent_id = SPDK_BLOBID_INVALID;
+ _blob->back_bs_dev->destroy(_blob->back_bs_dev);
+ _blob->back_bs_dev = bs_create_zeroes_dev();
+ }
+
+ _blob->state = SPDK_BLOB_STATE_DIRTY;
+ spdk_blob_sync_md(_blob, bs_clone_snapshot_origblob_cleanup, ctx);
+}
+
+/* Check if cluster needs allocation */
+static inline bool
+bs_cluster_needs_allocation(struct spdk_blob *blob, uint64_t cluster, bool allocate_all)
+{
+ struct spdk_blob_bs_dev *b;
+
+ assert(blob != NULL);
+
+ if (blob->active.clusters[cluster] != 0) {
+ /* Cluster is already allocated */
+ return false;
+ }
+
+ if (blob->parent_id == SPDK_BLOBID_INVALID) {
+ /* Blob have no parent blob */
+ return allocate_all;
+ }
+
+ b = (struct spdk_blob_bs_dev *)blob->back_bs_dev;
+ return (allocate_all || b->blob->active.clusters[cluster] != 0);
+}
+
+static void
+bs_inflate_blob_touch_next(void *cb_arg, int bserrno)
+{
+ struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
+ struct spdk_blob *_blob = ctx->original.blob;
+ uint64_t offset;
+
+ if (bserrno != 0) {
+ bs_clone_snapshot_origblob_cleanup(ctx, bserrno);
+ return;
+ }
+
+ for (; ctx->cluster < _blob->active.num_clusters; ctx->cluster++) {
+ if (bs_cluster_needs_allocation(_blob, ctx->cluster, ctx->allocate_all)) {
+ break;
+ }
+ }
+
+ if (ctx->cluster < _blob->active.num_clusters) {
+ offset = bs_cluster_to_lba(_blob->bs, ctx->cluster);
+
+ /* We may safely increment a cluster before write */
+ ctx->cluster++;
+
+ /* Use zero length write to touch a cluster */
+ spdk_blob_io_write(_blob, ctx->channel, NULL, offset, 0,
+ bs_inflate_blob_touch_next, ctx);
+ } else {
+ bs_inflate_blob_done(cb_arg, bserrno);
+ }
+}
+
+static void
+bs_inflate_blob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno)
+{
+ struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
+ uint64_t lfc; /* lowest free cluster */
+ uint64_t i;
+
+ if (bserrno != 0) {
+ bs_clone_snapshot_cleanup_finish(ctx, bserrno);
+ return;
+ }
+
+ ctx->original.blob = _blob;
+
+ if (_blob->locked_operation_in_progress) {
+ SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Cannot inflate blob - another operation in progress\n");
+ ctx->bserrno = -EBUSY;
+ spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx);
+ return;
+ }
+
+ _blob->locked_operation_in_progress = true;
+
+ if (!ctx->allocate_all && _blob->parent_id == SPDK_BLOBID_INVALID) {
+ /* This blob have no parent, so we cannot decouple it. */
+ SPDK_ERRLOG("Cannot decouple parent of blob with no parent.\n");
+ bs_clone_snapshot_origblob_cleanup(ctx, -EINVAL);
+ return;
+ }
+
+ if (spdk_blob_is_thin_provisioned(_blob) == false) {
+ /* This is not thin provisioned blob. No need to inflate. */
+ bs_clone_snapshot_origblob_cleanup(ctx, 0);
+ return;
+ }
+
+ /* Do two passes - one to verify that we can obtain enough clusters
+ * and another to actually claim them.
+ */
+ lfc = 0;
+ for (i = 0; i < _blob->active.num_clusters; i++) {
+ if (bs_cluster_needs_allocation(_blob, i, ctx->allocate_all)) {
+ lfc = spdk_bit_array_find_first_clear(_blob->bs->used_clusters, lfc);
+ if (lfc == UINT32_MAX) {
+ /* No more free clusters. Cannot satisfy the request */
+ bs_clone_snapshot_origblob_cleanup(ctx, -ENOSPC);
+ return;
+ }
+ lfc++;
+ }
+ }
+
+ ctx->cluster = 0;
+ bs_inflate_blob_touch_next(ctx, 0);
+}
+
+static void
+bs_inflate_blob(struct spdk_blob_store *bs, struct spdk_io_channel *channel,
+ spdk_blob_id blobid, bool allocate_all, spdk_blob_op_complete cb_fn, void *cb_arg)
+{
+ struct spdk_clone_snapshot_ctx *ctx = calloc(1, sizeof(*ctx));
+
+ if (!ctx) {
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+ ctx->cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
+ ctx->cpl.u.bs_basic.cb_fn = cb_fn;
+ ctx->cpl.u.bs_basic.cb_arg = cb_arg;
+ ctx->bserrno = 0;
+ ctx->original.id = blobid;
+ ctx->channel = channel;
+ ctx->allocate_all = allocate_all;
+
+ spdk_bs_open_blob(bs, ctx->original.id, bs_inflate_blob_open_cpl, ctx);
+}
+
+void
+spdk_bs_inflate_blob(struct spdk_blob_store *bs, struct spdk_io_channel *channel,
+ spdk_blob_id blobid, spdk_blob_op_complete cb_fn, void *cb_arg)
+{
+ bs_inflate_blob(bs, channel, blobid, true, cb_fn, cb_arg);
+}
+
+void
+spdk_bs_blob_decouple_parent(struct spdk_blob_store *bs, struct spdk_io_channel *channel,
+ spdk_blob_id blobid, spdk_blob_op_complete cb_fn, void *cb_arg)
+{
+ bs_inflate_blob(bs, channel, blobid, false, cb_fn, cb_arg);
+}
+/* END spdk_bs_inflate_blob */
+
+/* START spdk_blob_resize */
+struct spdk_bs_resize_ctx {
+ spdk_blob_op_complete cb_fn;
+ void *cb_arg;
+ struct spdk_blob *blob;
+ uint64_t sz;
+ int rc;
+};
+
+static void
+bs_resize_unfreeze_cpl(void *cb_arg, int rc)
+{
+ struct spdk_bs_resize_ctx *ctx = (struct spdk_bs_resize_ctx *)cb_arg;
+
+ if (rc != 0) {
+ SPDK_ERRLOG("Unfreeze failed, rc=%d\n", rc);
+ }
+
+ if (ctx->rc != 0) {
+ SPDK_ERRLOG("Unfreeze failed, ctx->rc=%d\n", ctx->rc);
+ rc = ctx->rc;
+ }
+
+ ctx->blob->locked_operation_in_progress = false;
+
+ ctx->cb_fn(ctx->cb_arg, rc);
+ free(ctx);
+}
+
+static void
+bs_resize_freeze_cpl(void *cb_arg, int rc)
+{
+ struct spdk_bs_resize_ctx *ctx = (struct spdk_bs_resize_ctx *)cb_arg;
+
+ if (rc != 0) {
+ ctx->blob->locked_operation_in_progress = false;
+ ctx->cb_fn(ctx->cb_arg, rc);
+ free(ctx);
+ return;
+ }
+
+ ctx->rc = blob_resize(ctx->blob, ctx->sz);
+
+ blob_unfreeze_io(ctx->blob, bs_resize_unfreeze_cpl, ctx);
+}
+
+void
+spdk_blob_resize(struct spdk_blob *blob, uint64_t sz, spdk_blob_op_complete cb_fn, void *cb_arg)
+{
+ struct spdk_bs_resize_ctx *ctx;
+
+ blob_verify_md_op(blob);
+
+ SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Resizing blob %lu to %lu clusters\n", blob->id, sz);
+
+ if (blob->md_ro) {
+ cb_fn(cb_arg, -EPERM);
+ return;
+ }
+
+ if (sz == blob->active.num_clusters) {
+ cb_fn(cb_arg, 0);
+ return;
+ }
+
+ if (blob->locked_operation_in_progress) {
+ cb_fn(cb_arg, -EBUSY);
+ return;
+ }
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx) {
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ blob->locked_operation_in_progress = true;
+ ctx->cb_fn = cb_fn;
+ ctx->cb_arg = cb_arg;
+ ctx->blob = blob;
+ ctx->sz = sz;
+ blob_freeze_io(blob, bs_resize_freeze_cpl, ctx);
+}
+
+/* END spdk_blob_resize */
+
+
+/* START spdk_bs_delete_blob */
+
+static void
+bs_delete_close_cpl(void *cb_arg, int bserrno)
+{
+ spdk_bs_sequence_t *seq = cb_arg;
+
+ bs_sequence_finish(seq, bserrno);
+}
+
+static void
+bs_delete_persist_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+ struct spdk_blob *blob = cb_arg;
+
+ if (bserrno != 0) {
+ /*
+ * We already removed this blob from the blobstore tailq, so
+ * we need to free it here since this is the last reference
+ * to it.
+ */
+ blob_free(blob);
+ bs_delete_close_cpl(seq, bserrno);
+ return;
+ }
+
+ /*
+ * This will immediately decrement the ref_count and call
+ * the completion routine since the metadata state is clean.
+ * By calling spdk_blob_close, we reduce the number of call
+ * points into code that touches the blob->open_ref count
+ * and the blobstore's blob list.
+ */
+ spdk_blob_close(blob, bs_delete_close_cpl, seq);
+}
+
+struct delete_snapshot_ctx {
+ struct spdk_blob_list *parent_snapshot_entry;
+ struct spdk_blob *snapshot;
+ bool snapshot_md_ro;
+ struct spdk_blob *clone;
+ bool clone_md_ro;
+ spdk_blob_op_with_handle_complete cb_fn;
+ void *cb_arg;
+ int bserrno;
+};
+
+static void
+delete_blob_cleanup_finish(void *cb_arg, int bserrno)
+{
+ struct delete_snapshot_ctx *ctx = cb_arg;
+
+ if (bserrno != 0) {
+ SPDK_ERRLOG("Snapshot cleanup error %d\n", bserrno);
+ }
+
+ assert(ctx != NULL);
+
+ if (bserrno != 0 && ctx->bserrno == 0) {
+ ctx->bserrno = bserrno;
+ }
+
+ ctx->cb_fn(ctx->cb_arg, ctx->snapshot, ctx->bserrno);
+ free(ctx);
+}
+
+static void
+delete_snapshot_cleanup_snapshot(void *cb_arg, int bserrno)
+{
+ struct delete_snapshot_ctx *ctx = cb_arg;
+
+ if (bserrno != 0) {
+ ctx->bserrno = bserrno;
+ SPDK_ERRLOG("Clone cleanup error %d\n", bserrno);
+ }
+
+ if (ctx->bserrno != 0) {
+ assert(blob_lookup(ctx->snapshot->bs, ctx->snapshot->id) == NULL);
+ TAILQ_INSERT_HEAD(&ctx->snapshot->bs->blobs, ctx->snapshot, link);
+ spdk_bit_array_set(ctx->snapshot->bs->open_blobids, ctx->snapshot->id);
+ }
+
+ ctx->snapshot->locked_operation_in_progress = false;
+ ctx->snapshot->md_ro = ctx->snapshot_md_ro;
+
+ spdk_blob_close(ctx->snapshot, delete_blob_cleanup_finish, ctx);
+}
+
+static void
+delete_snapshot_cleanup_clone(void *cb_arg, int bserrno)
+{
+ struct delete_snapshot_ctx *ctx = cb_arg;
+
+ ctx->clone->locked_operation_in_progress = false;
+ ctx->clone->md_ro = ctx->clone_md_ro;
+
+ spdk_blob_close(ctx->clone, delete_snapshot_cleanup_snapshot, ctx);
+}
+
+static void
+delete_snapshot_unfreeze_cpl(void *cb_arg, int bserrno)
+{
+ struct delete_snapshot_ctx *ctx = cb_arg;
+
+ if (bserrno) {
+ ctx->bserrno = bserrno;
+ delete_snapshot_cleanup_clone(ctx, 0);
+ return;
+ }
+
+ ctx->clone->locked_operation_in_progress = false;
+ spdk_blob_close(ctx->clone, delete_blob_cleanup_finish, ctx);
+}
+
+static void
+delete_snapshot_sync_snapshot_cpl(void *cb_arg, int bserrno)
+{
+ struct delete_snapshot_ctx *ctx = cb_arg;
+ struct spdk_blob_list *parent_snapshot_entry = NULL;
+ struct spdk_blob_list *snapshot_entry = NULL;
+ struct spdk_blob_list *clone_entry = NULL;
+ struct spdk_blob_list *snapshot_clone_entry = NULL;
+
+ if (bserrno) {
+ SPDK_ERRLOG("Failed to sync MD on blob\n");
+ ctx->bserrno = bserrno;
+ delete_snapshot_cleanup_clone(ctx, 0);
+ return;
+ }
+
+ /* Get snapshot entry for the snapshot we want to remove */
+ snapshot_entry = bs_get_snapshot_entry(ctx->snapshot->bs, ctx->snapshot->id);
+
+ assert(snapshot_entry != NULL);
+
+ /* Remove clone entry in this snapshot (at this point there can be only one clone) */
+ clone_entry = TAILQ_FIRST(&snapshot_entry->clones);
+ assert(clone_entry != NULL);
+ TAILQ_REMOVE(&snapshot_entry->clones, clone_entry, link);
+ snapshot_entry->clone_count--;
+ assert(TAILQ_EMPTY(&snapshot_entry->clones));
+
+ if (ctx->snapshot->parent_id != SPDK_BLOBID_INVALID) {
+ /* This snapshot is at the same time a clone of another snapshot - we need to
+ * update parent snapshot (remove current clone, add new one inherited from
+ * the snapshot that is being removed) */
+
+ /* Get snapshot entry for parent snapshot and clone entry within that snapshot for
+ * snapshot that we are removing */
+ blob_get_snapshot_and_clone_entries(ctx->snapshot, &parent_snapshot_entry,
+ &snapshot_clone_entry);
+
+ /* Switch clone entry in parent snapshot */
+ TAILQ_INSERT_TAIL(&parent_snapshot_entry->clones, clone_entry, link);
+ TAILQ_REMOVE(&parent_snapshot_entry->clones, snapshot_clone_entry, link);
+ free(snapshot_clone_entry);
+ } else {
+ /* No parent snapshot - just remove clone entry */
+ free(clone_entry);
+ }
+
+ /* Restore md_ro flags */
+ ctx->clone->md_ro = ctx->clone_md_ro;
+ ctx->snapshot->md_ro = ctx->snapshot_md_ro;
+
+ blob_unfreeze_io(ctx->clone, delete_snapshot_unfreeze_cpl, ctx);
+}
+
+static void
+delete_snapshot_sync_clone_cpl(void *cb_arg, int bserrno)
+{
+ struct delete_snapshot_ctx *ctx = cb_arg;
+ uint64_t i;
+
+ ctx->snapshot->md_ro = false;
+
+ if (bserrno) {
+ SPDK_ERRLOG("Failed to sync MD on clone\n");
+ ctx->bserrno = bserrno;
+
+ /* Restore snapshot to previous state */
+ bserrno = blob_remove_xattr(ctx->snapshot, SNAPSHOT_PENDING_REMOVAL, true);
+ if (bserrno != 0) {
+ delete_snapshot_cleanup_clone(ctx, bserrno);
+ return;
+ }
+
+ spdk_blob_sync_md(ctx->snapshot, delete_snapshot_cleanup_clone, ctx);
+ return;
+ }
+
+ /* Clear cluster map entries for snapshot */
+ for (i = 0; i < ctx->snapshot->active.num_clusters && i < ctx->clone->active.num_clusters; i++) {
+ if (ctx->clone->active.clusters[i] == ctx->snapshot->active.clusters[i]) {
+ ctx->snapshot->active.clusters[i] = 0;
+ }
+ }
+ for (i = 0; i < ctx->snapshot->active.num_extent_pages &&
+ i < ctx->clone->active.num_extent_pages; i++) {
+ if (ctx->clone->active.extent_pages[i] == ctx->snapshot->active.extent_pages[i]) {
+ ctx->snapshot->active.extent_pages[i] = 0;
+ }
+ }
+
+ blob_set_thin_provision(ctx->snapshot);
+ ctx->snapshot->state = SPDK_BLOB_STATE_DIRTY;
+
+ if (ctx->parent_snapshot_entry != NULL) {
+ ctx->snapshot->back_bs_dev = NULL;
+ }
+
+ spdk_blob_sync_md(ctx->snapshot, delete_snapshot_sync_snapshot_cpl, ctx);
+}
+
+static void
+delete_snapshot_sync_snapshot_xattr_cpl(void *cb_arg, int bserrno)
+{
+ struct delete_snapshot_ctx *ctx = cb_arg;
+ uint64_t i;
+
+ /* Temporarily override md_ro flag for clone for MD modification */
+ ctx->clone_md_ro = ctx->clone->md_ro;
+ ctx->clone->md_ro = false;
+
+ if (bserrno) {
+ SPDK_ERRLOG("Failed to sync MD with xattr on blob\n");
+ ctx->bserrno = bserrno;
+ delete_snapshot_cleanup_clone(ctx, 0);
+ return;
+ }
+
+ /* Copy snapshot map to clone map (only unallocated clusters in clone) */
+ for (i = 0; i < ctx->snapshot->active.num_clusters && i < ctx->clone->active.num_clusters; i++) {
+ if (ctx->clone->active.clusters[i] == 0) {
+ ctx->clone->active.clusters[i] = ctx->snapshot->active.clusters[i];
+ }
+ }
+ for (i = 0; i < ctx->snapshot->active.num_extent_pages &&
+ i < ctx->clone->active.num_extent_pages; i++) {
+ if (ctx->clone->active.extent_pages[i] == 0) {
+ ctx->clone->active.extent_pages[i] = ctx->snapshot->active.extent_pages[i];
+ }
+ }
+
+ /* Delete old backing bs_dev from clone (related to snapshot that will be removed) */
+ ctx->clone->back_bs_dev->destroy(ctx->clone->back_bs_dev);
+
+ /* Set/remove snapshot xattr and switch parent ID and backing bs_dev on clone... */
+ if (ctx->parent_snapshot_entry != NULL) {
+ /* ...to parent snapshot */
+ ctx->clone->parent_id = ctx->parent_snapshot_entry->id;
+ ctx->clone->back_bs_dev = ctx->snapshot->back_bs_dev;
+ blob_set_xattr(ctx->clone, BLOB_SNAPSHOT, &ctx->parent_snapshot_entry->id,
+ sizeof(spdk_blob_id),
+ true);
+ } else {
+ /* ...to blobid invalid and zeroes dev */
+ ctx->clone->parent_id = SPDK_BLOBID_INVALID;
+ ctx->clone->back_bs_dev = bs_create_zeroes_dev();
+ blob_remove_xattr(ctx->clone, BLOB_SNAPSHOT, true);
+ }
+
+ spdk_blob_sync_md(ctx->clone, delete_snapshot_sync_clone_cpl, ctx);
+}
+
+static void
+delete_snapshot_freeze_io_cb(void *cb_arg, int bserrno)
+{
+ struct delete_snapshot_ctx *ctx = cb_arg;
+
+ if (bserrno) {
+ SPDK_ERRLOG("Failed to freeze I/O on clone\n");
+ ctx->bserrno = bserrno;
+ delete_snapshot_cleanup_clone(ctx, 0);
+ return;
+ }
+
+ /* Temporarily override md_ro flag for snapshot for MD modification */
+ ctx->snapshot_md_ro = ctx->snapshot->md_ro;
+ ctx->snapshot->md_ro = false;
+
+ /* Mark blob as pending for removal for power failure safety, use clone id for recovery */
+ ctx->bserrno = blob_set_xattr(ctx->snapshot, SNAPSHOT_PENDING_REMOVAL, &ctx->clone->id,
+ sizeof(spdk_blob_id), true);
+ if (ctx->bserrno != 0) {
+ delete_snapshot_cleanup_clone(ctx, 0);
+ return;
+ }
+
+ spdk_blob_sync_md(ctx->snapshot, delete_snapshot_sync_snapshot_xattr_cpl, ctx);
+}
+
+static void
+delete_snapshot_open_clone_cb(void *cb_arg, struct spdk_blob *clone, int bserrno)
+{
+ struct delete_snapshot_ctx *ctx = cb_arg;
+
+ if (bserrno) {
+ SPDK_ERRLOG("Failed to open clone\n");
+ ctx->bserrno = bserrno;
+ delete_snapshot_cleanup_snapshot(ctx, 0);
+ return;
+ }
+
+ ctx->clone = clone;
+
+ if (clone->locked_operation_in_progress) {
+ SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Cannot remove blob - another operation in progress on its clone\n");
+ ctx->bserrno = -EBUSY;
+ spdk_blob_close(ctx->clone, delete_snapshot_cleanup_snapshot, ctx);
+ return;
+ }
+
+ clone->locked_operation_in_progress = true;
+
+ blob_freeze_io(clone, delete_snapshot_freeze_io_cb, ctx);
+}
+
+static void
+update_clone_on_snapshot_deletion(struct spdk_blob *snapshot, struct delete_snapshot_ctx *ctx)
+{
+ struct spdk_blob_list *snapshot_entry = NULL;
+ struct spdk_blob_list *clone_entry = NULL;
+ struct spdk_blob_list *snapshot_clone_entry = NULL;
+
+ /* Get snapshot entry for the snapshot we want to remove */
+ snapshot_entry = bs_get_snapshot_entry(snapshot->bs, snapshot->id);
+
+ assert(snapshot_entry != NULL);
+
+ /* Get clone of the snapshot (at this point there can be only one clone) */
+ clone_entry = TAILQ_FIRST(&snapshot_entry->clones);
+ assert(snapshot_entry->clone_count == 1);
+ assert(clone_entry != NULL);
+
+ /* Get snapshot entry for parent snapshot and clone entry within that snapshot for
+ * snapshot that we are removing */
+ blob_get_snapshot_and_clone_entries(snapshot, &ctx->parent_snapshot_entry,
+ &snapshot_clone_entry);
+
+ spdk_bs_open_blob(snapshot->bs, clone_entry->id, delete_snapshot_open_clone_cb, ctx);
+}
+
+static void
+bs_delete_blob_finish(void *cb_arg, struct spdk_blob *blob, int bserrno)
+{
+ spdk_bs_sequence_t *seq = cb_arg;
+ struct spdk_blob_list *snapshot_entry = NULL;
+ uint32_t page_num;
+
+ if (bserrno) {
+ SPDK_ERRLOG("Failed to remove blob\n");
+ bs_sequence_finish(seq, bserrno);
+ return;
+ }
+
+ /* Remove snapshot from the list */
+ snapshot_entry = bs_get_snapshot_entry(blob->bs, blob->id);
+ if (snapshot_entry != NULL) {
+ TAILQ_REMOVE(&blob->bs->snapshots, snapshot_entry, link);
+ free(snapshot_entry);
+ }
+
+ page_num = bs_blobid_to_page(blob->id);
+ spdk_bit_array_clear(blob->bs->used_blobids, page_num);
+ blob->state = SPDK_BLOB_STATE_DIRTY;
+ blob->active.num_pages = 0;
+ blob_resize(blob, 0);
+
+ blob_persist(seq, blob, bs_delete_persist_cpl, blob);
+}
+
+static int
+bs_is_blob_deletable(struct spdk_blob *blob, bool *update_clone)
+{
+ struct spdk_blob_list *snapshot_entry = NULL;
+ struct spdk_blob_list *clone_entry = NULL;
+ struct spdk_blob *clone = NULL;
+ bool has_one_clone = false;
+
+ /* Check if this is a snapshot with clones */
+ snapshot_entry = bs_get_snapshot_entry(blob->bs, blob->id);
+ if (snapshot_entry != NULL) {
+ if (snapshot_entry->clone_count > 1) {
+ SPDK_ERRLOG("Cannot remove snapshot with more than one clone\n");
+ return -EBUSY;
+ } else if (snapshot_entry->clone_count == 1) {
+ has_one_clone = true;
+ }
+ }
+
+ /* Check if someone has this blob open (besides this delete context):
+ * - open_ref = 1 - only this context opened blob, so it is ok to remove it
+ * - open_ref <= 2 && has_one_clone = true - clone is holding snapshot
+ * and that is ok, because we will update it accordingly */
+ if (blob->open_ref <= 2 && has_one_clone) {
+ clone_entry = TAILQ_FIRST(&snapshot_entry->clones);
+ assert(clone_entry != NULL);
+ clone = blob_lookup(blob->bs, clone_entry->id);
+
+ if (blob->open_ref == 2 && clone == NULL) {
+ /* Clone is closed and someone else opened this blob */
+ SPDK_ERRLOG("Cannot remove snapshot because it is open\n");
+ return -EBUSY;
+ }
+
+ *update_clone = true;
+ return 0;
+ }
+
+ if (blob->open_ref > 1) {
+ SPDK_ERRLOG("Cannot remove snapshot because it is open\n");
+ return -EBUSY;
+ }
+
+ assert(has_one_clone == false);
+ *update_clone = false;
+ return 0;
+}
+
+static void
+bs_delete_enomem_close_cpl(void *cb_arg, int bserrno)
+{
+ spdk_bs_sequence_t *seq = cb_arg;
+
+ bs_sequence_finish(seq, -ENOMEM);
+}
+
+static void
+bs_delete_open_cpl(void *cb_arg, struct spdk_blob *blob, int bserrno)
+{
+ spdk_bs_sequence_t *seq = cb_arg;
+ struct delete_snapshot_ctx *ctx;
+ bool update_clone = false;
+
+ if (bserrno != 0) {
+ bs_sequence_finish(seq, bserrno);
+ return;
+ }
+
+ blob_verify_md_op(blob);
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (ctx == NULL) {
+ spdk_blob_close(blob, bs_delete_enomem_close_cpl, seq);
+ return;
+ }
+
+ ctx->snapshot = blob;
+ ctx->cb_fn = bs_delete_blob_finish;
+ ctx->cb_arg = seq;
+
+ /* Check if blob can be removed and if it is a snapshot with clone on top of it */
+ ctx->bserrno = bs_is_blob_deletable(blob, &update_clone);
+ if (ctx->bserrno) {
+ spdk_blob_close(blob, delete_blob_cleanup_finish, ctx);
+ return;
+ }
+
+ if (blob->locked_operation_in_progress) {
+ SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Cannot remove blob - another operation in progress\n");
+ ctx->bserrno = -EBUSY;
+ spdk_blob_close(blob, delete_blob_cleanup_finish, ctx);
+ return;
+ }
+
+ blob->locked_operation_in_progress = true;
+
+ /*
+ * Remove the blob from the blob_store list now, to ensure it does not
+ * get returned after this point by blob_lookup().
+ */
+ spdk_bit_array_clear(blob->bs->open_blobids, blob->id);
+ TAILQ_REMOVE(&blob->bs->blobs, blob, link);
+
+ if (update_clone) {
+ /* This blob is a snapshot with active clone - update clone first */
+ update_clone_on_snapshot_deletion(blob, ctx);
+ } else {
+ /* This blob does not have any clones - just remove it */
+ bs_blob_list_remove(blob);
+ bs_delete_blob_finish(seq, blob, 0);
+ free(ctx);
+ }
+}
+
+void
+spdk_bs_delete_blob(struct spdk_blob_store *bs, spdk_blob_id blobid,
+ spdk_blob_op_complete cb_fn, void *cb_arg)
+{
+ struct spdk_bs_cpl cpl;
+ spdk_bs_sequence_t *seq;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Deleting blob %lu\n", blobid);
+
+ assert(spdk_get_thread() == bs->md_thread);
+
+ cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
+ cpl.u.blob_basic.cb_fn = cb_fn;
+ cpl.u.blob_basic.cb_arg = cb_arg;
+
+ seq = bs_sequence_start(bs->md_channel, &cpl);
+ if (!seq) {
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ spdk_bs_open_blob(bs, blobid, bs_delete_open_cpl, seq);
+}
+
+/* END spdk_bs_delete_blob */
+
+/* START spdk_bs_open_blob */
+
+static void
+bs_open_blob_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+ struct spdk_blob *blob = cb_arg;
+
+ if (bserrno != 0) {
+ blob_free(blob);
+ seq->cpl.u.blob_handle.blob = NULL;
+ bs_sequence_finish(seq, bserrno);
+ return;
+ }
+
+ blob->open_ref++;
+
+ spdk_bit_array_set(blob->bs->open_blobids, blob->id);
+ TAILQ_INSERT_HEAD(&blob->bs->blobs, blob, link);
+
+ bs_sequence_finish(seq, bserrno);
+}
+
+static void
+bs_open_blob(struct spdk_blob_store *bs,
+ spdk_blob_id blobid,
+ struct spdk_blob_open_opts *opts,
+ spdk_blob_op_with_handle_complete cb_fn,
+ void *cb_arg)
+{
+ struct spdk_blob *blob;
+ struct spdk_bs_cpl cpl;
+ struct spdk_blob_open_opts opts_default;
+ spdk_bs_sequence_t *seq;
+ uint32_t page_num;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Opening blob %lu\n", blobid);
+ assert(spdk_get_thread() == bs->md_thread);
+
+ page_num = bs_blobid_to_page(blobid);
+ if (spdk_bit_array_get(bs->used_blobids, page_num) == false) {
+ /* Invalid blobid */
+ cb_fn(cb_arg, NULL, -ENOENT);
+ return;
+ }
+
+ blob = blob_lookup(bs, blobid);
+ if (blob) {
+ blob->open_ref++;
+ cb_fn(cb_arg, blob, 0);
+ return;
+ }
+
+ blob = blob_alloc(bs, blobid);
+ if (!blob) {
+ cb_fn(cb_arg, NULL, -ENOMEM);
+ return;
+ }
+
+ if (!opts) {
+ spdk_blob_open_opts_init(&opts_default);
+ opts = &opts_default;
+ }
+
+ blob->clear_method = opts->clear_method;
+
+ cpl.type = SPDK_BS_CPL_TYPE_BLOB_HANDLE;
+ cpl.u.blob_handle.cb_fn = cb_fn;
+ cpl.u.blob_handle.cb_arg = cb_arg;
+ cpl.u.blob_handle.blob = blob;
+
+ seq = bs_sequence_start(bs->md_channel, &cpl);
+ if (!seq) {
+ blob_free(blob);
+ cb_fn(cb_arg, NULL, -ENOMEM);
+ return;
+ }
+
+ blob_load(seq, blob, bs_open_blob_cpl, blob);
+}
+
+void spdk_bs_open_blob(struct spdk_blob_store *bs, spdk_blob_id blobid,
+ spdk_blob_op_with_handle_complete cb_fn, void *cb_arg)
+{
+ bs_open_blob(bs, blobid, NULL, cb_fn, cb_arg);
+}
+
+void spdk_bs_open_blob_ext(struct spdk_blob_store *bs, spdk_blob_id blobid,
+ struct spdk_blob_open_opts *opts, spdk_blob_op_with_handle_complete cb_fn, void *cb_arg)
+{
+ bs_open_blob(bs, blobid, opts, cb_fn, cb_arg);
+}
+
+/* END spdk_bs_open_blob */
+
+/* START spdk_blob_set_read_only */
+int spdk_blob_set_read_only(struct spdk_blob *blob)
+{
+ blob_verify_md_op(blob);
+
+ blob->data_ro_flags |= SPDK_BLOB_READ_ONLY;
+
+ blob->state = SPDK_BLOB_STATE_DIRTY;
+ return 0;
+}
+/* END spdk_blob_set_read_only */
+
+/* START spdk_blob_sync_md */
+
+static void
+blob_sync_md_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+ struct spdk_blob *blob = cb_arg;
+
+ if (bserrno == 0 && (blob->data_ro_flags & SPDK_BLOB_READ_ONLY)) {
+ blob->data_ro = true;
+ blob->md_ro = true;
+ }
+
+ bs_sequence_finish(seq, bserrno);
+}
+
+static void
+blob_sync_md(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg)
+{
+ struct spdk_bs_cpl cpl;
+ spdk_bs_sequence_t *seq;
+
+ cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
+ cpl.u.blob_basic.cb_fn = cb_fn;
+ cpl.u.blob_basic.cb_arg = cb_arg;
+
+ seq = bs_sequence_start(blob->bs->md_channel, &cpl);
+ if (!seq) {
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ blob_persist(seq, blob, blob_sync_md_cpl, blob);
+}
+
+void
+spdk_blob_sync_md(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg)
+{
+ blob_verify_md_op(blob);
+
+ SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Syncing blob %lu\n", blob->id);
+
+ if (blob->md_ro) {
+ assert(blob->state == SPDK_BLOB_STATE_CLEAN);
+ cb_fn(cb_arg, 0);
+ return;
+ }
+
+ blob_sync_md(blob, cb_fn, cb_arg);
+}
+
+/* END spdk_blob_sync_md */
+
+struct spdk_blob_insert_cluster_ctx {
+ struct spdk_thread *thread;
+ struct spdk_blob *blob;
+ uint32_t cluster_num; /* cluster index in blob */
+ uint32_t cluster; /* cluster on disk */
+ uint32_t extent_page; /* extent page on disk */
+ int rc;
+ spdk_blob_op_complete cb_fn;
+ void *cb_arg;
+};
+
+static void
+blob_insert_cluster_msg_cpl(void *arg)
+{
+ struct spdk_blob_insert_cluster_ctx *ctx = arg;
+
+ ctx->cb_fn(ctx->cb_arg, ctx->rc);
+ free(ctx);
+}
+
+static void
+blob_insert_cluster_msg_cb(void *arg, int bserrno)
+{
+ struct spdk_blob_insert_cluster_ctx *ctx = arg;
+
+ ctx->rc = bserrno;
+ spdk_thread_send_msg(ctx->thread, blob_insert_cluster_msg_cpl, ctx);
+}
+
+static void
+blob_persist_extent_page_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+ struct spdk_blob_md_page *page = cb_arg;
+
+ bs_sequence_finish(seq, bserrno);
+ spdk_free(page);
+}
+
+static void
+blob_insert_extent(struct spdk_blob *blob, uint32_t extent, uint64_t cluster_num,
+ spdk_blob_op_complete cb_fn, void *cb_arg)
+{
+ spdk_bs_sequence_t *seq;
+ struct spdk_bs_cpl cpl;
+ struct spdk_blob_md_page *page = NULL;
+ uint32_t page_count = 0;
+ int rc;
+
+ cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
+ cpl.u.blob_basic.cb_fn = cb_fn;
+ cpl.u.blob_basic.cb_arg = cb_arg;
+
+ seq = bs_sequence_start(blob->bs->md_channel, &cpl);
+ if (!seq) {
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+ rc = blob_serialize_add_page(blob, &page, &page_count, &page);
+ if (rc < 0) {
+ bs_sequence_finish(seq, rc);
+ return;
+ }
+
+ blob_serialize_extent_page(blob, cluster_num, page);
+
+ page->crc = blob_md_page_calc_crc(page);
+
+ assert(spdk_bit_array_get(blob->bs->used_md_pages, extent) == true);
+
+ bs_sequence_write_dev(seq, page, bs_md_page_to_lba(blob->bs, extent),
+ bs_byte_to_lba(blob->bs, SPDK_BS_PAGE_SIZE),
+ blob_persist_extent_page_cpl, page);
+}
+
+static void
+blob_insert_cluster_msg(void *arg)
+{
+ struct spdk_blob_insert_cluster_ctx *ctx = arg;
+ uint32_t *extent_page;
+
+ ctx->rc = blob_insert_cluster(ctx->blob, ctx->cluster_num, ctx->cluster);
+ if (ctx->rc != 0) {
+ spdk_thread_send_msg(ctx->thread, blob_insert_cluster_msg_cpl, ctx);
+ return;
+ }
+
+ if (ctx->blob->use_extent_table == false) {
+ /* Extent table is not used, proceed with sync of md that will only use extents_rle. */
+ ctx->blob->state = SPDK_BLOB_STATE_DIRTY;
+ blob_sync_md(ctx->blob, blob_insert_cluster_msg_cb, ctx);
+ return;
+ }
+
+ extent_page = bs_cluster_to_extent_page(ctx->blob, ctx->cluster_num);
+ if (*extent_page == 0) {
+ /* Extent page requires allocation.
+ * It was already claimed in the used_md_pages map and placed in ctx.
+ * Blob persist will take care of writing out new extent page on disk. */
+ assert(ctx->extent_page != 0);
+ assert(spdk_bit_array_get(ctx->blob->bs->used_md_pages, ctx->extent_page) == true);
+ *extent_page = ctx->extent_page;
+ ctx->blob->state = SPDK_BLOB_STATE_DIRTY;
+ blob_sync_md(ctx->blob, blob_insert_cluster_msg_cb, ctx);
+ } else {
+ /* It is possible for original thread to allocate extent page for
+ * different cluster in the same extent page. In such case proceed with
+ * updating the existing extent page, but release the additional one. */
+ if (ctx->extent_page != 0) {
+ assert(spdk_bit_array_get(ctx->blob->bs->used_md_pages, ctx->extent_page) == true);
+ bs_release_md_page(ctx->blob->bs, ctx->extent_page);
+ ctx->extent_page = 0;
+ }
+ /* Extent page already allocated.
+ * Every cluster allocation, requires just an update of single extent page. */
+ blob_insert_extent(ctx->blob, *extent_page, ctx->cluster_num,
+ blob_insert_cluster_msg_cb, ctx);
+ }
+}
+
+static void
+blob_insert_cluster_on_md_thread(struct spdk_blob *blob, uint32_t cluster_num,
+ uint64_t cluster, uint32_t extent_page, spdk_blob_op_complete cb_fn, void *cb_arg)
+{
+ struct spdk_blob_insert_cluster_ctx *ctx;
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (ctx == NULL) {
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ ctx->thread = spdk_get_thread();
+ ctx->blob = blob;
+ ctx->cluster_num = cluster_num;
+ ctx->cluster = cluster;
+ ctx->extent_page = extent_page;
+ ctx->cb_fn = cb_fn;
+ ctx->cb_arg = cb_arg;
+
+ spdk_thread_send_msg(blob->bs->md_thread, blob_insert_cluster_msg, ctx);
+}
+
+/* START spdk_blob_close */
+
+static void
+blob_close_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
+{
+ struct spdk_blob *blob = cb_arg;
+
+ if (bserrno == 0) {
+ blob->open_ref--;
+ if (blob->open_ref == 0) {
+ /*
+ * Blobs with active.num_pages == 0 are deleted blobs.
+ * these blobs are removed from the blob_store list
+ * when the deletion process starts - so don't try to
+ * remove them again.
+ */
+ if (blob->active.num_pages > 0) {
+ spdk_bit_array_clear(blob->bs->open_blobids, blob->id);
+ TAILQ_REMOVE(&blob->bs->blobs, blob, link);
+ }
+ blob_free(blob);
+ }
+ }
+
+ bs_sequence_finish(seq, bserrno);
+}
+
+void spdk_blob_close(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg)
+{
+ struct spdk_bs_cpl cpl;
+ spdk_bs_sequence_t *seq;
+
+ blob_verify_md_op(blob);
+
+ SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Closing blob %lu\n", blob->id);
+
+ if (blob->open_ref == 0) {
+ cb_fn(cb_arg, -EBADF);
+ return;
+ }
+
+ cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
+ cpl.u.blob_basic.cb_fn = cb_fn;
+ cpl.u.blob_basic.cb_arg = cb_arg;
+
+ seq = bs_sequence_start(blob->bs->md_channel, &cpl);
+ if (!seq) {
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ /* Sync metadata */
+ blob_persist(seq, blob, blob_close_cpl, blob);
+}
+
+/* END spdk_blob_close */
+
+struct spdk_io_channel *spdk_bs_alloc_io_channel(struct spdk_blob_store *bs)
+{
+ return spdk_get_io_channel(bs);
+}
+
+void spdk_bs_free_io_channel(struct spdk_io_channel *channel)
+{
+ spdk_put_io_channel(channel);
+}
+
+void spdk_blob_io_unmap(struct spdk_blob *blob, struct spdk_io_channel *channel,
+ uint64_t offset, uint64_t length, spdk_blob_op_complete cb_fn, void *cb_arg)
+{
+ blob_request_submit_op(blob, channel, NULL, offset, length, cb_fn, cb_arg,
+ SPDK_BLOB_UNMAP);
+}
+
+void spdk_blob_io_write_zeroes(struct spdk_blob *blob, struct spdk_io_channel *channel,
+ uint64_t offset, uint64_t length, spdk_blob_op_complete cb_fn, void *cb_arg)
+{
+ blob_request_submit_op(blob, channel, NULL, offset, length, cb_fn, cb_arg,
+ SPDK_BLOB_WRITE_ZEROES);
+}
+
+void spdk_blob_io_write(struct spdk_blob *blob, struct spdk_io_channel *channel,
+ void *payload, uint64_t offset, uint64_t length,
+ spdk_blob_op_complete cb_fn, void *cb_arg)
+{
+ blob_request_submit_op(blob, channel, payload, offset, length, cb_fn, cb_arg,
+ SPDK_BLOB_WRITE);
+}
+
+void spdk_blob_io_read(struct spdk_blob *blob, struct spdk_io_channel *channel,
+ void *payload, uint64_t offset, uint64_t length,
+ spdk_blob_op_complete cb_fn, void *cb_arg)
+{
+ blob_request_submit_op(blob, channel, payload, offset, length, cb_fn, cb_arg,
+ SPDK_BLOB_READ);
+}
+
+void spdk_blob_io_writev(struct spdk_blob *blob, struct spdk_io_channel *channel,
+ struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length,
+ spdk_blob_op_complete cb_fn, void *cb_arg)
+{
+ blob_request_submit_rw_iov(blob, channel, iov, iovcnt, offset, length, cb_fn, cb_arg, false);
+}
+
+void spdk_blob_io_readv(struct spdk_blob *blob, struct spdk_io_channel *channel,
+ struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length,
+ spdk_blob_op_complete cb_fn, void *cb_arg)
+{
+ blob_request_submit_rw_iov(blob, channel, iov, iovcnt, offset, length, cb_fn, cb_arg, true);
+}
+
+struct spdk_bs_iter_ctx {
+ int64_t page_num;
+ struct spdk_blob_store *bs;
+
+ spdk_blob_op_with_handle_complete cb_fn;
+ void *cb_arg;
+};
+
+static void
+bs_iter_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno)
+{
+ struct spdk_bs_iter_ctx *ctx = cb_arg;
+ struct spdk_blob_store *bs = ctx->bs;
+ spdk_blob_id id;
+
+ if (bserrno == 0) {
+ ctx->cb_fn(ctx->cb_arg, _blob, bserrno);
+ free(ctx);
+ return;
+ }
+
+ ctx->page_num++;
+ ctx->page_num = spdk_bit_array_find_first_set(bs->used_blobids, ctx->page_num);
+ if (ctx->page_num >= spdk_bit_array_capacity(bs->used_blobids)) {
+ ctx->cb_fn(ctx->cb_arg, NULL, -ENOENT);
+ free(ctx);
+ return;
+ }
+
+ id = bs_page_to_blobid(ctx->page_num);
+
+ spdk_bs_open_blob(bs, id, bs_iter_cpl, ctx);
+}
+
+void
+spdk_bs_iter_first(struct spdk_blob_store *bs,
+ spdk_blob_op_with_handle_complete cb_fn, void *cb_arg)
+{
+ struct spdk_bs_iter_ctx *ctx;
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx) {
+ cb_fn(cb_arg, NULL, -ENOMEM);
+ return;
+ }
+
+ ctx->page_num = -1;
+ ctx->bs = bs;
+ ctx->cb_fn = cb_fn;
+ ctx->cb_arg = cb_arg;
+
+ bs_iter_cpl(ctx, NULL, -1);
+}
+
+static void
+bs_iter_close_cpl(void *cb_arg, int bserrno)
+{
+ struct spdk_bs_iter_ctx *ctx = cb_arg;
+
+ bs_iter_cpl(ctx, NULL, -1);
+}
+
+void
+spdk_bs_iter_next(struct spdk_blob_store *bs, struct spdk_blob *blob,
+ spdk_blob_op_with_handle_complete cb_fn, void *cb_arg)
+{
+ struct spdk_bs_iter_ctx *ctx;
+
+ assert(blob != NULL);
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx) {
+ cb_fn(cb_arg, NULL, -ENOMEM);
+ return;
+ }
+
+ ctx->page_num = bs_blobid_to_page(blob->id);
+ ctx->bs = bs;
+ ctx->cb_fn = cb_fn;
+ ctx->cb_arg = cb_arg;
+
+ /* Close the existing blob */
+ spdk_blob_close(blob, bs_iter_close_cpl, ctx);
+}
+
+static int
+blob_set_xattr(struct spdk_blob *blob, const char *name, const void *value,
+ uint16_t value_len, bool internal)
+{
+ struct spdk_xattr_tailq *xattrs;
+ struct spdk_xattr *xattr;
+ size_t desc_size;
+ void *tmp;
+
+ blob_verify_md_op(blob);
+
+ if (blob->md_ro) {
+ return -EPERM;
+ }
+
+ desc_size = sizeof(struct spdk_blob_md_descriptor_xattr) + strlen(name) + value_len;
+ if (desc_size > SPDK_BS_MAX_DESC_SIZE) {
+ SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Xattr '%s' of size %ld does not fix into single page %ld\n", name,
+ desc_size, SPDK_BS_MAX_DESC_SIZE);
+ return -ENOMEM;
+ }
+
+ if (internal) {
+ xattrs = &blob->xattrs_internal;
+ blob->invalid_flags |= SPDK_BLOB_INTERNAL_XATTR;
+ } else {
+ xattrs = &blob->xattrs;
+ }
+
+ TAILQ_FOREACH(xattr, xattrs, link) {
+ if (!strcmp(name, xattr->name)) {
+ tmp = malloc(value_len);
+ if (!tmp) {
+ return -ENOMEM;
+ }
+
+ free(xattr->value);
+ xattr->value_len = value_len;
+ xattr->value = tmp;
+ memcpy(xattr->value, value, value_len);
+
+ blob->state = SPDK_BLOB_STATE_DIRTY;
+
+ return 0;
+ }
+ }
+
+ xattr = calloc(1, sizeof(*xattr));
+ if (!xattr) {
+ return -ENOMEM;
+ }
+
+ xattr->name = strdup(name);
+ if (!xattr->name) {
+ free(xattr);
+ return -ENOMEM;
+ }
+
+ xattr->value_len = value_len;
+ xattr->value = malloc(value_len);
+ if (!xattr->value) {
+ free(xattr->name);
+ free(xattr);
+ return -ENOMEM;
+ }
+ memcpy(xattr->value, value, value_len);
+ TAILQ_INSERT_TAIL(xattrs, xattr, link);
+
+ blob->state = SPDK_BLOB_STATE_DIRTY;
+
+ return 0;
+}
+
+int
+spdk_blob_set_xattr(struct spdk_blob *blob, const char *name, const void *value,
+ uint16_t value_len)
+{
+ return blob_set_xattr(blob, name, value, value_len, false);
+}
+
+static int
+blob_remove_xattr(struct spdk_blob *blob, const char *name, bool internal)
+{
+ struct spdk_xattr_tailq *xattrs;
+ struct spdk_xattr *xattr;
+
+ blob_verify_md_op(blob);
+
+ if (blob->md_ro) {
+ return -EPERM;
+ }
+ xattrs = internal ? &blob->xattrs_internal : &blob->xattrs;
+
+ TAILQ_FOREACH(xattr, xattrs, link) {
+ if (!strcmp(name, xattr->name)) {
+ TAILQ_REMOVE(xattrs, xattr, link);
+ free(xattr->value);
+ free(xattr->name);
+ free(xattr);
+
+ if (internal && TAILQ_EMPTY(&blob->xattrs_internal)) {
+ blob->invalid_flags &= ~SPDK_BLOB_INTERNAL_XATTR;
+ }
+ blob->state = SPDK_BLOB_STATE_DIRTY;
+
+ return 0;
+ }
+ }
+
+ return -ENOENT;
+}
+
+int
+spdk_blob_remove_xattr(struct spdk_blob *blob, const char *name)
+{
+ return blob_remove_xattr(blob, name, false);
+}
+
+static int
+blob_get_xattr_value(struct spdk_blob *blob, const char *name,
+ const void **value, size_t *value_len, bool internal)
+{
+ struct spdk_xattr *xattr;
+ struct spdk_xattr_tailq *xattrs;
+
+ xattrs = internal ? &blob->xattrs_internal : &blob->xattrs;
+
+ TAILQ_FOREACH(xattr, xattrs, link) {
+ if (!strcmp(name, xattr->name)) {
+ *value = xattr->value;
+ *value_len = xattr->value_len;
+ return 0;
+ }
+ }
+ return -ENOENT;
+}
+
+int
+spdk_blob_get_xattr_value(struct spdk_blob *blob, const char *name,
+ const void **value, size_t *value_len)
+{
+ blob_verify_md_op(blob);
+
+ return blob_get_xattr_value(blob, name, value, value_len, false);
+}
+
+struct spdk_xattr_names {
+ uint32_t count;
+ const char *names[0];
+};
+
+static int
+blob_get_xattr_names(struct spdk_xattr_tailq *xattrs, struct spdk_xattr_names **names)
+{
+ struct spdk_xattr *xattr;
+ int count = 0;
+
+ TAILQ_FOREACH(xattr, xattrs, link) {
+ count++;
+ }
+
+ *names = calloc(1, sizeof(struct spdk_xattr_names) + count * sizeof(char *));
+ if (*names == NULL) {
+ return -ENOMEM;
+ }
+
+ TAILQ_FOREACH(xattr, xattrs, link) {
+ (*names)->names[(*names)->count++] = xattr->name;
+ }
+
+ return 0;
+}
+
+int
+spdk_blob_get_xattr_names(struct spdk_blob *blob, struct spdk_xattr_names **names)
+{
+ blob_verify_md_op(blob);
+
+ return blob_get_xattr_names(&blob->xattrs, names);
+}
+
+uint32_t
+spdk_xattr_names_get_count(struct spdk_xattr_names *names)
+{
+ assert(names != NULL);
+
+ return names->count;
+}
+
+const char *
+spdk_xattr_names_get_name(struct spdk_xattr_names *names, uint32_t index)
+{
+ if (index >= names->count) {
+ return NULL;
+ }
+
+ return names->names[index];
+}
+
+void
+spdk_xattr_names_free(struct spdk_xattr_names *names)
+{
+ free(names);
+}
+
+struct spdk_bs_type
+spdk_bs_get_bstype(struct spdk_blob_store *bs)
+{
+ return bs->bstype;
+}
+
+void
+spdk_bs_set_bstype(struct spdk_blob_store *bs, struct spdk_bs_type bstype)
+{
+ memcpy(&bs->bstype, &bstype, sizeof(bstype));
+}
+
+bool
+spdk_blob_is_read_only(struct spdk_blob *blob)
+{
+ assert(blob != NULL);
+ return (blob->data_ro || blob->md_ro);
+}
+
+bool
+spdk_blob_is_snapshot(struct spdk_blob *blob)
+{
+ struct spdk_blob_list *snapshot_entry;
+
+ assert(blob != NULL);
+
+ snapshot_entry = bs_get_snapshot_entry(blob->bs, blob->id);
+ if (snapshot_entry == NULL) {
+ return false;
+ }
+
+ return true;
+}
+
+bool
+spdk_blob_is_clone(struct spdk_blob *blob)
+{
+ assert(blob != NULL);
+
+ if (blob->parent_id != SPDK_BLOBID_INVALID) {
+ assert(spdk_blob_is_thin_provisioned(blob));
+ return true;
+ }
+
+ return false;
+}
+
+bool
+spdk_blob_is_thin_provisioned(struct spdk_blob *blob)
+{
+ assert(blob != NULL);
+ return !!(blob->invalid_flags & SPDK_BLOB_THIN_PROV);
+}
+
+static void
+blob_update_clear_method(struct spdk_blob *blob)
+{
+ enum blob_clear_method stored_cm;
+
+ assert(blob != NULL);
+
+ /* If BLOB_CLEAR_WITH_DEFAULT was passed in, use the setting stored
+ * in metadata previously. If something other than the default was
+ * specified, ignore stored value and used what was passed in.
+ */
+ stored_cm = ((blob->md_ro_flags & SPDK_BLOB_CLEAR_METHOD) >> SPDK_BLOB_CLEAR_METHOD_SHIFT);
+
+ if (blob->clear_method == BLOB_CLEAR_WITH_DEFAULT) {
+ blob->clear_method = stored_cm;
+ } else if (blob->clear_method != stored_cm) {
+ SPDK_WARNLOG("Using passed in clear method 0x%x instead of stored value of 0x%x\n",
+ blob->clear_method, stored_cm);
+ }
+}
+
+spdk_blob_id
+spdk_blob_get_parent_snapshot(struct spdk_blob_store *bs, spdk_blob_id blob_id)
+{
+ struct spdk_blob_list *snapshot_entry = NULL;
+ struct spdk_blob_list *clone_entry = NULL;
+
+ TAILQ_FOREACH(snapshot_entry, &bs->snapshots, link) {
+ TAILQ_FOREACH(clone_entry, &snapshot_entry->clones, link) {
+ if (clone_entry->id == blob_id) {
+ return snapshot_entry->id;
+ }
+ }
+ }
+
+ return SPDK_BLOBID_INVALID;
+}
+
+int
+spdk_blob_get_clones(struct spdk_blob_store *bs, spdk_blob_id blobid, spdk_blob_id *ids,
+ size_t *count)
+{
+ struct spdk_blob_list *snapshot_entry, *clone_entry;
+ size_t n;
+
+ snapshot_entry = bs_get_snapshot_entry(bs, blobid);
+ if (snapshot_entry == NULL) {
+ *count = 0;
+ return 0;
+ }
+
+ if (ids == NULL || *count < snapshot_entry->clone_count) {
+ *count = snapshot_entry->clone_count;
+ return -ENOMEM;
+ }
+ *count = snapshot_entry->clone_count;
+
+ n = 0;
+ TAILQ_FOREACH(clone_entry, &snapshot_entry->clones, link) {
+ ids[n++] = clone_entry->id;
+ }
+
+ return 0;
+}
+
+SPDK_LOG_REGISTER_COMPONENT("blob", SPDK_LOG_BLOB)
diff --git a/src/spdk/lib/blob/blobstore.h b/src/spdk/lib/blob/blobstore.h
new file mode 100644
index 000000000..5e93bd6ad
--- /dev/null
+++ b/src/spdk/lib/blob/blobstore.h
@@ -0,0 +1,702 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_BLOBSTORE_H
+#define SPDK_BLOBSTORE_H
+
+#include "spdk/assert.h"
+#include "spdk/blob.h"
+#include "spdk/queue.h"
+#include "spdk/util.h"
+
+#include "request.h"
+
+/* In Memory Data Structures
+ *
+ * The following data structures exist only in memory.
+ */
+
+#define SPDK_BLOB_OPTS_CLUSTER_SZ (1024 * 1024)
+#define SPDK_BLOB_OPTS_NUM_MD_PAGES UINT32_MAX
+#define SPDK_BLOB_OPTS_MAX_MD_OPS 32
+#define SPDK_BLOB_OPTS_DEFAULT_CHANNEL_OPS 512
+#define SPDK_BLOB_BLOBID_HIGH_BIT (1ULL << 32)
+
+struct spdk_xattr {
+ uint32_t index;
+ uint16_t value_len;
+ char *name;
+ void *value;
+ TAILQ_ENTRY(spdk_xattr) link;
+};
+
+/* The mutable part of the blob data that is sync'd to
+ * disk. The data in here is both mutable and persistent.
+ */
+struct spdk_blob_mut_data {
+ /* Number of data clusters in the blob */
+ uint64_t num_clusters;
+
+ /* Array LBAs that are the beginning of a cluster, in
+ * the order they appear in the blob.
+ */
+ uint64_t *clusters;
+
+ /* The size of the clusters array. This is greater than or
+ * equal to 'num_clusters'.
+ */
+ size_t cluster_array_size;
+
+ /* Number of extent pages */
+ uint64_t num_extent_pages;
+
+ /* Array of page offsets into the metadata region,
+ * containing extents. Can contain entries for not yet
+ * allocated pages. */
+ uint32_t *extent_pages;
+
+ /* The size of the extent page array. This is greater than or
+ * equal to 'num_extent_pages'. */
+ size_t extent_pages_array_size;
+
+ /* Number of metadata pages */
+ uint32_t num_pages;
+
+ /* Array of page offsets into the metadata region, in
+ * the order of the metadata page sequence.
+ */
+ uint32_t *pages;
+};
+
+enum spdk_blob_state {
+ /* The blob in-memory version does not match the on-disk
+ * version.
+ */
+ SPDK_BLOB_STATE_DIRTY,
+
+ /* The blob in memory version of the blob matches the on disk
+ * version.
+ */
+ SPDK_BLOB_STATE_CLEAN,
+
+ /* The in-memory state being synchronized with the on-disk
+ * blob state. */
+ SPDK_BLOB_STATE_LOADING,
+};
+
+TAILQ_HEAD(spdk_xattr_tailq, spdk_xattr);
+
+struct spdk_blob_list {
+ spdk_blob_id id;
+ size_t clone_count;
+ TAILQ_HEAD(, spdk_blob_list) clones;
+ TAILQ_ENTRY(spdk_blob_list) link;
+};
+
+struct spdk_blob {
+ struct spdk_blob_store *bs;
+
+ uint32_t open_ref;
+
+ spdk_blob_id id;
+ spdk_blob_id parent_id;
+
+ enum spdk_blob_state state;
+
+ /* Two copies of the mutable data. One is a version
+ * that matches the last known data on disk (clean).
+ * The other (active) is the current data. Syncing
+ * a blob makes the clean match the active.
+ */
+ struct spdk_blob_mut_data clean;
+ struct spdk_blob_mut_data active;
+
+ bool invalid;
+ bool data_ro;
+ bool md_ro;
+
+ uint64_t invalid_flags;
+ uint64_t data_ro_flags;
+ uint64_t md_ro_flags;
+
+ struct spdk_bs_dev *back_bs_dev;
+
+ /* TODO: The xattrs are mutable, but we don't want to be
+ * copying them unnecessarily. Figure this out.
+ */
+ struct spdk_xattr_tailq xattrs;
+ struct spdk_xattr_tailq xattrs_internal;
+
+ TAILQ_ENTRY(spdk_blob) link;
+
+ uint32_t frozen_refcnt;
+ bool locked_operation_in_progress;
+ enum blob_clear_method clear_method;
+ bool extent_rle_found;
+ bool extent_table_found;
+ bool use_extent_table;
+
+ /* A list of pending metadata pending_persists */
+ TAILQ_HEAD(, spdk_blob_persist_ctx) pending_persists;
+
+ /* Number of data clusters retrived from extent table,
+ * that many have to be read from extent pages. */
+ uint64_t remaining_clusters_in_et;
+};
+
+struct spdk_blob_store {
+ uint64_t md_start; /* Offset from beginning of disk, in pages */
+ uint32_t md_len; /* Count, in pages */
+
+ struct spdk_io_channel *md_channel;
+ uint32_t max_channel_ops;
+
+ struct spdk_thread *md_thread;
+
+ struct spdk_bs_dev *dev;
+
+ struct spdk_bit_array *used_md_pages;
+ struct spdk_bit_array *used_clusters;
+ struct spdk_bit_array *used_blobids;
+ struct spdk_bit_array *open_blobids;
+
+ pthread_mutex_t used_clusters_mutex;
+
+ uint32_t cluster_sz;
+ uint64_t total_clusters;
+ uint64_t total_data_clusters;
+ uint64_t num_free_clusters;
+ uint64_t pages_per_cluster;
+ uint8_t pages_per_cluster_shift;
+ uint32_t io_unit_size;
+
+ spdk_blob_id super_blob;
+ struct spdk_bs_type bstype;
+
+ struct spdk_bs_cpl unload_cpl;
+ int unload_err;
+
+ TAILQ_HEAD(, spdk_blob) blobs;
+ TAILQ_HEAD(, spdk_blob_list) snapshots;
+
+ bool clean;
+};
+
+struct spdk_bs_channel {
+ struct spdk_bs_request_set *req_mem;
+ TAILQ_HEAD(, spdk_bs_request_set) reqs;
+
+ struct spdk_blob_store *bs;
+
+ struct spdk_bs_dev *dev;
+ struct spdk_io_channel *dev_channel;
+
+ TAILQ_HEAD(, spdk_bs_request_set) need_cluster_alloc;
+ TAILQ_HEAD(, spdk_bs_request_set) queued_io;
+};
+
+/** operation type */
+enum spdk_blob_op_type {
+ SPDK_BLOB_WRITE,
+ SPDK_BLOB_READ,
+ SPDK_BLOB_UNMAP,
+ SPDK_BLOB_WRITE_ZEROES,
+ SPDK_BLOB_WRITEV,
+ SPDK_BLOB_READV,
+};
+
+/* back bs_dev */
+
+#define BLOB_SNAPSHOT "SNAP"
+#define SNAPSHOT_IN_PROGRESS "SNAPTMP"
+#define SNAPSHOT_PENDING_REMOVAL "SNAPRM"
+
+struct spdk_blob_bs_dev {
+ struct spdk_bs_dev bs_dev;
+ struct spdk_blob *blob;
+};
+
+/* On-Disk Data Structures
+ *
+ * The following data structures exist on disk.
+ */
+#define SPDK_BS_INITIAL_VERSION 1
+#define SPDK_BS_VERSION 3 /* current version */
+
+#pragma pack(push, 1)
+
+#define SPDK_MD_MASK_TYPE_USED_PAGES 0
+#define SPDK_MD_MASK_TYPE_USED_CLUSTERS 1
+#define SPDK_MD_MASK_TYPE_USED_BLOBIDS 2
+
+struct spdk_bs_md_mask {
+ uint8_t type;
+ uint32_t length; /* In bits */
+ uint8_t mask[0];
+};
+
+#define SPDK_MD_DESCRIPTOR_TYPE_PADDING 0
+#define SPDK_MD_DESCRIPTOR_TYPE_XATTR 2
+#define SPDK_MD_DESCRIPTOR_TYPE_FLAGS 3
+#define SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL 4
+
+/* Following descriptors define cluster layout in a blob.
+ * EXTENT_RLE cannot be present in blobs metadata,
+ * at the same time as EXTENT_TABLE and EXTENT_PAGE descriptors. */
+
+/* EXTENT_RLE descriptor holds an array of LBA that points to
+ * beginning of allocated clusters. The array is run-length encoded,
+ * with 0's being unallocated clusters. It is part of serialized
+ * metadata chain for a blob. */
+#define SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE 1
+/* EXTENT_TABLE descriptor holds array of md page offsets that
+ * point to pages with EXTENT_PAGE descriptor. The 0's in the array
+ * are run-length encoded, non-zero values are unallocated pages.
+ * It is part of serialized metadata chain for a blob. */
+#define SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE 5
+/* EXTENT_PAGE descriptor holds an array of LBAs that point to
+ * beginning of allocated clusters. The array is run-length encoded,
+ * with 0's being unallocated clusters. It is NOT part of
+ * serialized metadata chain for a blob. */
+#define SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE 6
+
+struct spdk_blob_md_descriptor_xattr {
+ uint8_t type;
+ uint32_t length;
+
+ uint16_t name_length;
+ uint16_t value_length;
+
+ char name[0];
+ /* String name immediately followed by string value. */
+};
+
+struct spdk_blob_md_descriptor_extent_rle {
+ uint8_t type;
+ uint32_t length;
+
+ struct {
+ uint32_t cluster_idx;
+ uint32_t length; /* In units of clusters */
+ } extents[0];
+};
+
+struct spdk_blob_md_descriptor_extent_table {
+ uint8_t type;
+ uint32_t length;
+
+ /* Number of data clusters in the blob */
+ uint64_t num_clusters;
+
+ struct {
+ uint32_t page_idx;
+ uint32_t num_pages; /* In units of pages */
+ } extent_page[0];
+};
+
+struct spdk_blob_md_descriptor_extent_page {
+ uint8_t type;
+ uint32_t length;
+
+ /* First cluster index in this extent page */
+ uint32_t start_cluster_idx;
+
+ uint32_t cluster_idx[0];
+};
+
+#define SPDK_BLOB_THIN_PROV (1ULL << 0)
+#define SPDK_BLOB_INTERNAL_XATTR (1ULL << 1)
+#define SPDK_BLOB_EXTENT_TABLE (1ULL << 2)
+#define SPDK_BLOB_INVALID_FLAGS_MASK (SPDK_BLOB_THIN_PROV | SPDK_BLOB_INTERNAL_XATTR | SPDK_BLOB_EXTENT_TABLE)
+
+#define SPDK_BLOB_READ_ONLY (1ULL << 0)
+#define SPDK_BLOB_DATA_RO_FLAGS_MASK SPDK_BLOB_READ_ONLY
+
+#define SPDK_BLOB_CLEAR_METHOD_SHIFT 0
+#define SPDK_BLOB_CLEAR_METHOD (3ULL << SPDK_BLOB_CLEAR_METHOD_SHIFT)
+#define SPDK_BLOB_MD_RO_FLAGS_MASK SPDK_BLOB_CLEAR_METHOD
+
+struct spdk_blob_md_descriptor_flags {
+ uint8_t type;
+ uint32_t length;
+
+ /*
+ * If a flag in invalid_flags is set that the application is not aware of,
+ * it will not allow the blob to be opened.
+ */
+ uint64_t invalid_flags;
+
+ /*
+ * If a flag in data_ro_flags is set that the application is not aware of,
+ * allow the blob to be opened in data_read_only and md_read_only mode.
+ */
+ uint64_t data_ro_flags;
+
+ /*
+ * If a flag in md_ro_flags is set the the application is not aware of,
+ * allow the blob to be opened in md_read_only mode.
+ */
+ uint64_t md_ro_flags;
+};
+
+struct spdk_blob_md_descriptor {
+ uint8_t type;
+ uint32_t length;
+};
+
+#define SPDK_INVALID_MD_PAGE UINT32_MAX
+
+struct spdk_blob_md_page {
+ spdk_blob_id id;
+
+ uint32_t sequence_num;
+ uint32_t reserved0;
+
+ /* Descriptors here */
+ uint8_t descriptors[4072];
+
+ uint32_t next;
+ uint32_t crc;
+};
+#define SPDK_BS_PAGE_SIZE 0x1000
+SPDK_STATIC_ASSERT(SPDK_BS_PAGE_SIZE == sizeof(struct spdk_blob_md_page), "Invalid md page size");
+
+#define SPDK_BS_MAX_DESC_SIZE sizeof(((struct spdk_blob_md_page*)0)->descriptors)
+
+/* Maximum number of extents a single Extent Page can fit.
+ * For an SPDK_BS_PAGE_SIZE of 4K SPDK_EXTENTS_PER_EP would be 512. */
+#define SPDK_EXTENTS_PER_EP_MAX ((SPDK_BS_MAX_DESC_SIZE - sizeof(struct spdk_blob_md_descriptor_extent_page)) / sizeof(uint32_t))
+#define SPDK_EXTENTS_PER_EP (spdk_align64pow2(SPDK_EXTENTS_PER_EP_MAX + 1) >> 1u)
+
+#define SPDK_BS_SUPER_BLOCK_SIG "SPDKBLOB"
+
+struct spdk_bs_super_block {
+ uint8_t signature[8];
+ uint32_t version;
+ uint32_t length;
+ uint32_t clean; /* If there was a clean shutdown, this is 1. */
+ spdk_blob_id super_blob;
+
+ uint32_t cluster_size; /* In bytes */
+
+ uint32_t used_page_mask_start; /* Offset from beginning of disk, in pages */
+ uint32_t used_page_mask_len; /* Count, in pages */
+
+ uint32_t used_cluster_mask_start; /* Offset from beginning of disk, in pages */
+ uint32_t used_cluster_mask_len; /* Count, in pages */
+
+ uint32_t md_start; /* Offset from beginning of disk, in pages */
+ uint32_t md_len; /* Count, in pages */
+
+ struct spdk_bs_type bstype; /* blobstore type */
+
+ uint32_t used_blobid_mask_start; /* Offset from beginning of disk, in pages */
+ uint32_t used_blobid_mask_len; /* Count, in pages */
+
+ uint64_t size; /* size of blobstore in bytes */
+ uint32_t io_unit_size; /* Size of io unit in bytes */
+
+ uint8_t reserved[4000];
+ uint32_t crc;
+};
+SPDK_STATIC_ASSERT(sizeof(struct spdk_bs_super_block) == 0x1000, "Invalid super block size");
+
+#pragma pack(pop)
+
+struct spdk_bs_dev *bs_create_zeroes_dev(void);
+struct spdk_bs_dev *bs_create_blob_bs_dev(struct spdk_blob *blob);
+
+/* Unit Conversions
+ *
+ * The blobstore works with several different units:
+ * - Byte: Self explanatory
+ * - LBA: The logical blocks on the backing storage device.
+ * - Page: The read/write units of blobs and metadata. This is
+ * an offset into a blob in units of 4KiB.
+ * - Cluster Index: The disk is broken into a sequential list of
+ * clusters. This is the offset from the beginning.
+ *
+ * NOTE: These conversions all act on simple magnitudes, not with any sort
+ * of knowledge about the blobs themselves. For instance, converting
+ * a page to an lba with the conversion function below simply converts
+ * a number of pages to an equivalent number of lbas, but that
+ * lba certainly isn't the right lba that corresponds to a page offset
+ * for a particular blob.
+ */
+static inline uint64_t
+bs_byte_to_lba(struct spdk_blob_store *bs, uint64_t length)
+{
+ assert(length % bs->dev->blocklen == 0);
+
+ return length / bs->dev->blocklen;
+}
+
+static inline uint64_t
+bs_dev_byte_to_lba(struct spdk_bs_dev *bs_dev, uint64_t length)
+{
+ assert(length % bs_dev->blocklen == 0);
+
+ return length / bs_dev->blocklen;
+}
+
+static inline uint64_t
+bs_page_to_lba(struct spdk_blob_store *bs, uint64_t page)
+{
+ return page * SPDK_BS_PAGE_SIZE / bs->dev->blocklen;
+}
+
+static inline uint64_t
+bs_md_page_to_lba(struct spdk_blob_store *bs, uint32_t page)
+{
+ assert(page < bs->md_len);
+ return bs_page_to_lba(bs, page + bs->md_start);
+}
+
+static inline uint64_t
+bs_dev_page_to_lba(struct spdk_bs_dev *bs_dev, uint64_t page)
+{
+ return page * SPDK_BS_PAGE_SIZE / bs_dev->blocklen;
+}
+
+static inline uint64_t
+bs_io_unit_per_page(struct spdk_blob_store *bs)
+{
+ return SPDK_BS_PAGE_SIZE / bs->io_unit_size;
+}
+
+static inline uint64_t
+bs_io_unit_to_page(struct spdk_blob_store *bs, uint64_t io_unit)
+{
+ return io_unit / bs_io_unit_per_page(bs);
+}
+
+static inline uint64_t
+bs_cluster_to_page(struct spdk_blob_store *bs, uint32_t cluster)
+{
+ return (uint64_t)cluster * bs->pages_per_cluster;
+}
+
+static inline uint32_t
+bs_page_to_cluster(struct spdk_blob_store *bs, uint64_t page)
+{
+ assert(page % bs->pages_per_cluster == 0);
+
+ return page / bs->pages_per_cluster;
+}
+
+static inline uint64_t
+bs_cluster_to_lba(struct spdk_blob_store *bs, uint32_t cluster)
+{
+ return (uint64_t)cluster * (bs->cluster_sz / bs->dev->blocklen);
+}
+
+static inline uint32_t
+bs_lba_to_cluster(struct spdk_blob_store *bs, uint64_t lba)
+{
+ assert(lba % (bs->cluster_sz / bs->dev->blocklen) == 0);
+
+ return lba / (bs->cluster_sz / bs->dev->blocklen);
+}
+
+static inline uint64_t
+bs_io_unit_to_back_dev_lba(struct spdk_blob *blob, uint64_t io_unit)
+{
+ return io_unit * (blob->bs->io_unit_size / blob->back_bs_dev->blocklen);
+}
+
+static inline uint64_t
+bs_back_dev_lba_to_io_unit(struct spdk_blob *blob, uint64_t lba)
+{
+ return lba * (blob->back_bs_dev->blocklen / blob->bs->io_unit_size);
+}
+
+static inline uint64_t
+bs_cluster_to_extent_table_id(uint64_t cluster_num)
+{
+ return cluster_num / SPDK_EXTENTS_PER_EP;
+}
+
+static inline uint32_t *
+bs_cluster_to_extent_page(struct spdk_blob *blob, uint64_t cluster_num)
+{
+ uint64_t extent_table_id = bs_cluster_to_extent_table_id(cluster_num);
+
+ assert(blob->use_extent_table);
+ assert(extent_table_id < blob->active.extent_pages_array_size);
+
+ return &blob->active.extent_pages[extent_table_id];
+}
+
+/* End basic conversions */
+
+static inline uint64_t
+bs_blobid_to_page(spdk_blob_id id)
+{
+ return id & 0xFFFFFFFF;
+}
+
+/* The blob id is a 64 bit number. The lower 32 bits are the page_idx. The upper
+ * 32 bits are not currently used. Stick a 1 there just to catch bugs where the
+ * code assumes blob id == page_idx.
+ */
+static inline spdk_blob_id
+bs_page_to_blobid(uint64_t page_idx)
+{
+ if (page_idx > UINT32_MAX) {
+ return SPDK_BLOBID_INVALID;
+ }
+ return SPDK_BLOB_BLOBID_HIGH_BIT | page_idx;
+}
+
+/* Given an io unit offset into a blob, look up the LBA for the
+ * start of that io unit.
+ */
+static inline uint64_t
+bs_blob_io_unit_to_lba(struct spdk_blob *blob, uint64_t io_unit)
+{
+ uint64_t lba;
+ uint64_t pages_per_cluster;
+ uint8_t shift;
+ uint64_t io_units_per_cluster;
+ uint64_t io_units_per_page;
+ uint64_t page;
+
+ page = bs_io_unit_to_page(blob->bs, io_unit);
+
+ pages_per_cluster = blob->bs->pages_per_cluster;
+ shift = blob->bs->pages_per_cluster_shift;
+ io_units_per_page = bs_io_unit_per_page(blob->bs);
+
+ assert(page < blob->active.num_clusters * pages_per_cluster);
+
+ if (shift != 0) {
+ io_units_per_cluster = io_units_per_page << shift;
+ lba = blob->active.clusters[page >> shift];
+ } else {
+ io_units_per_cluster = io_units_per_page * pages_per_cluster;
+ lba = blob->active.clusters[page / pages_per_cluster];
+ }
+ lba += io_unit % io_units_per_cluster;
+ return lba;
+}
+
+/* Given an io_unit offset into a blob, look up the number of io_units until the
+ * next cluster boundary.
+ */
+static inline uint32_t
+bs_num_io_units_to_cluster_boundary(struct spdk_blob *blob, uint64_t io_unit)
+{
+ uint64_t io_units_per_cluster;
+ uint8_t shift = blob->bs->pages_per_cluster_shift;
+
+ if (shift != 0) {
+ io_units_per_cluster = bs_io_unit_per_page(blob->bs) << shift;
+ } else {
+ io_units_per_cluster = bs_io_unit_per_page(blob->bs) * blob->bs->pages_per_cluster;
+ }
+
+ return io_units_per_cluster - (io_unit % io_units_per_cluster);
+}
+
+/* Given a page offset into a blob, look up the number of pages until the
+ * next cluster boundary.
+ */
+static inline uint32_t
+bs_num_pages_to_cluster_boundary(struct spdk_blob *blob, uint64_t page)
+{
+ uint64_t pages_per_cluster;
+
+ pages_per_cluster = blob->bs->pages_per_cluster;
+
+ return pages_per_cluster - (page % pages_per_cluster);
+}
+
+/* Given an io_unit offset into a blob, look up the number of pages into blob to beginning of current cluster */
+static inline uint32_t
+bs_io_unit_to_cluster_start(struct spdk_blob *blob, uint64_t io_unit)
+{
+ uint64_t pages_per_cluster;
+ uint64_t page;
+
+ pages_per_cluster = blob->bs->pages_per_cluster;
+ page = bs_io_unit_to_page(blob->bs, io_unit);
+
+ return page - (page % pages_per_cluster);
+}
+
+/* Given an io_unit offset into a blob, look up the number of pages into blob to beginning of current cluster */
+static inline uint32_t
+bs_io_unit_to_cluster_number(struct spdk_blob *blob, uint64_t io_unit)
+{
+ uint64_t pages_per_cluster = blob->bs->pages_per_cluster;
+ uint8_t shift = blob->bs->pages_per_cluster_shift;
+ uint32_t page_offset;
+
+ page_offset = io_unit / bs_io_unit_per_page(blob->bs);
+ if (shift != 0) {
+ return page_offset >> shift;
+ } else {
+ return page_offset / pages_per_cluster;
+ }
+}
+
+/* Given an io unit offset into a blob, look up if it is from allocated cluster. */
+static inline bool
+bs_io_unit_is_allocated(struct spdk_blob *blob, uint64_t io_unit)
+{
+ uint64_t lba;
+ uint64_t page;
+ uint64_t pages_per_cluster;
+ uint8_t shift;
+
+ shift = blob->bs->pages_per_cluster_shift;
+ pages_per_cluster = blob->bs->pages_per_cluster;
+ page = bs_io_unit_to_page(blob->bs, io_unit);
+
+ assert(page < blob->active.num_clusters * pages_per_cluster);
+
+ if (shift != 0) {
+ lba = blob->active.clusters[page >> shift];
+ } else {
+ lba = blob->active.clusters[page / pages_per_cluster];
+ }
+
+ if (lba == 0) {
+ assert(spdk_blob_is_thin_provisioned(blob));
+ return false;
+ } else {
+ return true;
+ }
+}
+
+#endif
diff --git a/src/spdk/lib/blob/request.c b/src/spdk/lib/blob/request.c
new file mode 100644
index 000000000..0975bcf24
--- /dev/null
+++ b/src/spdk/lib/blob/request.c
@@ -0,0 +1,521 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "blobstore.h"
+#include "request.h"
+
+#include "spdk/thread.h"
+#include "spdk/queue.h"
+
+#include "spdk_internal/log.h"
+
+void
+bs_call_cpl(struct spdk_bs_cpl *cpl, int bserrno)
+{
+ switch (cpl->type) {
+ case SPDK_BS_CPL_TYPE_BS_BASIC:
+ cpl->u.bs_basic.cb_fn(cpl->u.bs_basic.cb_arg,
+ bserrno);
+ break;
+ case SPDK_BS_CPL_TYPE_BS_HANDLE:
+ cpl->u.bs_handle.cb_fn(cpl->u.bs_handle.cb_arg,
+ bserrno == 0 ? cpl->u.bs_handle.bs : NULL,
+ bserrno);
+ break;
+ case SPDK_BS_CPL_TYPE_BLOB_BASIC:
+ cpl->u.blob_basic.cb_fn(cpl->u.blob_basic.cb_arg,
+ bserrno);
+ break;
+ case SPDK_BS_CPL_TYPE_BLOBID:
+ cpl->u.blobid.cb_fn(cpl->u.blobid.cb_arg,
+ bserrno == 0 ? cpl->u.blobid.blobid : SPDK_BLOBID_INVALID,
+ bserrno);
+ break;
+ case SPDK_BS_CPL_TYPE_BLOB_HANDLE:
+ cpl->u.blob_handle.cb_fn(cpl->u.blob_handle.cb_arg,
+ bserrno == 0 ? cpl->u.blob_handle.blob : NULL,
+ bserrno);
+ break;
+ case SPDK_BS_CPL_TYPE_NESTED_SEQUENCE:
+ cpl->u.nested_seq.cb_fn(cpl->u.nested_seq.cb_arg,
+ cpl->u.nested_seq.parent,
+ bserrno);
+ break;
+ case SPDK_BS_CPL_TYPE_NONE:
+ /* this completion's callback is handled elsewhere */
+ break;
+ }
+}
+
+static void
+bs_request_set_complete(struct spdk_bs_request_set *set)
+{
+ struct spdk_bs_cpl cpl = set->cpl;
+ int bserrno = set->bserrno;
+
+ TAILQ_INSERT_TAIL(&set->channel->reqs, set, link);
+
+ bs_call_cpl(&cpl, bserrno);
+}
+
+static void
+bs_sequence_completion(struct spdk_io_channel *channel, void *cb_arg, int bserrno)
+{
+ struct spdk_bs_request_set *set = cb_arg;
+
+ set->bserrno = bserrno;
+ set->u.sequence.cb_fn((spdk_bs_sequence_t *)set, set->u.sequence.cb_arg, bserrno);
+}
+
+spdk_bs_sequence_t *
+bs_sequence_start(struct spdk_io_channel *_channel,
+ struct spdk_bs_cpl *cpl)
+{
+ struct spdk_bs_channel *channel;
+ struct spdk_bs_request_set *set;
+
+ channel = spdk_io_channel_get_ctx(_channel);
+ assert(channel != NULL);
+ set = TAILQ_FIRST(&channel->reqs);
+ if (!set) {
+ return NULL;
+ }
+ TAILQ_REMOVE(&channel->reqs, set, link);
+
+ set->cpl = *cpl;
+ set->bserrno = 0;
+ set->channel = channel;
+
+ set->cb_args.cb_fn = bs_sequence_completion;
+ set->cb_args.cb_arg = set;
+ set->cb_args.channel = channel->dev_channel;
+
+ return (spdk_bs_sequence_t *)set;
+}
+
+void
+bs_sequence_read_bs_dev(spdk_bs_sequence_t *seq, struct spdk_bs_dev *bs_dev,
+ void *payload, uint64_t lba, uint32_t lba_count,
+ spdk_bs_sequence_cpl cb_fn, void *cb_arg)
+{
+ struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)seq;
+ struct spdk_bs_channel *channel = set->channel;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "Reading %" PRIu32 " blocks from LBA %" PRIu64 "\n", lba_count,
+ lba);
+
+ set->u.sequence.cb_fn = cb_fn;
+ set->u.sequence.cb_arg = cb_arg;
+
+ bs_dev->read(bs_dev, spdk_io_channel_from_ctx(channel), payload, lba, lba_count, &set->cb_args);
+}
+
+void
+bs_sequence_read_dev(spdk_bs_sequence_t *seq, void *payload,
+ uint64_t lba, uint32_t lba_count,
+ spdk_bs_sequence_cpl cb_fn, void *cb_arg)
+{
+ struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)seq;
+ struct spdk_bs_channel *channel = set->channel;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "Reading %" PRIu32 " blocks from LBA %" PRIu64 "\n", lba_count,
+ lba);
+
+ set->u.sequence.cb_fn = cb_fn;
+ set->u.sequence.cb_arg = cb_arg;
+
+ channel->dev->read(channel->dev, channel->dev_channel, payload, lba, lba_count, &set->cb_args);
+}
+
+void
+bs_sequence_write_dev(spdk_bs_sequence_t *seq, void *payload,
+ uint64_t lba, uint32_t lba_count,
+ spdk_bs_sequence_cpl cb_fn, void *cb_arg)
+{
+ struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)seq;
+ struct spdk_bs_channel *channel = set->channel;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "Writing %" PRIu32 " blocks from LBA %" PRIu64 "\n", lba_count,
+ lba);
+
+ set->u.sequence.cb_fn = cb_fn;
+ set->u.sequence.cb_arg = cb_arg;
+
+ channel->dev->write(channel->dev, channel->dev_channel, payload, lba, lba_count,
+ &set->cb_args);
+}
+
+void
+bs_sequence_readv_bs_dev(spdk_bs_sequence_t *seq, struct spdk_bs_dev *bs_dev,
+ struct iovec *iov, int iovcnt, uint64_t lba, uint32_t lba_count,
+ spdk_bs_sequence_cpl cb_fn, void *cb_arg)
+{
+ struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)seq;
+ struct spdk_bs_channel *channel = set->channel;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "Reading %" PRIu32 " blocks from LBA %" PRIu64 "\n", lba_count,
+ lba);
+
+ set->u.sequence.cb_fn = cb_fn;
+ set->u.sequence.cb_arg = cb_arg;
+
+ bs_dev->readv(bs_dev, spdk_io_channel_from_ctx(channel), iov, iovcnt, lba, lba_count,
+ &set->cb_args);
+}
+
+void
+bs_sequence_readv_dev(spdk_bs_sequence_t *seq, struct iovec *iov, int iovcnt,
+ uint64_t lba, uint32_t lba_count, spdk_bs_sequence_cpl cb_fn, void *cb_arg)
+{
+ struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)seq;
+ struct spdk_bs_channel *channel = set->channel;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "Reading %" PRIu32 " blocks from LBA %" PRIu64 "\n", lba_count,
+ lba);
+
+ set->u.sequence.cb_fn = cb_fn;
+ set->u.sequence.cb_arg = cb_arg;
+ channel->dev->readv(channel->dev, channel->dev_channel, iov, iovcnt, lba, lba_count,
+ &set->cb_args);
+}
+
+void
+bs_sequence_writev_dev(spdk_bs_sequence_t *seq, struct iovec *iov, int iovcnt,
+ uint64_t lba, uint32_t lba_count,
+ spdk_bs_sequence_cpl cb_fn, void *cb_arg)
+{
+ struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)seq;
+ struct spdk_bs_channel *channel = set->channel;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "Writing %" PRIu32 " blocks from LBA %" PRIu64 "\n", lba_count,
+ lba);
+
+ set->u.sequence.cb_fn = cb_fn;
+ set->u.sequence.cb_arg = cb_arg;
+
+ channel->dev->writev(channel->dev, channel->dev_channel, iov, iovcnt, lba, lba_count,
+ &set->cb_args);
+}
+
+void
+bs_sequence_write_zeroes_dev(spdk_bs_sequence_t *seq,
+ uint64_t lba, uint32_t lba_count,
+ spdk_bs_sequence_cpl cb_fn, void *cb_arg)
+{
+ struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)seq;
+ struct spdk_bs_channel *channel = set->channel;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "writing zeroes to %" PRIu32 " blocks at LBA %" PRIu64 "\n",
+ lba_count, lba);
+
+ set->u.sequence.cb_fn = cb_fn;
+ set->u.sequence.cb_arg = cb_arg;
+
+ channel->dev->write_zeroes(channel->dev, channel->dev_channel, lba, lba_count,
+ &set->cb_args);
+}
+
+void
+bs_sequence_finish(spdk_bs_sequence_t *seq, int bserrno)
+{
+ if (bserrno != 0) {
+ seq->bserrno = bserrno;
+ }
+ bs_request_set_complete((struct spdk_bs_request_set *)seq);
+}
+
+void
+bs_user_op_sequence_finish(void *cb_arg, int bserrno)
+{
+ spdk_bs_sequence_t *seq = cb_arg;
+
+ bs_sequence_finish(seq, bserrno);
+}
+
+static void
+bs_batch_completion(struct spdk_io_channel *_channel,
+ void *cb_arg, int bserrno)
+{
+ struct spdk_bs_request_set *set = cb_arg;
+
+ set->u.batch.outstanding_ops--;
+ if (bserrno != 0) {
+ set->bserrno = bserrno;
+ }
+
+ if (set->u.batch.outstanding_ops == 0 && set->u.batch.batch_closed) {
+ if (set->u.batch.cb_fn) {
+ set->cb_args.cb_fn = bs_sequence_completion;
+ set->u.batch.cb_fn((spdk_bs_sequence_t *)set, set->u.batch.cb_arg, bserrno);
+ } else {
+ bs_request_set_complete(set);
+ }
+ }
+}
+
+spdk_bs_batch_t *
+bs_batch_open(struct spdk_io_channel *_channel,
+ struct spdk_bs_cpl *cpl)
+{
+ struct spdk_bs_channel *channel;
+ struct spdk_bs_request_set *set;
+
+ channel = spdk_io_channel_get_ctx(_channel);
+ assert(channel != NULL);
+ set = TAILQ_FIRST(&channel->reqs);
+ if (!set) {
+ return NULL;
+ }
+ TAILQ_REMOVE(&channel->reqs, set, link);
+
+ set->cpl = *cpl;
+ set->bserrno = 0;
+ set->channel = channel;
+
+ set->u.batch.cb_fn = NULL;
+ set->u.batch.cb_arg = NULL;
+ set->u.batch.outstanding_ops = 0;
+ set->u.batch.batch_closed = 0;
+
+ set->cb_args.cb_fn = bs_batch_completion;
+ set->cb_args.cb_arg = set;
+ set->cb_args.channel = channel->dev_channel;
+
+ return (spdk_bs_batch_t *)set;
+}
+
+void
+bs_batch_read_bs_dev(spdk_bs_batch_t *batch, struct spdk_bs_dev *bs_dev,
+ void *payload, uint64_t lba, uint32_t lba_count)
+{
+ struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)batch;
+ struct spdk_bs_channel *channel = set->channel;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "Reading %" PRIu32 " blocks from LBA %" PRIu64 "\n", lba_count,
+ lba);
+
+ set->u.batch.outstanding_ops++;
+ bs_dev->read(bs_dev, spdk_io_channel_from_ctx(channel), payload, lba, lba_count, &set->cb_args);
+}
+
+void
+bs_batch_read_dev(spdk_bs_batch_t *batch, void *payload,
+ uint64_t lba, uint32_t lba_count)
+{
+ struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)batch;
+ struct spdk_bs_channel *channel = set->channel;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "Reading %" PRIu32 " blocks from LBA %" PRIu64 "\n", lba_count,
+ lba);
+
+ set->u.batch.outstanding_ops++;
+ channel->dev->read(channel->dev, channel->dev_channel, payload, lba, lba_count, &set->cb_args);
+}
+
+void
+bs_batch_write_dev(spdk_bs_batch_t *batch, void *payload,
+ uint64_t lba, uint32_t lba_count)
+{
+ struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)batch;
+ struct spdk_bs_channel *channel = set->channel;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "Writing %" PRIu32 " blocks to LBA %" PRIu64 "\n", lba_count, lba);
+
+ set->u.batch.outstanding_ops++;
+ channel->dev->write(channel->dev, channel->dev_channel, payload, lba, lba_count,
+ &set->cb_args);
+}
+
+void
+bs_batch_unmap_dev(spdk_bs_batch_t *batch,
+ uint64_t lba, uint32_t lba_count)
+{
+ struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)batch;
+ struct spdk_bs_channel *channel = set->channel;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "Unmapping %" PRIu32 " blocks at LBA %" PRIu64 "\n", lba_count,
+ lba);
+
+ set->u.batch.outstanding_ops++;
+ channel->dev->unmap(channel->dev, channel->dev_channel, lba, lba_count,
+ &set->cb_args);
+}
+
+void
+bs_batch_write_zeroes_dev(spdk_bs_batch_t *batch,
+ uint64_t lba, uint32_t lba_count)
+{
+ struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)batch;
+ struct spdk_bs_channel *channel = set->channel;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "Zeroing %" PRIu32 " blocks at LBA %" PRIu64 "\n", lba_count, lba);
+
+ set->u.batch.outstanding_ops++;
+ channel->dev->write_zeroes(channel->dev, channel->dev_channel, lba, lba_count,
+ &set->cb_args);
+}
+
+void
+bs_batch_close(spdk_bs_batch_t *batch)
+{
+ struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)batch;
+
+ set->u.batch.batch_closed = 1;
+
+ if (set->u.batch.outstanding_ops == 0) {
+ if (set->u.batch.cb_fn) {
+ set->cb_args.cb_fn = bs_sequence_completion;
+ set->u.batch.cb_fn((spdk_bs_sequence_t *)set, set->u.batch.cb_arg, set->bserrno);
+ } else {
+ bs_request_set_complete(set);
+ }
+ }
+}
+
+spdk_bs_batch_t *
+bs_sequence_to_batch(spdk_bs_sequence_t *seq, spdk_bs_sequence_cpl cb_fn, void *cb_arg)
+{
+ struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)seq;
+
+ set->u.batch.cb_fn = cb_fn;
+ set->u.batch.cb_arg = cb_arg;
+ set->u.batch.outstanding_ops = 0;
+ set->u.batch.batch_closed = 0;
+
+ set->cb_args.cb_fn = bs_batch_completion;
+
+ return set;
+}
+
+spdk_bs_user_op_t *
+bs_user_op_alloc(struct spdk_io_channel *_channel, struct spdk_bs_cpl *cpl,
+ enum spdk_blob_op_type op_type, struct spdk_blob *blob,
+ void *payload, int iovcnt, uint64_t offset, uint64_t length)
+{
+ struct spdk_bs_channel *channel;
+ struct spdk_bs_request_set *set;
+ struct spdk_bs_user_op_args *args;
+
+ channel = spdk_io_channel_get_ctx(_channel);
+ assert(channel != NULL);
+ set = TAILQ_FIRST(&channel->reqs);
+ if (!set) {
+ return NULL;
+ }
+ TAILQ_REMOVE(&channel->reqs, set, link);
+
+ set->cpl = *cpl;
+ set->channel = channel;
+
+ args = &set->u.user_op;
+
+ args->type = op_type;
+ args->iovcnt = iovcnt;
+ args->blob = blob;
+ args->offset = offset;
+ args->length = length;
+ args->payload = payload;
+
+ return (spdk_bs_user_op_t *)set;
+}
+
+void
+bs_user_op_execute(spdk_bs_user_op_t *op)
+{
+ struct spdk_bs_request_set *set;
+ struct spdk_bs_user_op_args *args;
+ struct spdk_io_channel *ch;
+
+ set = (struct spdk_bs_request_set *)op;
+ args = &set->u.user_op;
+ ch = spdk_io_channel_from_ctx(set->channel);
+
+ switch (args->type) {
+ case SPDK_BLOB_READ:
+ spdk_blob_io_read(args->blob, ch, args->payload, args->offset, args->length,
+ set->cpl.u.blob_basic.cb_fn, set->cpl.u.blob_basic.cb_arg);
+ break;
+ case SPDK_BLOB_WRITE:
+ spdk_blob_io_write(args->blob, ch, args->payload, args->offset, args->length,
+ set->cpl.u.blob_basic.cb_fn, set->cpl.u.blob_basic.cb_arg);
+ break;
+ case SPDK_BLOB_UNMAP:
+ spdk_blob_io_unmap(args->blob, ch, args->offset, args->length,
+ set->cpl.u.blob_basic.cb_fn, set->cpl.u.blob_basic.cb_arg);
+ break;
+ case SPDK_BLOB_WRITE_ZEROES:
+ spdk_blob_io_write_zeroes(args->blob, ch, args->offset, args->length,
+ set->cpl.u.blob_basic.cb_fn, set->cpl.u.blob_basic.cb_arg);
+ break;
+ case SPDK_BLOB_READV:
+ spdk_blob_io_readv(args->blob, ch, args->payload, args->iovcnt,
+ args->offset, args->length,
+ set->cpl.u.blob_basic.cb_fn, set->cpl.u.blob_basic.cb_arg);
+ break;
+ case SPDK_BLOB_WRITEV:
+ spdk_blob_io_writev(args->blob, ch, args->payload, args->iovcnt,
+ args->offset, args->length,
+ set->cpl.u.blob_basic.cb_fn, set->cpl.u.blob_basic.cb_arg);
+ break;
+ }
+ TAILQ_INSERT_TAIL(&set->channel->reqs, set, link);
+}
+
+void
+bs_user_op_abort(spdk_bs_user_op_t *op)
+{
+ struct spdk_bs_request_set *set;
+
+ set = (struct spdk_bs_request_set *)op;
+
+ set->cpl.u.blob_basic.cb_fn(set->cpl.u.blob_basic.cb_arg, -EIO);
+ TAILQ_INSERT_TAIL(&set->channel->reqs, set, link);
+}
+
+void
+bs_sequence_to_batch_completion(void *cb_arg, int bserrno)
+{
+ struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)cb_arg;
+
+ set->u.batch.outstanding_ops--;
+
+ if (set->u.batch.outstanding_ops == 0 && set->u.batch.batch_closed) {
+ if (set->cb_args.cb_fn) {
+ set->cb_args.cb_fn(set->cb_args.channel, set->cb_args.cb_arg, bserrno);
+ }
+ }
+}
+
+SPDK_LOG_REGISTER_COMPONENT("blob_rw", SPDK_LOG_BLOB_RW)
diff --git a/src/spdk/lib/blob/request.h b/src/spdk/lib/blob/request.h
new file mode 100644
index 000000000..81dc161db
--- /dev/null
+++ b/src/spdk/lib/blob/request.h
@@ -0,0 +1,217 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_BS_REQUEST_H
+#define SPDK_BS_REQUEST_H
+
+#include "spdk/stdinc.h"
+
+#include "spdk/blob.h"
+
+enum spdk_bs_cpl_type {
+ SPDK_BS_CPL_TYPE_NONE,
+ SPDK_BS_CPL_TYPE_BS_BASIC,
+ SPDK_BS_CPL_TYPE_BS_HANDLE,
+ SPDK_BS_CPL_TYPE_BLOB_BASIC,
+ SPDK_BS_CPL_TYPE_BLOBID,
+ SPDK_BS_CPL_TYPE_BLOB_HANDLE,
+ SPDK_BS_CPL_TYPE_NESTED_SEQUENCE,
+};
+
+enum spdk_blob_op_type;
+
+struct spdk_bs_request_set;
+
+/* Use a sequence to submit a set of requests serially */
+typedef struct spdk_bs_request_set spdk_bs_sequence_t;
+
+/* Use a batch to submit a set of requests in parallel */
+typedef struct spdk_bs_request_set spdk_bs_batch_t;
+
+/* Use a user_op to queue a user operation for later execution */
+typedef struct spdk_bs_request_set spdk_bs_user_op_t;
+
+typedef void (*spdk_bs_nested_seq_complete)(void *cb_arg, spdk_bs_sequence_t *parent, int bserrno);
+
+struct spdk_bs_cpl {
+ enum spdk_bs_cpl_type type;
+ union {
+ struct {
+ spdk_bs_op_complete cb_fn;
+ void *cb_arg;
+ } bs_basic;
+
+ struct {
+ spdk_bs_op_with_handle_complete cb_fn;
+ void *cb_arg;
+ struct spdk_blob_store *bs;
+ } bs_handle;
+
+ struct {
+ spdk_blob_op_complete cb_fn;
+ void *cb_arg;
+ } blob_basic;
+
+ struct {
+ spdk_blob_op_with_id_complete cb_fn;
+ void *cb_arg;
+ spdk_blob_id blobid;
+ } blobid;
+
+ struct {
+ spdk_blob_op_with_handle_complete cb_fn;
+ void *cb_arg;
+ struct spdk_blob *blob;
+ } blob_handle;
+
+ struct {
+ spdk_bs_nested_seq_complete cb_fn;
+ void *cb_arg;
+ spdk_bs_sequence_t *parent;
+ } nested_seq;
+ } u;
+};
+
+typedef void (*spdk_bs_sequence_cpl)(spdk_bs_sequence_t *sequence,
+ void *cb_arg, int bserrno);
+
+/* A generic request set. Can be a sequence, batch or a user_op. */
+struct spdk_bs_request_set {
+ struct spdk_bs_cpl cpl;
+
+ int bserrno;
+
+ struct spdk_bs_channel *channel;
+
+ struct spdk_bs_dev_cb_args cb_args;
+
+ union {
+ struct {
+ spdk_bs_sequence_cpl cb_fn;
+ void *cb_arg;
+ } sequence;
+
+ struct {
+ uint32_t outstanding_ops;
+ uint32_t batch_closed;
+ spdk_bs_sequence_cpl cb_fn;
+ void *cb_arg;
+ } batch;
+
+ struct spdk_bs_user_op_args {
+ int type;
+ int iovcnt;
+ struct spdk_blob *blob;
+ uint64_t offset;
+ uint64_t length;
+ spdk_blob_op_complete cb_fn;
+ void *cb_arg;
+ void *payload; /* cast to iov for readv/writev */
+ } user_op;
+ } u;
+
+ TAILQ_ENTRY(spdk_bs_request_set) link;
+};
+
+void bs_call_cpl(struct spdk_bs_cpl *cpl, int bserrno);
+
+spdk_bs_sequence_t *bs_sequence_start(struct spdk_io_channel *channel,
+ struct spdk_bs_cpl *cpl);
+
+void bs_sequence_read_bs_dev(spdk_bs_sequence_t *seq, struct spdk_bs_dev *bs_dev,
+ void *payload, uint64_t lba, uint32_t lba_count,
+ spdk_bs_sequence_cpl cb_fn, void *cb_arg);
+
+void bs_sequence_read_dev(spdk_bs_sequence_t *seq, void *payload,
+ uint64_t lba, uint32_t lba_count,
+ spdk_bs_sequence_cpl cb_fn, void *cb_arg);
+
+void bs_sequence_write_dev(spdk_bs_sequence_t *seq, void *payload,
+ uint64_t lba, uint32_t lba_count,
+ spdk_bs_sequence_cpl cb_fn, void *cb_arg);
+
+void bs_sequence_readv_bs_dev(spdk_bs_batch_t *batch, struct spdk_bs_dev *bs_dev,
+ struct iovec *iov, int iovcnt, uint64_t lba, uint32_t lba_count,
+ spdk_bs_sequence_cpl cb_fn, void *cb_arg);
+
+void bs_sequence_readv_dev(spdk_bs_batch_t *batch, struct iovec *iov, int iovcnt,
+ uint64_t lba, uint32_t lba_count,
+ spdk_bs_sequence_cpl cb_fn, void *cb_arg);
+
+void bs_sequence_writev_dev(spdk_bs_batch_t *batch, struct iovec *iov, int iovcnt,
+ uint64_t lba, uint32_t lba_count,
+ spdk_bs_sequence_cpl cb_fn, void *cb_arg);
+
+void bs_sequence_write_zeroes_dev(spdk_bs_sequence_t *seq,
+ uint64_t lba, uint32_t lba_count,
+ spdk_bs_sequence_cpl cb_fn, void *cb_arg);
+
+void bs_sequence_finish(spdk_bs_sequence_t *seq, int bserrno);
+
+void bs_user_op_sequence_finish(void *cb_arg, int bserrno);
+
+spdk_bs_batch_t *bs_batch_open(struct spdk_io_channel *channel,
+ struct spdk_bs_cpl *cpl);
+
+void bs_batch_read_bs_dev(spdk_bs_batch_t *batch, struct spdk_bs_dev *bs_dev,
+ void *payload, uint64_t lba, uint32_t lba_count);
+
+void bs_batch_read_dev(spdk_bs_batch_t *batch, void *payload,
+ uint64_t lba, uint32_t lba_count);
+
+void bs_batch_write_dev(spdk_bs_batch_t *batch, void *payload,
+ uint64_t lba, uint32_t lba_count);
+
+void bs_batch_unmap_dev(spdk_bs_batch_t *batch,
+ uint64_t lba, uint32_t lba_count);
+
+void bs_batch_write_zeroes_dev(spdk_bs_batch_t *batch,
+ uint64_t lba, uint32_t lba_count);
+
+void bs_batch_close(spdk_bs_batch_t *batch);
+
+spdk_bs_batch_t *bs_sequence_to_batch(spdk_bs_sequence_t *seq,
+ spdk_bs_sequence_cpl cb_fn,
+ void *cb_arg);
+
+spdk_bs_user_op_t *bs_user_op_alloc(struct spdk_io_channel *channel, struct spdk_bs_cpl *cpl,
+ enum spdk_blob_op_type op_type, struct spdk_blob *blob,
+ void *payload, int iovcnt, uint64_t offset, uint64_t length);
+
+void bs_user_op_execute(spdk_bs_user_op_t *op);
+
+void bs_user_op_abort(spdk_bs_user_op_t *op);
+
+void bs_sequence_to_batch_completion(void *cb_arg, int bserrno);
+
+#endif
diff --git a/src/spdk/lib/blob/spdk_blob.map b/src/spdk/lib/blob/spdk_blob.map
new file mode 100644
index 000000000..7c1bc473f
--- /dev/null
+++ b/src/spdk/lib/blob/spdk_blob.map
@@ -0,0 +1,64 @@
+{
+ global:
+
+ # Public functions
+ spdk_bs_opts_init;
+ spdk_bs_load;
+ spdk_bs_init;
+ spdk_bs_dump;
+ spdk_bs_destroy;
+ spdk_bs_unload;
+ spdk_bs_set_super;
+ spdk_bs_get_super;
+ spdk_bs_get_cluster_size;
+ spdk_bs_get_page_size;
+ spdk_bs_get_io_unit_size;
+ spdk_bs_free_cluster_count;
+ spdk_bs_total_data_cluster_count;
+ spdk_blob_get_id;
+ spdk_blob_get_num_pages;
+ spdk_blob_get_num_io_units;
+ spdk_blob_get_num_clusters;
+ spdk_blob_opts_init;
+ spdk_bs_create_blob_ext;
+ spdk_bs_create_blob;
+ spdk_bs_create_snapshot;
+ spdk_bs_create_clone;
+ spdk_blob_get_clones;
+ spdk_blob_get_parent_snapshot;
+ spdk_blob_is_read_only;
+ spdk_blob_is_snapshot;
+ spdk_blob_is_clone;
+ spdk_blob_is_thin_provisioned;
+ spdk_bs_delete_blob;
+ spdk_bs_inflate_blob;
+ spdk_bs_blob_decouple_parent;
+ spdk_blob_open_opts_init;
+ spdk_bs_open_blob;
+ spdk_bs_open_blob_ext;
+ spdk_blob_resize;
+ spdk_blob_set_read_only;
+ spdk_blob_sync_md;
+ spdk_blob_close;
+ spdk_bs_alloc_io_channel;
+ spdk_bs_free_io_channel;
+ spdk_blob_io_write;
+ spdk_blob_io_read;
+ spdk_blob_io_writev;
+ spdk_blob_io_readv;
+ spdk_blob_io_unmap;
+ spdk_blob_io_write_zeroes;
+ spdk_bs_iter_first;
+ spdk_bs_iter_next;
+ spdk_blob_set_xattr;
+ spdk_blob_remove_xattr;
+ spdk_blob_get_xattr_value;
+ spdk_blob_get_xattr_names;
+ spdk_xattr_names_get_count;
+ spdk_xattr_names_get_name;
+ spdk_xattr_names_free;
+ spdk_bs_get_bstype;
+ spdk_bs_set_bstype;
+
+ local: *;
+};
diff --git a/src/spdk/lib/blob/zeroes.c b/src/spdk/lib/blob/zeroes.c
new file mode 100644
index 000000000..5e8d70545
--- /dev/null
+++ b/src/spdk/lib/blob/zeroes.c
@@ -0,0 +1,122 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+#include "spdk/blob.h"
+
+#include "blobstore.h"
+
+static void
+zeroes_destroy(struct spdk_bs_dev *bs_dev)
+{
+ return;
+}
+
+static void
+zeroes_read(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, void *payload,
+ uint64_t lba, uint32_t lba_count, struct spdk_bs_dev_cb_args *cb_args)
+{
+ memset(payload, 0, dev->blocklen * lba_count);
+ cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, 0);
+}
+
+static void
+zeroes_write(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, void *payload,
+ uint64_t lba, uint32_t lba_count,
+ struct spdk_bs_dev_cb_args *cb_args)
+{
+ cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, -EPERM);
+ assert(false);
+}
+
+static void
+zeroes_readv(struct spdk_bs_dev *dev, struct spdk_io_channel *channel,
+ struct iovec *iov, int iovcnt,
+ uint64_t lba, uint32_t lba_count, struct spdk_bs_dev_cb_args *cb_args)
+{
+ int i;
+
+ for (i = 0; i < iovcnt; i++) {
+ memset(iov[i].iov_base, 0, iov[i].iov_len);
+ }
+
+ cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, 0);
+}
+
+static void
+zeroes_writev(struct spdk_bs_dev *dev, struct spdk_io_channel *channel,
+ struct iovec *iov, int iovcnt,
+ uint64_t lba, uint32_t lba_count,
+ struct spdk_bs_dev_cb_args *cb_args)
+{
+ cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, -EPERM);
+ assert(false);
+}
+
+static void
+zeroes_write_zeroes(struct spdk_bs_dev *dev, struct spdk_io_channel *channel,
+ uint64_t lba, uint32_t lba_count,
+ struct spdk_bs_dev_cb_args *cb_args)
+{
+ cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, -EPERM);
+ assert(false);
+}
+
+static void
+zeroes_unmap(struct spdk_bs_dev *dev, struct spdk_io_channel *channel,
+ uint64_t lba, uint32_t lba_count,
+ struct spdk_bs_dev_cb_args *cb_args)
+{
+ cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, -EPERM);
+ assert(false);
+}
+
+static struct spdk_bs_dev g_zeroes_bs_dev = {
+ .blockcnt = UINT64_MAX,
+ .blocklen = 512,
+ .create_channel = NULL,
+ .destroy_channel = NULL,
+ .destroy = zeroes_destroy,
+ .read = zeroes_read,
+ .write = zeroes_write,
+ .readv = zeroes_readv,
+ .writev = zeroes_writev,
+ .write_zeroes = zeroes_write_zeroes,
+ .unmap = zeroes_unmap,
+};
+
+struct spdk_bs_dev *
+bs_create_zeroes_dev(void)
+{
+ return &g_zeroes_bs_dev;
+}
diff --git a/src/spdk/lib/blobfs/Makefile b/src/spdk/lib/blobfs/Makefile
new file mode 100644
index 000000000..d0c46de02
--- /dev/null
+++ b/src/spdk/lib/blobfs/Makefile
@@ -0,0 +1,45 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 3
+SO_MINOR := 0
+
+C_SRCS = blobfs.c tree.c
+LIBNAME = blobfs
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_blobfs.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/blobfs/blobfs.c b/src/spdk/lib/blobfs/blobfs.c
new file mode 100644
index 000000000..3af6b0639
--- /dev/null
+++ b/src/spdk/lib/blobfs/blobfs.c
@@ -0,0 +1,2980 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/blobfs.h"
+#include "spdk/conf.h"
+#include "tree.h"
+
+#include "spdk/queue.h"
+#include "spdk/thread.h"
+#include "spdk/assert.h"
+#include "spdk/env.h"
+#include "spdk/util.h"
+#include "spdk_internal/log.h"
+#include "spdk/trace.h"
+
+#define BLOBFS_TRACE(file, str, args...) \
+ SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s " str, file->name, ##args)
+
+#define BLOBFS_TRACE_RW(file, str, args...) \
+ SPDK_DEBUGLOG(SPDK_LOG_BLOBFS_RW, "file=%s " str, file->name, ##args)
+
+#define BLOBFS_DEFAULT_CACHE_SIZE (4ULL * 1024 * 1024 * 1024)
+#define SPDK_BLOBFS_DEFAULT_OPTS_CLUSTER_SZ (1024 * 1024)
+
+#define SPDK_BLOBFS_SIGNATURE "BLOBFS"
+
+static uint64_t g_fs_cache_size = BLOBFS_DEFAULT_CACHE_SIZE;
+static struct spdk_mempool *g_cache_pool;
+static TAILQ_HEAD(, spdk_file) g_caches;
+static struct spdk_poller *g_cache_pool_mgmt_poller;
+static struct spdk_thread *g_cache_pool_thread;
+#define BLOBFS_CACHE_POOL_POLL_PERIOD_IN_US 1000ULL
+static int g_fs_count = 0;
+static pthread_mutex_t g_cache_init_lock = PTHREAD_MUTEX_INITIALIZER;
+
+#define TRACE_GROUP_BLOBFS 0x7
+#define TRACE_BLOBFS_XATTR_START SPDK_TPOINT_ID(TRACE_GROUP_BLOBFS, 0x0)
+#define TRACE_BLOBFS_XATTR_END SPDK_TPOINT_ID(TRACE_GROUP_BLOBFS, 0x1)
+#define TRACE_BLOBFS_OPEN SPDK_TPOINT_ID(TRACE_GROUP_BLOBFS, 0x2)
+#define TRACE_BLOBFS_CLOSE SPDK_TPOINT_ID(TRACE_GROUP_BLOBFS, 0x3)
+#define TRACE_BLOBFS_DELETE_START SPDK_TPOINT_ID(TRACE_GROUP_BLOBFS, 0x4)
+#define TRACE_BLOBFS_DELETE_DONE SPDK_TPOINT_ID(TRACE_GROUP_BLOBFS, 0x5)
+
+SPDK_TRACE_REGISTER_FN(blobfs_trace, "blobfs", TRACE_GROUP_BLOBFS)
+{
+ spdk_trace_register_description("BLOBFS_XATTR_START",
+ TRACE_BLOBFS_XATTR_START,
+ OWNER_NONE, OBJECT_NONE, 0,
+ SPDK_TRACE_ARG_TYPE_STR,
+ "file: ");
+ spdk_trace_register_description("BLOBFS_XATTR_END",
+ TRACE_BLOBFS_XATTR_END,
+ OWNER_NONE, OBJECT_NONE, 0,
+ SPDK_TRACE_ARG_TYPE_STR,
+ "file: ");
+ spdk_trace_register_description("BLOBFS_OPEN",
+ TRACE_BLOBFS_OPEN,
+ OWNER_NONE, OBJECT_NONE, 0,
+ SPDK_TRACE_ARG_TYPE_STR,
+ "file: ");
+ spdk_trace_register_description("BLOBFS_CLOSE",
+ TRACE_BLOBFS_CLOSE,
+ OWNER_NONE, OBJECT_NONE, 0,
+ SPDK_TRACE_ARG_TYPE_STR,
+ "file: ");
+ spdk_trace_register_description("BLOBFS_DELETE_START",
+ TRACE_BLOBFS_DELETE_START,
+ OWNER_NONE, OBJECT_NONE, 0,
+ SPDK_TRACE_ARG_TYPE_STR,
+ "file: ");
+ spdk_trace_register_description("BLOBFS_DELETE_DONE",
+ TRACE_BLOBFS_DELETE_DONE,
+ OWNER_NONE, OBJECT_NONE, 0,
+ SPDK_TRACE_ARG_TYPE_STR,
+ "file: ");
+}
+
+void
+cache_buffer_free(struct cache_buffer *cache_buffer)
+{
+ spdk_mempool_put(g_cache_pool, cache_buffer->buf);
+ free(cache_buffer);
+}
+
+#define CACHE_READAHEAD_THRESHOLD (128 * 1024)
+
+struct spdk_file {
+ struct spdk_filesystem *fs;
+ struct spdk_blob *blob;
+ char *name;
+ uint64_t trace_arg_name;
+ uint64_t length;
+ bool is_deleted;
+ bool open_for_writing;
+ uint64_t length_flushed;
+ uint64_t length_xattr;
+ uint64_t append_pos;
+ uint64_t seq_byte_count;
+ uint64_t next_seq_offset;
+ uint32_t priority;
+ TAILQ_ENTRY(spdk_file) tailq;
+ spdk_blob_id blobid;
+ uint32_t ref_count;
+ pthread_spinlock_t lock;
+ struct cache_buffer *last;
+ struct cache_tree *tree;
+ TAILQ_HEAD(open_requests_head, spdk_fs_request) open_requests;
+ TAILQ_HEAD(sync_requests_head, spdk_fs_request) sync_requests;
+ TAILQ_ENTRY(spdk_file) cache_tailq;
+};
+
+struct spdk_deleted_file {
+ spdk_blob_id id;
+ TAILQ_ENTRY(spdk_deleted_file) tailq;
+};
+
+struct spdk_filesystem {
+ struct spdk_blob_store *bs;
+ TAILQ_HEAD(, spdk_file) files;
+ struct spdk_bs_opts bs_opts;
+ struct spdk_bs_dev *bdev;
+ fs_send_request_fn send_request;
+
+ struct {
+ uint32_t max_ops;
+ struct spdk_io_channel *sync_io_channel;
+ struct spdk_fs_channel *sync_fs_channel;
+ } sync_target;
+
+ struct {
+ uint32_t max_ops;
+ struct spdk_io_channel *md_io_channel;
+ struct spdk_fs_channel *md_fs_channel;
+ } md_target;
+
+ struct {
+ uint32_t max_ops;
+ } io_target;
+};
+
+struct spdk_fs_cb_args {
+ union {
+ spdk_fs_op_with_handle_complete fs_op_with_handle;
+ spdk_fs_op_complete fs_op;
+ spdk_file_op_with_handle_complete file_op_with_handle;
+ spdk_file_op_complete file_op;
+ spdk_file_stat_op_complete stat_op;
+ } fn;
+ void *arg;
+ sem_t *sem;
+ struct spdk_filesystem *fs;
+ struct spdk_file *file;
+ int rc;
+ struct iovec *iovs;
+ uint32_t iovcnt;
+ struct iovec iov;
+ union {
+ struct {
+ TAILQ_HEAD(, spdk_deleted_file) deleted_files;
+ } fs_load;
+ struct {
+ uint64_t length;
+ } truncate;
+ struct {
+ struct spdk_io_channel *channel;
+ void *pin_buf;
+ int is_read;
+ off_t offset;
+ size_t length;
+ uint64_t start_lba;
+ uint64_t num_lba;
+ uint32_t blocklen;
+ } rw;
+ struct {
+ const char *old_name;
+ const char *new_name;
+ } rename;
+ struct {
+ struct cache_buffer *cache_buffer;
+ uint64_t length;
+ } flush;
+ struct {
+ struct cache_buffer *cache_buffer;
+ uint64_t length;
+ uint64_t offset;
+ } readahead;
+ struct {
+ /* offset of the file when the sync request was made */
+ uint64_t offset;
+ TAILQ_ENTRY(spdk_fs_request) tailq;
+ bool xattr_in_progress;
+ /* length written to the xattr for this file - this should
+ * always be the same as the offset if only one thread is
+ * writing to the file, but could differ if multiple threads
+ * are appending
+ */
+ uint64_t length;
+ } sync;
+ struct {
+ uint32_t num_clusters;
+ } resize;
+ struct {
+ const char *name;
+ uint32_t flags;
+ TAILQ_ENTRY(spdk_fs_request) tailq;
+ } open;
+ struct {
+ const char *name;
+ struct spdk_blob *blob;
+ } create;
+ struct {
+ const char *name;
+ } delete;
+ struct {
+ const char *name;
+ } stat;
+ } op;
+};
+
+static void file_free(struct spdk_file *file);
+static void fs_io_device_unregister(struct spdk_filesystem *fs);
+static void fs_free_io_channels(struct spdk_filesystem *fs);
+
+void
+spdk_fs_opts_init(struct spdk_blobfs_opts *opts)
+{
+ opts->cluster_sz = SPDK_BLOBFS_DEFAULT_OPTS_CLUSTER_SZ;
+}
+
+static int _blobfs_cache_pool_reclaim(void *arg);
+
+static bool
+blobfs_cache_pool_need_reclaim(void)
+{
+ size_t count;
+
+ count = spdk_mempool_count(g_cache_pool);
+ /* We define a aggressive policy here as the requirements from db_bench are batched, so start the poller
+ * when the number of available cache buffer is less than 1/5 of total buffers.
+ */
+ if (count > (size_t)g_fs_cache_size / CACHE_BUFFER_SIZE / 5) {
+ return false;
+ }
+
+ return true;
+}
+
+static void
+__start_cache_pool_mgmt(void *ctx)
+{
+ assert(g_cache_pool == NULL);
+
+ g_cache_pool = spdk_mempool_create("spdk_fs_cache",
+ g_fs_cache_size / CACHE_BUFFER_SIZE,
+ CACHE_BUFFER_SIZE,
+ SPDK_MEMPOOL_DEFAULT_CACHE_SIZE,
+ SPDK_ENV_SOCKET_ID_ANY);
+ if (!g_cache_pool) {
+ SPDK_ERRLOG("Create mempool failed, you may "
+ "increase the memory and try again\n");
+ assert(false);
+ }
+ TAILQ_INIT(&g_caches);
+
+ assert(g_cache_pool_mgmt_poller == NULL);
+ g_cache_pool_mgmt_poller = SPDK_POLLER_REGISTER(_blobfs_cache_pool_reclaim, NULL,
+ BLOBFS_CACHE_POOL_POLL_PERIOD_IN_US);
+}
+
+static void
+__stop_cache_pool_mgmt(void *ctx)
+{
+ spdk_poller_unregister(&g_cache_pool_mgmt_poller);
+
+ assert(g_cache_pool != NULL);
+ assert(spdk_mempool_count(g_cache_pool) == g_fs_cache_size / CACHE_BUFFER_SIZE);
+ spdk_mempool_free(g_cache_pool);
+ g_cache_pool = NULL;
+
+ spdk_thread_exit(g_cache_pool_thread);
+}
+
+static void
+initialize_global_cache(void)
+{
+ pthread_mutex_lock(&g_cache_init_lock);
+ if (g_fs_count == 0) {
+ g_cache_pool_thread = spdk_thread_create("cache_pool_mgmt", NULL);
+ assert(g_cache_pool_thread != NULL);
+ spdk_thread_send_msg(g_cache_pool_thread, __start_cache_pool_mgmt, NULL);
+ }
+ g_fs_count++;
+ pthread_mutex_unlock(&g_cache_init_lock);
+}
+
+static void
+free_global_cache(void)
+{
+ pthread_mutex_lock(&g_cache_init_lock);
+ g_fs_count--;
+ if (g_fs_count == 0) {
+ spdk_thread_send_msg(g_cache_pool_thread, __stop_cache_pool_mgmt, NULL);
+ }
+ pthread_mutex_unlock(&g_cache_init_lock);
+}
+
+static uint64_t
+__file_get_blob_size(struct spdk_file *file)
+{
+ uint64_t cluster_sz;
+
+ cluster_sz = file->fs->bs_opts.cluster_sz;
+ return cluster_sz * spdk_blob_get_num_clusters(file->blob);
+}
+
+struct spdk_fs_request {
+ struct spdk_fs_cb_args args;
+ TAILQ_ENTRY(spdk_fs_request) link;
+ struct spdk_fs_channel *channel;
+};
+
+struct spdk_fs_channel {
+ struct spdk_fs_request *req_mem;
+ TAILQ_HEAD(, spdk_fs_request) reqs;
+ sem_t sem;
+ struct spdk_filesystem *fs;
+ struct spdk_io_channel *bs_channel;
+ fs_send_request_fn send_request;
+ bool sync;
+ uint32_t outstanding_reqs;
+ pthread_spinlock_t lock;
+};
+
+/* For now, this is effectively an alias. But eventually we'll shift
+ * some data members over. */
+struct spdk_fs_thread_ctx {
+ struct spdk_fs_channel ch;
+};
+
+static struct spdk_fs_request *
+alloc_fs_request_with_iov(struct spdk_fs_channel *channel, uint32_t iovcnt)
+{
+ struct spdk_fs_request *req;
+ struct iovec *iovs = NULL;
+
+ if (iovcnt > 1) {
+ iovs = calloc(iovcnt, sizeof(struct iovec));
+ if (!iovs) {
+ return NULL;
+ }
+ }
+
+ if (channel->sync) {
+ pthread_spin_lock(&channel->lock);
+ }
+
+ req = TAILQ_FIRST(&channel->reqs);
+ if (req) {
+ channel->outstanding_reqs++;
+ TAILQ_REMOVE(&channel->reqs, req, link);
+ }
+
+ if (channel->sync) {
+ pthread_spin_unlock(&channel->lock);
+ }
+
+ if (req == NULL) {
+ SPDK_ERRLOG("Cannot allocate req on spdk_fs_channel =%p\n", channel);
+ free(iovs);
+ return NULL;
+ }
+ memset(req, 0, sizeof(*req));
+ req->channel = channel;
+ if (iovcnt > 1) {
+ req->args.iovs = iovs;
+ } else {
+ req->args.iovs = &req->args.iov;
+ }
+ req->args.iovcnt = iovcnt;
+
+ return req;
+}
+
+static struct spdk_fs_request *
+alloc_fs_request(struct spdk_fs_channel *channel)
+{
+ return alloc_fs_request_with_iov(channel, 0);
+}
+
+static void
+free_fs_request(struct spdk_fs_request *req)
+{
+ struct spdk_fs_channel *channel = req->channel;
+
+ if (req->args.iovcnt > 1) {
+ free(req->args.iovs);
+ }
+
+ if (channel->sync) {
+ pthread_spin_lock(&channel->lock);
+ }
+
+ TAILQ_INSERT_HEAD(&req->channel->reqs, req, link);
+ channel->outstanding_reqs--;
+
+ if (channel->sync) {
+ pthread_spin_unlock(&channel->lock);
+ }
+}
+
+static int
+fs_channel_create(struct spdk_filesystem *fs, struct spdk_fs_channel *channel,
+ uint32_t max_ops)
+{
+ uint32_t i;
+
+ channel->req_mem = calloc(max_ops, sizeof(struct spdk_fs_request));
+ if (!channel->req_mem) {
+ return -1;
+ }
+
+ channel->outstanding_reqs = 0;
+ TAILQ_INIT(&channel->reqs);
+ sem_init(&channel->sem, 0, 0);
+
+ for (i = 0; i < max_ops; i++) {
+ TAILQ_INSERT_TAIL(&channel->reqs, &channel->req_mem[i], link);
+ }
+
+ channel->fs = fs;
+
+ return 0;
+}
+
+static int
+fs_md_channel_create(void *io_device, void *ctx_buf)
+{
+ struct spdk_filesystem *fs;
+ struct spdk_fs_channel *channel = ctx_buf;
+
+ fs = SPDK_CONTAINEROF(io_device, struct spdk_filesystem, md_target);
+
+ return fs_channel_create(fs, channel, fs->md_target.max_ops);
+}
+
+static int
+fs_sync_channel_create(void *io_device, void *ctx_buf)
+{
+ struct spdk_filesystem *fs;
+ struct spdk_fs_channel *channel = ctx_buf;
+
+ fs = SPDK_CONTAINEROF(io_device, struct spdk_filesystem, sync_target);
+
+ return fs_channel_create(fs, channel, fs->sync_target.max_ops);
+}
+
+static int
+fs_io_channel_create(void *io_device, void *ctx_buf)
+{
+ struct spdk_filesystem *fs;
+ struct spdk_fs_channel *channel = ctx_buf;
+
+ fs = SPDK_CONTAINEROF(io_device, struct spdk_filesystem, io_target);
+
+ return fs_channel_create(fs, channel, fs->io_target.max_ops);
+}
+
+static void
+fs_channel_destroy(void *io_device, void *ctx_buf)
+{
+ struct spdk_fs_channel *channel = ctx_buf;
+
+ if (channel->outstanding_reqs > 0) {
+ SPDK_ERRLOG("channel freed with %" PRIu32 " outstanding requests!\n",
+ channel->outstanding_reqs);
+ }
+
+ free(channel->req_mem);
+ if (channel->bs_channel != NULL) {
+ spdk_bs_free_io_channel(channel->bs_channel);
+ }
+}
+
+static void
+__send_request_direct(fs_request_fn fn, void *arg)
+{
+ fn(arg);
+}
+
+static void
+common_fs_bs_init(struct spdk_filesystem *fs, struct spdk_blob_store *bs)
+{
+ fs->bs = bs;
+ fs->bs_opts.cluster_sz = spdk_bs_get_cluster_size(bs);
+ fs->md_target.md_fs_channel->bs_channel = spdk_bs_alloc_io_channel(fs->bs);
+ fs->md_target.md_fs_channel->send_request = __send_request_direct;
+ fs->sync_target.sync_fs_channel->bs_channel = spdk_bs_alloc_io_channel(fs->bs);
+ fs->sync_target.sync_fs_channel->send_request = __send_request_direct;
+
+ initialize_global_cache();
+}
+
+static void
+init_cb(void *ctx, struct spdk_blob_store *bs, int bserrno)
+{
+ struct spdk_fs_request *req = ctx;
+ struct spdk_fs_cb_args *args = &req->args;
+ struct spdk_filesystem *fs = args->fs;
+
+ if (bserrno == 0) {
+ common_fs_bs_init(fs, bs);
+ } else {
+ free(fs);
+ fs = NULL;
+ }
+
+ args->fn.fs_op_with_handle(args->arg, fs, bserrno);
+ free_fs_request(req);
+}
+
+static void
+fs_conf_parse(void)
+{
+ struct spdk_conf_section *sp;
+ int cache_buffer_shift;
+
+ sp = spdk_conf_find_section(NULL, "Blobfs");
+ if (sp == NULL) {
+ g_fs_cache_buffer_shift = CACHE_BUFFER_SHIFT_DEFAULT;
+ return;
+ }
+
+ cache_buffer_shift = spdk_conf_section_get_intval(sp, "CacheBufferShift");
+ if (cache_buffer_shift <= 0) {
+ g_fs_cache_buffer_shift = CACHE_BUFFER_SHIFT_DEFAULT;
+ } else {
+ g_fs_cache_buffer_shift = cache_buffer_shift;
+ }
+}
+
+static struct spdk_filesystem *
+fs_alloc(struct spdk_bs_dev *dev, fs_send_request_fn send_request_fn)
+{
+ struct spdk_filesystem *fs;
+
+ fs = calloc(1, sizeof(*fs));
+ if (fs == NULL) {
+ return NULL;
+ }
+
+ fs->bdev = dev;
+ fs->send_request = send_request_fn;
+ TAILQ_INIT(&fs->files);
+
+ fs->md_target.max_ops = 512;
+ spdk_io_device_register(&fs->md_target, fs_md_channel_create, fs_channel_destroy,
+ sizeof(struct spdk_fs_channel), "blobfs_md");
+ fs->md_target.md_io_channel = spdk_get_io_channel(&fs->md_target);
+ fs->md_target.md_fs_channel = spdk_io_channel_get_ctx(fs->md_target.md_io_channel);
+
+ fs->sync_target.max_ops = 512;
+ spdk_io_device_register(&fs->sync_target, fs_sync_channel_create, fs_channel_destroy,
+ sizeof(struct spdk_fs_channel), "blobfs_sync");
+ fs->sync_target.sync_io_channel = spdk_get_io_channel(&fs->sync_target);
+ fs->sync_target.sync_fs_channel = spdk_io_channel_get_ctx(fs->sync_target.sync_io_channel);
+
+ fs->io_target.max_ops = 512;
+ spdk_io_device_register(&fs->io_target, fs_io_channel_create, fs_channel_destroy,
+ sizeof(struct spdk_fs_channel), "blobfs_io");
+
+ return fs;
+}
+
+static void
+__wake_caller(void *arg, int fserrno)
+{
+ struct spdk_fs_cb_args *args = arg;
+
+ args->rc = fserrno;
+ sem_post(args->sem);
+}
+
+void
+spdk_fs_init(struct spdk_bs_dev *dev, struct spdk_blobfs_opts *opt,
+ fs_send_request_fn send_request_fn,
+ spdk_fs_op_with_handle_complete cb_fn, void *cb_arg)
+{
+ struct spdk_filesystem *fs;
+ struct spdk_fs_request *req;
+ struct spdk_fs_cb_args *args;
+ struct spdk_bs_opts opts = {};
+
+ fs = fs_alloc(dev, send_request_fn);
+ if (fs == NULL) {
+ cb_fn(cb_arg, NULL, -ENOMEM);
+ return;
+ }
+
+ fs_conf_parse();
+
+ req = alloc_fs_request(fs->md_target.md_fs_channel);
+ if (req == NULL) {
+ fs_free_io_channels(fs);
+ fs_io_device_unregister(fs);
+ cb_fn(cb_arg, NULL, -ENOMEM);
+ return;
+ }
+
+ args = &req->args;
+ args->fn.fs_op_with_handle = cb_fn;
+ args->arg = cb_arg;
+ args->fs = fs;
+
+ spdk_bs_opts_init(&opts);
+ snprintf(opts.bstype.bstype, sizeof(opts.bstype.bstype), SPDK_BLOBFS_SIGNATURE);
+ if (opt) {
+ opts.cluster_sz = opt->cluster_sz;
+ }
+ spdk_bs_init(dev, &opts, init_cb, req);
+}
+
+static struct spdk_file *
+file_alloc(struct spdk_filesystem *fs)
+{
+ struct spdk_file *file;
+
+ file = calloc(1, sizeof(*file));
+ if (file == NULL) {
+ return NULL;
+ }
+
+ file->tree = calloc(1, sizeof(*file->tree));
+ if (file->tree == NULL) {
+ free(file);
+ return NULL;
+ }
+
+ if (pthread_spin_init(&file->lock, 0)) {
+ free(file->tree);
+ free(file);
+ return NULL;
+ }
+
+ file->fs = fs;
+ TAILQ_INIT(&file->open_requests);
+ TAILQ_INIT(&file->sync_requests);
+ TAILQ_INSERT_TAIL(&fs->files, file, tailq);
+ file->priority = SPDK_FILE_PRIORITY_LOW;
+ return file;
+}
+
+static void fs_load_done(void *ctx, int bserrno);
+
+static int
+_handle_deleted_files(struct spdk_fs_request *req)
+{
+ struct spdk_fs_cb_args *args = &req->args;
+ struct spdk_filesystem *fs = args->fs;
+
+ if (!TAILQ_EMPTY(&args->op.fs_load.deleted_files)) {
+ struct spdk_deleted_file *deleted_file;
+
+ deleted_file = TAILQ_FIRST(&args->op.fs_load.deleted_files);
+ TAILQ_REMOVE(&args->op.fs_load.deleted_files, deleted_file, tailq);
+ spdk_bs_delete_blob(fs->bs, deleted_file->id, fs_load_done, req);
+ free(deleted_file);
+ return 0;
+ }
+
+ return 1;
+}
+
+static void
+fs_load_done(void *ctx, int bserrno)
+{
+ struct spdk_fs_request *req = ctx;
+ struct spdk_fs_cb_args *args = &req->args;
+ struct spdk_filesystem *fs = args->fs;
+
+ /* The filesystem has been loaded. Now check if there are any files that
+ * were marked for deletion before last unload. Do not complete the
+ * fs_load callback until all of them have been deleted on disk.
+ */
+ if (_handle_deleted_files(req) == 0) {
+ /* We found a file that's been marked for deleting but not actually
+ * deleted yet. This function will get called again once the delete
+ * operation is completed.
+ */
+ return;
+ }
+
+ args->fn.fs_op_with_handle(args->arg, fs, 0);
+ free_fs_request(req);
+
+}
+
+static void
+_file_build_trace_arg_name(struct spdk_file *f)
+{
+ f->trace_arg_name = 0;
+ memcpy(&f->trace_arg_name, f->name,
+ spdk_min(sizeof(f->trace_arg_name), strlen(f->name)));
+}
+
+static void
+iter_cb(void *ctx, struct spdk_blob *blob, int rc)
+{
+ struct spdk_fs_request *req = ctx;
+ struct spdk_fs_cb_args *args = &req->args;
+ struct spdk_filesystem *fs = args->fs;
+ uint64_t *length;
+ const char *name;
+ uint32_t *is_deleted;
+ size_t value_len;
+
+ if (rc < 0) {
+ args->fn.fs_op_with_handle(args->arg, fs, rc);
+ free_fs_request(req);
+ return;
+ }
+
+ rc = spdk_blob_get_xattr_value(blob, "name", (const void **)&name, &value_len);
+ if (rc < 0) {
+ args->fn.fs_op_with_handle(args->arg, fs, rc);
+ free_fs_request(req);
+ return;
+ }
+
+ rc = spdk_blob_get_xattr_value(blob, "length", (const void **)&length, &value_len);
+ if (rc < 0) {
+ args->fn.fs_op_with_handle(args->arg, fs, rc);
+ free_fs_request(req);
+ return;
+ }
+
+ assert(value_len == 8);
+
+ /* This file could be deleted last time without close it, then app crashed, so we delete it now */
+ rc = spdk_blob_get_xattr_value(blob, "is_deleted", (const void **)&is_deleted, &value_len);
+ if (rc < 0) {
+ struct spdk_file *f;
+
+ f = file_alloc(fs);
+ if (f == NULL) {
+ SPDK_ERRLOG("Cannot allocate file to handle deleted file on disk\n");
+ args->fn.fs_op_with_handle(args->arg, fs, -ENOMEM);
+ free_fs_request(req);
+ return;
+ }
+
+ f->name = strdup(name);
+ _file_build_trace_arg_name(f);
+ f->blobid = spdk_blob_get_id(blob);
+ f->length = *length;
+ f->length_flushed = *length;
+ f->length_xattr = *length;
+ f->append_pos = *length;
+ SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "added file %s length=%ju\n", f->name, f->length);
+ } else {
+ struct spdk_deleted_file *deleted_file;
+
+ deleted_file = calloc(1, sizeof(*deleted_file));
+ if (deleted_file == NULL) {
+ args->fn.fs_op_with_handle(args->arg, fs, -ENOMEM);
+ free_fs_request(req);
+ return;
+ }
+ deleted_file->id = spdk_blob_get_id(blob);
+ TAILQ_INSERT_TAIL(&args->op.fs_load.deleted_files, deleted_file, tailq);
+ }
+}
+
+static void
+load_cb(void *ctx, struct spdk_blob_store *bs, int bserrno)
+{
+ struct spdk_fs_request *req = ctx;
+ struct spdk_fs_cb_args *args = &req->args;
+ struct spdk_filesystem *fs = args->fs;
+ struct spdk_bs_type bstype;
+ static const struct spdk_bs_type blobfs_type = {SPDK_BLOBFS_SIGNATURE};
+ static const struct spdk_bs_type zeros;
+
+ if (bserrno != 0) {
+ args->fn.fs_op_with_handle(args->arg, NULL, bserrno);
+ free_fs_request(req);
+ fs_free_io_channels(fs);
+ fs_io_device_unregister(fs);
+ return;
+ }
+
+ bstype = spdk_bs_get_bstype(bs);
+
+ if (!memcmp(&bstype, &zeros, sizeof(bstype))) {
+ SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "assigning bstype\n");
+ spdk_bs_set_bstype(bs, blobfs_type);
+ } else if (memcmp(&bstype, &blobfs_type, sizeof(bstype))) {
+ SPDK_ERRLOG("not blobfs\n");
+ SPDK_LOGDUMP(SPDK_LOG_BLOBFS, "bstype", &bstype, sizeof(bstype));
+ args->fn.fs_op_with_handle(args->arg, NULL, -EINVAL);
+ free_fs_request(req);
+ fs_free_io_channels(fs);
+ fs_io_device_unregister(fs);
+ return;
+ }
+
+ common_fs_bs_init(fs, bs);
+ fs_load_done(req, 0);
+}
+
+static void
+fs_io_device_unregister(struct spdk_filesystem *fs)
+{
+ assert(fs != NULL);
+ spdk_io_device_unregister(&fs->md_target, NULL);
+ spdk_io_device_unregister(&fs->sync_target, NULL);
+ spdk_io_device_unregister(&fs->io_target, NULL);
+ free(fs);
+}
+
+static void
+fs_free_io_channels(struct spdk_filesystem *fs)
+{
+ assert(fs != NULL);
+ spdk_fs_free_io_channel(fs->md_target.md_io_channel);
+ spdk_fs_free_io_channel(fs->sync_target.sync_io_channel);
+}
+
+void
+spdk_fs_load(struct spdk_bs_dev *dev, fs_send_request_fn send_request_fn,
+ spdk_fs_op_with_handle_complete cb_fn, void *cb_arg)
+{
+ struct spdk_filesystem *fs;
+ struct spdk_fs_cb_args *args;
+ struct spdk_fs_request *req;
+ struct spdk_bs_opts bs_opts;
+
+ fs = fs_alloc(dev, send_request_fn);
+ if (fs == NULL) {
+ cb_fn(cb_arg, NULL, -ENOMEM);
+ return;
+ }
+
+ fs_conf_parse();
+
+ req = alloc_fs_request(fs->md_target.md_fs_channel);
+ if (req == NULL) {
+ fs_free_io_channels(fs);
+ fs_io_device_unregister(fs);
+ cb_fn(cb_arg, NULL, -ENOMEM);
+ return;
+ }
+
+ args = &req->args;
+ args->fn.fs_op_with_handle = cb_fn;
+ args->arg = cb_arg;
+ args->fs = fs;
+ TAILQ_INIT(&args->op.fs_load.deleted_files);
+ spdk_bs_opts_init(&bs_opts);
+ bs_opts.iter_cb_fn = iter_cb;
+ bs_opts.iter_cb_arg = req;
+ spdk_bs_load(dev, &bs_opts, load_cb, req);
+}
+
+static void
+unload_cb(void *ctx, int bserrno)
+{
+ struct spdk_fs_request *req = ctx;
+ struct spdk_fs_cb_args *args = &req->args;
+ struct spdk_filesystem *fs = args->fs;
+ struct spdk_file *file, *tmp;
+
+ TAILQ_FOREACH_SAFE(file, &fs->files, tailq, tmp) {
+ TAILQ_REMOVE(&fs->files, file, tailq);
+ file_free(file);
+ }
+
+ free_global_cache();
+
+ args->fn.fs_op(args->arg, bserrno);
+ free(req);
+
+ fs_io_device_unregister(fs);
+}
+
+void
+spdk_fs_unload(struct spdk_filesystem *fs, spdk_fs_op_complete cb_fn, void *cb_arg)
+{
+ struct spdk_fs_request *req;
+ struct spdk_fs_cb_args *args;
+
+ /*
+ * We must free the md_channel before unloading the blobstore, so just
+ * allocate this request from the general heap.
+ */
+ req = calloc(1, sizeof(*req));
+ if (req == NULL) {
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ args = &req->args;
+ args->fn.fs_op = cb_fn;
+ args->arg = cb_arg;
+ args->fs = fs;
+
+ fs_free_io_channels(fs);
+ spdk_bs_unload(fs->bs, unload_cb, req);
+}
+
+static struct spdk_file *
+fs_find_file(struct spdk_filesystem *fs, const char *name)
+{
+ struct spdk_file *file;
+
+ TAILQ_FOREACH(file, &fs->files, tailq) {
+ if (!strncmp(name, file->name, SPDK_FILE_NAME_MAX)) {
+ return file;
+ }
+ }
+
+ return NULL;
+}
+
+void
+spdk_fs_file_stat_async(struct spdk_filesystem *fs, const char *name,
+ spdk_file_stat_op_complete cb_fn, void *cb_arg)
+{
+ struct spdk_file_stat stat;
+ struct spdk_file *f = NULL;
+
+ if (strnlen(name, SPDK_FILE_NAME_MAX + 1) == SPDK_FILE_NAME_MAX + 1) {
+ cb_fn(cb_arg, NULL, -ENAMETOOLONG);
+ return;
+ }
+
+ f = fs_find_file(fs, name);
+ if (f != NULL) {
+ stat.blobid = f->blobid;
+ stat.size = f->append_pos >= f->length ? f->append_pos : f->length;
+ cb_fn(cb_arg, &stat, 0);
+ return;
+ }
+
+ cb_fn(cb_arg, NULL, -ENOENT);
+}
+
+static void
+__copy_stat(void *arg, struct spdk_file_stat *stat, int fserrno)
+{
+ struct spdk_fs_request *req = arg;
+ struct spdk_fs_cb_args *args = &req->args;
+
+ args->rc = fserrno;
+ if (fserrno == 0) {
+ memcpy(args->arg, stat, sizeof(*stat));
+ }
+ sem_post(args->sem);
+}
+
+static void
+__file_stat(void *arg)
+{
+ struct spdk_fs_request *req = arg;
+ struct spdk_fs_cb_args *args = &req->args;
+
+ spdk_fs_file_stat_async(args->fs, args->op.stat.name,
+ args->fn.stat_op, req);
+}
+
+int
+spdk_fs_file_stat(struct spdk_filesystem *fs, struct spdk_fs_thread_ctx *ctx,
+ const char *name, struct spdk_file_stat *stat)
+{
+ struct spdk_fs_channel *channel = (struct spdk_fs_channel *)ctx;
+ struct spdk_fs_request *req;
+ int rc;
+
+ req = alloc_fs_request(channel);
+ if (req == NULL) {
+ SPDK_ERRLOG("Cannot allocate stat req on file=%s\n", name);
+ return -ENOMEM;
+ }
+
+ req->args.fs = fs;
+ req->args.op.stat.name = name;
+ req->args.fn.stat_op = __copy_stat;
+ req->args.arg = stat;
+ req->args.sem = &channel->sem;
+ channel->send_request(__file_stat, req);
+ sem_wait(&channel->sem);
+
+ rc = req->args.rc;
+ free_fs_request(req);
+
+ return rc;
+}
+
+static void
+fs_create_blob_close_cb(void *ctx, int bserrno)
+{
+ int rc;
+ struct spdk_fs_request *req = ctx;
+ struct spdk_fs_cb_args *args = &req->args;
+
+ rc = args->rc ? args->rc : bserrno;
+ args->fn.file_op(args->arg, rc);
+ free_fs_request(req);
+}
+
+static void
+fs_create_blob_resize_cb(void *ctx, int bserrno)
+{
+ struct spdk_fs_request *req = ctx;
+ struct spdk_fs_cb_args *args = &req->args;
+ struct spdk_file *f = args->file;
+ struct spdk_blob *blob = args->op.create.blob;
+ uint64_t length = 0;
+
+ args->rc = bserrno;
+ if (bserrno) {
+ spdk_blob_close(blob, fs_create_blob_close_cb, args);
+ return;
+ }
+
+ spdk_blob_set_xattr(blob, "name", f->name, strlen(f->name) + 1);
+ spdk_blob_set_xattr(blob, "length", &length, sizeof(length));
+
+ spdk_blob_close(blob, fs_create_blob_close_cb, args);
+}
+
+static void
+fs_create_blob_open_cb(void *ctx, struct spdk_blob *blob, int bserrno)
+{
+ struct spdk_fs_request *req = ctx;
+ struct spdk_fs_cb_args *args = &req->args;
+
+ if (bserrno) {
+ args->fn.file_op(args->arg, bserrno);
+ free_fs_request(req);
+ return;
+ }
+
+ args->op.create.blob = blob;
+ spdk_blob_resize(blob, 1, fs_create_blob_resize_cb, req);
+}
+
+static void
+fs_create_blob_create_cb(void *ctx, spdk_blob_id blobid, int bserrno)
+{
+ struct spdk_fs_request *req = ctx;
+ struct spdk_fs_cb_args *args = &req->args;
+ struct spdk_file *f = args->file;
+
+ if (bserrno) {
+ args->fn.file_op(args->arg, bserrno);
+ free_fs_request(req);
+ return;
+ }
+
+ f->blobid = blobid;
+ spdk_bs_open_blob(f->fs->bs, blobid, fs_create_blob_open_cb, req);
+}
+
+void
+spdk_fs_create_file_async(struct spdk_filesystem *fs, const char *name,
+ spdk_file_op_complete cb_fn, void *cb_arg)
+{
+ struct spdk_file *file;
+ struct spdk_fs_request *req;
+ struct spdk_fs_cb_args *args;
+
+ if (strnlen(name, SPDK_FILE_NAME_MAX + 1) == SPDK_FILE_NAME_MAX + 1) {
+ cb_fn(cb_arg, -ENAMETOOLONG);
+ return;
+ }
+
+ file = fs_find_file(fs, name);
+ if (file != NULL) {
+ cb_fn(cb_arg, -EEXIST);
+ return;
+ }
+
+ file = file_alloc(fs);
+ if (file == NULL) {
+ SPDK_ERRLOG("Cannot allocate new file for creation\n");
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ req = alloc_fs_request(fs->md_target.md_fs_channel);
+ if (req == NULL) {
+ SPDK_ERRLOG("Cannot allocate create async req for file=%s\n", name);
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ args = &req->args;
+ args->file = file;
+ args->fn.file_op = cb_fn;
+ args->arg = cb_arg;
+
+ file->name = strdup(name);
+ _file_build_trace_arg_name(file);
+ spdk_bs_create_blob(fs->bs, fs_create_blob_create_cb, args);
+}
+
+static void
+__fs_create_file_done(void *arg, int fserrno)
+{
+ struct spdk_fs_request *req = arg;
+ struct spdk_fs_cb_args *args = &req->args;
+
+ __wake_caller(args, fserrno);
+ SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s\n", args->op.create.name);
+}
+
+static void
+__fs_create_file(void *arg)
+{
+ struct spdk_fs_request *req = arg;
+ struct spdk_fs_cb_args *args = &req->args;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s\n", args->op.create.name);
+ spdk_fs_create_file_async(args->fs, args->op.create.name, __fs_create_file_done, req);
+}
+
+int
+spdk_fs_create_file(struct spdk_filesystem *fs, struct spdk_fs_thread_ctx *ctx, const char *name)
+{
+ struct spdk_fs_channel *channel = (struct spdk_fs_channel *)ctx;
+ struct spdk_fs_request *req;
+ struct spdk_fs_cb_args *args;
+ int rc;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s\n", name);
+
+ req = alloc_fs_request(channel);
+ if (req == NULL) {
+ SPDK_ERRLOG("Cannot allocate req to create file=%s\n", name);
+ return -ENOMEM;
+ }
+
+ args = &req->args;
+ args->fs = fs;
+ args->op.create.name = name;
+ args->sem = &channel->sem;
+ fs->send_request(__fs_create_file, req);
+ sem_wait(&channel->sem);
+ rc = args->rc;
+ free_fs_request(req);
+
+ return rc;
+}
+
+static void
+fs_open_blob_done(void *ctx, struct spdk_blob *blob, int bserrno)
+{
+ struct spdk_fs_request *req = ctx;
+ struct spdk_fs_cb_args *args = &req->args;
+ struct spdk_file *f = args->file;
+
+ f->blob = blob;
+ while (!TAILQ_EMPTY(&f->open_requests)) {
+ req = TAILQ_FIRST(&f->open_requests);
+ args = &req->args;
+ TAILQ_REMOVE(&f->open_requests, req, args.op.open.tailq);
+ spdk_trace_record(TRACE_BLOBFS_OPEN, 0, 0, 0, f->trace_arg_name);
+ args->fn.file_op_with_handle(args->arg, f, bserrno);
+ free_fs_request(req);
+ }
+}
+
+static void
+fs_open_blob_create_cb(void *ctx, int bserrno)
+{
+ struct spdk_fs_request *req = ctx;
+ struct spdk_fs_cb_args *args = &req->args;
+ struct spdk_file *file = args->file;
+ struct spdk_filesystem *fs = args->fs;
+
+ if (file == NULL) {
+ /*
+ * This is from an open with CREATE flag - the file
+ * is now created so look it up in the file list for this
+ * filesystem.
+ */
+ file = fs_find_file(fs, args->op.open.name);
+ assert(file != NULL);
+ args->file = file;
+ }
+
+ file->ref_count++;
+ TAILQ_INSERT_TAIL(&file->open_requests, req, args.op.open.tailq);
+ if (file->ref_count == 1) {
+ assert(file->blob == NULL);
+ spdk_bs_open_blob(fs->bs, file->blobid, fs_open_blob_done, req);
+ } else if (file->blob != NULL) {
+ fs_open_blob_done(req, file->blob, 0);
+ } else {
+ /*
+ * The blob open for this file is in progress due to a previous
+ * open request. When that open completes, it will invoke the
+ * open callback for this request.
+ */
+ }
+}
+
+void
+spdk_fs_open_file_async(struct spdk_filesystem *fs, const char *name, uint32_t flags,
+ spdk_file_op_with_handle_complete cb_fn, void *cb_arg)
+{
+ struct spdk_file *f = NULL;
+ struct spdk_fs_request *req;
+ struct spdk_fs_cb_args *args;
+
+ if (strnlen(name, SPDK_FILE_NAME_MAX + 1) == SPDK_FILE_NAME_MAX + 1) {
+ cb_fn(cb_arg, NULL, -ENAMETOOLONG);
+ return;
+ }
+
+ f = fs_find_file(fs, name);
+ if (f == NULL && !(flags & SPDK_BLOBFS_OPEN_CREATE)) {
+ cb_fn(cb_arg, NULL, -ENOENT);
+ return;
+ }
+
+ if (f != NULL && f->is_deleted == true) {
+ cb_fn(cb_arg, NULL, -ENOENT);
+ return;
+ }
+
+ req = alloc_fs_request(fs->md_target.md_fs_channel);
+ if (req == NULL) {
+ SPDK_ERRLOG("Cannot allocate async open req for file=%s\n", name);
+ cb_fn(cb_arg, NULL, -ENOMEM);
+ return;
+ }
+
+ args = &req->args;
+ args->fn.file_op_with_handle = cb_fn;
+ args->arg = cb_arg;
+ args->file = f;
+ args->fs = fs;
+ args->op.open.name = name;
+
+ if (f == NULL) {
+ spdk_fs_create_file_async(fs, name, fs_open_blob_create_cb, req);
+ } else {
+ fs_open_blob_create_cb(req, 0);
+ }
+}
+
+static void
+__fs_open_file_done(void *arg, struct spdk_file *file, int bserrno)
+{
+ struct spdk_fs_request *req = arg;
+ struct spdk_fs_cb_args *args = &req->args;
+
+ args->file = file;
+ __wake_caller(args, bserrno);
+ SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s\n", args->op.open.name);
+}
+
+static void
+__fs_open_file(void *arg)
+{
+ struct spdk_fs_request *req = arg;
+ struct spdk_fs_cb_args *args = &req->args;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s\n", args->op.open.name);
+ spdk_fs_open_file_async(args->fs, args->op.open.name, args->op.open.flags,
+ __fs_open_file_done, req);
+}
+
+int
+spdk_fs_open_file(struct spdk_filesystem *fs, struct spdk_fs_thread_ctx *ctx,
+ const char *name, uint32_t flags, struct spdk_file **file)
+{
+ struct spdk_fs_channel *channel = (struct spdk_fs_channel *)ctx;
+ struct spdk_fs_request *req;
+ struct spdk_fs_cb_args *args;
+ int rc;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s\n", name);
+
+ req = alloc_fs_request(channel);
+ if (req == NULL) {
+ SPDK_ERRLOG("Cannot allocate req for opening file=%s\n", name);
+ return -ENOMEM;
+ }
+
+ args = &req->args;
+ args->fs = fs;
+ args->op.open.name = name;
+ args->op.open.flags = flags;
+ args->sem = &channel->sem;
+ fs->send_request(__fs_open_file, req);
+ sem_wait(&channel->sem);
+ rc = args->rc;
+ if (rc == 0) {
+ *file = args->file;
+ } else {
+ *file = NULL;
+ }
+ free_fs_request(req);
+
+ return rc;
+}
+
+static void
+fs_rename_blob_close_cb(void *ctx, int bserrno)
+{
+ struct spdk_fs_request *req = ctx;
+ struct spdk_fs_cb_args *args = &req->args;
+
+ args->fn.fs_op(args->arg, bserrno);
+ free_fs_request(req);
+}
+
+static void
+fs_rename_blob_open_cb(void *ctx, struct spdk_blob *blob, int bserrno)
+{
+ struct spdk_fs_request *req = ctx;
+ struct spdk_fs_cb_args *args = &req->args;
+ const char *new_name = args->op.rename.new_name;
+
+ spdk_blob_set_xattr(blob, "name", new_name, strlen(new_name) + 1);
+ spdk_blob_close(blob, fs_rename_blob_close_cb, req);
+}
+
+static void
+_fs_md_rename_file(struct spdk_fs_request *req)
+{
+ struct spdk_fs_cb_args *args = &req->args;
+ struct spdk_file *f;
+
+ f = fs_find_file(args->fs, args->op.rename.old_name);
+ if (f == NULL) {
+ args->fn.fs_op(args->arg, -ENOENT);
+ free_fs_request(req);
+ return;
+ }
+
+ free(f->name);
+ f->name = strdup(args->op.rename.new_name);
+ _file_build_trace_arg_name(f);
+ args->file = f;
+ spdk_bs_open_blob(args->fs->bs, f->blobid, fs_rename_blob_open_cb, req);
+}
+
+static void
+fs_rename_delete_done(void *arg, int fserrno)
+{
+ _fs_md_rename_file(arg);
+}
+
+void
+spdk_fs_rename_file_async(struct spdk_filesystem *fs,
+ const char *old_name, const char *new_name,
+ spdk_file_op_complete cb_fn, void *cb_arg)
+{
+ struct spdk_file *f;
+ struct spdk_fs_request *req;
+ struct spdk_fs_cb_args *args;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "old=%s new=%s\n", old_name, new_name);
+ if (strnlen(new_name, SPDK_FILE_NAME_MAX + 1) == SPDK_FILE_NAME_MAX + 1) {
+ cb_fn(cb_arg, -ENAMETOOLONG);
+ return;
+ }
+
+ req = alloc_fs_request(fs->md_target.md_fs_channel);
+ if (req == NULL) {
+ SPDK_ERRLOG("Cannot allocate rename async req for renaming file from %s to %s\n", old_name,
+ new_name);
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ args = &req->args;
+ args->fn.fs_op = cb_fn;
+ args->fs = fs;
+ args->arg = cb_arg;
+ args->op.rename.old_name = old_name;
+ args->op.rename.new_name = new_name;
+
+ f = fs_find_file(fs, new_name);
+ if (f == NULL) {
+ _fs_md_rename_file(req);
+ return;
+ }
+
+ /*
+ * The rename overwrites an existing file. So delete the existing file, then
+ * do the actual rename.
+ */
+ spdk_fs_delete_file_async(fs, new_name, fs_rename_delete_done, req);
+}
+
+static void
+__fs_rename_file_done(void *arg, int fserrno)
+{
+ struct spdk_fs_request *req = arg;
+ struct spdk_fs_cb_args *args = &req->args;
+
+ __wake_caller(args, fserrno);
+}
+
+static void
+__fs_rename_file(void *arg)
+{
+ struct spdk_fs_request *req = arg;
+ struct spdk_fs_cb_args *args = &req->args;
+
+ spdk_fs_rename_file_async(args->fs, args->op.rename.old_name, args->op.rename.new_name,
+ __fs_rename_file_done, req);
+}
+
+int
+spdk_fs_rename_file(struct spdk_filesystem *fs, struct spdk_fs_thread_ctx *ctx,
+ const char *old_name, const char *new_name)
+{
+ struct spdk_fs_channel *channel = (struct spdk_fs_channel *)ctx;
+ struct spdk_fs_request *req;
+ struct spdk_fs_cb_args *args;
+ int rc;
+
+ req = alloc_fs_request(channel);
+ if (req == NULL) {
+ SPDK_ERRLOG("Cannot allocate rename req for file=%s\n", old_name);
+ return -ENOMEM;
+ }
+
+ args = &req->args;
+
+ args->fs = fs;
+ args->op.rename.old_name = old_name;
+ args->op.rename.new_name = new_name;
+ args->sem = &channel->sem;
+ fs->send_request(__fs_rename_file, req);
+ sem_wait(&channel->sem);
+ rc = args->rc;
+ free_fs_request(req);
+ return rc;
+}
+
+static void
+blob_delete_cb(void *ctx, int bserrno)
+{
+ struct spdk_fs_request *req = ctx;
+ struct spdk_fs_cb_args *args = &req->args;
+
+ args->fn.file_op(args->arg, bserrno);
+ free_fs_request(req);
+}
+
+void
+spdk_fs_delete_file_async(struct spdk_filesystem *fs, const char *name,
+ spdk_file_op_complete cb_fn, void *cb_arg)
+{
+ struct spdk_file *f;
+ spdk_blob_id blobid;
+ struct spdk_fs_request *req;
+ struct spdk_fs_cb_args *args;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s\n", name);
+
+ if (strnlen(name, SPDK_FILE_NAME_MAX + 1) == SPDK_FILE_NAME_MAX + 1) {
+ cb_fn(cb_arg, -ENAMETOOLONG);
+ return;
+ }
+
+ f = fs_find_file(fs, name);
+ if (f == NULL) {
+ SPDK_ERRLOG("Cannot find the file=%s to deleted\n", name);
+ cb_fn(cb_arg, -ENOENT);
+ return;
+ }
+
+ req = alloc_fs_request(fs->md_target.md_fs_channel);
+ if (req == NULL) {
+ SPDK_ERRLOG("Cannot allocate the req for the file=%s to deleted\n", name);
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ args = &req->args;
+ args->fn.file_op = cb_fn;
+ args->arg = cb_arg;
+
+ if (f->ref_count > 0) {
+ /* If the ref > 0, we mark the file as deleted and delete it when we close it. */
+ f->is_deleted = true;
+ spdk_blob_set_xattr(f->blob, "is_deleted", &f->is_deleted, sizeof(bool));
+ spdk_blob_sync_md(f->blob, blob_delete_cb, req);
+ return;
+ }
+
+ blobid = f->blobid;
+ TAILQ_REMOVE(&fs->files, f, tailq);
+
+ file_free(f);
+
+ spdk_bs_delete_blob(fs->bs, blobid, blob_delete_cb, req);
+}
+
+static uint64_t
+fs_name_to_uint64(const char *name)
+{
+ uint64_t result = 0;
+ memcpy(&result, name, spdk_min(sizeof(result), strlen(name)));
+ return result;
+}
+
+static void
+__fs_delete_file_done(void *arg, int fserrno)
+{
+ struct spdk_fs_request *req = arg;
+ struct spdk_fs_cb_args *args = &req->args;
+
+ spdk_trace_record(TRACE_BLOBFS_DELETE_DONE, 0, 0, 0, fs_name_to_uint64(args->op.delete.name));
+ __wake_caller(args, fserrno);
+}
+
+static void
+__fs_delete_file(void *arg)
+{
+ struct spdk_fs_request *req = arg;
+ struct spdk_fs_cb_args *args = &req->args;
+
+ spdk_trace_record(TRACE_BLOBFS_DELETE_START, 0, 0, 0, fs_name_to_uint64(args->op.delete.name));
+ spdk_fs_delete_file_async(args->fs, args->op.delete.name, __fs_delete_file_done, req);
+}
+
+int
+spdk_fs_delete_file(struct spdk_filesystem *fs, struct spdk_fs_thread_ctx *ctx,
+ const char *name)
+{
+ struct spdk_fs_channel *channel = (struct spdk_fs_channel *)ctx;
+ struct spdk_fs_request *req;
+ struct spdk_fs_cb_args *args;
+ int rc;
+
+ req = alloc_fs_request(channel);
+ if (req == NULL) {
+ SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "Cannot allocate req to delete file=%s\n", name);
+ return -ENOMEM;
+ }
+
+ args = &req->args;
+ args->fs = fs;
+ args->op.delete.name = name;
+ args->sem = &channel->sem;
+ fs->send_request(__fs_delete_file, req);
+ sem_wait(&channel->sem);
+ rc = args->rc;
+ free_fs_request(req);
+
+ return rc;
+}
+
+spdk_fs_iter
+spdk_fs_iter_first(struct spdk_filesystem *fs)
+{
+ struct spdk_file *f;
+
+ f = TAILQ_FIRST(&fs->files);
+ return f;
+}
+
+spdk_fs_iter
+spdk_fs_iter_next(spdk_fs_iter iter)
+{
+ struct spdk_file *f = iter;
+
+ if (f == NULL) {
+ return NULL;
+ }
+
+ f = TAILQ_NEXT(f, tailq);
+ return f;
+}
+
+const char *
+spdk_file_get_name(struct spdk_file *file)
+{
+ return file->name;
+}
+
+uint64_t
+spdk_file_get_length(struct spdk_file *file)
+{
+ uint64_t length;
+
+ assert(file != NULL);
+
+ length = file->append_pos >= file->length ? file->append_pos : file->length;
+ SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s length=0x%jx\n", file->name, length);
+ return length;
+}
+
+static void
+fs_truncate_complete_cb(void *ctx, int bserrno)
+{
+ struct spdk_fs_request *req = ctx;
+ struct spdk_fs_cb_args *args = &req->args;
+
+ args->fn.file_op(args->arg, bserrno);
+ free_fs_request(req);
+}
+
+static void
+fs_truncate_resize_cb(void *ctx, int bserrno)
+{
+ struct spdk_fs_request *req = ctx;
+ struct spdk_fs_cb_args *args = &req->args;
+ struct spdk_file *file = args->file;
+ uint64_t *length = &args->op.truncate.length;
+
+ if (bserrno) {
+ args->fn.file_op(args->arg, bserrno);
+ free_fs_request(req);
+ return;
+ }
+
+ spdk_blob_set_xattr(file->blob, "length", length, sizeof(*length));
+
+ file->length = *length;
+ if (file->append_pos > file->length) {
+ file->append_pos = file->length;
+ }
+
+ spdk_blob_sync_md(file->blob, fs_truncate_complete_cb, req);
+}
+
+static uint64_t
+__bytes_to_clusters(uint64_t length, uint64_t cluster_sz)
+{
+ return (length + cluster_sz - 1) / cluster_sz;
+}
+
+void
+spdk_file_truncate_async(struct spdk_file *file, uint64_t length,
+ spdk_file_op_complete cb_fn, void *cb_arg)
+{
+ struct spdk_filesystem *fs;
+ size_t num_clusters;
+ struct spdk_fs_request *req;
+ struct spdk_fs_cb_args *args;
+
+ SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s old=0x%jx new=0x%jx\n", file->name, file->length, length);
+ if (length == file->length) {
+ cb_fn(cb_arg, 0);
+ return;
+ }
+
+ req = alloc_fs_request(file->fs->md_target.md_fs_channel);
+ if (req == NULL) {
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ args = &req->args;
+ args->fn.file_op = cb_fn;
+ args->arg = cb_arg;
+ args->file = file;
+ args->op.truncate.length = length;
+ fs = file->fs;
+
+ num_clusters = __bytes_to_clusters(length, fs->bs_opts.cluster_sz);
+
+ spdk_blob_resize(file->blob, num_clusters, fs_truncate_resize_cb, req);
+}
+
+static void
+__truncate(void *arg)
+{
+ struct spdk_fs_request *req = arg;
+ struct spdk_fs_cb_args *args = &req->args;
+
+ spdk_file_truncate_async(args->file, args->op.truncate.length,
+ args->fn.file_op, args);
+}
+
+int
+spdk_file_truncate(struct spdk_file *file, struct spdk_fs_thread_ctx *ctx,
+ uint64_t length)
+{
+ struct spdk_fs_channel *channel = (struct spdk_fs_channel *)ctx;
+ struct spdk_fs_request *req;
+ struct spdk_fs_cb_args *args;
+ int rc;
+
+ req = alloc_fs_request(channel);
+ if (req == NULL) {
+ return -ENOMEM;
+ }
+
+ args = &req->args;
+
+ args->file = file;
+ args->op.truncate.length = length;
+ args->fn.file_op = __wake_caller;
+ args->sem = &channel->sem;
+
+ channel->send_request(__truncate, req);
+ sem_wait(&channel->sem);
+ rc = args->rc;
+ free_fs_request(req);
+
+ return rc;
+}
+
+static void
+__rw_done(void *ctx, int bserrno)
+{
+ struct spdk_fs_request *req = ctx;
+ struct spdk_fs_cb_args *args = &req->args;
+
+ spdk_free(args->op.rw.pin_buf);
+ args->fn.file_op(args->arg, bserrno);
+ free_fs_request(req);
+}
+
+static void
+_copy_iovs_to_buf(void *buf, size_t buf_len, struct iovec *iovs, int iovcnt)
+{
+ int i;
+ size_t len;
+
+ for (i = 0; i < iovcnt; i++) {
+ len = spdk_min(iovs[i].iov_len, buf_len);
+ memcpy(buf, iovs[i].iov_base, len);
+ buf += len;
+ assert(buf_len >= len);
+ buf_len -= len;
+ }
+}
+
+static void
+_copy_buf_to_iovs(struct iovec *iovs, int iovcnt, void *buf, size_t buf_len)
+{
+ int i;
+ size_t len;
+
+ for (i = 0; i < iovcnt; i++) {
+ len = spdk_min(iovs[i].iov_len, buf_len);
+ memcpy(iovs[i].iov_base, buf, len);
+ buf += len;
+ assert(buf_len >= len);
+ buf_len -= len;
+ }
+}
+
+static void
+__read_done(void *ctx, int bserrno)
+{
+ struct spdk_fs_request *req = ctx;
+ struct spdk_fs_cb_args *args = &req->args;
+ void *buf;
+
+ assert(req != NULL);
+ buf = (void *)((uintptr_t)args->op.rw.pin_buf + (args->op.rw.offset & (args->op.rw.blocklen - 1)));
+ if (args->op.rw.is_read) {
+ _copy_buf_to_iovs(args->iovs, args->iovcnt, buf, args->op.rw.length);
+ __rw_done(req, 0);
+ } else {
+ _copy_iovs_to_buf(buf, args->op.rw.length, args->iovs, args->iovcnt);
+ spdk_blob_io_write(args->file->blob, args->op.rw.channel,
+ args->op.rw.pin_buf,
+ args->op.rw.start_lba, args->op.rw.num_lba,
+ __rw_done, req);
+ }
+}
+
+static void
+__do_blob_read(void *ctx, int fserrno)
+{
+ struct spdk_fs_request *req = ctx;
+ struct spdk_fs_cb_args *args = &req->args;
+
+ if (fserrno) {
+ __rw_done(req, fserrno);
+ return;
+ }
+ spdk_blob_io_read(args->file->blob, args->op.rw.channel,
+ args->op.rw.pin_buf,
+ args->op.rw.start_lba, args->op.rw.num_lba,
+ __read_done, req);
+}
+
+static void
+__get_page_parameters(struct spdk_file *file, uint64_t offset, uint64_t length,
+ uint64_t *start_lba, uint32_t *lba_size, uint64_t *num_lba)
+{
+ uint64_t end_lba;
+
+ *lba_size = spdk_bs_get_io_unit_size(file->fs->bs);
+ *start_lba = offset / *lba_size;
+ end_lba = (offset + length - 1) / *lba_size;
+ *num_lba = (end_lba - *start_lba + 1);
+}
+
+static bool
+__is_lba_aligned(struct spdk_file *file, uint64_t offset, uint64_t length)
+{
+ uint32_t lba_size = spdk_bs_get_io_unit_size(file->fs->bs);
+
+ if ((offset % lba_size == 0) && (length % lba_size == 0)) {
+ return true;
+ }
+
+ return false;
+}
+
+static void
+_fs_request_setup_iovs(struct spdk_fs_request *req, struct iovec *iovs, uint32_t iovcnt)
+{
+ uint32_t i;
+
+ for (i = 0; i < iovcnt; i++) {
+ req->args.iovs[i].iov_base = iovs[i].iov_base;
+ req->args.iovs[i].iov_len = iovs[i].iov_len;
+ }
+}
+
+static void
+__readvwritev(struct spdk_file *file, struct spdk_io_channel *_channel,
+ struct iovec *iovs, uint32_t iovcnt, uint64_t offset, uint64_t length,
+ spdk_file_op_complete cb_fn, void *cb_arg, int is_read)
+{
+ struct spdk_fs_request *req;
+ struct spdk_fs_cb_args *args;
+ struct spdk_fs_channel *channel = spdk_io_channel_get_ctx(_channel);
+ uint64_t start_lba, num_lba, pin_buf_length;
+ uint32_t lba_size;
+
+ if (is_read && offset + length > file->length) {
+ cb_fn(cb_arg, -EINVAL);
+ return;
+ }
+
+ req = alloc_fs_request_with_iov(channel, iovcnt);
+ if (req == NULL) {
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ __get_page_parameters(file, offset, length, &start_lba, &lba_size, &num_lba);
+
+ args = &req->args;
+ args->fn.file_op = cb_fn;
+ args->arg = cb_arg;
+ args->file = file;
+ args->op.rw.channel = channel->bs_channel;
+ _fs_request_setup_iovs(req, iovs, iovcnt);
+ args->op.rw.is_read = is_read;
+ args->op.rw.offset = offset;
+ args->op.rw.blocklen = lba_size;
+
+ pin_buf_length = num_lba * lba_size;
+ args->op.rw.length = pin_buf_length;
+ args->op.rw.pin_buf = spdk_malloc(pin_buf_length, lba_size, NULL,
+ SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
+ if (args->op.rw.pin_buf == NULL) {
+ SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "Failed to allocate buf for: file=%s offset=%jx length=%jx\n",
+ file->name, offset, length);
+ free_fs_request(req);
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ args->op.rw.start_lba = start_lba;
+ args->op.rw.num_lba = num_lba;
+
+ if (!is_read && file->length < offset + length) {
+ spdk_file_truncate_async(file, offset + length, __do_blob_read, req);
+ } else if (!is_read && __is_lba_aligned(file, offset, length)) {
+ _copy_iovs_to_buf(args->op.rw.pin_buf, args->op.rw.length, args->iovs, args->iovcnt);
+ spdk_blob_io_write(args->file->blob, args->op.rw.channel,
+ args->op.rw.pin_buf,
+ args->op.rw.start_lba, args->op.rw.num_lba,
+ __rw_done, req);
+ } else {
+ __do_blob_read(req, 0);
+ }
+}
+
+static void
+__readwrite(struct spdk_file *file, struct spdk_io_channel *channel,
+ void *payload, uint64_t offset, uint64_t length,
+ spdk_file_op_complete cb_fn, void *cb_arg, int is_read)
+{
+ struct iovec iov;
+
+ iov.iov_base = payload;
+ iov.iov_len = (size_t)length;
+
+ __readvwritev(file, channel, &iov, 1, offset, length, cb_fn, cb_arg, is_read);
+}
+
+void
+spdk_file_write_async(struct spdk_file *file, struct spdk_io_channel *channel,
+ void *payload, uint64_t offset, uint64_t length,
+ spdk_file_op_complete cb_fn, void *cb_arg)
+{
+ __readwrite(file, channel, payload, offset, length, cb_fn, cb_arg, 0);
+}
+
+void
+spdk_file_writev_async(struct spdk_file *file, struct spdk_io_channel *channel,
+ struct iovec *iovs, uint32_t iovcnt, uint64_t offset, uint64_t length,
+ spdk_file_op_complete cb_fn, void *cb_arg)
+{
+ SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s offset=%jx length=%jx\n",
+ file->name, offset, length);
+
+ __readvwritev(file, channel, iovs, iovcnt, offset, length, cb_fn, cb_arg, 0);
+}
+
+void
+spdk_file_read_async(struct spdk_file *file, struct spdk_io_channel *channel,
+ void *payload, uint64_t offset, uint64_t length,
+ spdk_file_op_complete cb_fn, void *cb_arg)
+{
+ SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s offset=%jx length=%jx\n",
+ file->name, offset, length);
+ __readwrite(file, channel, payload, offset, length, cb_fn, cb_arg, 1);
+}
+
+void
+spdk_file_readv_async(struct spdk_file *file, struct spdk_io_channel *channel,
+ struct iovec *iovs, uint32_t iovcnt, uint64_t offset, uint64_t length,
+ spdk_file_op_complete cb_fn, void *cb_arg)
+{
+ SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s offset=%jx length=%jx\n",
+ file->name, offset, length);
+
+ __readvwritev(file, channel, iovs, iovcnt, offset, length, cb_fn, cb_arg, 1);
+}
+
+struct spdk_io_channel *
+spdk_fs_alloc_io_channel(struct spdk_filesystem *fs)
+{
+ struct spdk_io_channel *io_channel;
+ struct spdk_fs_channel *fs_channel;
+
+ io_channel = spdk_get_io_channel(&fs->io_target);
+ fs_channel = spdk_io_channel_get_ctx(io_channel);
+ fs_channel->bs_channel = spdk_bs_alloc_io_channel(fs->bs);
+ fs_channel->send_request = __send_request_direct;
+
+ return io_channel;
+}
+
+void
+spdk_fs_free_io_channel(struct spdk_io_channel *channel)
+{
+ spdk_put_io_channel(channel);
+}
+
+struct spdk_fs_thread_ctx *
+spdk_fs_alloc_thread_ctx(struct spdk_filesystem *fs)
+{
+ struct spdk_fs_thread_ctx *ctx;
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx) {
+ return NULL;
+ }
+
+ if (pthread_spin_init(&ctx->ch.lock, 0)) {
+ free(ctx);
+ return NULL;
+ }
+
+ fs_channel_create(fs, &ctx->ch, 512);
+
+ ctx->ch.send_request = fs->send_request;
+ ctx->ch.sync = 1;
+
+ return ctx;
+}
+
+
+void
+spdk_fs_free_thread_ctx(struct spdk_fs_thread_ctx *ctx)
+{
+ assert(ctx->ch.sync == 1);
+
+ while (true) {
+ pthread_spin_lock(&ctx->ch.lock);
+ if (ctx->ch.outstanding_reqs == 0) {
+ pthread_spin_unlock(&ctx->ch.lock);
+ break;
+ }
+ pthread_spin_unlock(&ctx->ch.lock);
+ usleep(1000);
+ }
+
+ fs_channel_destroy(NULL, &ctx->ch);
+ free(ctx);
+}
+
+int
+spdk_fs_set_cache_size(uint64_t size_in_mb)
+{
+ /* setting g_fs_cache_size is only permitted if cache pool
+ * is already freed or hasn't been initialized
+ */
+ if (g_cache_pool != NULL) {
+ return -EPERM;
+ }
+
+ g_fs_cache_size = size_in_mb * 1024 * 1024;
+
+ return 0;
+}
+
+uint64_t
+spdk_fs_get_cache_size(void)
+{
+ return g_fs_cache_size / (1024 * 1024);
+}
+
+static void __file_flush(void *ctx);
+
+/* Try to free some cache buffers from this file.
+ */
+static int
+reclaim_cache_buffers(struct spdk_file *file)
+{
+ int rc;
+
+ BLOBFS_TRACE(file, "free=%s\n", file->name);
+
+ /* The function is safe to be called with any threads, while the file
+ * lock maybe locked by other thread for now, so try to get the file
+ * lock here.
+ */
+ rc = pthread_spin_trylock(&file->lock);
+ if (rc != 0) {
+ return -1;
+ }
+
+ if (file->tree->present_mask == 0) {
+ pthread_spin_unlock(&file->lock);
+ return -1;
+ }
+ tree_free_buffers(file->tree);
+
+ TAILQ_REMOVE(&g_caches, file, cache_tailq);
+ /* If not freed, put it in the end of the queue */
+ if (file->tree->present_mask != 0) {
+ TAILQ_INSERT_TAIL(&g_caches, file, cache_tailq);
+ } else {
+ file->last = NULL;
+ }
+ pthread_spin_unlock(&file->lock);
+
+ return 0;
+}
+
+static int
+_blobfs_cache_pool_reclaim(void *arg)
+{
+ struct spdk_file *file, *tmp;
+ int rc;
+
+ if (!blobfs_cache_pool_need_reclaim()) {
+ return SPDK_POLLER_IDLE;
+ }
+
+ TAILQ_FOREACH_SAFE(file, &g_caches, cache_tailq, tmp) {
+ if (!file->open_for_writing &&
+ file->priority == SPDK_FILE_PRIORITY_LOW) {
+ rc = reclaim_cache_buffers(file);
+ if (rc < 0) {
+ continue;
+ }
+ if (!blobfs_cache_pool_need_reclaim()) {
+ return SPDK_POLLER_BUSY;
+ }
+ break;
+ }
+ }
+
+ TAILQ_FOREACH_SAFE(file, &g_caches, cache_tailq, tmp) {
+ if (!file->open_for_writing) {
+ rc = reclaim_cache_buffers(file);
+ if (rc < 0) {
+ continue;
+ }
+ if (!blobfs_cache_pool_need_reclaim()) {
+ return SPDK_POLLER_BUSY;
+ }
+ break;
+ }
+ }
+
+ TAILQ_FOREACH_SAFE(file, &g_caches, cache_tailq, tmp) {
+ rc = reclaim_cache_buffers(file);
+ if (rc < 0) {
+ continue;
+ }
+ break;
+ }
+
+ return SPDK_POLLER_BUSY;
+}
+
+static void
+_add_file_to_cache_pool(void *ctx)
+{
+ struct spdk_file *file = ctx;
+
+ TAILQ_INSERT_TAIL(&g_caches, file, cache_tailq);
+}
+
+static void
+_remove_file_from_cache_pool(void *ctx)
+{
+ struct spdk_file *file = ctx;
+
+ TAILQ_REMOVE(&g_caches, file, cache_tailq);
+}
+
+static struct cache_buffer *
+cache_insert_buffer(struct spdk_file *file, uint64_t offset)
+{
+ struct cache_buffer *buf;
+ int count = 0;
+ bool need_update = false;
+
+ buf = calloc(1, sizeof(*buf));
+ if (buf == NULL) {
+ SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "calloc failed\n");
+ return NULL;
+ }
+
+ do {
+ buf->buf = spdk_mempool_get(g_cache_pool);
+ if (buf->buf) {
+ break;
+ }
+ if (count++ == 100) {
+ SPDK_ERRLOG("Could not allocate cache buffer for file=%p on offset=%jx\n",
+ file, offset);
+ free(buf);
+ return NULL;
+ }
+ usleep(BLOBFS_CACHE_POOL_POLL_PERIOD_IN_US);
+ } while (true);
+
+ buf->buf_size = CACHE_BUFFER_SIZE;
+ buf->offset = offset;
+
+ if (file->tree->present_mask == 0) {
+ need_update = true;
+ }
+ file->tree = tree_insert_buffer(file->tree, buf);
+
+ if (need_update) {
+ spdk_thread_send_msg(g_cache_pool_thread, _add_file_to_cache_pool, file);
+ }
+
+ return buf;
+}
+
+static struct cache_buffer *
+cache_append_buffer(struct spdk_file *file)
+{
+ struct cache_buffer *last;
+
+ assert(file->last == NULL || file->last->bytes_filled == file->last->buf_size);
+ assert((file->append_pos % CACHE_BUFFER_SIZE) == 0);
+
+ last = cache_insert_buffer(file, file->append_pos);
+ if (last == NULL) {
+ SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "cache_insert_buffer failed\n");
+ return NULL;
+ }
+
+ file->last = last;
+
+ return last;
+}
+
+static void __check_sync_reqs(struct spdk_file *file);
+
+static void
+__file_cache_finish_sync(void *ctx, int bserrno)
+{
+ struct spdk_file *file;
+ struct spdk_fs_request *sync_req = ctx;
+ struct spdk_fs_cb_args *sync_args;
+
+ sync_args = &sync_req->args;
+ file = sync_args->file;
+ pthread_spin_lock(&file->lock);
+ file->length_xattr = sync_args->op.sync.length;
+ assert(sync_args->op.sync.offset <= file->length_flushed);
+ spdk_trace_record(TRACE_BLOBFS_XATTR_END, 0, sync_args->op.sync.offset,
+ 0, file->trace_arg_name);
+ BLOBFS_TRACE(file, "sync done offset=%jx\n", sync_args->op.sync.offset);
+ TAILQ_REMOVE(&file->sync_requests, sync_req, args.op.sync.tailq);
+ pthread_spin_unlock(&file->lock);
+
+ sync_args->fn.file_op(sync_args->arg, bserrno);
+
+ free_fs_request(sync_req);
+ __check_sync_reqs(file);
+}
+
+static void
+__check_sync_reqs(struct spdk_file *file)
+{
+ struct spdk_fs_request *sync_req;
+
+ pthread_spin_lock(&file->lock);
+
+ TAILQ_FOREACH(sync_req, &file->sync_requests, args.op.sync.tailq) {
+ if (sync_req->args.op.sync.offset <= file->length_flushed) {
+ break;
+ }
+ }
+
+ if (sync_req != NULL && !sync_req->args.op.sync.xattr_in_progress) {
+ BLOBFS_TRACE(file, "set xattr length 0x%jx\n", file->length_flushed);
+ sync_req->args.op.sync.xattr_in_progress = true;
+ sync_req->args.op.sync.length = file->length_flushed;
+ spdk_blob_set_xattr(file->blob, "length", &file->length_flushed,
+ sizeof(file->length_flushed));
+
+ pthread_spin_unlock(&file->lock);
+ spdk_trace_record(TRACE_BLOBFS_XATTR_START, 0, file->length_flushed,
+ 0, file->trace_arg_name);
+ spdk_blob_sync_md(file->blob, __file_cache_finish_sync, sync_req);
+ } else {
+ pthread_spin_unlock(&file->lock);
+ }
+}
+
+static void
+__file_flush_done(void *ctx, int bserrno)
+{
+ struct spdk_fs_request *req = ctx;
+ struct spdk_fs_cb_args *args = &req->args;
+ struct spdk_file *file = args->file;
+ struct cache_buffer *next = args->op.flush.cache_buffer;
+
+ BLOBFS_TRACE(file, "length=%jx\n", args->op.flush.length);
+
+ pthread_spin_lock(&file->lock);
+ next->in_progress = false;
+ next->bytes_flushed += args->op.flush.length;
+ file->length_flushed += args->op.flush.length;
+ if (file->length_flushed > file->length) {
+ file->length = file->length_flushed;
+ }
+ if (next->bytes_flushed == next->buf_size) {
+ BLOBFS_TRACE(file, "write buffer fully flushed 0x%jx\n", file->length_flushed);
+ next = tree_find_buffer(file->tree, file->length_flushed);
+ }
+
+ /*
+ * Assert that there is no cached data that extends past the end of the underlying
+ * blob.
+ */
+ assert(next == NULL || next->offset < __file_get_blob_size(file) ||
+ next->bytes_filled == 0);
+
+ pthread_spin_unlock(&file->lock);
+
+ __check_sync_reqs(file);
+
+ __file_flush(req);
+}
+
+static void
+__file_flush(void *ctx)
+{
+ struct spdk_fs_request *req = ctx;
+ struct spdk_fs_cb_args *args = &req->args;
+ struct spdk_file *file = args->file;
+ struct cache_buffer *next;
+ uint64_t offset, length, start_lba, num_lba;
+ uint32_t lba_size;
+
+ pthread_spin_lock(&file->lock);
+ next = tree_find_buffer(file->tree, file->length_flushed);
+ if (next == NULL || next->in_progress ||
+ ((next->bytes_filled < next->buf_size) && TAILQ_EMPTY(&file->sync_requests))) {
+ /*
+ * There is either no data to flush, a flush I/O is already in
+ * progress, or the next buffer is partially filled but there's no
+ * outstanding request to sync it.
+ * So return immediately - if a flush I/O is in progress we will flush
+ * more data after that is completed, or a partial buffer will get flushed
+ * when it is either filled or the file is synced.
+ */
+ free_fs_request(req);
+ if (next == NULL) {
+ /*
+ * For cases where a file's cache was evicted, and then the
+ * file was later appended, we will write the data directly
+ * to disk and bypass cache. So just update length_flushed
+ * here to reflect that all data was already written to disk.
+ */
+ file->length_flushed = file->append_pos;
+ }
+ pthread_spin_unlock(&file->lock);
+ if (next == NULL) {
+ /*
+ * There is no data to flush, but we still need to check for any
+ * outstanding sync requests to make sure metadata gets updated.
+ */
+ __check_sync_reqs(file);
+ }
+ return;
+ }
+
+ offset = next->offset + next->bytes_flushed;
+ length = next->bytes_filled - next->bytes_flushed;
+ if (length == 0) {
+ free_fs_request(req);
+ pthread_spin_unlock(&file->lock);
+ /*
+ * There is no data to flush, but we still need to check for any
+ * outstanding sync requests to make sure metadata gets updated.
+ */
+ __check_sync_reqs(file);
+ return;
+ }
+ args->op.flush.length = length;
+ args->op.flush.cache_buffer = next;
+
+ __get_page_parameters(file, offset, length, &start_lba, &lba_size, &num_lba);
+
+ next->in_progress = true;
+ BLOBFS_TRACE(file, "offset=0x%jx length=0x%jx page start=0x%jx num=0x%jx\n",
+ offset, length, start_lba, num_lba);
+ pthread_spin_unlock(&file->lock);
+ spdk_blob_io_write(file->blob, file->fs->sync_target.sync_fs_channel->bs_channel,
+ next->buf + (start_lba * lba_size) - next->offset,
+ start_lba, num_lba, __file_flush_done, req);
+}
+
+static void
+__file_extend_done(void *arg, int bserrno)
+{
+ struct spdk_fs_cb_args *args = arg;
+
+ __wake_caller(args, bserrno);
+}
+
+static void
+__file_extend_resize_cb(void *_args, int bserrno)
+{
+ struct spdk_fs_cb_args *args = _args;
+ struct spdk_file *file = args->file;
+
+ if (bserrno) {
+ __wake_caller(args, bserrno);
+ return;
+ }
+
+ spdk_blob_sync_md(file->blob, __file_extend_done, args);
+}
+
+static void
+__file_extend_blob(void *_args)
+{
+ struct spdk_fs_cb_args *args = _args;
+ struct spdk_file *file = args->file;
+
+ spdk_blob_resize(file->blob, args->op.resize.num_clusters, __file_extend_resize_cb, args);
+}
+
+static void
+__rw_from_file_done(void *ctx, int bserrno)
+{
+ struct spdk_fs_request *req = ctx;
+
+ __wake_caller(&req->args, bserrno);
+ free_fs_request(req);
+}
+
+static void
+__rw_from_file(void *ctx)
+{
+ struct spdk_fs_request *req = ctx;
+ struct spdk_fs_cb_args *args = &req->args;
+ struct spdk_file *file = args->file;
+
+ if (args->op.rw.is_read) {
+ spdk_file_read_async(file, file->fs->sync_target.sync_io_channel, args->iovs[0].iov_base,
+ args->op.rw.offset, (uint64_t)args->iovs[0].iov_len,
+ __rw_from_file_done, req);
+ } else {
+ spdk_file_write_async(file, file->fs->sync_target.sync_io_channel, args->iovs[0].iov_base,
+ args->op.rw.offset, (uint64_t)args->iovs[0].iov_len,
+ __rw_from_file_done, req);
+ }
+}
+
+static int
+__send_rw_from_file(struct spdk_file *file, void *payload,
+ uint64_t offset, uint64_t length, bool is_read,
+ struct spdk_fs_channel *channel)
+{
+ struct spdk_fs_request *req;
+ struct spdk_fs_cb_args *args;
+
+ req = alloc_fs_request_with_iov(channel, 1);
+ if (req == NULL) {
+ sem_post(&channel->sem);
+ return -ENOMEM;
+ }
+
+ args = &req->args;
+ args->file = file;
+ args->sem = &channel->sem;
+ args->iovs[0].iov_base = payload;
+ args->iovs[0].iov_len = (size_t)length;
+ args->op.rw.offset = offset;
+ args->op.rw.is_read = is_read;
+ file->fs->send_request(__rw_from_file, req);
+ return 0;
+}
+
+int
+spdk_file_write(struct spdk_file *file, struct spdk_fs_thread_ctx *ctx,
+ void *payload, uint64_t offset, uint64_t length)
+{
+ struct spdk_fs_channel *channel = (struct spdk_fs_channel *)ctx;
+ struct spdk_fs_request *flush_req;
+ uint64_t rem_length, copy, blob_size, cluster_sz;
+ uint32_t cache_buffers_filled = 0;
+ uint8_t *cur_payload;
+ struct cache_buffer *last;
+
+ BLOBFS_TRACE_RW(file, "offset=%jx length=%jx\n", offset, length);
+
+ if (length == 0) {
+ return 0;
+ }
+
+ if (offset != file->append_pos) {
+ BLOBFS_TRACE(file, " error offset=%jx append_pos=%jx\n", offset, file->append_pos);
+ return -EINVAL;
+ }
+
+ pthread_spin_lock(&file->lock);
+ file->open_for_writing = true;
+
+ if ((file->last == NULL) && (file->append_pos % CACHE_BUFFER_SIZE == 0)) {
+ cache_append_buffer(file);
+ }
+
+ if (file->last == NULL) {
+ int rc;
+
+ file->append_pos += length;
+ pthread_spin_unlock(&file->lock);
+ rc = __send_rw_from_file(file, payload, offset, length, false, channel);
+ sem_wait(&channel->sem);
+ return rc;
+ }
+
+ blob_size = __file_get_blob_size(file);
+
+ if ((offset + length) > blob_size) {
+ struct spdk_fs_cb_args extend_args = {};
+
+ cluster_sz = file->fs->bs_opts.cluster_sz;
+ extend_args.sem = &channel->sem;
+ extend_args.op.resize.num_clusters = __bytes_to_clusters((offset + length), cluster_sz);
+ extend_args.file = file;
+ BLOBFS_TRACE(file, "start resize to %u clusters\n", extend_args.op.resize.num_clusters);
+ pthread_spin_unlock(&file->lock);
+ file->fs->send_request(__file_extend_blob, &extend_args);
+ sem_wait(&channel->sem);
+ if (extend_args.rc) {
+ return extend_args.rc;
+ }
+ }
+
+ flush_req = alloc_fs_request(channel);
+ if (flush_req == NULL) {
+ pthread_spin_unlock(&file->lock);
+ return -ENOMEM;
+ }
+
+ last = file->last;
+ rem_length = length;
+ cur_payload = payload;
+ while (rem_length > 0) {
+ copy = last->buf_size - last->bytes_filled;
+ if (copy > rem_length) {
+ copy = rem_length;
+ }
+ BLOBFS_TRACE_RW(file, " fill offset=%jx length=%jx\n", file->append_pos, copy);
+ memcpy(&last->buf[last->bytes_filled], cur_payload, copy);
+ file->append_pos += copy;
+ if (file->length < file->append_pos) {
+ file->length = file->append_pos;
+ }
+ cur_payload += copy;
+ last->bytes_filled += copy;
+ rem_length -= copy;
+ if (last->bytes_filled == last->buf_size) {
+ cache_buffers_filled++;
+ last = cache_append_buffer(file);
+ if (last == NULL) {
+ BLOBFS_TRACE(file, "nomem\n");
+ free_fs_request(flush_req);
+ pthread_spin_unlock(&file->lock);
+ return -ENOMEM;
+ }
+ }
+ }
+
+ pthread_spin_unlock(&file->lock);
+
+ if (cache_buffers_filled == 0) {
+ free_fs_request(flush_req);
+ return 0;
+ }
+
+ flush_req->args.file = file;
+ file->fs->send_request(__file_flush, flush_req);
+ return 0;
+}
+
+static void
+__readahead_done(void *ctx, int bserrno)
+{
+ struct spdk_fs_request *req = ctx;
+ struct spdk_fs_cb_args *args = &req->args;
+ struct cache_buffer *cache_buffer = args->op.readahead.cache_buffer;
+ struct spdk_file *file = args->file;
+
+ BLOBFS_TRACE(file, "offset=%jx\n", cache_buffer->offset);
+
+ pthread_spin_lock(&file->lock);
+ cache_buffer->bytes_filled = args->op.readahead.length;
+ cache_buffer->bytes_flushed = args->op.readahead.length;
+ cache_buffer->in_progress = false;
+ pthread_spin_unlock(&file->lock);
+
+ free_fs_request(req);
+}
+
+static void
+__readahead(void *ctx)
+{
+ struct spdk_fs_request *req = ctx;
+ struct spdk_fs_cb_args *args = &req->args;
+ struct spdk_file *file = args->file;
+ uint64_t offset, length, start_lba, num_lba;
+ uint32_t lba_size;
+
+ offset = args->op.readahead.offset;
+ length = args->op.readahead.length;
+ assert(length > 0);
+
+ __get_page_parameters(file, offset, length, &start_lba, &lba_size, &num_lba);
+
+ BLOBFS_TRACE(file, "offset=%jx length=%jx page start=%jx num=%jx\n",
+ offset, length, start_lba, num_lba);
+ spdk_blob_io_read(file->blob, file->fs->sync_target.sync_fs_channel->bs_channel,
+ args->op.readahead.cache_buffer->buf,
+ start_lba, num_lba, __readahead_done, req);
+}
+
+static uint64_t
+__next_cache_buffer_offset(uint64_t offset)
+{
+ return (offset + CACHE_BUFFER_SIZE) & ~(CACHE_TREE_LEVEL_MASK(0));
+}
+
+static void
+check_readahead(struct spdk_file *file, uint64_t offset,
+ struct spdk_fs_channel *channel)
+{
+ struct spdk_fs_request *req;
+ struct spdk_fs_cb_args *args;
+
+ offset = __next_cache_buffer_offset(offset);
+ if (tree_find_buffer(file->tree, offset) != NULL || file->length <= offset) {
+ return;
+ }
+
+ req = alloc_fs_request(channel);
+ if (req == NULL) {
+ return;
+ }
+ args = &req->args;
+
+ BLOBFS_TRACE(file, "offset=%jx\n", offset);
+
+ args->file = file;
+ args->op.readahead.offset = offset;
+ args->op.readahead.cache_buffer = cache_insert_buffer(file, offset);
+ if (!args->op.readahead.cache_buffer) {
+ BLOBFS_TRACE(file, "Cannot allocate buf for offset=%jx\n", offset);
+ free_fs_request(req);
+ return;
+ }
+
+ args->op.readahead.cache_buffer->in_progress = true;
+ if (file->length < (offset + CACHE_BUFFER_SIZE)) {
+ args->op.readahead.length = file->length & (CACHE_BUFFER_SIZE - 1);
+ } else {
+ args->op.readahead.length = CACHE_BUFFER_SIZE;
+ }
+ file->fs->send_request(__readahead, req);
+}
+
+int64_t
+spdk_file_read(struct spdk_file *file, struct spdk_fs_thread_ctx *ctx,
+ void *payload, uint64_t offset, uint64_t length)
+{
+ struct spdk_fs_channel *channel = (struct spdk_fs_channel *)ctx;
+ uint64_t final_offset, final_length;
+ uint32_t sub_reads = 0;
+ struct cache_buffer *buf;
+ uint64_t read_len;
+ int rc = 0;
+
+ pthread_spin_lock(&file->lock);
+
+ BLOBFS_TRACE_RW(file, "offset=%ju length=%ju\n", offset, length);
+
+ file->open_for_writing = false;
+
+ if (length == 0 || offset >= file->append_pos) {
+ pthread_spin_unlock(&file->lock);
+ return 0;
+ }
+
+ if (offset + length > file->append_pos) {
+ length = file->append_pos - offset;
+ }
+
+ if (offset != file->next_seq_offset) {
+ file->seq_byte_count = 0;
+ }
+ file->seq_byte_count += length;
+ file->next_seq_offset = offset + length;
+ if (file->seq_byte_count >= CACHE_READAHEAD_THRESHOLD) {
+ check_readahead(file, offset, channel);
+ check_readahead(file, offset + CACHE_BUFFER_SIZE, channel);
+ }
+
+ final_length = 0;
+ final_offset = offset + length;
+ while (offset < final_offset) {
+ length = NEXT_CACHE_BUFFER_OFFSET(offset) - offset;
+ if (length > (final_offset - offset)) {
+ length = final_offset - offset;
+ }
+
+ buf = tree_find_filled_buffer(file->tree, offset);
+ if (buf == NULL) {
+ pthread_spin_unlock(&file->lock);
+ rc = __send_rw_from_file(file, payload, offset, length, true, channel);
+ pthread_spin_lock(&file->lock);
+ if (rc == 0) {
+ sub_reads++;
+ }
+ } else {
+ read_len = length;
+ if ((offset + length) > (buf->offset + buf->bytes_filled)) {
+ read_len = buf->offset + buf->bytes_filled - offset;
+ }
+ BLOBFS_TRACE(file, "read %p offset=%ju length=%ju\n", payload, offset, read_len);
+ memcpy(payload, &buf->buf[offset - buf->offset], read_len);
+ if ((offset + read_len) % CACHE_BUFFER_SIZE == 0) {
+ tree_remove_buffer(file->tree, buf);
+ if (file->tree->present_mask == 0) {
+ spdk_thread_send_msg(g_cache_pool_thread, _remove_file_from_cache_pool, file);
+ }
+ }
+ }
+
+ if (rc == 0) {
+ final_length += length;
+ } else {
+ break;
+ }
+ payload += length;
+ offset += length;
+ }
+ pthread_spin_unlock(&file->lock);
+ while (sub_reads > 0) {
+ sem_wait(&channel->sem);
+ sub_reads--;
+ }
+ if (rc == 0) {
+ return final_length;
+ } else {
+ return rc;
+ }
+}
+
+static void
+_file_sync(struct spdk_file *file, struct spdk_fs_channel *channel,
+ spdk_file_op_complete cb_fn, void *cb_arg)
+{
+ struct spdk_fs_request *sync_req;
+ struct spdk_fs_request *flush_req;
+ struct spdk_fs_cb_args *sync_args;
+ struct spdk_fs_cb_args *flush_args;
+
+ BLOBFS_TRACE(file, "offset=%jx\n", file->append_pos);
+
+ pthread_spin_lock(&file->lock);
+ if (file->append_pos <= file->length_xattr) {
+ BLOBFS_TRACE(file, "done - file already synced\n");
+ pthread_spin_unlock(&file->lock);
+ cb_fn(cb_arg, 0);
+ return;
+ }
+
+ sync_req = alloc_fs_request(channel);
+ if (!sync_req) {
+ SPDK_ERRLOG("Cannot allocate sync req for file=%s\n", file->name);
+ pthread_spin_unlock(&file->lock);
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+ sync_args = &sync_req->args;
+
+ flush_req = alloc_fs_request(channel);
+ if (!flush_req) {
+ SPDK_ERRLOG("Cannot allocate flush req for file=%s\n", file->name);
+ free_fs_request(sync_req);
+ pthread_spin_unlock(&file->lock);
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+ flush_args = &flush_req->args;
+
+ sync_args->file = file;
+ sync_args->fn.file_op = cb_fn;
+ sync_args->arg = cb_arg;
+ sync_args->op.sync.offset = file->append_pos;
+ sync_args->op.sync.xattr_in_progress = false;
+ TAILQ_INSERT_TAIL(&file->sync_requests, sync_req, args.op.sync.tailq);
+ pthread_spin_unlock(&file->lock);
+
+ flush_args->file = file;
+ channel->send_request(__file_flush, flush_req);
+}
+
+int
+spdk_file_sync(struct spdk_file *file, struct spdk_fs_thread_ctx *ctx)
+{
+ struct spdk_fs_channel *channel = (struct spdk_fs_channel *)ctx;
+ struct spdk_fs_cb_args args = {};
+
+ args.sem = &channel->sem;
+ _file_sync(file, channel, __wake_caller, &args);
+ sem_wait(&channel->sem);
+
+ return args.rc;
+}
+
+void
+spdk_file_sync_async(struct spdk_file *file, struct spdk_io_channel *_channel,
+ spdk_file_op_complete cb_fn, void *cb_arg)
+{
+ struct spdk_fs_channel *channel = spdk_io_channel_get_ctx(_channel);
+
+ _file_sync(file, channel, cb_fn, cb_arg);
+}
+
+void
+spdk_file_set_priority(struct spdk_file *file, uint32_t priority)
+{
+ BLOBFS_TRACE(file, "priority=%u\n", priority);
+ file->priority = priority;
+
+}
+
+/*
+ * Close routines
+ */
+
+static void
+__file_close_async_done(void *ctx, int bserrno)
+{
+ struct spdk_fs_request *req = ctx;
+ struct spdk_fs_cb_args *args = &req->args;
+ struct spdk_file *file = args->file;
+
+ spdk_trace_record(TRACE_BLOBFS_CLOSE, 0, 0, 0, file->trace_arg_name);
+
+ if (file->is_deleted) {
+ spdk_fs_delete_file_async(file->fs, file->name, blob_delete_cb, ctx);
+ return;
+ }
+
+ args->fn.file_op(args->arg, bserrno);
+ free_fs_request(req);
+}
+
+static void
+__file_close_async(struct spdk_file *file, struct spdk_fs_request *req)
+{
+ struct spdk_blob *blob;
+
+ pthread_spin_lock(&file->lock);
+ if (file->ref_count == 0) {
+ pthread_spin_unlock(&file->lock);
+ __file_close_async_done(req, -EBADF);
+ return;
+ }
+
+ file->ref_count--;
+ if (file->ref_count > 0) {
+ pthread_spin_unlock(&file->lock);
+ req->args.fn.file_op(req->args.arg, 0);
+ free_fs_request(req);
+ return;
+ }
+
+ pthread_spin_unlock(&file->lock);
+
+ blob = file->blob;
+ file->blob = NULL;
+ spdk_blob_close(blob, __file_close_async_done, req);
+}
+
+static void
+__file_close_async__sync_done(void *arg, int fserrno)
+{
+ struct spdk_fs_request *req = arg;
+ struct spdk_fs_cb_args *args = &req->args;
+
+ __file_close_async(args->file, req);
+}
+
+void
+spdk_file_close_async(struct spdk_file *file, spdk_file_op_complete cb_fn, void *cb_arg)
+{
+ struct spdk_fs_request *req;
+ struct spdk_fs_cb_args *args;
+
+ req = alloc_fs_request(file->fs->md_target.md_fs_channel);
+ if (req == NULL) {
+ SPDK_ERRLOG("Cannot allocate close async req for file=%s\n", file->name);
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ args = &req->args;
+ args->file = file;
+ args->fn.file_op = cb_fn;
+ args->arg = cb_arg;
+
+ spdk_file_sync_async(file, file->fs->md_target.md_io_channel, __file_close_async__sync_done, req);
+}
+
+static void
+__file_close(void *arg)
+{
+ struct spdk_fs_request *req = arg;
+ struct spdk_fs_cb_args *args = &req->args;
+ struct spdk_file *file = args->file;
+
+ __file_close_async(file, req);
+}
+
+int
+spdk_file_close(struct spdk_file *file, struct spdk_fs_thread_ctx *ctx)
+{
+ struct spdk_fs_channel *channel = (struct spdk_fs_channel *)ctx;
+ struct spdk_fs_request *req;
+ struct spdk_fs_cb_args *args;
+
+ req = alloc_fs_request(channel);
+ if (req == NULL) {
+ SPDK_ERRLOG("Cannot allocate close req for file=%s\n", file->name);
+ return -ENOMEM;
+ }
+
+ args = &req->args;
+
+ spdk_file_sync(file, ctx);
+ BLOBFS_TRACE(file, "name=%s\n", file->name);
+ args->file = file;
+ args->sem = &channel->sem;
+ args->fn.file_op = __wake_caller;
+ args->arg = args;
+ channel->send_request(__file_close, req);
+ sem_wait(&channel->sem);
+
+ return args->rc;
+}
+
+int
+spdk_file_get_id(struct spdk_file *file, void *id, size_t size)
+{
+ if (size < sizeof(spdk_blob_id)) {
+ return -EINVAL;
+ }
+
+ memcpy(id, &file->blobid, sizeof(spdk_blob_id));
+
+ return sizeof(spdk_blob_id);
+}
+
+static void
+_file_free(void *ctx)
+{
+ struct spdk_file *file = ctx;
+
+ TAILQ_REMOVE(&g_caches, file, cache_tailq);
+
+ free(file->name);
+ free(file->tree);
+ free(file);
+}
+
+static void
+file_free(struct spdk_file *file)
+{
+ BLOBFS_TRACE(file, "free=%s\n", file->name);
+ pthread_spin_lock(&file->lock);
+ if (file->tree->present_mask == 0) {
+ pthread_spin_unlock(&file->lock);
+ free(file->name);
+ free(file->tree);
+ free(file);
+ return;
+ }
+
+ tree_free_buffers(file->tree);
+ assert(file->tree->present_mask == 0);
+ spdk_thread_send_msg(g_cache_pool_thread, _file_free, file);
+ pthread_spin_unlock(&file->lock);
+}
+
+SPDK_LOG_REGISTER_COMPONENT("blobfs", SPDK_LOG_BLOBFS)
+SPDK_LOG_REGISTER_COMPONENT("blobfs_rw", SPDK_LOG_BLOBFS_RW)
diff --git a/src/spdk/lib/blobfs/spdk_blobfs.map b/src/spdk/lib/blobfs/spdk_blobfs.map
new file mode 100644
index 000000000..91c02f61e
--- /dev/null
+++ b/src/spdk/lib/blobfs/spdk_blobfs.map
@@ -0,0 +1,45 @@
+{
+ global:
+
+ # public functions
+ spdk_fs_opts_init;
+ spdk_fs_init;
+ spdk_fs_load;
+ spdk_fs_unload;
+ spdk_fs_alloc_io_channel;
+ spdk_fs_free_io_channel;
+ spdk_fs_alloc_thread_ctx;
+ spdk_fs_free_thread_ctx;
+ spdk_fs_file_stat;
+ spdk_fs_create_file;
+ spdk_fs_open_file;
+ spdk_file_close;
+ spdk_fs_rename_file;
+ spdk_fs_delete_file;
+ spdk_fs_iter_first;
+ spdk_fs_iter_next;
+ spdk_file_truncate;
+ spdk_file_get_name;
+ spdk_file_get_length;
+ spdk_file_write;
+ spdk_file_read;
+ spdk_fs_set_cache_size;
+ spdk_fs_get_cache_size;
+ spdk_file_set_priority;
+ spdk_file_sync;
+ spdk_file_get_id;
+ spdk_file_readv_async;
+ spdk_file_writev_async;
+ spdk_fs_file_stat_async;
+ spdk_fs_create_file_async;
+ spdk_fs_open_file_async;
+ spdk_file_close_async;
+ spdk_fs_rename_file_async;
+ spdk_fs_delete_file_async;
+ spdk_file_truncate_async;
+ spdk_file_write_async;
+ spdk_file_read_async;
+ spdk_file_sync_async;
+
+ local: *;
+};
diff --git a/src/spdk/lib/blobfs/tree.c b/src/spdk/lib/blobfs/tree.c
new file mode 100644
index 000000000..32779766f
--- /dev/null
+++ b/src/spdk/lib/blobfs/tree.c
@@ -0,0 +1,181 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/blobfs.h"
+#include "tree.h"
+
+#include "spdk/queue.h"
+#include "spdk/assert.h"
+#include "spdk/env.h"
+#include "spdk_internal/log.h"
+
+uint32_t g_fs_cache_buffer_shift = CACHE_BUFFER_SHIFT_DEFAULT;
+
+struct cache_buffer *
+tree_find_buffer(struct cache_tree *tree, uint64_t offset)
+{
+ uint64_t index;
+
+ while (tree != NULL) {
+ index = offset / CACHE_TREE_LEVEL_SIZE(tree->level);
+ if (index >= CACHE_TREE_WIDTH) {
+ return NULL;
+ }
+ if (tree->level == 0) {
+ return tree->u.buffer[index];
+ } else {
+ offset &= CACHE_TREE_LEVEL_MASK(tree->level);
+ tree = tree->u.tree[index];
+ }
+ }
+
+ return NULL;
+}
+
+struct cache_buffer *
+tree_find_filled_buffer(struct cache_tree *tree, uint64_t offset)
+{
+ struct cache_buffer *buf;
+
+ buf = tree_find_buffer(tree, offset);
+ if (buf != NULL && buf->bytes_filled > 0) {
+ return buf;
+ } else {
+ return NULL;
+ }
+}
+
+struct cache_tree *
+tree_insert_buffer(struct cache_tree *root, struct cache_buffer *buffer)
+{
+ struct cache_tree *tree;
+ uint64_t index, offset;
+
+ offset = buffer->offset;
+ while (offset >= CACHE_TREE_LEVEL_SIZE(root->level + 1)) {
+ if (root->present_mask != 0) {
+ tree = calloc(1, sizeof(*tree));
+ tree->level = root->level + 1;
+ tree->u.tree[0] = root;
+ root = tree;
+ root->present_mask = 0x1ULL;
+ } else {
+ root->level++;
+ }
+ }
+
+ tree = root;
+ while (tree->level > 0) {
+ index = offset / CACHE_TREE_LEVEL_SIZE(tree->level);
+ assert(index < CACHE_TREE_WIDTH);
+ offset &= CACHE_TREE_LEVEL_MASK(tree->level);
+ if (tree->u.tree[index] == NULL) {
+ tree->u.tree[index] = calloc(1, sizeof(*tree));
+ tree->u.tree[index]->level = tree->level - 1;
+ tree->present_mask |= (1ULL << index);
+ }
+ tree = tree->u.tree[index];
+ }
+
+ index = offset / CACHE_BUFFER_SIZE;
+ assert(index < CACHE_TREE_WIDTH);
+ assert(tree->u.buffer[index] == NULL);
+ tree->u.buffer[index] = buffer;
+ tree->present_mask |= (1ULL << index);
+ return root;
+}
+
+void
+tree_remove_buffer(struct cache_tree *tree, struct cache_buffer *buffer)
+{
+ struct cache_tree *child;
+ uint64_t index;
+
+ index = CACHE_TREE_INDEX(tree->level, buffer->offset);
+
+ if (tree->level == 0) {
+ assert(tree->u.buffer[index] != NULL);
+ assert(buffer == tree->u.buffer[index]);
+ tree->present_mask &= ~(1ULL << index);
+ tree->u.buffer[index] = NULL;
+ cache_buffer_free(buffer);
+ return;
+ }
+
+ child = tree->u.tree[index];
+ assert(child != NULL);
+ tree_remove_buffer(child, buffer);
+ if (child->present_mask == 0) {
+ tree->present_mask &= ~(1ULL << index);
+ tree->u.tree[index] = NULL;
+ free(child);
+ }
+}
+
+void
+tree_free_buffers(struct cache_tree *tree)
+{
+ struct cache_buffer *buffer;
+ struct cache_tree *child;
+ uint32_t i;
+
+ if (tree->present_mask == 0) {
+ return;
+ }
+
+ if (tree->level == 0) {
+ for (i = 0; i < CACHE_TREE_WIDTH; i++) {
+ buffer = tree->u.buffer[i];
+ if (buffer != NULL && buffer->in_progress == false &&
+ buffer->bytes_filled == buffer->bytes_flushed) {
+ cache_buffer_free(buffer);
+ tree->u.buffer[i] = NULL;
+ tree->present_mask &= ~(1ULL << i);
+ }
+ }
+ } else {
+ for (i = 0; i < CACHE_TREE_WIDTH; i++) {
+ child = tree->u.tree[i];
+ if (child != NULL) {
+ tree_free_buffers(child);
+ if (child->present_mask == 0) {
+ free(child);
+ tree->u.tree[i] = NULL;
+ tree->present_mask &= ~(1ULL << i);
+ }
+ }
+ }
+ }
+}
diff --git a/src/spdk/lib/blobfs/tree.h b/src/spdk/lib/blobfs/tree.h
new file mode 100644
index 000000000..71df71090
--- /dev/null
+++ b/src/spdk/lib/blobfs/tree.h
@@ -0,0 +1,77 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_TREE_H_
+#define SPDK_TREE_H_
+
+struct cache_buffer {
+ uint8_t *buf;
+ uint64_t offset;
+ uint32_t buf_size;
+ uint32_t bytes_filled;
+ uint32_t bytes_flushed;
+ bool in_progress;
+};
+
+extern uint32_t g_fs_cache_buffer_shift;
+
+#define CACHE_BUFFER_SHIFT_DEFAULT 18
+#define CACHE_BUFFER_SIZE (1U << g_fs_cache_buffer_shift)
+#define NEXT_CACHE_BUFFER_OFFSET(offset) \
+ (((offset + CACHE_BUFFER_SIZE) >> g_fs_cache_buffer_shift) << g_fs_cache_buffer_shift)
+
+#define CACHE_TREE_SHIFT 6
+#define CACHE_TREE_WIDTH (1U << CACHE_TREE_SHIFT)
+#define CACHE_TREE_LEVEL_SHIFT(level) (g_fs_cache_buffer_shift + (level) * CACHE_TREE_SHIFT)
+#define CACHE_TREE_LEVEL_SIZE(level) (1ULL << CACHE_TREE_LEVEL_SHIFT(level))
+#define CACHE_TREE_LEVEL_MASK(level) (CACHE_TREE_LEVEL_SIZE(level) - 1)
+#define CACHE_TREE_INDEX(level, offset) ((offset >> CACHE_TREE_LEVEL_SHIFT(level)) & (CACHE_TREE_WIDTH - 1))
+
+struct cache_tree {
+ uint8_t level;
+ uint64_t present_mask;
+ union {
+ struct cache_buffer *buffer[CACHE_TREE_WIDTH];
+ struct cache_tree *tree[CACHE_TREE_WIDTH];
+ } u;
+};
+
+void cache_buffer_free(struct cache_buffer *cache_buffer);
+
+struct cache_tree *tree_insert_buffer(struct cache_tree *root, struct cache_buffer *buffer);
+void tree_free_buffers(struct cache_tree *tree);
+struct cache_buffer *tree_find_buffer(struct cache_tree *tree, uint64_t offset);
+struct cache_buffer *tree_find_filled_buffer(struct cache_tree *tree, uint64_t offset);
+void tree_remove_buffer(struct cache_tree *tree, struct cache_buffer *buffer);
+
+#endif /* SPDK_TREE_H_ */
diff --git a/src/spdk/lib/conf/Makefile b/src/spdk/lib/conf/Makefile
new file mode 100644
index 000000000..09966ea12
--- /dev/null
+++ b/src/spdk/lib/conf/Makefile
@@ -0,0 +1,45 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 1
+
+C_SRCS = conf.c
+LIBNAME = conf
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_conf.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/conf/conf.c b/src/spdk/lib/conf/conf.c
new file mode 100644
index 000000000..287e157a5
--- /dev/null
+++ b/src/spdk/lib/conf/conf.c
@@ -0,0 +1,704 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/conf.h"
+#include "spdk/string.h"
+#include "spdk/log.h"
+
+struct spdk_conf_value {
+ struct spdk_conf_value *next;
+ char *value;
+};
+
+struct spdk_conf_item {
+ struct spdk_conf_item *next;
+ char *key;
+ struct spdk_conf_value *val;
+};
+
+struct spdk_conf_section {
+ struct spdk_conf_section *next;
+ char *name;
+ int num;
+ struct spdk_conf_item *item;
+};
+
+struct spdk_conf {
+ char *file;
+ struct spdk_conf_section *current_section;
+ struct spdk_conf_section *section;
+ bool merge_sections;
+};
+
+#define CF_DELIM " \t"
+#define CF_DELIM_KEY " \t="
+
+#define LIB_MAX_TMPBUF 1024
+
+static struct spdk_conf *default_config = NULL;
+
+struct spdk_conf *
+spdk_conf_allocate(void)
+{
+ struct spdk_conf *ret = calloc(1, sizeof(struct spdk_conf));
+
+ if (ret) {
+ ret->merge_sections = true;
+ }
+
+ return ret;
+}
+
+static void
+free_conf_value(struct spdk_conf_value *vp)
+{
+ if (vp == NULL) {
+ return;
+ }
+
+ if (vp->value) {
+ free(vp->value);
+ }
+
+ free(vp);
+}
+
+static void
+free_all_conf_value(struct spdk_conf_value *vp)
+{
+ struct spdk_conf_value *next;
+
+ if (vp == NULL) {
+ return;
+ }
+
+ while (vp != NULL) {
+ next = vp->next;
+ free_conf_value(vp);
+ vp = next;
+ }
+}
+
+static void
+free_conf_item(struct spdk_conf_item *ip)
+{
+ if (ip == NULL) {
+ return;
+ }
+
+ if (ip->val != NULL) {
+ free_all_conf_value(ip->val);
+ }
+
+ if (ip->key != NULL) {
+ free(ip->key);
+ }
+
+ free(ip);
+}
+
+static void
+free_all_conf_item(struct spdk_conf_item *ip)
+{
+ struct spdk_conf_item *next;
+
+ if (ip == NULL) {
+ return;
+ }
+
+ while (ip != NULL) {
+ next = ip->next;
+ free_conf_item(ip);
+ ip = next;
+ }
+}
+
+static void
+free_conf_section(struct spdk_conf_section *sp)
+{
+ if (sp == NULL) {
+ return;
+ }
+
+ if (sp->item) {
+ free_all_conf_item(sp->item);
+ }
+
+ if (sp->name) {
+ free(sp->name);
+ }
+
+ free(sp);
+}
+
+static void
+free_all_conf_section(struct spdk_conf_section *sp)
+{
+ struct spdk_conf_section *next;
+
+ if (sp == NULL) {
+ return;
+ }
+
+ while (sp != NULL) {
+ next = sp->next;
+ free_conf_section(sp);
+ sp = next;
+ }
+}
+
+void
+spdk_conf_free(struct spdk_conf *cp)
+{
+ if (cp == NULL) {
+ return;
+ }
+
+ if (cp->section != NULL) {
+ free_all_conf_section(cp->section);
+ }
+
+ if (cp->file != NULL) {
+ free(cp->file);
+ }
+
+ free(cp);
+}
+
+static struct spdk_conf_section *
+allocate_cf_section(void)
+{
+ return calloc(1, sizeof(struct spdk_conf_section));
+}
+
+static struct spdk_conf_item *
+allocate_cf_item(void)
+{
+ return calloc(1, sizeof(struct spdk_conf_item));
+}
+
+static struct spdk_conf_value *
+allocate_cf_value(void)
+{
+ return calloc(1, sizeof(struct spdk_conf_value));
+}
+
+
+#define CHECK_CP_OR_USE_DEFAULT(cp) (((cp) == NULL) && (default_config != NULL)) ? default_config : (cp)
+
+struct spdk_conf_section *
+spdk_conf_find_section(struct spdk_conf *cp, const char *name)
+{
+ struct spdk_conf_section *sp;
+
+ if (name == NULL || name[0] == '\0') {
+ return NULL;
+ }
+
+ cp = CHECK_CP_OR_USE_DEFAULT(cp);
+ if (cp == NULL) {
+ return NULL;
+ }
+
+ for (sp = cp->section; sp != NULL; sp = sp->next) {
+ if (sp->name != NULL && sp->name[0] == name[0]
+ && strcasecmp(sp->name, name) == 0) {
+ return sp;
+ }
+ }
+
+ return NULL;
+}
+
+struct spdk_conf_section *
+spdk_conf_first_section(struct spdk_conf *cp)
+{
+ cp = CHECK_CP_OR_USE_DEFAULT(cp);
+ if (cp == NULL) {
+ return NULL;
+ }
+
+ return cp->section;
+}
+
+struct spdk_conf_section *
+spdk_conf_next_section(struct spdk_conf_section *sp)
+{
+ if (sp == NULL) {
+ return NULL;
+ }
+
+ return sp->next;
+}
+
+static void
+append_cf_section(struct spdk_conf *cp, struct spdk_conf_section *sp)
+{
+ struct spdk_conf_section *last;
+
+ cp = CHECK_CP_OR_USE_DEFAULT(cp);
+ if (cp == NULL) {
+ SPDK_ERRLOG("cp == NULL\n");
+ return;
+ }
+
+ if (cp->section == NULL) {
+ cp->section = sp;
+ return;
+ }
+
+ for (last = cp->section; last->next != NULL; last = last->next)
+ ;
+ last->next = sp;
+}
+
+static struct spdk_conf_item *
+find_cf_nitem(struct spdk_conf_section *sp, const char *key, int idx)
+{
+ struct spdk_conf_item *ip;
+ int i;
+
+ if (key == NULL || key[0] == '\0') {
+ return NULL;
+ }
+
+ i = 0;
+ for (ip = sp->item; ip != NULL; ip = ip->next) {
+ if (ip->key != NULL && ip->key[0] == key[0]
+ && strcasecmp(ip->key, key) == 0) {
+ if (i == idx) {
+ return ip;
+ }
+ i++;
+ }
+ }
+
+ return NULL;
+}
+
+static void
+append_cf_item(struct spdk_conf_section *sp, struct spdk_conf_item *ip)
+{
+ struct spdk_conf_item *last;
+
+ if (sp == NULL) {
+ return;
+ }
+
+ if (sp->item == NULL) {
+ sp->item = ip;
+ return;
+ }
+
+ for (last = sp->item; last->next != NULL; last = last->next)
+ ;
+ last->next = ip;
+}
+
+static void
+append_cf_value(struct spdk_conf_item *ip, struct spdk_conf_value *vp)
+{
+ struct spdk_conf_value *last;
+
+ if (ip == NULL) {
+ return;
+ }
+
+ if (ip->val == NULL) {
+ ip->val = vp;
+ return;
+ }
+
+ for (last = ip->val; last->next != NULL; last = last->next)
+ ;
+ last->next = vp;
+}
+
+bool
+spdk_conf_section_match_prefix(const struct spdk_conf_section *sp, const char *name_prefix)
+{
+ return strncasecmp(sp->name, name_prefix, strlen(name_prefix)) == 0;
+}
+
+const char *
+spdk_conf_section_get_name(const struct spdk_conf_section *sp)
+{
+ return sp->name;
+}
+
+int
+spdk_conf_section_get_num(const struct spdk_conf_section *sp)
+{
+ return sp->num;
+}
+
+char *
+spdk_conf_section_get_nmval(struct spdk_conf_section *sp, const char *key, int idx1, int idx2)
+{
+ struct spdk_conf_item *ip;
+ struct spdk_conf_value *vp;
+ int i;
+
+ ip = find_cf_nitem(sp, key, idx1);
+ if (ip == NULL) {
+ return NULL;
+ }
+
+ vp = ip->val;
+ if (vp == NULL) {
+ return NULL;
+ }
+
+ for (i = 0; vp != NULL; vp = vp->next, i++) {
+ if (i == idx2) {
+ return vp->value;
+ }
+ }
+
+ return NULL;
+}
+
+char *
+spdk_conf_section_get_nval(struct spdk_conf_section *sp, const char *key, int idx)
+{
+ struct spdk_conf_item *ip;
+ struct spdk_conf_value *vp;
+
+ ip = find_cf_nitem(sp, key, idx);
+ if (ip == NULL) {
+ return NULL;
+ }
+
+ vp = ip->val;
+ if (vp == NULL) {
+ return NULL;
+ }
+
+ return vp->value;
+}
+
+char *
+spdk_conf_section_get_val(struct spdk_conf_section *sp, const char *key)
+{
+ return spdk_conf_section_get_nval(sp, key, 0);
+}
+
+int
+spdk_conf_section_get_intval(struct spdk_conf_section *sp, const char *key)
+{
+ const char *v;
+ int value;
+
+ v = spdk_conf_section_get_nval(sp, key, 0);
+ if (v == NULL) {
+ return -1;
+ }
+
+ value = (int)spdk_strtol(v, 10);
+ return value;
+}
+
+bool
+spdk_conf_section_get_boolval(struct spdk_conf_section *sp, const char *key, bool default_val)
+{
+ const char *v;
+
+ v = spdk_conf_section_get_nval(sp, key, 0);
+ if (v == NULL) {
+ return default_val;
+ }
+
+ if (!strcasecmp(v, "Yes") || !strcasecmp(v, "Y") || !strcasecmp(v, "True")) {
+ return true;
+ }
+
+ if (!strcasecmp(v, "No") || !strcasecmp(v, "N") || !strcasecmp(v, "False")) {
+ return false;
+ }
+
+ return default_val;
+}
+
+static int
+parse_line(struct spdk_conf *cp, char *lp)
+{
+ struct spdk_conf_section *sp;
+ struct spdk_conf_item *ip;
+ struct spdk_conf_value *vp;
+ char *arg;
+ char *key;
+ char *val;
+ char *p;
+ int num;
+
+ arg = spdk_str_trim(lp);
+ if (arg == NULL) {
+ SPDK_ERRLOG("no section\n");
+ return -1;
+ }
+
+ if (arg[0] == '[') {
+ /* section */
+ arg++;
+ key = spdk_strsepq(&arg, "]");
+ if (key == NULL || arg != NULL) {
+ SPDK_ERRLOG("broken section\n");
+ return -1;
+ }
+ /* determine section number */
+ for (p = key; *p != '\0' && !isdigit((int) *p); p++)
+ ;
+ if (*p != '\0') {
+ num = (int)spdk_strtol(p, 10);
+ } else {
+ num = 0;
+ }
+
+ if (cp->merge_sections) {
+ sp = spdk_conf_find_section(cp, key);
+ } else {
+ sp = NULL;
+ }
+
+ if (sp == NULL) {
+ sp = allocate_cf_section();
+ append_cf_section(cp, sp);
+
+ sp->name = strdup(key);
+ if (sp->name == NULL) {
+ SPDK_ERRLOG("cannot duplicate %s to sp->name\n", key);
+ return -1;
+ }
+ }
+ cp->current_section = sp;
+
+
+ sp->num = num;
+ } else {
+ /* parameters */
+ sp = cp->current_section;
+ if (sp == NULL) {
+ SPDK_ERRLOG("unknown section\n");
+ return -1;
+ }
+ key = spdk_strsepq(&arg, CF_DELIM_KEY);
+ if (key == NULL) {
+ SPDK_ERRLOG("broken key\n");
+ return -1;
+ }
+
+ ip = allocate_cf_item();
+ if (ip == NULL) {
+ SPDK_ERRLOG("cannot allocate cf item\n");
+ return -1;
+ }
+ append_cf_item(sp, ip);
+ ip->key = strdup(key);
+ if (ip->key == NULL) {
+ SPDK_ERRLOG("cannot make duplicate of %s\n", key);
+ return -1;
+ }
+ ip->val = NULL;
+ if (arg != NULL) {
+ /* key has value(s) */
+ while (arg != NULL) {
+ val = spdk_strsepq(&arg, CF_DELIM);
+ vp = allocate_cf_value();
+ if (vp == NULL) {
+ SPDK_ERRLOG("cannot allocate cf value\n");
+ return -1;
+ }
+ append_cf_value(ip, vp);
+ vp->value = strdup(val);
+ if (vp->value == NULL) {
+ SPDK_ERRLOG("cannot duplicate %s to vp->value\n", val);
+ return -1;
+ }
+ }
+ }
+ }
+
+ return 0;
+}
+
+static char *
+fgets_line(FILE *fp)
+{
+ char *dst, *dst2, *p;
+ size_t total, len;
+
+ dst = p = malloc(LIB_MAX_TMPBUF);
+ if (!dst) {
+ return NULL;
+ }
+
+ dst[0] = '\0';
+ total = 0;
+
+ while (fgets(p, LIB_MAX_TMPBUF, fp) != NULL) {
+ len = strlen(p);
+ total += len;
+ if (len + 1 < LIB_MAX_TMPBUF || dst[total - 1] == '\n') {
+ dst2 = realloc(dst, total + 1);
+ if (!dst2) {
+ free(dst);
+ return NULL;
+ } else {
+ return dst2;
+ }
+ }
+
+ dst2 = realloc(dst, total + LIB_MAX_TMPBUF);
+ if (!dst2) {
+ free(dst);
+ return NULL;
+ } else {
+ dst = dst2;
+ }
+
+ p = dst + total;
+ }
+
+ if (feof(fp) && total != 0) {
+ dst2 = realloc(dst, total + 2);
+ if (!dst2) {
+ free(dst);
+ return NULL;
+ } else {
+ dst = dst2;
+ }
+
+ dst[total] = '\n';
+ dst[total + 1] = '\0';
+ return dst;
+ }
+
+ free(dst);
+
+ return NULL;
+}
+
+int
+spdk_conf_read(struct spdk_conf *cp, const char *file)
+{
+ FILE *fp;
+ char *lp, *p;
+ char *lp2, *q;
+ int line;
+ int n, n2;
+
+ if (file == NULL || file[0] == '\0') {
+ return -1;
+ }
+ SPDK_ERRLOG("INI configuration has been deprecated and will be removed in a future release. Please switch to JSON-RPC.\n");
+
+ fp = fopen(file, "r");
+ if (fp == NULL) {
+ SPDK_ERRLOG("open error: %s\n", file);
+ return -1;
+ }
+
+ cp->file = strdup(file);
+ if (cp->file == NULL) {
+ SPDK_ERRLOG("cannot duplicate %s to cp->file\n", file);
+ fclose(fp);
+ return -1;
+ }
+
+ line = 1;
+ while ((lp = fgets_line(fp)) != NULL) {
+ /* skip spaces */
+ for (p = lp; *p != '\0' && isspace((int) *p); p++)
+ ;
+ /* skip comment, empty line */
+ if (p[0] == '#' || p[0] == '\0') {
+ goto next_line;
+ }
+
+ /* concatenate line end with '\' */
+ n = strlen(p);
+ while (n > 2 && p[n - 1] == '\n' && p[n - 2] == '\\') {
+ n -= 2;
+ lp2 = fgets_line(fp);
+ if (lp2 == NULL) {
+ break;
+ }
+
+ line++;
+ n2 = strlen(lp2);
+
+ q = malloc(n + n2 + 1);
+ if (!q) {
+ free(lp2);
+ free(lp);
+ SPDK_ERRLOG("malloc failed at line %d of %s\n", line, cp->file);
+ fclose(fp);
+ return -1;
+ }
+
+ memcpy(q, p, n);
+ memcpy(q + n, lp2, n2);
+ q[n + n2] = '\0';
+ free(lp2);
+ free(lp);
+ p = lp = q;
+ n += n2;
+ }
+
+ /* parse one line */
+ if (parse_line(cp, p) < 0) {
+ SPDK_ERRLOG("parse error at line %d of %s\n", line, cp->file);
+ }
+next_line:
+ line++;
+ free(lp);
+ }
+
+ fclose(fp);
+ return 0;
+}
+
+void
+spdk_conf_set_as_default(struct spdk_conf *cp)
+{
+ default_config = cp;
+}
+
+void
+spdk_conf_disable_sections_merge(struct spdk_conf *cp)
+{
+ cp->merge_sections = false;
+}
diff --git a/src/spdk/lib/conf/spdk_conf.map b/src/spdk/lib/conf/spdk_conf.map
new file mode 100644
index 000000000..0fc01c8aa
--- /dev/null
+++ b/src/spdk/lib/conf/spdk_conf.map
@@ -0,0 +1,23 @@
+{
+ global:
+
+ # Public functions
+ spdk_conf_allocate;
+ spdk_conf_free;
+ spdk_conf_read;
+ spdk_conf_find_section;
+ spdk_conf_first_section;
+ spdk_conf_next_section;
+ spdk_conf_section_match_prefix;
+ spdk_conf_section_get_name;
+ spdk_conf_section_get_num;
+ spdk_conf_section_get_nmval;
+ spdk_conf_section_get_nval;
+ spdk_conf_section_get_val;
+ spdk_conf_section_get_intval;
+ spdk_conf_section_get_boolval;
+ spdk_conf_set_as_default;
+ spdk_conf_disable_sections_merge;
+
+ local: *;
+};
diff --git a/src/spdk/lib/env_dpdk/Makefile b/src/spdk/lib/env_dpdk/Makefile
new file mode 100644
index 000000000..11433fe86
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/Makefile
@@ -0,0 +1,47 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 5
+SO_MINOR := 0
+
+CFLAGS += $(ENV_CFLAGS)
+C_SRCS = env.c memory.c pci.c init.c threads.c
+C_SRCS += pci_ioat.c pci_virtio.c pci_vmd.c pci_idxd.c
+LIBNAME = env_dpdk
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_env_dpdk.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/env_dpdk/env.c b/src/spdk/lib/env_dpdk/env.c
new file mode 100644
index 000000000..94b709de9
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/env.c
@@ -0,0 +1,451 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+#include "spdk/util.h"
+#include "spdk/env_dpdk.h"
+
+#include "env_internal.h"
+
+#include <rte_config.h>
+#include <rte_cycles.h>
+#include <rte_malloc.h>
+#include <rte_mempool.h>
+#include <rte_memzone.h>
+#include <rte_version.h>
+
+static uint64_t
+virt_to_phys(void *vaddr)
+{
+ uint64_t ret;
+
+ ret = rte_malloc_virt2iova(vaddr);
+ if (ret != RTE_BAD_IOVA) {
+ return ret;
+ }
+
+ return spdk_vtophys(vaddr, NULL);
+}
+
+void *
+spdk_malloc(size_t size, size_t align, uint64_t *phys_addr, int socket_id, uint32_t flags)
+{
+ void *buf;
+
+ if (flags == 0) {
+ return NULL;
+ }
+
+ align = spdk_max(align, RTE_CACHE_LINE_SIZE);
+ buf = rte_malloc_socket(NULL, size, align, socket_id);
+ if (buf && phys_addr) {
+#ifdef DEBUG
+ fprintf(stderr, "phys_addr param in spdk_*malloc() is deprecated\n");
+#endif
+ *phys_addr = virt_to_phys(buf);
+ }
+ return buf;
+}
+
+void *
+spdk_zmalloc(size_t size, size_t align, uint64_t *phys_addr, int socket_id, uint32_t flags)
+{
+ void *buf = spdk_malloc(size, align, phys_addr, socket_id, flags);
+ if (buf) {
+ memset(buf, 0, size);
+ }
+ return buf;
+}
+
+void *
+spdk_realloc(void *buf, size_t size, size_t align)
+{
+ align = spdk_max(align, RTE_CACHE_LINE_SIZE);
+ return rte_realloc(buf, size, align);
+}
+
+void
+spdk_free(void *buf)
+{
+ rte_free(buf);
+}
+
+void *
+spdk_dma_malloc_socket(size_t size, size_t align, uint64_t *phys_addr, int socket_id)
+{
+ return spdk_malloc(size, align, phys_addr, socket_id, (SPDK_MALLOC_DMA | SPDK_MALLOC_SHARE));
+}
+
+void *
+spdk_dma_zmalloc_socket(size_t size, size_t align, uint64_t *phys_addr, int socket_id)
+{
+ return spdk_zmalloc(size, align, phys_addr, socket_id, (SPDK_MALLOC_DMA | SPDK_MALLOC_SHARE));
+}
+
+void *
+spdk_dma_malloc(size_t size, size_t align, uint64_t *phys_addr)
+{
+ return spdk_dma_malloc_socket(size, align, phys_addr, SPDK_ENV_SOCKET_ID_ANY);
+}
+
+void *
+spdk_dma_zmalloc(size_t size, size_t align, uint64_t *phys_addr)
+{
+ return spdk_dma_zmalloc_socket(size, align, phys_addr, SPDK_ENV_SOCKET_ID_ANY);
+}
+
+void *
+spdk_dma_realloc(void *buf, size_t size, size_t align, uint64_t *phys_addr)
+{
+ void *new_buf;
+
+ align = spdk_max(align, RTE_CACHE_LINE_SIZE);
+ new_buf = rte_realloc(buf, size, align);
+ if (new_buf && phys_addr) {
+ *phys_addr = virt_to_phys(new_buf);
+ }
+ return new_buf;
+}
+
+void
+spdk_dma_free(void *buf)
+{
+ spdk_free(buf);
+}
+
+void *
+spdk_memzone_reserve_aligned(const char *name, size_t len, int socket_id,
+ unsigned flags, unsigned align)
+{
+ const struct rte_memzone *mz;
+ unsigned dpdk_flags = 0;
+
+ if ((flags & SPDK_MEMZONE_NO_IOVA_CONTIG) == 0) {
+ dpdk_flags |= RTE_MEMZONE_IOVA_CONTIG;
+ }
+
+ if (socket_id == SPDK_ENV_SOCKET_ID_ANY) {
+ socket_id = SOCKET_ID_ANY;
+ }
+
+ mz = rte_memzone_reserve_aligned(name, len, socket_id, dpdk_flags, align);
+
+ if (mz != NULL) {
+ memset(mz->addr, 0, len);
+ return mz->addr;
+ } else {
+ return NULL;
+ }
+}
+
+void *
+spdk_memzone_reserve(const char *name, size_t len, int socket_id, unsigned flags)
+{
+ return spdk_memzone_reserve_aligned(name, len, socket_id, flags,
+ RTE_CACHE_LINE_SIZE);
+}
+
+void *
+spdk_memzone_lookup(const char *name)
+{
+ const struct rte_memzone *mz = rte_memzone_lookup(name);
+
+ if (mz != NULL) {
+ return mz->addr;
+ } else {
+ return NULL;
+ }
+}
+
+int
+spdk_memzone_free(const char *name)
+{
+ const struct rte_memzone *mz = rte_memzone_lookup(name);
+
+ if (mz != NULL) {
+ return rte_memzone_free(mz);
+ }
+
+ return -1;
+}
+
+void
+spdk_memzone_dump(FILE *f)
+{
+ rte_memzone_dump(f);
+}
+
+struct spdk_mempool *
+spdk_mempool_create_ctor(const char *name, size_t count,
+ size_t ele_size, size_t cache_size, int socket_id,
+ spdk_mempool_obj_cb_t *obj_init, void *obj_init_arg)
+{
+ struct rte_mempool *mp;
+ size_t tmp;
+
+ if (socket_id == SPDK_ENV_SOCKET_ID_ANY) {
+ socket_id = SOCKET_ID_ANY;
+ }
+
+ /* No more than half of all elements can be in cache */
+ tmp = (count / 2) / rte_lcore_count();
+ if (cache_size > tmp) {
+ cache_size = tmp;
+ }
+
+ if (cache_size > RTE_MEMPOOL_CACHE_MAX_SIZE) {
+ cache_size = RTE_MEMPOOL_CACHE_MAX_SIZE;
+ }
+
+ mp = rte_mempool_create(name, count, ele_size, cache_size,
+ 0, NULL, NULL, (rte_mempool_obj_cb_t *)obj_init, obj_init_arg,
+ socket_id, MEMPOOL_F_NO_PHYS_CONTIG);
+
+ return (struct spdk_mempool *)mp;
+}
+
+
+struct spdk_mempool *
+spdk_mempool_create(const char *name, size_t count,
+ size_t ele_size, size_t cache_size, int socket_id)
+{
+ return spdk_mempool_create_ctor(name, count, ele_size, cache_size, socket_id,
+ NULL, NULL);
+}
+
+char *
+spdk_mempool_get_name(struct spdk_mempool *mp)
+{
+ return ((struct rte_mempool *)mp)->name;
+}
+
+void
+spdk_mempool_free(struct spdk_mempool *mp)
+{
+ rte_mempool_free((struct rte_mempool *)mp);
+}
+
+void *
+spdk_mempool_get(struct spdk_mempool *mp)
+{
+ void *ele = NULL;
+ int rc;
+
+ rc = rte_mempool_get((struct rte_mempool *)mp, &ele);
+ if (rc != 0) {
+ return NULL;
+ }
+ return ele;
+}
+
+int
+spdk_mempool_get_bulk(struct spdk_mempool *mp, void **ele_arr, size_t count)
+{
+ return rte_mempool_get_bulk((struct rte_mempool *)mp, ele_arr, count);
+}
+
+void
+spdk_mempool_put(struct spdk_mempool *mp, void *ele)
+{
+ rte_mempool_put((struct rte_mempool *)mp, ele);
+}
+
+void
+spdk_mempool_put_bulk(struct spdk_mempool *mp, void **ele_arr, size_t count)
+{
+ rte_mempool_put_bulk((struct rte_mempool *)mp, ele_arr, count);
+}
+
+size_t
+spdk_mempool_count(const struct spdk_mempool *pool)
+{
+ return rte_mempool_avail_count((struct rte_mempool *)pool);
+}
+
+uint32_t
+spdk_mempool_obj_iter(struct spdk_mempool *mp, spdk_mempool_obj_cb_t obj_cb,
+ void *obj_cb_arg)
+{
+ return rte_mempool_obj_iter((struct rte_mempool *)mp, (rte_mempool_obj_cb_t *)obj_cb,
+ obj_cb_arg);
+}
+
+struct spdk_mempool *
+spdk_mempool_lookup(const char *name)
+{
+ return (struct spdk_mempool *)rte_mempool_lookup(name);
+}
+
+bool
+spdk_process_is_primary(void)
+{
+ return (rte_eal_process_type() == RTE_PROC_PRIMARY);
+}
+
+uint64_t spdk_get_ticks(void)
+{
+ return rte_get_timer_cycles();
+}
+
+uint64_t spdk_get_ticks_hz(void)
+{
+ return rte_get_timer_hz();
+}
+
+void spdk_delay_us(unsigned int us)
+{
+ rte_delay_us(us);
+}
+
+void spdk_pause(void)
+{
+ rte_pause();
+}
+
+void
+spdk_unaffinitize_thread(void)
+{
+ rte_cpuset_t new_cpuset, orig_cpuset;
+ long num_cores, i, orig_num_cores;
+
+ CPU_ZERO(&new_cpuset);
+
+ num_cores = sysconf(_SC_NPROCESSORS_CONF);
+
+ /* Create a mask containing all CPUs */
+ for (i = 0; i < num_cores; i++) {
+ CPU_SET(i, &new_cpuset);
+ }
+
+ rte_thread_get_affinity(&orig_cpuset);
+ orig_num_cores = CPU_COUNT(&orig_cpuset);
+ if (orig_num_cores < num_cores) {
+ for (i = 0; i < orig_num_cores; i++) {
+ if (CPU_ISSET(i, &orig_cpuset)) {
+ CPU_CLR(i, &new_cpuset);
+ }
+ }
+ }
+
+ rte_thread_set_affinity(&new_cpuset);
+}
+
+void *
+spdk_call_unaffinitized(void *cb(void *arg), void *arg)
+{
+ rte_cpuset_t orig_cpuset;
+ void *ret;
+
+ if (cb == NULL) {
+ return NULL;
+ }
+
+ rte_thread_get_affinity(&orig_cpuset);
+
+ spdk_unaffinitize_thread();
+
+ ret = cb(arg);
+
+ rte_thread_set_affinity(&orig_cpuset);
+
+ return ret;
+}
+
+struct spdk_ring *
+spdk_ring_create(enum spdk_ring_type type, size_t count, int socket_id)
+{
+ char ring_name[64];
+ static uint32_t ring_num = 0;
+ unsigned flags = RING_F_EXACT_SZ;
+
+ switch (type) {
+ case SPDK_RING_TYPE_SP_SC:
+ flags |= RING_F_SP_ENQ | RING_F_SC_DEQ;
+ break;
+ case SPDK_RING_TYPE_MP_SC:
+ flags |= RING_F_SC_DEQ;
+ break;
+ case SPDK_RING_TYPE_MP_MC:
+ flags |= 0;
+ break;
+ default:
+ return NULL;
+ }
+
+ snprintf(ring_name, sizeof(ring_name), "ring_%u_%d",
+ __atomic_fetch_add(&ring_num, 1, __ATOMIC_RELAXED), getpid());
+
+ return (struct spdk_ring *)rte_ring_create(ring_name, count, socket_id, flags);
+}
+
+void
+spdk_ring_free(struct spdk_ring *ring)
+{
+ rte_ring_free((struct rte_ring *)ring);
+}
+
+size_t
+spdk_ring_count(struct spdk_ring *ring)
+{
+ return rte_ring_count((struct rte_ring *)ring);
+}
+
+size_t
+spdk_ring_enqueue(struct spdk_ring *ring, void **objs, size_t count,
+ size_t *free_space)
+{
+ return rte_ring_enqueue_bulk((struct rte_ring *)ring, objs, count,
+ (unsigned int *)free_space);
+}
+
+size_t
+spdk_ring_dequeue(struct spdk_ring *ring, void **objs, size_t count)
+{
+ return rte_ring_dequeue_burst((struct rte_ring *)ring, objs, count, NULL);
+}
+
+void
+spdk_env_dpdk_dump_mem_stats(FILE *file)
+{
+ fprintf(file, "DPDK memory size %lu\n", rte_eal_get_physmem_size());
+ fprintf(file, "DPDK memory layout\n");
+ rte_dump_physmem_layout(file);
+ fprintf(file, "DPDK memzones.\n");
+ rte_memzone_dump(file);
+ fprintf(file, "DPDK mempools.\n");
+ rte_mempool_list_dump(file);
+ fprintf(file, "DPDK malloc stats.\n");
+ rte_malloc_dump_stats(file, NULL);
+ fprintf(file, "DPDK malloc heaps.\n");
+ rte_malloc_dump_heaps(file);
+}
diff --git a/src/spdk/lib/env_dpdk/env.mk b/src/spdk/lib/env_dpdk/env.mk
new file mode 100644
index 000000000..c2bfb0d19
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/env.mk
@@ -0,0 +1,176 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+# This makefile snippet must define the following flags:
+# ENV_CFLAGS
+# ENV_CXXFLAGS
+# ENV_LIBS
+# ENV_LINKER_ARGS
+
+DPDK_DIR = $(CONFIG_DPDK_DIR)
+
+export DPDK_ABS_DIR = $(abspath $(DPDK_DIR))
+
+ifneq (, $(wildcard $(DPDK_ABS_DIR)/include/rte_config.h))
+DPDK_INC_DIR := $(DPDK_ABS_DIR)/include
+else
+DPDK_INC_DIR := $(DPDK_ABS_DIR)/include/dpdk
+endif
+DPDK_INC := -I$(DPDK_INC_DIR)
+
+ifeq ($(CONFIG_SHARED),y)
+DPDK_LIB_EXT = .so
+else
+DPDK_LIB_EXT = .a
+endif
+
+DPDK_LIB_LIST = rte_eal rte_mempool rte_ring rte_mbuf
+
+# librte_mempool_ring was new added from DPDK 17.05. Link this library used for
+# ring based mempool management API.
+ifneq (, $(wildcard $(DPDK_ABS_DIR)/lib/librte_mempool_ring.*))
+DPDK_LIB_LIST += rte_mempool_ring
+endif
+
+# librte_malloc was removed after DPDK 2.1. Link this library conditionally based on its
+# existence to maintain backward compatibility.
+ifneq ($(wildcard $(DPDK_ABS_DIR)/lib/librte_malloc.*),)
+DPDK_LIB_LIST += rte_malloc
+endif
+
+# librte_pci and librte_bus_pci were added in DPDK 17.11. Link these libraries conditionally
+# based on their existence to maintain backward compatibility.
+ifneq (, $(wildcard $(DPDK_ABS_DIR)/lib/librte_pci.*))
+DPDK_LIB_LIST += rte_pci
+endif
+
+ifneq (, $(wildcard $(DPDK_ABS_DIR)/lib/librte_bus_pci.*))
+DPDK_LIB_LIST += rte_bus_pci
+endif
+
+# DPDK 20.05 eal dependency
+ifneq (, $(wildcard $(DPDK_ABS_DIR)/lib/librte_telemetry.*))
+DPDK_LIB_LIST += rte_telemetry
+endif
+
+# There are some complex dependencies when using crypto, reduce or both so
+# here we add the feature specific ones and set a flag to add the common
+# ones after that.
+DPDK_FRAMEWORK=n
+ifeq ($(CONFIG_CRYPTO),y)
+DPDK_FRAMEWORK=y
+DPDK_LIB_LIST += rte_pmd_aesni_mb rte_reorder
+endif
+
+ifeq ($(CONFIG_REDUCE),y)
+DPDK_FRAMEWORK=y
+DPDK_LIB_LIST += rte_pmd_isal
+endif
+
+ifeq ($(DPDK_FRAMEWORK),y)
+DPDK_LIB_LIST += rte_cryptodev rte_compressdev rte_bus_vdev rte_pmd_qat
+endif
+
+ifneq (, $(wildcard $(DPDK_ABS_DIR)/lib/librte_kvargs.*))
+DPDK_LIB_LIST += rte_kvargs
+endif
+
+LINK_HASH=n
+
+ifeq ($(CONFIG_VHOST),y)
+ifneq ($(CONFIG_VHOST_INTERNAL_LIB),y)
+DPDK_LIB_LIST += rte_vhost rte_net
+LINK_HASH=y
+ifneq ($(DPDK_FRAMEWORK),y)
+DPDK_LIB_LIST += rte_cryptodev
+endif
+endif
+endif
+
+ifeq ($(CONFIG_RAID5),y)
+LINK_HASH=y
+endif
+
+ifeq ($(LINK_HASH),y)
+DPDK_LIB_LIST += rte_hash
+endif
+
+define dpdk_lib_list_to_libs
+$(1:%=$(DPDK_ABS_DIR)/lib/lib%$(DPDK_LIB_EXT))
+endef
+
+define dpdk_env_linker_args
+$(ENV_DPDK_FILE) -Wl,--whole-archive,--no-as-needed $(call dpdk_lib_list_to_libs,$1) -Wl,--no-whole-archive
+endef
+
+DPDK_LIB = $(call dpdk_lib_list_to_libs,$(DPDK_LIB_LIST))
+
+# SPDK memory registration requires experimental (deprecated) rte_memory API for DPDK 18.05
+ENV_CFLAGS = $(DPDK_INC) -Wno-deprecated-declarations
+ENV_CXXFLAGS = $(ENV_CFLAGS)
+ifeq ($(CONFIG_SHARED),y)
+ENV_DPDK_FILE = $(call spdk_lib_list_to_shared_libs,env_dpdk)
+else
+ENV_DPDK_FILE = $(call spdk_lib_list_to_static_libs,env_dpdk)
+endif
+ENV_LIBS = $(ENV_DPDK_FILE) $(DPDK_LIB)
+ENV_LINKER_ARGS = -Wl,-rpath-link $(DPDK_ABS_DIR)/lib
+ENV_LINKER_ARGS += $(call dpdk_env_linker_args,$(DPDK_LIB_LIST))
+
+ifeq ($(CONFIG_IPSEC_MB),y)
+ENV_LINKER_ARGS += -lIPSec_MB -L$(IPSEC_MB_DIR)
+endif
+
+ifeq ($(CONFIG_REDUCE),y)
+ENV_LINKER_ARGS += -lisal -L$(ISAL_DIR)/.libs
+endif
+
+ifneq (,$(wildcard $(DPDK_INC_DIR)/rte_config.h))
+ifneq (,$(shell grep -e "define RTE_LIBRTE_VHOST_NUMA 1" -e "define RTE_EAL_NUMA_AWARE_HUGEPAGES 1" $(DPDK_INC_DIR)/rte_config.h))
+ENV_LINKER_ARGS += -lnuma
+endif
+endif
+
+# DPDK built with meson puts those defines elsewhere
+ifneq (,$(wildcard $(DPDK_INC_DIR)/rte_build_config.h))
+ifneq (,$(shell grep -e "define RTE_LIBRTE_VHOST_NUMA 1" -e "define RTE_EAL_NUMA_AWARE_HUGEPAGES 1" $(DPDK_INC_DIR)/rte_build_config.h))
+ENV_LINKER_ARGS += -lnuma
+endif
+endif
+
+ifeq ($(OS),Linux)
+ENV_LINKER_ARGS += -ldl
+endif
+ifeq ($(OS),FreeBSD)
+ENV_LINKER_ARGS += -lexecinfo
+endif
diff --git a/src/spdk/lib/env_dpdk/env_internal.h b/src/spdk/lib/env_dpdk/env_internal.h
new file mode 100644
index 000000000..c7900d9d3
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/env_internal.h
@@ -0,0 +1,98 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_ENV_INTERNAL_H
+#define SPDK_ENV_INTERNAL_H
+
+#include "spdk/stdinc.h"
+
+#include "spdk/env.h"
+
+#include <rte_config.h>
+#include <rte_version.h>
+#include <rte_eal.h>
+#include <rte_bus.h>
+#include <rte_pci.h>
+#include <rte_bus_pci.h>
+#include <rte_dev.h>
+
+#if RTE_VERSION < RTE_VERSION_NUM(18, 11, 0, 0)
+#error RTE_VERSION is too old! Minimum 18.11 is required.
+#endif
+
+/* x86-64 and ARM userspace virtual addresses use only the low 48 bits [0..47],
+ * which is enough to cover 256 TB.
+ */
+#define SHIFT_256TB 48 /* (1 << 48) == 256 TB */
+#define MASK_256TB ((1ULL << SHIFT_256TB) - 1)
+
+#define SHIFT_1GB 30 /* (1 << 30) == 1 GB */
+#define MASK_1GB ((1ULL << SHIFT_1GB) - 1)
+
+#define SPDK_PCI_DRIVER_MAX_NAME_LEN 32
+struct spdk_pci_driver {
+ struct rte_pci_driver driver;
+
+ const char *name;
+ const struct spdk_pci_id *id_table;
+ uint32_t drv_flags;
+
+ spdk_pci_enum_cb cb_fn;
+ void *cb_arg;
+ TAILQ_ENTRY(spdk_pci_driver) tailq;
+};
+
+int pci_device_init(struct rte_pci_driver *driver, struct rte_pci_device *device);
+int pci_device_fini(struct rte_pci_device *device);
+
+void pci_env_init(void);
+void pci_env_reinit(void);
+void pci_env_fini(void);
+int mem_map_init(bool legacy_mem);
+int vtophys_init(void);
+
+/**
+ * Report a DMA-capable PCI device to the vtophys translation code.
+ * Increases the refcount of active DMA-capable devices managed by SPDK.
+ * This must be called after a `rte_pci_device` is created.
+ */
+void vtophys_pci_device_added(struct rte_pci_device *pci_device);
+
+/**
+ * Report the removal of a DMA-capable PCI device to the vtophys translation code.
+ * Decreases the refcount of active DMA-capable devices managed by SPDK.
+ * This must be called before a `rte_pci_device` is destroyed.
+ */
+void vtophys_pci_device_removed(struct rte_pci_device *pci_device);
+
+#endif
diff --git a/src/spdk/lib/env_dpdk/init.c b/src/spdk/lib/env_dpdk/init.c
new file mode 100644
index 000000000..0376dbe7b
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/init.c
@@ -0,0 +1,604 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "env_internal.h"
+
+#include "spdk/version.h"
+#include "spdk/env_dpdk.h"
+
+#include <rte_config.h>
+#include <rte_eal.h>
+#include <rte_errno.h>
+#include <rte_vfio.h>
+
+#define SPDK_ENV_DPDK_DEFAULT_NAME "spdk"
+#define SPDK_ENV_DPDK_DEFAULT_SHM_ID -1
+#define SPDK_ENV_DPDK_DEFAULT_MEM_SIZE -1
+#define SPDK_ENV_DPDK_DEFAULT_MASTER_CORE -1
+#define SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL -1
+#define SPDK_ENV_DPDK_DEFAULT_CORE_MASK "0x1"
+#define SPDK_ENV_DPDK_DEFAULT_BASE_VIRTADDR 0x200000000000
+
+static char **g_eal_cmdline;
+static int g_eal_cmdline_argcount;
+static bool g_external_init = true;
+
+static char *
+_sprintf_alloc(const char *format, ...)
+{
+ va_list args;
+ va_list args_copy;
+ char *buf;
+ size_t bufsize;
+ int rc;
+
+ va_start(args, format);
+
+ /* Try with a small buffer first. */
+ bufsize = 32;
+
+ /* Limit maximum buffer size to something reasonable so we don't loop forever. */
+ while (bufsize <= 1024 * 1024) {
+ buf = malloc(bufsize);
+ if (buf == NULL) {
+ va_end(args);
+ return NULL;
+ }
+
+ va_copy(args_copy, args);
+ rc = vsnprintf(buf, bufsize, format, args_copy);
+ va_end(args_copy);
+
+ /*
+ * If vsnprintf() returned a count within our current buffer size, we are done.
+ * The count does not include the \0 terminator, so rc == bufsize is not OK.
+ */
+ if (rc >= 0 && (size_t)rc < bufsize) {
+ va_end(args);
+ return buf;
+ }
+
+ /*
+ * vsnprintf() should return the required space, but some libc versions do not
+ * implement this correctly, so just double the buffer size and try again.
+ *
+ * We don't need the data in buf, so rather than realloc(), use free() and malloc()
+ * again to avoid a copy.
+ */
+ free(buf);
+ bufsize *= 2;
+ }
+
+ va_end(args);
+ return NULL;
+}
+
+void
+spdk_env_opts_init(struct spdk_env_opts *opts)
+{
+ if (!opts) {
+ return;
+ }
+
+ memset(opts, 0, sizeof(*opts));
+
+ opts->name = SPDK_ENV_DPDK_DEFAULT_NAME;
+ opts->core_mask = SPDK_ENV_DPDK_DEFAULT_CORE_MASK;
+ opts->shm_id = SPDK_ENV_DPDK_DEFAULT_SHM_ID;
+ opts->mem_size = SPDK_ENV_DPDK_DEFAULT_MEM_SIZE;
+ opts->master_core = SPDK_ENV_DPDK_DEFAULT_MASTER_CORE;
+ opts->mem_channel = SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL;
+ opts->base_virtaddr = SPDK_ENV_DPDK_DEFAULT_BASE_VIRTADDR;
+}
+
+static void
+free_args(char **args, int argcount)
+{
+ int i;
+
+ if (args == NULL) {
+ return;
+ }
+
+ for (i = 0; i < argcount; i++) {
+ free(args[i]);
+ }
+
+ if (argcount) {
+ free(args);
+ }
+}
+
+static char **
+push_arg(char *args[], int *argcount, char *arg)
+{
+ char **tmp;
+
+ if (arg == NULL) {
+ fprintf(stderr, "%s: NULL arg supplied\n", __func__);
+ free_args(args, *argcount);
+ return NULL;
+ }
+
+ tmp = realloc(args, sizeof(char *) * (*argcount + 1));
+ if (tmp == NULL) {
+ free(arg);
+ free_args(args, *argcount);
+ return NULL;
+ }
+
+ tmp[*argcount] = arg;
+ (*argcount)++;
+
+ return tmp;
+}
+
+#if defined(__linux__) && defined(__x86_64__)
+
+/* TODO: Can likely get this value from rlimits in the future */
+#define SPDK_IOMMU_VA_REQUIRED_WIDTH 48
+#define VTD_CAP_MGAW_SHIFT 16
+#define VTD_CAP_MGAW_MASK (0x3F << VTD_CAP_MGAW_SHIFT)
+
+static int
+get_iommu_width(void)
+{
+ DIR *dir;
+ FILE *file;
+ struct dirent *entry;
+ char mgaw_path[64];
+ char buf[64];
+ char *end;
+ long long int val;
+ int width, tmp;
+
+ dir = opendir("/sys/devices/virtual/iommu/");
+ if (dir == NULL) {
+ return -EINVAL;
+ }
+
+ width = 0;
+
+ while ((entry = readdir(dir)) != NULL) {
+ /* Find directories named "dmar0", "dmar1", etc */
+ if (strncmp(entry->d_name, "dmar", sizeof("dmar") - 1) != 0) {
+ continue;
+ }
+
+ tmp = snprintf(mgaw_path, sizeof(mgaw_path), "/sys/devices/virtual/iommu/%s/intel-iommu/cap",
+ entry->d_name);
+ if ((unsigned)tmp >= sizeof(mgaw_path)) {
+ continue;
+ }
+
+ file = fopen(mgaw_path, "r");
+ if (file == NULL) {
+ continue;
+ }
+
+ if (fgets(buf, sizeof(buf), file) == NULL) {
+ fclose(file);
+ continue;
+ }
+
+ val = strtoll(buf, &end, 16);
+ if (val == LLONG_MIN || val == LLONG_MAX) {
+ fclose(file);
+ continue;
+ }
+
+ tmp = ((val & VTD_CAP_MGAW_MASK) >> VTD_CAP_MGAW_SHIFT) + 1;
+ if (width == 0 || tmp < width) {
+ width = tmp;
+ }
+
+ fclose(file);
+ }
+
+ closedir(dir);
+
+ return width;
+}
+
+#endif
+
+static int
+build_eal_cmdline(const struct spdk_env_opts *opts)
+{
+ int argcount = 0;
+ char **args;
+
+ args = NULL;
+
+ /* set the program name */
+ args = push_arg(args, &argcount, _sprintf_alloc("%s", opts->name));
+ if (args == NULL) {
+ return -1;
+ }
+
+ /* disable shared configuration files when in single process mode. This allows for cleaner shutdown */
+ if (opts->shm_id < 0) {
+ args = push_arg(args, &argcount, _sprintf_alloc("%s", "--no-shconf"));
+ if (args == NULL) {
+ return -1;
+ }
+ }
+
+ /* set the coremask */
+ /* NOTE: If coremask starts with '[' and ends with ']' it is a core list
+ */
+ if (opts->core_mask[0] == '[') {
+ char *l_arg = _sprintf_alloc("-l %s", opts->core_mask + 1);
+
+ if (l_arg != NULL) {
+ int len = strlen(l_arg);
+
+ if (l_arg[len - 1] == ']') {
+ l_arg[len - 1] = '\0';
+ }
+ }
+ args = push_arg(args, &argcount, l_arg);
+ } else {
+ args = push_arg(args, &argcount, _sprintf_alloc("-c %s", opts->core_mask));
+ }
+
+ if (args == NULL) {
+ return -1;
+ }
+
+ /* set the memory channel number */
+ if (opts->mem_channel > 0) {
+ args = push_arg(args, &argcount, _sprintf_alloc("-n %d", opts->mem_channel));
+ if (args == NULL) {
+ return -1;
+ }
+ }
+
+ /* set the memory size */
+ if (opts->mem_size >= 0) {
+ args = push_arg(args, &argcount, _sprintf_alloc("-m %d", opts->mem_size));
+ if (args == NULL) {
+ return -1;
+ }
+ }
+
+ /* set the master core */
+ if (opts->master_core > 0) {
+ args = push_arg(args, &argcount, _sprintf_alloc("--master-lcore=%d",
+ opts->master_core));
+ if (args == NULL) {
+ return -1;
+ }
+ }
+
+ /* set no pci if enabled */
+ if (opts->no_pci) {
+ args = push_arg(args, &argcount, _sprintf_alloc("--no-pci"));
+ if (args == NULL) {
+ return -1;
+ }
+ }
+
+ /* create just one hugetlbfs file */
+ if (opts->hugepage_single_segments) {
+ args = push_arg(args, &argcount, _sprintf_alloc("--single-file-segments"));
+ if (args == NULL) {
+ return -1;
+ }
+ }
+
+ /* unlink hugepages after initialization */
+ if (opts->unlink_hugepage) {
+ args = push_arg(args, &argcount, _sprintf_alloc("--huge-unlink"));
+ if (args == NULL) {
+ return -1;
+ }
+ }
+
+ /* use a specific hugetlbfs mount */
+ if (opts->hugedir) {
+ args = push_arg(args, &argcount, _sprintf_alloc("--huge-dir=%s", opts->hugedir));
+ if (args == NULL) {
+ return -1;
+ }
+ }
+
+ if (opts->num_pci_addr) {
+ size_t i;
+ char bdf[32];
+ struct spdk_pci_addr *pci_addr =
+ opts->pci_blacklist ? opts->pci_blacklist : opts->pci_whitelist;
+
+ for (i = 0; i < opts->num_pci_addr; i++) {
+ spdk_pci_addr_fmt(bdf, 32, &pci_addr[i]);
+ args = push_arg(args, &argcount, _sprintf_alloc("%s=%s",
+ (opts->pci_blacklist ? "--pci-blacklist" : "--pci-whitelist"),
+ bdf));
+ if (args == NULL) {
+ return -1;
+ }
+ }
+ }
+
+ /* Lower default EAL loglevel to RTE_LOG_NOTICE - normal, but significant messages.
+ * This can be overridden by specifying the same option in opts->env_context
+ */
+ args = push_arg(args, &argcount, strdup("--log-level=lib.eal:6"));
+ if (args == NULL) {
+ return -1;
+ }
+
+ /* Lower default CRYPTO loglevel to RTE_LOG_ERR to avoid a ton of init msgs.
+ * This can be overridden by specifying the same option in opts->env_context
+ */
+ args = push_arg(args, &argcount, strdup("--log-level=lib.cryptodev:5"));
+ if (args == NULL) {
+ return -1;
+ }
+
+ /* `user1` log type is used by rte_vhost, which prints an INFO log for each received
+ * vhost user message. We don't want that. The same log type is also used by a couple
+ * of other DPDK libs, but none of which we make use right now. If necessary, this can
+ * be overridden via opts->env_context.
+ */
+ args = push_arg(args, &argcount, strdup("--log-level=user1:6"));
+ if (args == NULL) {
+ return -1;
+ }
+
+ if (opts->env_context) {
+ args = push_arg(args, &argcount, strdup(opts->env_context));
+ if (args == NULL) {
+ return -1;
+ }
+ }
+
+#ifdef __linux__
+
+ if (opts->iova_mode) {
+ args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=%s", opts->iova_mode));
+ if (args == NULL) {
+ return -1;
+ }
+ } else {
+ /* When using vfio with enable_unsafe_noiommu_mode=Y, we need iova-mode=pa,
+ * but DPDK guesses it should be iova-mode=va. Add a check and force
+ * iova-mode=pa here. */
+ if (rte_vfio_noiommu_is_enabled()) {
+ args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa"));
+ if (args == NULL) {
+ return -1;
+ }
+ }
+
+#if defined(__x86_64__)
+ /* DPDK by default guesses that it should be using iova-mode=va so that it can
+ * support running as an unprivileged user. However, some systems (especially
+ * virtual machines) don't have an IOMMU capable of handling the full virtual
+ * address space and DPDK doesn't currently catch that. Add a check in SPDK
+ * and force iova-mode=pa here. */
+ if (get_iommu_width() < SPDK_IOMMU_VA_REQUIRED_WIDTH) {
+ args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa"));
+ if (args == NULL) {
+ return -1;
+ }
+ }
+#elif defined(__PPC64__)
+ /* On Linux + PowerPC, DPDK doesn't support VA mode at all. Unfortunately, it doesn't correctly
+ * auto-detect at the moment, so we'll just force it here. */
+ args = push_arg(args, &argcount, _sprintf_alloc("--iova-mode=pa"));
+ if (args == NULL) {
+ return -1;
+ }
+#endif
+ }
+
+
+ /* Set the base virtual address - it must be an address that is not in the
+ * ASAN shadow region, otherwise ASAN-enabled builds will ignore the
+ * mmap hint.
+ *
+ * Ref: https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm
+ */
+ args = push_arg(args, &argcount, _sprintf_alloc("--base-virtaddr=0x%" PRIx64, opts->base_virtaddr));
+ if (args == NULL) {
+ return -1;
+ }
+
+ /* --match-allocation prevents DPDK from merging or splitting system memory allocations under the hood.
+ * This is critical for RDMA when attempting to use an rte_mempool based buffer pool. If DPDK merges two
+ * physically or IOVA contiguous memory regions, then when we go to allocate a buffer pool, it can split
+ * the memory for a buffer over two allocations meaning the buffer will be split over a memory region.
+ */
+#if RTE_VERSION >= RTE_VERSION_NUM(19, 02, 0, 0)
+ if (!opts->env_context || strstr(opts->env_context, "--legacy-mem") == NULL) {
+ args = push_arg(args, &argcount, _sprintf_alloc("%s", "--match-allocations"));
+ if (args == NULL) {
+ return -1;
+ }
+ }
+#endif
+
+ if (opts->shm_id < 0) {
+ args = push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk_pid%d",
+ getpid()));
+ if (args == NULL) {
+ return -1;
+ }
+ } else {
+ args = push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk%d",
+ opts->shm_id));
+ if (args == NULL) {
+ return -1;
+ }
+
+ /* set the process type */
+ args = push_arg(args, &argcount, _sprintf_alloc("--proc-type=auto"));
+ if (args == NULL) {
+ return -1;
+ }
+ }
+#endif
+
+ g_eal_cmdline = args;
+ g_eal_cmdline_argcount = argcount;
+ return argcount;
+}
+
+int
+spdk_env_dpdk_post_init(bool legacy_mem)
+{
+ int rc;
+
+ pci_env_init();
+
+ rc = mem_map_init(legacy_mem);
+ if (rc < 0) {
+ fprintf(stderr, "Failed to allocate mem_map\n");
+ return rc;
+ }
+
+ rc = vtophys_init();
+ if (rc < 0) {
+ fprintf(stderr, "Failed to initialize vtophys\n");
+ return rc;
+ }
+
+ return 0;
+}
+
+void
+spdk_env_dpdk_post_fini(void)
+{
+ pci_env_fini();
+
+ free_args(g_eal_cmdline, g_eal_cmdline_argcount);
+ g_eal_cmdline = NULL;
+ g_eal_cmdline_argcount = 0;
+}
+
+int
+spdk_env_init(const struct spdk_env_opts *opts)
+{
+ char **dpdk_args = NULL;
+ int i, rc;
+ int orig_optind;
+ bool legacy_mem;
+
+ /* If SPDK env has been initialized before, then only pci env requires
+ * reinitialization.
+ */
+ if (g_external_init == false) {
+ if (opts != NULL) {
+ fprintf(stderr, "Invalid arguments to reinitialize SPDK env\n");
+ return -EINVAL;
+ }
+
+ printf("Starting %s / %s reinitialization...\n", SPDK_VERSION_STRING, rte_version());
+ pci_env_reinit();
+
+ return 0;
+ }
+
+ if (opts == NULL) {
+ fprintf(stderr, "NULL arguments to initialize DPDK\n");
+ return -EINVAL;
+ }
+
+ rc = build_eal_cmdline(opts);
+ if (rc < 0) {
+ fprintf(stderr, "Invalid arguments to initialize DPDK\n");
+ return -EINVAL;
+ }
+
+ printf("Starting %s / %s initialization...\n", SPDK_VERSION_STRING, rte_version());
+ printf("[ DPDK EAL parameters: ");
+ for (i = 0; i < g_eal_cmdline_argcount; i++) {
+ printf("%s ", g_eal_cmdline[i]);
+ }
+ printf("]\n");
+
+ /* DPDK rearranges the array we pass to it, so make a copy
+ * before passing so we can still free the individual strings
+ * correctly.
+ */
+ dpdk_args = calloc(g_eal_cmdline_argcount, sizeof(char *));
+ if (dpdk_args == NULL) {
+ fprintf(stderr, "Failed to allocate dpdk_args\n");
+ return -ENOMEM;
+ }
+ memcpy(dpdk_args, g_eal_cmdline, sizeof(char *) * g_eal_cmdline_argcount);
+
+ fflush(stdout);
+ orig_optind = optind;
+ optind = 1;
+ rc = rte_eal_init(g_eal_cmdline_argcount, dpdk_args);
+ optind = orig_optind;
+
+ free(dpdk_args);
+
+ if (rc < 0) {
+ if (rte_errno == EALREADY) {
+ fprintf(stderr, "DPDK already initialized\n");
+ } else {
+ fprintf(stderr, "Failed to initialize DPDK\n");
+ }
+ return -rte_errno;
+ }
+
+ legacy_mem = false;
+ if (opts->env_context && strstr(opts->env_context, "--legacy-mem") != NULL) {
+ legacy_mem = true;
+ }
+
+ rc = spdk_env_dpdk_post_init(legacy_mem);
+ if (rc == 0) {
+ g_external_init = false;
+ }
+
+ return rc;
+}
+
+void
+spdk_env_fini(void)
+{
+ spdk_env_dpdk_post_fini();
+}
+
+bool
+spdk_env_dpdk_external_init(void)
+{
+ return g_external_init;
+}
diff --git a/src/spdk/lib/env_dpdk/memory.c b/src/spdk/lib/env_dpdk/memory.c
new file mode 100644
index 000000000..4c2205a46
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/memory.c
@@ -0,0 +1,1442 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "env_internal.h"
+
+#include <rte_config.h>
+#include <rte_memory.h>
+#include <rte_eal_memconfig.h>
+
+#include "spdk_internal/assert.h"
+
+#include "spdk/assert.h"
+#include "spdk/likely.h"
+#include "spdk/queue.h"
+#include "spdk/util.h"
+#include "spdk/memory.h"
+#include "spdk/env_dpdk.h"
+
+#ifdef __FreeBSD__
+#define VFIO_ENABLED 0
+#else
+#include <linux/version.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0)
+#define VFIO_ENABLED 1
+#include <linux/vfio.h>
+#include <rte_vfio.h>
+
+struct spdk_vfio_dma_map {
+ struct vfio_iommu_type1_dma_map map;
+ struct vfio_iommu_type1_dma_unmap unmap;
+ TAILQ_ENTRY(spdk_vfio_dma_map) tailq;
+};
+
+struct vfio_cfg {
+ int fd;
+ bool enabled;
+ bool noiommu_enabled;
+ unsigned device_ref;
+ TAILQ_HEAD(, spdk_vfio_dma_map) maps;
+ pthread_mutex_t mutex;
+};
+
+static struct vfio_cfg g_vfio = {
+ .fd = -1,
+ .enabled = false,
+ .noiommu_enabled = false,
+ .device_ref = 0,
+ .maps = TAILQ_HEAD_INITIALIZER(g_vfio.maps),
+ .mutex = PTHREAD_MUTEX_INITIALIZER
+};
+
+#else
+#define VFIO_ENABLED 0
+#endif
+#endif
+
+#if DEBUG
+#define DEBUG_PRINT(...) fprintf(stderr, __VA_ARGS__)
+#else
+#define DEBUG_PRINT(...)
+#endif
+
+#define FN_2MB_TO_4KB(fn) (fn << (SHIFT_2MB - SHIFT_4KB))
+#define FN_4KB_TO_2MB(fn) (fn >> (SHIFT_2MB - SHIFT_4KB))
+
+#define MAP_256TB_IDX(vfn_2mb) ((vfn_2mb) >> (SHIFT_1GB - SHIFT_2MB))
+#define MAP_1GB_IDX(vfn_2mb) ((vfn_2mb) & ((1ULL << (SHIFT_1GB - SHIFT_2MB)) - 1))
+
+/* Page is registered */
+#define REG_MAP_REGISTERED (1ULL << 62)
+
+/* A notification region barrier. The 2MB translation entry that's marked
+ * with this flag must be unregistered separately. This allows contiguous
+ * regions to be unregistered in the same chunks they were registered.
+ */
+#define REG_MAP_NOTIFY_START (1ULL << 63)
+
+/* Translation of a single 2MB page. */
+struct map_2mb {
+ uint64_t translation_2mb;
+};
+
+/* Second-level map table indexed by bits [21..29] of the virtual address.
+ * Each entry contains the address translation or error for entries that haven't
+ * been retrieved yet.
+ */
+struct map_1gb {
+ struct map_2mb map[1ULL << (SHIFT_1GB - SHIFT_2MB)];
+};
+
+/* Top-level map table indexed by bits [30..47] of the virtual address.
+ * Each entry points to a second-level map table or NULL.
+ */
+struct map_256tb {
+ struct map_1gb *map[1ULL << (SHIFT_256TB - SHIFT_1GB)];
+};
+
+/* Page-granularity memory address translation */
+struct spdk_mem_map {
+ struct map_256tb map_256tb;
+ pthread_mutex_t mutex;
+ uint64_t default_translation;
+ struct spdk_mem_map_ops ops;
+ void *cb_ctx;
+ TAILQ_ENTRY(spdk_mem_map) tailq;
+};
+
+/* Registrations map. The 64 bit translations are bit fields with the
+ * following layout (starting with the low bits):
+ * 0 - 61 : reserved
+ * 62 - 63 : flags
+ */
+static struct spdk_mem_map *g_mem_reg_map;
+static TAILQ_HEAD(spdk_mem_map_head, spdk_mem_map) g_spdk_mem_maps =
+ TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps);
+static pthread_mutex_t g_spdk_mem_map_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static bool g_legacy_mem;
+
+/*
+ * Walk the currently registered memory via the main memory registration map
+ * and call the new map's notify callback for each virtually contiguous region.
+ */
+static int
+mem_map_notify_walk(struct spdk_mem_map *map, enum spdk_mem_map_notify_action action)
+{
+ size_t idx_256tb;
+ uint64_t idx_1gb;
+ uint64_t contig_start = UINT64_MAX;
+ uint64_t contig_end = UINT64_MAX;
+ struct map_1gb *map_1gb;
+ int rc;
+
+ if (!g_mem_reg_map) {
+ return -EINVAL;
+ }
+
+ /* Hold the memory registration map mutex so no new registrations can be added while we are looping. */
+ pthread_mutex_lock(&g_mem_reg_map->mutex);
+
+ for (idx_256tb = 0;
+ idx_256tb < sizeof(g_mem_reg_map->map_256tb.map) / sizeof(g_mem_reg_map->map_256tb.map[0]);
+ idx_256tb++) {
+ map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb];
+
+ if (!map_1gb) {
+ if (contig_start != UINT64_MAX) {
+ /* End of of a virtually contiguous range */
+ rc = map->ops.notify_cb(map->cb_ctx, map, action,
+ (void *)contig_start,
+ contig_end - contig_start + VALUE_2MB);
+ /* Don't bother handling unregister failures. It can't be any worse */
+ if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) {
+ goto err_unregister;
+ }
+ }
+ contig_start = UINT64_MAX;
+ continue;
+ }
+
+ for (idx_1gb = 0; idx_1gb < sizeof(map_1gb->map) / sizeof(map_1gb->map[0]); idx_1gb++) {
+ if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) &&
+ (contig_start == UINT64_MAX ||
+ (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) {
+ /* Rebuild the virtual address from the indexes */
+ uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB);
+
+ if (contig_start == UINT64_MAX) {
+ contig_start = vaddr;
+ }
+
+ contig_end = vaddr;
+ } else {
+ if (contig_start != UINT64_MAX) {
+ /* End of of a virtually contiguous range */
+ rc = map->ops.notify_cb(map->cb_ctx, map, action,
+ (void *)contig_start,
+ contig_end - contig_start + VALUE_2MB);
+ /* Don't bother handling unregister failures. It can't be any worse */
+ if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) {
+ goto err_unregister;
+ }
+
+ /* This page might be a part of a neighbour region, so process
+ * it again. The idx_1gb will be incremented immediately.
+ */
+ idx_1gb--;
+ }
+ contig_start = UINT64_MAX;
+ }
+ }
+ }
+
+ pthread_mutex_unlock(&g_mem_reg_map->mutex);
+ return 0;
+
+err_unregister:
+ /* Unwind to the first empty translation so we don't unregister
+ * a region that just failed to register.
+ */
+ idx_256tb = MAP_256TB_IDX((contig_start >> SHIFT_2MB) - 1);
+ idx_1gb = MAP_1GB_IDX((contig_start >> SHIFT_2MB) - 1);
+ contig_start = UINT64_MAX;
+ contig_end = UINT64_MAX;
+
+ /* Unregister any memory we managed to register before the failure */
+ for (; idx_256tb < SIZE_MAX; idx_256tb--) {
+ map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb];
+
+ if (!map_1gb) {
+ if (contig_end != UINT64_MAX) {
+ /* End of of a virtually contiguous range */
+ map->ops.notify_cb(map->cb_ctx, map,
+ SPDK_MEM_MAP_NOTIFY_UNREGISTER,
+ (void *)contig_start,
+ contig_end - contig_start + VALUE_2MB);
+ }
+ contig_end = UINT64_MAX;
+ continue;
+ }
+
+ for (; idx_1gb < UINT64_MAX; idx_1gb--) {
+ if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) &&
+ (contig_end == UINT64_MAX || (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) {
+ /* Rebuild the virtual address from the indexes */
+ uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB);
+
+ if (contig_end == UINT64_MAX) {
+ contig_end = vaddr;
+ }
+ contig_start = vaddr;
+ } else {
+ if (contig_end != UINT64_MAX) {
+ /* End of of a virtually contiguous range */
+ map->ops.notify_cb(map->cb_ctx, map,
+ SPDK_MEM_MAP_NOTIFY_UNREGISTER,
+ (void *)contig_start,
+ contig_end - contig_start + VALUE_2MB);
+ idx_1gb++;
+ }
+ contig_end = UINT64_MAX;
+ }
+ }
+ idx_1gb = sizeof(map_1gb->map) / sizeof(map_1gb->map[0]) - 1;
+ }
+
+ pthread_mutex_unlock(&g_mem_reg_map->mutex);
+ return rc;
+}
+
+struct spdk_mem_map *
+spdk_mem_map_alloc(uint64_t default_translation, const struct spdk_mem_map_ops *ops, void *cb_ctx)
+{
+ struct spdk_mem_map *map;
+ int rc;
+
+ map = calloc(1, sizeof(*map));
+ if (map == NULL) {
+ return NULL;
+ }
+
+ if (pthread_mutex_init(&map->mutex, NULL)) {
+ free(map);
+ return NULL;
+ }
+
+ map->default_translation = default_translation;
+ map->cb_ctx = cb_ctx;
+ if (ops) {
+ map->ops = *ops;
+ }
+
+ if (ops && ops->notify_cb) {
+ pthread_mutex_lock(&g_spdk_mem_map_mutex);
+ rc = mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_REGISTER);
+ if (rc != 0) {
+ pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+ DEBUG_PRINT("Initial mem_map notify failed\n");
+ pthread_mutex_destroy(&map->mutex);
+ free(map);
+ return NULL;
+ }
+ TAILQ_INSERT_TAIL(&g_spdk_mem_maps, map, tailq);
+ pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+ }
+
+ return map;
+}
+
+void
+spdk_mem_map_free(struct spdk_mem_map **pmap)
+{
+ struct spdk_mem_map *map;
+ size_t i;
+
+ if (!pmap) {
+ return;
+ }
+
+ map = *pmap;
+
+ if (!map) {
+ return;
+ }
+
+ if (map->ops.notify_cb) {
+ pthread_mutex_lock(&g_spdk_mem_map_mutex);
+ mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_UNREGISTER);
+ TAILQ_REMOVE(&g_spdk_mem_maps, map, tailq);
+ pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+ }
+
+ for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) {
+ free(map->map_256tb.map[i]);
+ }
+
+ pthread_mutex_destroy(&map->mutex);
+
+ free(map);
+ *pmap = NULL;
+}
+
+int
+spdk_mem_register(void *vaddr, size_t len)
+{
+ struct spdk_mem_map *map;
+ int rc;
+ void *seg_vaddr;
+ size_t seg_len;
+ uint64_t reg;
+
+ if ((uintptr_t)vaddr & ~MASK_256TB) {
+ DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
+ return -EINVAL;
+ }
+
+ if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
+ DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n",
+ __func__, vaddr, len);
+ return -EINVAL;
+ }
+
+ if (len == 0) {
+ return 0;
+ }
+
+ pthread_mutex_lock(&g_spdk_mem_map_mutex);
+
+ seg_vaddr = vaddr;
+ seg_len = len;
+ while (seg_len > 0) {
+ reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
+ if (reg & REG_MAP_REGISTERED) {
+ pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+ return -EBUSY;
+ }
+ seg_vaddr += VALUE_2MB;
+ seg_len -= VALUE_2MB;
+ }
+
+ seg_vaddr = vaddr;
+ seg_len = 0;
+ while (len > 0) {
+ spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB,
+ seg_len == 0 ? REG_MAP_REGISTERED | REG_MAP_NOTIFY_START : REG_MAP_REGISTERED);
+ seg_len += VALUE_2MB;
+ vaddr += VALUE_2MB;
+ len -= VALUE_2MB;
+ }
+
+ TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
+ rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_REGISTER, seg_vaddr, seg_len);
+ if (rc != 0) {
+ pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+ return rc;
+ }
+ }
+
+ pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+ return 0;
+}
+
+int
+spdk_mem_unregister(void *vaddr, size_t len)
+{
+ struct spdk_mem_map *map;
+ int rc;
+ void *seg_vaddr;
+ size_t seg_len;
+ uint64_t reg, newreg;
+
+ if ((uintptr_t)vaddr & ~MASK_256TB) {
+ DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
+ return -EINVAL;
+ }
+
+ if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
+ DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n",
+ __func__, vaddr, len);
+ return -EINVAL;
+ }
+
+ pthread_mutex_lock(&g_spdk_mem_map_mutex);
+
+ /* The first page must be a start of a region. Also check if it's
+ * registered to make sure we don't return -ERANGE for non-registered
+ * regions.
+ */
+ reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL);
+ if ((reg & REG_MAP_REGISTERED) && (reg & REG_MAP_NOTIFY_START) == 0) {
+ pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+ return -ERANGE;
+ }
+
+ seg_vaddr = vaddr;
+ seg_len = len;
+ while (seg_len > 0) {
+ reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
+ if ((reg & REG_MAP_REGISTERED) == 0) {
+ pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+ return -EINVAL;
+ }
+ seg_vaddr += VALUE_2MB;
+ seg_len -= VALUE_2MB;
+ }
+
+ newreg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
+ /* If the next page is registered, it must be a start of a region as well,
+ * otherwise we'd be unregistering only a part of a region.
+ */
+ if ((newreg & REG_MAP_NOTIFY_START) == 0 && (newreg & REG_MAP_REGISTERED)) {
+ pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+ return -ERANGE;
+ }
+ seg_vaddr = vaddr;
+ seg_len = 0;
+
+ while (len > 0) {
+ reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL);
+ spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 0);
+
+ if (seg_len > 0 && (reg & REG_MAP_NOTIFY_START)) {
+ TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) {
+ rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len);
+ if (rc != 0) {
+ pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+ return rc;
+ }
+ }
+
+ seg_vaddr = vaddr;
+ seg_len = VALUE_2MB;
+ } else {
+ seg_len += VALUE_2MB;
+ }
+
+ vaddr += VALUE_2MB;
+ len -= VALUE_2MB;
+ }
+
+ if (seg_len > 0) {
+ TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) {
+ rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len);
+ if (rc != 0) {
+ pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+ return rc;
+ }
+ }
+ }
+
+ pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+ return 0;
+}
+
+int
+spdk_mem_reserve(void *vaddr, size_t len)
+{
+ struct spdk_mem_map *map;
+ void *seg_vaddr;
+ size_t seg_len;
+ uint64_t reg;
+
+ if ((uintptr_t)vaddr & ~MASK_256TB) {
+ DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
+ return -EINVAL;
+ }
+
+ if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
+ DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n",
+ __func__, vaddr, len);
+ return -EINVAL;
+ }
+
+ if (len == 0) {
+ return 0;
+ }
+
+ pthread_mutex_lock(&g_spdk_mem_map_mutex);
+
+ /* Check if any part of this range is already registered */
+ seg_vaddr = vaddr;
+ seg_len = len;
+ while (seg_len > 0) {
+ reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
+ if (reg & REG_MAP_REGISTERED) {
+ pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+ return -EBUSY;
+ }
+ seg_vaddr += VALUE_2MB;
+ seg_len -= VALUE_2MB;
+ }
+
+ /* Simply set the translation to the memory map's default. This allocates the space in the
+ * map but does not provide a valid translation. */
+ spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, len,
+ g_mem_reg_map->default_translation);
+
+ TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
+ spdk_mem_map_set_translation(map, (uint64_t)vaddr, len, map->default_translation);
+ }
+
+ pthread_mutex_unlock(&g_spdk_mem_map_mutex);
+ return 0;
+}
+
+static struct map_1gb *
+mem_map_get_map_1gb(struct spdk_mem_map *map, uint64_t vfn_2mb)
+{
+ struct map_1gb *map_1gb;
+ uint64_t idx_256tb = MAP_256TB_IDX(vfn_2mb);
+ size_t i;
+
+ if (spdk_unlikely(idx_256tb >= SPDK_COUNTOF(map->map_256tb.map))) {
+ return NULL;
+ }
+
+ map_1gb = map->map_256tb.map[idx_256tb];
+
+ if (!map_1gb) {
+ pthread_mutex_lock(&map->mutex);
+
+ /* Recheck to make sure nobody else got the mutex first. */
+ map_1gb = map->map_256tb.map[idx_256tb];
+ if (!map_1gb) {
+ map_1gb = malloc(sizeof(struct map_1gb));
+ if (map_1gb) {
+ /* initialize all entries to default translation */
+ for (i = 0; i < SPDK_COUNTOF(map_1gb->map); i++) {
+ map_1gb->map[i].translation_2mb = map->default_translation;
+ }
+ map->map_256tb.map[idx_256tb] = map_1gb;
+ }
+ }
+
+ pthread_mutex_unlock(&map->mutex);
+
+ if (!map_1gb) {
+ DEBUG_PRINT("allocation failed\n");
+ return NULL;
+ }
+ }
+
+ return map_1gb;
+}
+
+int
+spdk_mem_map_set_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size,
+ uint64_t translation)
+{
+ uint64_t vfn_2mb;
+ struct map_1gb *map_1gb;
+ uint64_t idx_1gb;
+ struct map_2mb *map_2mb;
+
+ if ((uintptr_t)vaddr & ~MASK_256TB) {
+ DEBUG_PRINT("invalid usermode virtual address %lu\n", vaddr);
+ return -EINVAL;
+ }
+
+ /* For now, only 2 MB-aligned registrations are supported */
+ if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) {
+ DEBUG_PRINT("invalid %s parameters, vaddr=%lu len=%ju\n",
+ __func__, vaddr, size);
+ return -EINVAL;
+ }
+
+ vfn_2mb = vaddr >> SHIFT_2MB;
+
+ while (size) {
+ map_1gb = mem_map_get_map_1gb(map, vfn_2mb);
+ if (!map_1gb) {
+ DEBUG_PRINT("could not get %p map\n", (void *)vaddr);
+ return -ENOMEM;
+ }
+
+ idx_1gb = MAP_1GB_IDX(vfn_2mb);
+ map_2mb = &map_1gb->map[idx_1gb];
+ map_2mb->translation_2mb = translation;
+
+ size -= VALUE_2MB;
+ vfn_2mb++;
+ }
+
+ return 0;
+}
+
+int
+spdk_mem_map_clear_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size)
+{
+ return spdk_mem_map_set_translation(map, vaddr, size, map->default_translation);
+}
+
+inline uint64_t
+spdk_mem_map_translate(const struct spdk_mem_map *map, uint64_t vaddr, uint64_t *size)
+{
+ const struct map_1gb *map_1gb;
+ const struct map_2mb *map_2mb;
+ uint64_t idx_256tb;
+ uint64_t idx_1gb;
+ uint64_t vfn_2mb;
+ uint64_t cur_size;
+ uint64_t prev_translation;
+ uint64_t orig_translation;
+
+ if (spdk_unlikely(vaddr & ~MASK_256TB)) {
+ DEBUG_PRINT("invalid usermode virtual address %p\n", (void *)vaddr);
+ return map->default_translation;
+ }
+
+ vfn_2mb = vaddr >> SHIFT_2MB;
+ idx_256tb = MAP_256TB_IDX(vfn_2mb);
+ idx_1gb = MAP_1GB_IDX(vfn_2mb);
+
+ map_1gb = map->map_256tb.map[idx_256tb];
+ if (spdk_unlikely(!map_1gb)) {
+ return map->default_translation;
+ }
+
+ cur_size = VALUE_2MB - _2MB_OFFSET(vaddr);
+ map_2mb = &map_1gb->map[idx_1gb];
+ if (size == NULL || map->ops.are_contiguous == NULL ||
+ map_2mb->translation_2mb == map->default_translation) {
+ if (size != NULL) {
+ *size = spdk_min(*size, cur_size);
+ }
+ return map_2mb->translation_2mb;
+ }
+
+ orig_translation = map_2mb->translation_2mb;
+ prev_translation = orig_translation;
+ while (cur_size < *size) {
+ vfn_2mb++;
+ idx_256tb = MAP_256TB_IDX(vfn_2mb);
+ idx_1gb = MAP_1GB_IDX(vfn_2mb);
+
+ map_1gb = map->map_256tb.map[idx_256tb];
+ if (spdk_unlikely(!map_1gb)) {
+ break;
+ }
+
+ map_2mb = &map_1gb->map[idx_1gb];
+ if (!map->ops.are_contiguous(prev_translation, map_2mb->translation_2mb)) {
+ break;
+ }
+
+ cur_size += VALUE_2MB;
+ prev_translation = map_2mb->translation_2mb;
+ }
+
+ *size = spdk_min(*size, cur_size);
+ return orig_translation;
+}
+
+static void
+memory_hotplug_cb(enum rte_mem_event event_type,
+ const void *addr, size_t len, void *arg)
+{
+ if (event_type == RTE_MEM_EVENT_ALLOC) {
+ spdk_mem_register((void *)addr, len);
+
+#if RTE_VERSION >= RTE_VERSION_NUM(19, 02, 0, 0)
+ if (!spdk_env_dpdk_external_init()) {
+ return;
+ }
+#endif
+
+ /* Prior to DPDK 19.02, we have to worry about DPDK
+ * freeing memory in different units than it was allocated.
+ * That doesn't work with things like RDMA MRs. So for
+ * those versions of DPDK, mark each segment so that DPDK
+ * won't later free it. That ensures we don't have to deal
+ * with that scenario.
+ *
+ * DPDK 19.02 added the --match-allocations RTE flag to
+ * avoid this condition.
+ *
+ * Note: if the user initialized DPDK separately, we can't
+ * be sure that --match-allocations was specified, so need
+ * to still mark the segments so they aren't freed.
+ */
+ while (len > 0) {
+ struct rte_memseg *seg;
+
+ seg = rte_mem_virt2memseg(addr, NULL);
+ assert(seg != NULL);
+ seg->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE;
+ addr = (void *)((uintptr_t)addr + seg->hugepage_sz);
+ len -= seg->hugepage_sz;
+ }
+ } else if (event_type == RTE_MEM_EVENT_FREE) {
+ spdk_mem_unregister((void *)addr, len);
+ }
+}
+
+static int
+memory_iter_cb(const struct rte_memseg_list *msl,
+ const struct rte_memseg *ms, size_t len, void *arg)
+{
+ return spdk_mem_register(ms->addr, len);
+}
+
+int
+mem_map_init(bool legacy_mem)
+{
+ g_legacy_mem = legacy_mem;
+
+ g_mem_reg_map = spdk_mem_map_alloc(0, NULL, NULL);
+ if (g_mem_reg_map == NULL) {
+ DEBUG_PRINT("memory registration map allocation failed\n");
+ return -ENOMEM;
+ }
+
+ /*
+ * Walk all DPDK memory segments and register them
+ * with the master memory map
+ */
+ rte_mem_event_callback_register("spdk", memory_hotplug_cb, NULL);
+ rte_memseg_contig_walk(memory_iter_cb, NULL);
+ return 0;
+}
+
+bool
+spdk_iommu_is_enabled(void)
+{
+#if VFIO_ENABLED
+ return g_vfio.enabled && !g_vfio.noiommu_enabled;
+#else
+ return false;
+#endif
+}
+
+struct spdk_vtophys_pci_device {
+ struct rte_pci_device *pci_device;
+ TAILQ_ENTRY(spdk_vtophys_pci_device) tailq;
+};
+
+static pthread_mutex_t g_vtophys_pci_devices_mutex = PTHREAD_MUTEX_INITIALIZER;
+static TAILQ_HEAD(, spdk_vtophys_pci_device) g_vtophys_pci_devices =
+ TAILQ_HEAD_INITIALIZER(g_vtophys_pci_devices);
+
+static struct spdk_mem_map *g_vtophys_map;
+static struct spdk_mem_map *g_phys_ref_map;
+
+#if VFIO_ENABLED
+static int
+vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size)
+{
+ struct spdk_vfio_dma_map *dma_map;
+ uint64_t refcount;
+ int ret;
+
+ refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL);
+ assert(refcount < UINT64_MAX);
+ if (refcount > 0) {
+ spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1);
+ return 0;
+ }
+
+ dma_map = calloc(1, sizeof(*dma_map));
+ if (dma_map == NULL) {
+ return -ENOMEM;
+ }
+
+ dma_map->map.argsz = sizeof(dma_map->map);
+ dma_map->map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
+ dma_map->map.vaddr = vaddr;
+ dma_map->map.iova = iova;
+ dma_map->map.size = size;
+
+ dma_map->unmap.argsz = sizeof(dma_map->unmap);
+ dma_map->unmap.flags = 0;
+ dma_map->unmap.iova = iova;
+ dma_map->unmap.size = size;
+
+ pthread_mutex_lock(&g_vfio.mutex);
+ if (g_vfio.device_ref == 0) {
+ /* VFIO requires at least one device (IOMMU group) to be added to
+ * a VFIO container before it is possible to perform any IOMMU
+ * operations on that container. This memory will be mapped once
+ * the first device (IOMMU group) is hotplugged.
+ *
+ * Since the vfio container is managed internally by DPDK, it is
+ * also possible that some device is already in that container, but
+ * it's not managed by SPDK - e.g. an NIC attached internally
+ * inside DPDK. We could map the memory straight away in such
+ * scenario, but there's no need to do it. DPDK devices clearly
+ * don't need our mappings and hence we defer the mapping
+ * unconditionally until the first SPDK-managed device is
+ * hotplugged.
+ */
+ goto out_insert;
+ }
+
+ ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map);
+ if (ret) {
+ DEBUG_PRINT("Cannot set up DMA mapping, error %d\n", errno);
+ pthread_mutex_unlock(&g_vfio.mutex);
+ free(dma_map);
+ return ret;
+ }
+
+out_insert:
+ TAILQ_INSERT_TAIL(&g_vfio.maps, dma_map, tailq);
+ pthread_mutex_unlock(&g_vfio.mutex);
+ spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1);
+ return 0;
+}
+
+static int
+vtophys_iommu_unmap_dma(uint64_t iova, uint64_t size)
+{
+ struct spdk_vfio_dma_map *dma_map;
+ uint64_t refcount;
+ int ret;
+
+ pthread_mutex_lock(&g_vfio.mutex);
+ TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
+ if (dma_map->map.iova == iova) {
+ break;
+ }
+ }
+
+ if (dma_map == NULL) {
+ DEBUG_PRINT("Cannot clear DMA mapping for IOVA %"PRIx64" - it's not mapped\n", iova);
+ pthread_mutex_unlock(&g_vfio.mutex);
+ return -ENXIO;
+ }
+
+ refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL);
+ assert(refcount < UINT64_MAX);
+ if (refcount > 0) {
+ spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount - 1);
+ }
+
+ /* We still have outstanding references, don't clear it. */
+ if (refcount > 1) {
+ pthread_mutex_unlock(&g_vfio.mutex);
+ return 0;
+ }
+
+ /** don't support partial or multiple-page unmap for now */
+ assert(dma_map->map.size == size);
+
+ if (g_vfio.device_ref == 0) {
+ /* Memory is not mapped anymore, just remove it's references */
+ goto out_remove;
+ }
+
+
+ ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &dma_map->unmap);
+ if (ret) {
+ DEBUG_PRINT("Cannot clear DMA mapping, error %d\n", errno);
+ pthread_mutex_unlock(&g_vfio.mutex);
+ return ret;
+ }
+
+out_remove:
+ TAILQ_REMOVE(&g_vfio.maps, dma_map, tailq);
+ pthread_mutex_unlock(&g_vfio.mutex);
+ free(dma_map);
+ return 0;
+}
+#endif
+
+static uint64_t
+vtophys_get_paddr_memseg(uint64_t vaddr)
+{
+ uintptr_t paddr;
+ struct rte_memseg *seg;
+
+ seg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr, NULL);
+ if (seg != NULL) {
+ paddr = seg->phys_addr;
+ if (paddr == RTE_BAD_IOVA) {
+ return SPDK_VTOPHYS_ERROR;
+ }
+ paddr += (vaddr - (uintptr_t)seg->addr);
+ return paddr;
+ }
+
+ return SPDK_VTOPHYS_ERROR;
+}
+
+/* Try to get the paddr from /proc/self/pagemap */
+static uint64_t
+vtophys_get_paddr_pagemap(uint64_t vaddr)
+{
+ uintptr_t paddr;
+
+ /* Silence static analyzers */
+ assert(vaddr != 0);
+ paddr = rte_mem_virt2iova((void *)vaddr);
+ if (paddr == RTE_BAD_IOVA) {
+ /*
+ * The vaddr may be valid but doesn't have a backing page
+ * assigned yet. Touch the page to ensure a backing page
+ * gets assigned, then try to translate again.
+ */
+ rte_atomic64_read((rte_atomic64_t *)vaddr);
+ paddr = rte_mem_virt2iova((void *)vaddr);
+ }
+ if (paddr == RTE_BAD_IOVA) {
+ /* Unable to get to the physical address. */
+ return SPDK_VTOPHYS_ERROR;
+ }
+
+ return paddr;
+}
+
+/* Try to get the paddr from pci devices */
+static uint64_t
+vtophys_get_paddr_pci(uint64_t vaddr)
+{
+ struct spdk_vtophys_pci_device *vtophys_dev;
+ uintptr_t paddr;
+ struct rte_pci_device *dev;
+ struct rte_mem_resource *res;
+ unsigned r;
+
+ pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
+ TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) {
+ dev = vtophys_dev->pci_device;
+
+ for (r = 0; r < PCI_MAX_RESOURCE; r++) {
+ res = &dev->mem_resource[r];
+ if (res->phys_addr && vaddr >= (uint64_t)res->addr &&
+ vaddr < (uint64_t)res->addr + res->len) {
+ paddr = res->phys_addr + (vaddr - (uint64_t)res->addr);
+ DEBUG_PRINT("%s: %p -> %p\n", __func__, (void *)vaddr,
+ (void *)paddr);
+ pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
+ return paddr;
+ }
+ }
+ }
+ pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
+
+ return SPDK_VTOPHYS_ERROR;
+}
+
+static int
+vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
+ enum spdk_mem_map_notify_action action,
+ void *vaddr, size_t len)
+{
+ int rc = 0, pci_phys = 0;
+ uint64_t paddr;
+
+ if ((uintptr_t)vaddr & ~MASK_256TB) {
+ DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
+ return -EINVAL;
+ }
+
+ if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
+ DEBUG_PRINT("invalid parameters, vaddr=%p len=%ju\n",
+ vaddr, len);
+ return -EINVAL;
+ }
+
+ /* Get the physical address from the DPDK memsegs */
+ paddr = vtophys_get_paddr_memseg((uint64_t)vaddr);
+
+ switch (action) {
+ case SPDK_MEM_MAP_NOTIFY_REGISTER:
+ if (paddr == SPDK_VTOPHYS_ERROR) {
+ /* This is not an address that DPDK is managing. */
+#if VFIO_ENABLED
+ enum rte_iova_mode iova_mode;
+
+#if RTE_VERSION >= RTE_VERSION_NUM(19, 11, 0, 0)
+ iova_mode = rte_eal_iova_mode();
+#else
+ iova_mode = rte_eal_get_configuration()->iova_mode;
+#endif
+
+ if (spdk_iommu_is_enabled() && iova_mode == RTE_IOVA_VA) {
+ /* We'll use the virtual address as the iova to match DPDK. */
+ paddr = (uint64_t)vaddr;
+ rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, len);
+ if (rc) {
+ return -EFAULT;
+ }
+ while (len > 0) {
+ rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
+ if (rc != 0) {
+ return rc;
+ }
+ vaddr += VALUE_2MB;
+ paddr += VALUE_2MB;
+ len -= VALUE_2MB;
+ }
+ } else
+#endif
+ {
+ /* Get the physical address from /proc/self/pagemap. */
+ paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr);
+ if (paddr == SPDK_VTOPHYS_ERROR) {
+ /* Get the physical address from PCI devices */
+ paddr = vtophys_get_paddr_pci((uint64_t)vaddr);
+ if (paddr == SPDK_VTOPHYS_ERROR) {
+ DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
+ return -EFAULT;
+ }
+ /* The beginning of this address range points to a PCI resource,
+ * so the rest must point to a PCI resource as well.
+ */
+ pci_phys = 1;
+ }
+
+ /* Get paddr for each 2MB chunk in this address range */
+ while (len > 0) {
+ /* Get the physical address from /proc/self/pagemap. */
+ if (pci_phys) {
+ paddr = vtophys_get_paddr_pci((uint64_t)vaddr);
+ } else {
+ paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr);
+ }
+
+ if (paddr == SPDK_VTOPHYS_ERROR) {
+ DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
+ return -EFAULT;
+ }
+
+ /* Since PCI paddr can break the 2MiB physical alignment skip this check for that. */
+ if (!pci_phys && (paddr & MASK_2MB)) {
+ DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr);
+ return -EINVAL;
+ }
+#if VFIO_ENABLED
+ /* If the IOMMU is on, but DPDK is using iova-mode=pa, we want to register this memory
+ * with the IOMMU using the physical address to match. */
+ if (spdk_iommu_is_enabled()) {
+ rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, VALUE_2MB);
+ if (rc) {
+ DEBUG_PRINT("Unable to assign vaddr %p to paddr 0x%" PRIx64 "\n", vaddr, paddr);
+ return -EFAULT;
+ }
+ }
+#endif
+
+ rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
+ if (rc != 0) {
+ return rc;
+ }
+
+ vaddr += VALUE_2MB;
+ len -= VALUE_2MB;
+ }
+ }
+ } else {
+ /* This is an address managed by DPDK. Just setup the translations. */
+ while (len > 0) {
+ paddr = vtophys_get_paddr_memseg((uint64_t)vaddr);
+ if (paddr == SPDK_VTOPHYS_ERROR) {
+ DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
+ return -EFAULT;
+ }
+
+ rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
+ if (rc != 0) {
+ return rc;
+ }
+
+ vaddr += VALUE_2MB;
+ len -= VALUE_2MB;
+ }
+ }
+
+ break;
+ case SPDK_MEM_MAP_NOTIFY_UNREGISTER:
+#if VFIO_ENABLED
+ if (paddr == SPDK_VTOPHYS_ERROR) {
+ /*
+ * This is not an address that DPDK is managing. If vfio is enabled,
+ * we need to unmap the range from the IOMMU
+ */
+ if (spdk_iommu_is_enabled()) {
+ uint64_t buffer_len = len;
+ uint8_t *va = vaddr;
+ enum rte_iova_mode iova_mode;
+
+#if RTE_VERSION >= RTE_VERSION_NUM(19, 11, 0, 0)
+ iova_mode = rte_eal_iova_mode();
+#else
+ iova_mode = rte_eal_get_configuration()->iova_mode;
+#endif
+ /*
+ * In virtual address mode, the region is contiguous and can be done in
+ * one unmap.
+ */
+ if (iova_mode == RTE_IOVA_VA) {
+ paddr = spdk_mem_map_translate(map, (uint64_t)va, &buffer_len);
+ if (buffer_len != len || paddr != (uintptr_t)va) {
+ DEBUG_PRINT("Unmapping %p with length %lu failed because "
+ "translation had address 0x%" PRIx64 " and length %lu\n",
+ va, len, paddr, buffer_len);
+ return -EINVAL;
+ }
+ rc = vtophys_iommu_unmap_dma(paddr, len);
+ if (rc) {
+ DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr);
+ return -EFAULT;
+ }
+ } else if (iova_mode == RTE_IOVA_PA) {
+ /* Get paddr for each 2MB chunk in this address range */
+ while (buffer_len > 0) {
+ paddr = spdk_mem_map_translate(map, (uint64_t)va, NULL);
+
+ if (paddr == SPDK_VTOPHYS_ERROR || buffer_len < VALUE_2MB) {
+ DEBUG_PRINT("could not get phys addr for %p\n", va);
+ return -EFAULT;
+ }
+
+ rc = vtophys_iommu_unmap_dma(paddr, VALUE_2MB);
+ if (rc) {
+ DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr);
+ return -EFAULT;
+ }
+
+ va += VALUE_2MB;
+ buffer_len -= VALUE_2MB;
+ }
+ }
+ }
+ }
+#endif
+ while (len > 0) {
+ rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB);
+ if (rc != 0) {
+ return rc;
+ }
+
+ vaddr += VALUE_2MB;
+ len -= VALUE_2MB;
+ }
+
+ break;
+ default:
+ SPDK_UNREACHABLE();
+ }
+
+ return rc;
+}
+
+static int
+vtophys_check_contiguous_entries(uint64_t paddr1, uint64_t paddr2)
+{
+ /* This function is always called with paddrs for two subsequent
+ * 2MB chunks in virtual address space, so those chunks will be only
+ * physically contiguous if the physical addresses are 2MB apart
+ * from each other as well.
+ */
+ return (paddr2 - paddr1 == VALUE_2MB);
+}
+
+#if VFIO_ENABLED
+
+static bool
+vfio_enabled(void)
+{
+ return rte_vfio_is_enabled("vfio_pci");
+}
+
+/* Check if IOMMU is enabled on the system */
+static bool
+has_iommu_groups(void)
+{
+ struct dirent *d;
+ int count = 0;
+ DIR *dir = opendir("/sys/kernel/iommu_groups");
+
+ if (dir == NULL) {
+ return false;
+ }
+
+ while (count < 3 && (d = readdir(dir)) != NULL) {
+ count++;
+ }
+
+ closedir(dir);
+ /* there will always be ./ and ../ entries */
+ return count > 2;
+}
+
+static bool
+vfio_noiommu_enabled(void)
+{
+ return rte_vfio_noiommu_is_enabled();
+}
+
+static void
+vtophys_iommu_init(void)
+{
+ char proc_fd_path[PATH_MAX + 1];
+ char link_path[PATH_MAX + 1];
+ const char vfio_path[] = "/dev/vfio/vfio";
+ DIR *dir;
+ struct dirent *d;
+
+ if (!vfio_enabled()) {
+ return;
+ }
+
+ if (vfio_noiommu_enabled()) {
+ g_vfio.noiommu_enabled = true;
+ } else if (!has_iommu_groups()) {
+ return;
+ }
+
+ dir = opendir("/proc/self/fd");
+ if (!dir) {
+ DEBUG_PRINT("Failed to open /proc/self/fd (%d)\n", errno);
+ return;
+ }
+
+ while ((d = readdir(dir)) != NULL) {
+ if (d->d_type != DT_LNK) {
+ continue;
+ }
+
+ snprintf(proc_fd_path, sizeof(proc_fd_path), "/proc/self/fd/%s", d->d_name);
+ if (readlink(proc_fd_path, link_path, sizeof(link_path)) != (sizeof(vfio_path) - 1)) {
+ continue;
+ }
+
+ if (memcmp(link_path, vfio_path, sizeof(vfio_path) - 1) == 0) {
+ sscanf(d->d_name, "%d", &g_vfio.fd);
+ break;
+ }
+ }
+
+ closedir(dir);
+
+ if (g_vfio.fd < 0) {
+ DEBUG_PRINT("Failed to discover DPDK VFIO container fd.\n");
+ return;
+ }
+
+ g_vfio.enabled = true;
+
+ return;
+}
+#endif
+
+void
+vtophys_pci_device_added(struct rte_pci_device *pci_device)
+{
+ struct spdk_vtophys_pci_device *vtophys_dev;
+
+ pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
+
+ vtophys_dev = calloc(1, sizeof(*vtophys_dev));
+ if (vtophys_dev) {
+ vtophys_dev->pci_device = pci_device;
+ TAILQ_INSERT_TAIL(&g_vtophys_pci_devices, vtophys_dev, tailq);
+ } else {
+ DEBUG_PRINT("Memory allocation error\n");
+ }
+ pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
+
+#if VFIO_ENABLED
+ struct spdk_vfio_dma_map *dma_map;
+ int ret;
+
+ if (!g_vfio.enabled) {
+ return;
+ }
+
+ pthread_mutex_lock(&g_vfio.mutex);
+ g_vfio.device_ref++;
+ if (g_vfio.device_ref > 1) {
+ pthread_mutex_unlock(&g_vfio.mutex);
+ return;
+ }
+
+ /* This is the first SPDK device using DPDK vfio. This means that the first
+ * IOMMU group might have been just been added to the DPDK vfio container.
+ * From this point it is certain that the memory can be mapped now.
+ */
+ TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
+ ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map);
+ if (ret) {
+ DEBUG_PRINT("Cannot update DMA mapping, error %d\n", errno);
+ break;
+ }
+ }
+ pthread_mutex_unlock(&g_vfio.mutex);
+#endif
+}
+
+void
+vtophys_pci_device_removed(struct rte_pci_device *pci_device)
+{
+ struct spdk_vtophys_pci_device *vtophys_dev;
+
+ pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
+ TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) {
+ if (vtophys_dev->pci_device == pci_device) {
+ TAILQ_REMOVE(&g_vtophys_pci_devices, vtophys_dev, tailq);
+ free(vtophys_dev);
+ break;
+ }
+ }
+ pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
+
+#if VFIO_ENABLED
+ struct spdk_vfio_dma_map *dma_map;
+ int ret;
+
+ if (!g_vfio.enabled) {
+ return;
+ }
+
+ pthread_mutex_lock(&g_vfio.mutex);
+ assert(g_vfio.device_ref > 0);
+ g_vfio.device_ref--;
+ if (g_vfio.device_ref > 0) {
+ pthread_mutex_unlock(&g_vfio.mutex);
+ return;
+ }
+
+ /* This is the last SPDK device using DPDK vfio. If DPDK doesn't have
+ * any additional devices using it's vfio container, all the mappings
+ * will be automatically removed by the Linux vfio driver. We unmap
+ * the memory manually to be able to easily re-map it later regardless
+ * of other, external factors.
+ */
+ TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
+ ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &dma_map->unmap);
+ if (ret) {
+ DEBUG_PRINT("Cannot unmap DMA memory, error %d\n", errno);
+ break;
+ }
+ }
+ pthread_mutex_unlock(&g_vfio.mutex);
+#endif
+}
+
+int
+vtophys_init(void)
+{
+ const struct spdk_mem_map_ops vtophys_map_ops = {
+ .notify_cb = vtophys_notify,
+ .are_contiguous = vtophys_check_contiguous_entries,
+ };
+
+ const struct spdk_mem_map_ops phys_ref_map_ops = {
+ .notify_cb = NULL,
+ .are_contiguous = NULL,
+ };
+
+#if VFIO_ENABLED
+ vtophys_iommu_init();
+#endif
+
+ g_phys_ref_map = spdk_mem_map_alloc(0, &phys_ref_map_ops, NULL);
+ if (g_phys_ref_map == NULL) {
+ DEBUG_PRINT("phys_ref map allocation failed.\n");
+ return -ENOMEM;
+ }
+
+ g_vtophys_map = spdk_mem_map_alloc(SPDK_VTOPHYS_ERROR, &vtophys_map_ops, NULL);
+ if (g_vtophys_map == NULL) {
+ DEBUG_PRINT("vtophys map allocation failed\n");
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+uint64_t
+spdk_vtophys(void *buf, uint64_t *size)
+{
+ uint64_t vaddr, paddr_2mb;
+
+ vaddr = (uint64_t)buf;
+ paddr_2mb = spdk_mem_map_translate(g_vtophys_map, vaddr, size);
+
+ /*
+ * SPDK_VTOPHYS_ERROR has all bits set, so if the lookup returned SPDK_VTOPHYS_ERROR,
+ * we will still bitwise-or it with the buf offset below, but the result will still be
+ * SPDK_VTOPHYS_ERROR. However now that we do + rather than | (due to PCI vtophys being
+ * unaligned) we must now check the return value before addition.
+ */
+ SPDK_STATIC_ASSERT(SPDK_VTOPHYS_ERROR == UINT64_C(-1), "SPDK_VTOPHYS_ERROR should be all 1s");
+ if (paddr_2mb == SPDK_VTOPHYS_ERROR) {
+ return SPDK_VTOPHYS_ERROR;
+ } else {
+ return paddr_2mb + (vaddr & MASK_2MB);
+ }
+}
diff --git a/src/spdk/lib/env_dpdk/pci.c b/src/spdk/lib/env_dpdk/pci.c
new file mode 100644
index 000000000..5fd1b4abd
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/pci.c
@@ -0,0 +1,1063 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "env_internal.h"
+
+#include <rte_alarm.h>
+#include <rte_devargs.h>
+#include "spdk/env.h"
+
+#define SYSFS_PCI_DRIVERS "/sys/bus/pci/drivers"
+
+#define PCI_CFG_SIZE 256
+#define PCI_EXT_CAP_ID_SN 0x03
+
+/* DPDK 18.11+ hotplug isn't robust. Multiple apps starting at the same time
+ * might cause the internal IPC to misbehave. Just retry in such case.
+ */
+#define DPDK_HOTPLUG_RETRY_COUNT 4
+
+/* DPDK alarm/interrupt thread */
+static pthread_mutex_t g_pci_mutex = PTHREAD_MUTEX_INITIALIZER;
+static TAILQ_HEAD(, spdk_pci_device) g_pci_devices = TAILQ_HEAD_INITIALIZER(g_pci_devices);
+/* devices hotplugged on a dpdk thread */
+static TAILQ_HEAD(, spdk_pci_device) g_pci_hotplugged_devices =
+ TAILQ_HEAD_INITIALIZER(g_pci_hotplugged_devices);
+static TAILQ_HEAD(, spdk_pci_driver) g_pci_drivers = TAILQ_HEAD_INITIALIZER(g_pci_drivers);
+
+static int
+map_bar_rte(struct spdk_pci_device *device, uint32_t bar,
+ void **mapped_addr, uint64_t *phys_addr, uint64_t *size)
+{
+ struct rte_pci_device *dev = device->dev_handle;
+
+ *mapped_addr = dev->mem_resource[bar].addr;
+ *phys_addr = (uint64_t)dev->mem_resource[bar].phys_addr;
+ *size = (uint64_t)dev->mem_resource[bar].len;
+
+ return 0;
+}
+
+static int
+unmap_bar_rte(struct spdk_pci_device *device, uint32_t bar, void *addr)
+{
+ return 0;
+}
+
+static int
+cfg_read_rte(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
+{
+ int rc;
+
+ rc = rte_pci_read_config(dev->dev_handle, value, len, offset);
+
+ return (rc > 0 && (uint32_t) rc == len) ? 0 : -1;
+}
+
+static int
+cfg_write_rte(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
+{
+ int rc;
+
+ rc = rte_pci_write_config(dev->dev_handle, value, len, offset);
+
+#ifdef __FreeBSD__
+ /* DPDK returns 0 on success and -1 on failure */
+ return rc;
+#endif
+ return (rc > 0 && (uint32_t) rc == len) ? 0 : -1;
+}
+
+static void
+remove_rte_dev(struct rte_pci_device *rte_dev)
+{
+ char bdf[32];
+ int i = 0, rc;
+
+ snprintf(bdf, sizeof(bdf), "%s", rte_dev->device.name);
+ do {
+ rc = rte_eal_hotplug_remove("pci", bdf);
+ } while (rc == -ENOMSG && ++i <= DPDK_HOTPLUG_RETRY_COUNT);
+}
+
+static void
+detach_rte_cb(void *_dev)
+{
+ remove_rte_dev(_dev);
+}
+
+static void
+detach_rte(struct spdk_pci_device *dev)
+{
+ struct rte_pci_device *rte_dev = dev->dev_handle;
+ int i;
+ bool removed;
+
+ if (!spdk_process_is_primary()) {
+ remove_rte_dev(rte_dev);
+ return;
+ }
+
+ pthread_mutex_lock(&g_pci_mutex);
+ dev->internal.attached = false;
+ /* prevent the hotremove notification from removing this device */
+ dev->internal.pending_removal = true;
+ pthread_mutex_unlock(&g_pci_mutex);
+
+ rte_eal_alarm_set(1, detach_rte_cb, rte_dev);
+
+ /* wait up to 2s for the cb to execute */
+ for (i = 2000; i > 0; i--) {
+
+ spdk_delay_us(1000);
+ pthread_mutex_lock(&g_pci_mutex);
+ removed = dev->internal.removed;
+ pthread_mutex_unlock(&g_pci_mutex);
+
+ if (removed) {
+ break;
+ }
+ }
+
+ /* besides checking the removed flag, we also need to wait
+ * for the dpdk detach function to unwind, as it's doing some
+ * operations even after calling our detach callback. Simply
+ * cancel the alarm - if it started executing already, this
+ * call will block and wait for it to finish.
+ */
+ rte_eal_alarm_cancel(detach_rte_cb, rte_dev);
+
+ /* the device could have been finally removed, so just check
+ * it again.
+ */
+ pthread_mutex_lock(&g_pci_mutex);
+ removed = dev->internal.removed;
+ pthread_mutex_unlock(&g_pci_mutex);
+ if (!removed) {
+ fprintf(stderr, "Timeout waiting for DPDK to remove PCI device %s.\n",
+ rte_dev->name);
+ /* If we reach this state, then the device couldn't be removed and most likely
+ a subsequent hot add of a device in the same BDF will fail */
+ }
+}
+
+void
+spdk_pci_driver_register(const char *name, struct spdk_pci_id *id_table, uint32_t flags)
+{
+ struct spdk_pci_driver *driver;
+
+ driver = calloc(1, sizeof(*driver));
+ if (!driver) {
+ /* we can't do any better than bailing atm */
+ return;
+ }
+
+ driver->name = name;
+ driver->id_table = id_table;
+ driver->drv_flags = flags;
+ TAILQ_INSERT_TAIL(&g_pci_drivers, driver, tailq);
+}
+
+struct spdk_pci_driver *
+spdk_pci_nvme_get_driver(void)
+{
+ return spdk_pci_get_driver("nvme");
+}
+
+struct spdk_pci_driver *
+spdk_pci_get_driver(const char *name)
+{
+ struct spdk_pci_driver *driver;
+
+ TAILQ_FOREACH(driver, &g_pci_drivers, tailq) {
+ if (strcmp(driver->name, name) == 0) {
+ return driver;
+ }
+ }
+
+ return NULL;
+}
+
+static void
+pci_device_rte_hotremove(const char *device_name,
+ enum rte_dev_event_type event,
+ void *cb_arg)
+{
+ struct spdk_pci_device *dev;
+ bool can_detach = false;
+
+ if (event != RTE_DEV_EVENT_REMOVE) {
+ return;
+ }
+
+ pthread_mutex_lock(&g_pci_mutex);
+ TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
+ struct rte_pci_device *rte_dev = dev->dev_handle;
+
+ if (strcmp(rte_dev->name, device_name) == 0 &&
+ !dev->internal.pending_removal) {
+ can_detach = !dev->internal.attached;
+ /* prevent any further attaches */
+ dev->internal.pending_removal = true;
+ break;
+ }
+ }
+ pthread_mutex_unlock(&g_pci_mutex);
+
+ if (dev != NULL && can_detach) {
+ /* if device is not attached we can remove it right away.
+ * Otherwise it will be removed at detach.
+ */
+ remove_rte_dev(dev->dev_handle);
+ }
+}
+
+static void
+cleanup_pci_devices(void)
+{
+ struct spdk_pci_device *dev, *tmp;
+
+ pthread_mutex_lock(&g_pci_mutex);
+ /* cleanup removed devices */
+ TAILQ_FOREACH_SAFE(dev, &g_pci_devices, internal.tailq, tmp) {
+ if (!dev->internal.removed) {
+ continue;
+ }
+
+ vtophys_pci_device_removed(dev->dev_handle);
+ TAILQ_REMOVE(&g_pci_devices, dev, internal.tailq);
+ free(dev);
+ }
+
+ /* add newly-attached devices */
+ TAILQ_FOREACH_SAFE(dev, &g_pci_hotplugged_devices, internal.tailq, tmp) {
+ TAILQ_REMOVE(&g_pci_hotplugged_devices, dev, internal.tailq);
+ TAILQ_INSERT_TAIL(&g_pci_devices, dev, internal.tailq);
+ vtophys_pci_device_added(dev->dev_handle);
+ }
+ pthread_mutex_unlock(&g_pci_mutex);
+}
+
+static int scan_pci_bus(bool delay_init);
+
+/* translate spdk_pci_driver to an rte_pci_driver and register it to dpdk */
+static int
+register_rte_driver(struct spdk_pci_driver *driver)
+{
+ unsigned pci_id_count = 0;
+ struct rte_pci_id *rte_id_table;
+ char *rte_name;
+ size_t rte_name_len;
+ uint32_t rte_flags;
+
+ assert(driver->id_table);
+ while (driver->id_table[pci_id_count].vendor_id) {
+ pci_id_count++;
+ }
+ assert(pci_id_count > 0);
+
+ rte_id_table = calloc(pci_id_count + 1, sizeof(*rte_id_table));
+ if (!rte_id_table) {
+ return -ENOMEM;
+ }
+
+ while (pci_id_count > 0) {
+ struct rte_pci_id *rte_id = &rte_id_table[pci_id_count - 1];
+ const struct spdk_pci_id *spdk_id = &driver->id_table[pci_id_count - 1];
+
+ rte_id->class_id = spdk_id->class_id;
+ rte_id->vendor_id = spdk_id->vendor_id;
+ rte_id->device_id = spdk_id->device_id;
+ rte_id->subsystem_vendor_id = spdk_id->subvendor_id;
+ rte_id->subsystem_device_id = spdk_id->subdevice_id;
+ pci_id_count--;
+ }
+
+ assert(driver->name);
+ rte_name_len = strlen(driver->name) + strlen("spdk_") + 1;
+ rte_name = calloc(rte_name_len, 1);
+ if (!rte_name) {
+ free(rte_id_table);
+ return -ENOMEM;
+ }
+
+ snprintf(rte_name, rte_name_len, "spdk_%s", driver->name);
+ driver->driver.driver.name = rte_name;
+ driver->driver.id_table = rte_id_table;
+
+ rte_flags = 0;
+ if (driver->drv_flags & SPDK_PCI_DRIVER_NEED_MAPPING) {
+ rte_flags |= RTE_PCI_DRV_NEED_MAPPING;
+ }
+ if (driver->drv_flags & SPDK_PCI_DRIVER_WC_ACTIVATE) {
+ rte_flags |= RTE_PCI_DRV_WC_ACTIVATE;
+ }
+ driver->driver.drv_flags = rte_flags;
+
+ driver->driver.probe = pci_device_init;
+ driver->driver.remove = pci_device_fini;
+
+ rte_pci_register(&driver->driver);
+ return 0;
+}
+
+static inline void
+_pci_env_init(void)
+{
+ /* We assume devices were present on the bus for more than 2 seconds
+ * before initializing SPDK and there's no need to wait more. We scan
+ * the bus, but we don't blacklist any devices.
+ */
+ scan_pci_bus(false);
+
+ /* Register a single hotremove callback for all devices. */
+ if (spdk_process_is_primary()) {
+ rte_dev_event_callback_register(NULL, pci_device_rte_hotremove, NULL);
+ }
+}
+
+void
+pci_env_init(void)
+{
+ struct spdk_pci_driver *driver;
+
+ TAILQ_FOREACH(driver, &g_pci_drivers, tailq) {
+ register_rte_driver(driver);
+ }
+
+ _pci_env_init();
+}
+
+void
+pci_env_reinit(void)
+{
+ /* There is no need to register pci drivers again, since they were
+ * already pre-registered in pci_env_init.
+ */
+
+ _pci_env_init();
+}
+
+void
+pci_env_fini(void)
+{
+ struct spdk_pci_device *dev;
+ char bdf[32];
+
+ cleanup_pci_devices();
+ TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
+ if (dev->internal.attached) {
+ spdk_pci_addr_fmt(bdf, sizeof(bdf), &dev->addr);
+ fprintf(stderr, "Device %s is still attached at shutdown!\n", bdf);
+ }
+ }
+
+ if (spdk_process_is_primary()) {
+ rte_dev_event_callback_unregister(NULL, pci_device_rte_hotremove, NULL);
+ }
+}
+
+int
+pci_device_init(struct rte_pci_driver *_drv,
+ struct rte_pci_device *_dev)
+{
+ struct spdk_pci_driver *driver = (struct spdk_pci_driver *)_drv;
+ struct spdk_pci_device *dev;
+ int rc;
+
+ dev = calloc(1, sizeof(*dev));
+ if (dev == NULL) {
+ return -1;
+ }
+
+ dev->dev_handle = _dev;
+
+ dev->addr.domain = _dev->addr.domain;
+ dev->addr.bus = _dev->addr.bus;
+ dev->addr.dev = _dev->addr.devid;
+ dev->addr.func = _dev->addr.function;
+ dev->id.class_id = _dev->id.class_id;
+ dev->id.vendor_id = _dev->id.vendor_id;
+ dev->id.device_id = _dev->id.device_id;
+ dev->id.subvendor_id = _dev->id.subsystem_vendor_id;
+ dev->id.subdevice_id = _dev->id.subsystem_device_id;
+ dev->socket_id = _dev->device.numa_node;
+ dev->type = "pci";
+
+ dev->map_bar = map_bar_rte;
+ dev->unmap_bar = unmap_bar_rte;
+ dev->cfg_read = cfg_read_rte;
+ dev->cfg_write = cfg_write_rte;
+
+ dev->internal.driver = driver;
+ dev->internal.claim_fd = -1;
+
+ if (driver->cb_fn != NULL) {
+ rc = driver->cb_fn(driver->cb_arg, dev);
+ if (rc != 0) {
+ free(dev);
+ return rc;
+ }
+ dev->internal.attached = true;
+ }
+
+ pthread_mutex_lock(&g_pci_mutex);
+ TAILQ_INSERT_TAIL(&g_pci_hotplugged_devices, dev, internal.tailq);
+ pthread_mutex_unlock(&g_pci_mutex);
+ return 0;
+}
+
+int
+pci_device_fini(struct rte_pci_device *_dev)
+{
+ struct spdk_pci_device *dev;
+
+ pthread_mutex_lock(&g_pci_mutex);
+ TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
+ if (dev->dev_handle == _dev) {
+ break;
+ }
+ }
+
+ if (dev == NULL || dev->internal.attached) {
+ /* The device might be still referenced somewhere in SPDK. */
+ pthread_mutex_unlock(&g_pci_mutex);
+ return -1;
+ }
+
+ /* remove our whitelist_at option */
+ if (_dev->device.devargs) {
+ _dev->device.devargs->data = NULL;
+ }
+
+ assert(!dev->internal.removed);
+ dev->internal.removed = true;
+ pthread_mutex_unlock(&g_pci_mutex);
+ return 0;
+
+}
+
+void
+spdk_pci_device_detach(struct spdk_pci_device *dev)
+{
+ assert(dev->internal.attached);
+
+ if (dev->internal.claim_fd >= 0) {
+ spdk_pci_device_unclaim(dev);
+ }
+
+ if (strcmp(dev->type, "pci") == 0) {
+ /* if it's a physical device we need to deal with DPDK on
+ * a different process and we can't just unset one flag
+ * here. We also want to stop using any device resources
+ * so that the device isn't "in use" by the userspace driver
+ * once we detach it. This would allow attaching the device
+ * to a different process, or to a kernel driver like nvme.
+ */
+ detach_rte(dev);
+ } else {
+ dev->internal.attached = false;
+ }
+
+ cleanup_pci_devices();
+}
+
+static int
+scan_pci_bus(bool delay_init)
+{
+ struct spdk_pci_driver *driver;
+ struct rte_pci_device *rte_dev;
+ uint64_t now;
+
+ rte_bus_scan();
+ now = spdk_get_ticks();
+
+ driver = TAILQ_FIRST(&g_pci_drivers);
+ if (!driver) {
+ return 0;
+ }
+
+ TAILQ_FOREACH(rte_dev, &driver->driver.bus->device_list, next) {
+ struct rte_devargs *da;
+
+ da = rte_dev->device.devargs;
+ if (!da) {
+ char devargs_str[128];
+
+ /* the device was never blacklisted or whitelisted */
+ da = calloc(1, sizeof(*da));
+ if (!da) {
+ return -1;
+ }
+
+ snprintf(devargs_str, sizeof(devargs_str), "pci:%s", rte_dev->device.name);
+ if (rte_devargs_parse(da, devargs_str) != 0) {
+ free(da);
+ return -1;
+ }
+
+ rte_devargs_insert(&da);
+ rte_dev->device.devargs = da;
+ }
+
+ if (da->data) {
+ uint64_t whitelist_at = (uint64_t)(uintptr_t)da->data;
+
+ /* this device was seen by spdk before... */
+ if (da->policy == RTE_DEV_BLACKLISTED && whitelist_at <= now) {
+ da->policy = RTE_DEV_WHITELISTED;
+ }
+ } else if ((driver->driver.bus->bus.conf.scan_mode == RTE_BUS_SCAN_WHITELIST &&
+ da->policy == RTE_DEV_WHITELISTED) || da->policy != RTE_DEV_BLACKLISTED) {
+ /* override the policy only if not permanently blacklisted */
+
+ if (delay_init) {
+ da->policy = RTE_DEV_BLACKLISTED;
+ da->data = (void *)(now + 2 * spdk_get_ticks_hz());
+ } else {
+ da->policy = RTE_DEV_WHITELISTED;
+ da->data = (void *)(uintptr_t)now;
+ }
+ }
+ }
+
+ return 0;
+}
+
+int
+spdk_pci_device_attach(struct spdk_pci_driver *driver,
+ spdk_pci_enum_cb enum_cb,
+ void *enum_ctx, struct spdk_pci_addr *pci_address)
+{
+ struct spdk_pci_device *dev;
+ struct rte_pci_device *rte_dev;
+ struct rte_devargs *da;
+ int rc;
+ char bdf[32];
+
+ spdk_pci_addr_fmt(bdf, sizeof(bdf), pci_address);
+
+ cleanup_pci_devices();
+
+ TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
+ if (spdk_pci_addr_compare(&dev->addr, pci_address) == 0) {
+ break;
+ }
+ }
+
+ if (dev != NULL && dev->internal.driver == driver) {
+ pthread_mutex_lock(&g_pci_mutex);
+ if (dev->internal.attached || dev->internal.pending_removal) {
+ pthread_mutex_unlock(&g_pci_mutex);
+ return -1;
+ }
+
+ rc = enum_cb(enum_ctx, dev);
+ if (rc == 0) {
+ dev->internal.attached = true;
+ }
+ pthread_mutex_unlock(&g_pci_mutex);
+ return rc;
+ }
+
+ driver->cb_fn = enum_cb;
+ driver->cb_arg = enum_ctx;
+
+ int i = 0;
+
+ do {
+ rc = rte_eal_hotplug_add("pci", bdf, "");
+ } while (rc == -ENOMSG && ++i <= DPDK_HOTPLUG_RETRY_COUNT);
+
+ if (i > 1 && rc == -EEXIST) {
+ /* Even though the previous request timed out, the device
+ * was attached successfully.
+ */
+ rc = 0;
+ }
+
+ driver->cb_arg = NULL;
+ driver->cb_fn = NULL;
+
+ cleanup_pci_devices();
+
+ if (rc != 0) {
+ return -1;
+ }
+
+ /* explicit attach ignores the whitelist, so if we blacklisted this
+ * device before let's enable it now - just for clarity.
+ */
+ TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
+ if (spdk_pci_addr_compare(&dev->addr, pci_address) == 0) {
+ break;
+ }
+ }
+ assert(dev != NULL);
+
+ rte_dev = dev->dev_handle;
+ da = rte_dev->device.devargs;
+ if (da && da->data) {
+ da->data = (void *)(uintptr_t)spdk_get_ticks();
+ da->policy = RTE_DEV_WHITELISTED;
+ }
+
+ return 0;
+}
+
+/* Note: You can call spdk_pci_enumerate from more than one thread
+ * simultaneously safely, but you cannot call spdk_pci_enumerate
+ * and rte_eal_pci_probe simultaneously.
+ */
+int
+spdk_pci_enumerate(struct spdk_pci_driver *driver,
+ spdk_pci_enum_cb enum_cb,
+ void *enum_ctx)
+{
+ struct spdk_pci_device *dev;
+ int rc;
+
+ cleanup_pci_devices();
+
+ pthread_mutex_lock(&g_pci_mutex);
+ TAILQ_FOREACH(dev, &g_pci_devices, internal.tailq) {
+ if (dev->internal.attached ||
+ dev->internal.driver != driver ||
+ dev->internal.pending_removal) {
+ continue;
+ }
+
+ rc = enum_cb(enum_ctx, dev);
+ if (rc == 0) {
+ dev->internal.attached = true;
+ } else if (rc < 0) {
+ pthread_mutex_unlock(&g_pci_mutex);
+ return -1;
+ }
+ }
+ pthread_mutex_unlock(&g_pci_mutex);
+
+ if (scan_pci_bus(true) != 0) {
+ return -1;
+ }
+
+ driver->cb_fn = enum_cb;
+ driver->cb_arg = enum_ctx;
+
+ if (rte_bus_probe() != 0) {
+ driver->cb_arg = NULL;
+ driver->cb_fn = NULL;
+ return -1;
+ }
+
+ driver->cb_arg = NULL;
+ driver->cb_fn = NULL;
+
+ cleanup_pci_devices();
+ return 0;
+}
+
+struct spdk_pci_device *
+spdk_pci_get_first_device(void)
+{
+ return TAILQ_FIRST(&g_pci_devices);
+}
+
+struct spdk_pci_device *
+spdk_pci_get_next_device(struct spdk_pci_device *prev)
+{
+ return TAILQ_NEXT(prev, internal.tailq);
+}
+
+int
+spdk_pci_device_map_bar(struct spdk_pci_device *dev, uint32_t bar,
+ void **mapped_addr, uint64_t *phys_addr, uint64_t *size)
+{
+ return dev->map_bar(dev, bar, mapped_addr, phys_addr, size);
+}
+
+int
+spdk_pci_device_unmap_bar(struct spdk_pci_device *dev, uint32_t bar, void *addr)
+{
+ return dev->unmap_bar(dev, bar, addr);
+}
+
+uint32_t
+spdk_pci_device_get_domain(struct spdk_pci_device *dev)
+{
+ return dev->addr.domain;
+}
+
+uint8_t
+spdk_pci_device_get_bus(struct spdk_pci_device *dev)
+{
+ return dev->addr.bus;
+}
+
+uint8_t
+spdk_pci_device_get_dev(struct spdk_pci_device *dev)
+{
+ return dev->addr.dev;
+}
+
+uint8_t
+spdk_pci_device_get_func(struct spdk_pci_device *dev)
+{
+ return dev->addr.func;
+}
+
+uint16_t
+spdk_pci_device_get_vendor_id(struct spdk_pci_device *dev)
+{
+ return dev->id.vendor_id;
+}
+
+uint16_t
+spdk_pci_device_get_device_id(struct spdk_pci_device *dev)
+{
+ return dev->id.device_id;
+}
+
+uint16_t
+spdk_pci_device_get_subvendor_id(struct spdk_pci_device *dev)
+{
+ return dev->id.subvendor_id;
+}
+
+uint16_t
+spdk_pci_device_get_subdevice_id(struct spdk_pci_device *dev)
+{
+ return dev->id.subdevice_id;
+}
+
+struct spdk_pci_id
+spdk_pci_device_get_id(struct spdk_pci_device *dev)
+{
+ return dev->id;
+}
+
+int
+spdk_pci_device_get_socket_id(struct spdk_pci_device *dev)
+{
+ return dev->socket_id;
+}
+
+int
+spdk_pci_device_cfg_read(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
+{
+ return dev->cfg_read(dev, value, len, offset);
+}
+
+int
+spdk_pci_device_cfg_write(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset)
+{
+ return dev->cfg_write(dev, value, len, offset);
+}
+
+int
+spdk_pci_device_cfg_read8(struct spdk_pci_device *dev, uint8_t *value, uint32_t offset)
+{
+ return spdk_pci_device_cfg_read(dev, value, 1, offset);
+}
+
+int
+spdk_pci_device_cfg_write8(struct spdk_pci_device *dev, uint8_t value, uint32_t offset)
+{
+ return spdk_pci_device_cfg_write(dev, &value, 1, offset);
+}
+
+int
+spdk_pci_device_cfg_read16(struct spdk_pci_device *dev, uint16_t *value, uint32_t offset)
+{
+ return spdk_pci_device_cfg_read(dev, value, 2, offset);
+}
+
+int
+spdk_pci_device_cfg_write16(struct spdk_pci_device *dev, uint16_t value, uint32_t offset)
+{
+ return spdk_pci_device_cfg_write(dev, &value, 2, offset);
+}
+
+int
+spdk_pci_device_cfg_read32(struct spdk_pci_device *dev, uint32_t *value, uint32_t offset)
+{
+ return spdk_pci_device_cfg_read(dev, value, 4, offset);
+}
+
+int
+spdk_pci_device_cfg_write32(struct spdk_pci_device *dev, uint32_t value, uint32_t offset)
+{
+ return spdk_pci_device_cfg_write(dev, &value, 4, offset);
+}
+
+int
+spdk_pci_device_get_serial_number(struct spdk_pci_device *dev, char *sn, size_t len)
+{
+ int err;
+ uint32_t pos, header = 0;
+ uint32_t i, buf[2];
+
+ if (len < 17) {
+ return -1;
+ }
+
+ err = spdk_pci_device_cfg_read32(dev, &header, PCI_CFG_SIZE);
+ if (err || !header) {
+ return -1;
+ }
+
+ pos = PCI_CFG_SIZE;
+ while (1) {
+ if ((header & 0x0000ffff) == PCI_EXT_CAP_ID_SN) {
+ if (pos) {
+ /* skip the header */
+ pos += 4;
+ for (i = 0; i < 2; i++) {
+ err = spdk_pci_device_cfg_read32(dev, &buf[i], pos + 4 * i);
+ if (err) {
+ return -1;
+ }
+ }
+ snprintf(sn, len, "%08x%08x", buf[1], buf[0]);
+ return 0;
+ }
+ }
+ pos = (header >> 20) & 0xffc;
+ /* 0 if no other items exist */
+ if (pos < PCI_CFG_SIZE) {
+ return -1;
+ }
+ err = spdk_pci_device_cfg_read32(dev, &header, pos);
+ if (err) {
+ return -1;
+ }
+ }
+ return -1;
+}
+
+struct spdk_pci_addr
+spdk_pci_device_get_addr(struct spdk_pci_device *dev)
+{
+ return dev->addr;
+}
+
+bool
+spdk_pci_device_is_removed(struct spdk_pci_device *dev)
+{
+ return dev->internal.pending_removal;
+}
+
+int
+spdk_pci_addr_compare(const struct spdk_pci_addr *a1, const struct spdk_pci_addr *a2)
+{
+ if (a1->domain > a2->domain) {
+ return 1;
+ } else if (a1->domain < a2->domain) {
+ return -1;
+ } else if (a1->bus > a2->bus) {
+ return 1;
+ } else if (a1->bus < a2->bus) {
+ return -1;
+ } else if (a1->dev > a2->dev) {
+ return 1;
+ } else if (a1->dev < a2->dev) {
+ return -1;
+ } else if (a1->func > a2->func) {
+ return 1;
+ } else if (a1->func < a2->func) {
+ return -1;
+ }
+
+ return 0;
+}
+
+#ifdef __linux__
+int
+spdk_pci_device_claim(struct spdk_pci_device *dev)
+{
+ int dev_fd;
+ char dev_name[64];
+ int pid;
+ void *dev_map;
+ struct flock pcidev_lock = {
+ .l_type = F_WRLCK,
+ .l_whence = SEEK_SET,
+ .l_start = 0,
+ .l_len = 0,
+ };
+
+ snprintf(dev_name, sizeof(dev_name), "/tmp/spdk_pci_lock_%04x:%02x:%02x.%x",
+ dev->addr.domain, dev->addr.bus, dev->addr.dev, dev->addr.func);
+
+ dev_fd = open(dev_name, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
+ if (dev_fd == -1) {
+ fprintf(stderr, "could not open %s\n", dev_name);
+ return -errno;
+ }
+
+ if (ftruncate(dev_fd, sizeof(int)) != 0) {
+ fprintf(stderr, "could not truncate %s\n", dev_name);
+ close(dev_fd);
+ return -errno;
+ }
+
+ dev_map = mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE,
+ MAP_SHARED, dev_fd, 0);
+ if (dev_map == MAP_FAILED) {
+ fprintf(stderr, "could not mmap dev %s (%d)\n", dev_name, errno);
+ close(dev_fd);
+ return -errno;
+ }
+
+ if (fcntl(dev_fd, F_SETLK, &pcidev_lock) != 0) {
+ pid = *(int *)dev_map;
+ fprintf(stderr, "Cannot create lock on device %s, probably"
+ " process %d has claimed it\n", dev_name, pid);
+ munmap(dev_map, sizeof(int));
+ close(dev_fd);
+ /* F_SETLK returns unspecified errnos, normalize them */
+ return -EACCES;
+ }
+
+ *(int *)dev_map = (int)getpid();
+ munmap(dev_map, sizeof(int));
+ dev->internal.claim_fd = dev_fd;
+ /* Keep dev_fd open to maintain the lock. */
+ return 0;
+}
+
+void
+spdk_pci_device_unclaim(struct spdk_pci_device *dev)
+{
+ char dev_name[64];
+
+ snprintf(dev_name, sizeof(dev_name), "/tmp/spdk_pci_lock_%04x:%02x:%02x.%x",
+ dev->addr.domain, dev->addr.bus, dev->addr.dev, dev->addr.func);
+
+ close(dev->internal.claim_fd);
+ dev->internal.claim_fd = -1;
+ unlink(dev_name);
+}
+#endif /* __linux__ */
+
+#ifdef __FreeBSD__
+int
+spdk_pci_device_claim(struct spdk_pci_device *dev)
+{
+ /* TODO */
+ return 0;
+}
+
+void
+spdk_pci_device_unclaim(struct spdk_pci_device *dev)
+{
+ /* TODO */
+}
+#endif /* __FreeBSD__ */
+
+int
+spdk_pci_addr_parse(struct spdk_pci_addr *addr, const char *bdf)
+{
+ unsigned domain, bus, dev, func;
+
+ if (addr == NULL || bdf == NULL) {
+ return -EINVAL;
+ }
+
+ if ((sscanf(bdf, "%x:%x:%x.%x", &domain, &bus, &dev, &func) == 4) ||
+ (sscanf(bdf, "%x.%x.%x.%x", &domain, &bus, &dev, &func) == 4)) {
+ /* Matched a full address - all variables are initialized */
+ } else if (sscanf(bdf, "%x:%x:%x", &domain, &bus, &dev) == 3) {
+ func = 0;
+ } else if ((sscanf(bdf, "%x:%x.%x", &bus, &dev, &func) == 3) ||
+ (sscanf(bdf, "%x.%x.%x", &bus, &dev, &func) == 3)) {
+ domain = 0;
+ } else if ((sscanf(bdf, "%x:%x", &bus, &dev) == 2) ||
+ (sscanf(bdf, "%x.%x", &bus, &dev) == 2)) {
+ domain = 0;
+ func = 0;
+ } else {
+ return -EINVAL;
+ }
+
+ if (bus > 0xFF || dev > 0x1F || func > 7) {
+ return -EINVAL;
+ }
+
+ addr->domain = domain;
+ addr->bus = bus;
+ addr->dev = dev;
+ addr->func = func;
+
+ return 0;
+}
+
+int
+spdk_pci_addr_fmt(char *bdf, size_t sz, const struct spdk_pci_addr *addr)
+{
+ int rc;
+
+ rc = snprintf(bdf, sz, "%04x:%02x:%02x.%x",
+ addr->domain, addr->bus,
+ addr->dev, addr->func);
+
+ if (rc > 0 && (size_t)rc < sz) {
+ return 0;
+ }
+
+ return -1;
+}
+
+void
+spdk_pci_hook_device(struct spdk_pci_driver *drv, struct spdk_pci_device *dev)
+{
+ assert(dev->map_bar != NULL);
+ assert(dev->unmap_bar != NULL);
+ assert(dev->cfg_read != NULL);
+ assert(dev->cfg_write != NULL);
+ dev->internal.driver = drv;
+ TAILQ_INSERT_TAIL(&g_pci_devices, dev, internal.tailq);
+}
+
+void
+spdk_pci_unhook_device(struct spdk_pci_device *dev)
+{
+ assert(!dev->internal.attached);
+ TAILQ_REMOVE(&g_pci_devices, dev, internal.tailq);
+}
+
+const char *
+spdk_pci_device_get_type(const struct spdk_pci_device *dev)
+{
+ return dev->type;
+}
diff --git a/src/spdk/lib/env_dpdk/pci_idxd.c b/src/spdk/lib/env_dpdk/pci_idxd.c
new file mode 100644
index 000000000..eddbfa4af
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/pci_idxd.c
@@ -0,0 +1,50 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "env_internal.h"
+
+#include "spdk/pci_ids.h"
+
+#define SPDK_IDXD_PCI_DEVICE(DEVICE_ID) SPDK_PCI_DEVICE(SPDK_PCI_VID_INTEL, DEVICE_ID)
+static struct spdk_pci_id idxd_driver_id[] = {
+ {SPDK_IDXD_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IDXD)},
+ { .vendor_id = 0, /* sentinel */ },
+};
+
+struct spdk_pci_driver *
+spdk_pci_idxd_get_driver(void)
+{
+ return spdk_pci_get_driver("idxd");
+}
+
+SPDK_PCI_DRIVER_REGISTER("idxd", idxd_driver_id, SPDK_PCI_DRIVER_NEED_MAPPING);
diff --git a/src/spdk/lib/env_dpdk/pci_ioat.c b/src/spdk/lib/env_dpdk/pci_ioat.c
new file mode 100644
index 000000000..28b7bdb44
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/pci_ioat.c
@@ -0,0 +1,98 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "env_internal.h"
+
+#include "spdk/pci_ids.h"
+
+#define SPDK_IOAT_PCI_DEVICE(DEVICE_ID) SPDK_PCI_DEVICE(SPDK_PCI_VID_INTEL, DEVICE_ID)
+static struct spdk_pci_id ioat_driver_id[] = {
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB0)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB1)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB2)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB3)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB4)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB5)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB6)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB7)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB8)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB0)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB1)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB2)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB3)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB4)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB5)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB6)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB7)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB8)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB9)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW0)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW1)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW2)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW3)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW4)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW5)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW6)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW7)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW8)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW9)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BWD0)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BWD1)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BWD2)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BWD3)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDXDE0)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDXDE1)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDXDE2)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDXDE3)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX0)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX1)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX2)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX3)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX4)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX5)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX6)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX7)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX8)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX9)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SKX)},
+ {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_ICX)},
+ { .vendor_id = 0, /* sentinel */ },
+};
+
+struct spdk_pci_driver *
+spdk_pci_ioat_get_driver(void)
+{
+ return spdk_pci_get_driver("ioat");
+}
+
+SPDK_PCI_DRIVER_REGISTER("ioat", ioat_driver_id, SPDK_PCI_DRIVER_NEED_MAPPING);
diff --git a/src/spdk/lib/env_dpdk/pci_virtio.c b/src/spdk/lib/env_dpdk/pci_virtio.c
new file mode 100644
index 000000000..e525a4a8e
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/pci_virtio.c
@@ -0,0 +1,53 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "env_internal.h"
+
+#include "spdk/pci_ids.h"
+
+static struct spdk_pci_id virtio_pci_driver_id[] = {
+ { SPDK_PCI_DEVICE(SPDK_PCI_VID_VIRTIO, PCI_DEVICE_ID_VIRTIO_SCSI_MODERN) },
+ { SPDK_PCI_DEVICE(SPDK_PCI_VID_VIRTIO, PCI_DEVICE_ID_VIRTIO_BLK_MODERN) },
+ { SPDK_PCI_DEVICE(SPDK_PCI_VID_VIRTIO, PCI_DEVICE_ID_VIRTIO_SCSI_LEGACY) },
+ { SPDK_PCI_DEVICE(SPDK_PCI_VID_VIRTIO, PCI_DEVICE_ID_VIRTIO_BLK_LEGACY) },
+ { .vendor_id = 0, /* sentinel */ },
+};
+
+struct spdk_pci_driver *
+spdk_pci_virtio_get_driver(void)
+{
+ return spdk_pci_get_driver("virtio");
+}
+
+SPDK_PCI_DRIVER_REGISTER("virtio", virtio_pci_driver_id,
+ SPDK_PCI_DRIVER_NEED_MAPPING | SPDK_PCI_DRIVER_WC_ACTIVATE);
diff --git a/src/spdk/lib/env_dpdk/pci_vmd.c b/src/spdk/lib/env_dpdk/pci_vmd.c
new file mode 100644
index 000000000..fb6860873
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/pci_vmd.c
@@ -0,0 +1,50 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "env_internal.h"
+
+#include "spdk/pci_ids.h"
+
+static struct spdk_pci_id vmd_pci_driver_id[] = {
+ { SPDK_PCI_DEVICE(SPDK_PCI_VID_INTEL, PCI_DEVICE_ID_INTEL_VMD) },
+ { .vendor_id = 0, /* sentinel */ },
+};
+
+struct spdk_pci_driver *
+spdk_pci_vmd_get_driver(void)
+{
+ return spdk_pci_get_driver("vmd");
+}
+
+SPDK_PCI_DRIVER_REGISTER("vmd", vmd_pci_driver_id,
+ SPDK_PCI_DRIVER_NEED_MAPPING | SPDK_PCI_DRIVER_WC_ACTIVATE);
diff --git a/src/spdk/lib/env_dpdk/spdk_env_dpdk.map b/src/spdk/lib/env_dpdk/spdk_env_dpdk.map
new file mode 100644
index 000000000..a465f0938
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/spdk_env_dpdk.map
@@ -0,0 +1,114 @@
+{
+ global:
+
+ # Public functions in env.h
+ spdk_malloc;
+ spdk_zmalloc;
+ spdk_realloc;
+ spdk_free;
+ spdk_env_opts_init;
+ spdk_env_init;
+ spdk_env_fini;
+ spdk_dma_malloc;
+ spdk_dma_malloc_socket;
+ spdk_dma_zmalloc;
+ spdk_dma_zmalloc_socket;
+ spdk_dma_realloc;
+ spdk_dma_free;
+ spdk_memzone_reserve;
+ spdk_memzone_reserve_aligned;
+ spdk_memzone_lookup;
+ spdk_memzone_free;
+ spdk_memzone_dump;
+ spdk_mempool_create;
+ spdk_mempool_create_ctor;
+ spdk_mempool_get_name;
+ spdk_mempool_free;
+ spdk_mempool_get;
+ spdk_mempool_get_bulk;
+ spdk_mempool_put;
+ spdk_mempool_put_bulk;
+ spdk_mempool_count;
+ spdk_mempool_obj_iter;
+ spdk_mempool_lookup;
+ spdk_env_get_core_count;
+ spdk_env_get_current_core;
+ spdk_env_get_first_core;
+ spdk_env_get_last_core;
+ spdk_env_get_next_core;
+ spdk_env_get_socket_id;
+ spdk_env_thread_launch_pinned;
+ spdk_env_thread_wait_all;
+ spdk_process_is_primary;
+ spdk_get_ticks;
+ spdk_get_ticks_hz;
+ spdk_delay_us;
+ spdk_pause;
+ spdk_ring_create;
+ spdk_ring_free;
+ spdk_ring_count;
+ spdk_ring_enqueue;
+ spdk_ring_dequeue;
+ spdk_iommu_is_enabled;
+ spdk_vtophys;
+ spdk_pci_get_driver;
+ spdk_pci_driver_register;
+ spdk_pci_nvme_get_driver;
+ spdk_pci_vmd_get_driver;
+ spdk_pci_idxd_get_driver;
+ spdk_pci_ioat_get_driver;
+ spdk_pci_virtio_get_driver;
+ spdk_pci_enumerate;
+ spdk_pci_get_first_device;
+ spdk_pci_get_next_device;
+ spdk_pci_device_map_bar;
+ spdk_pci_device_unmap_bar;
+ spdk_pci_device_get_domain;
+ spdk_pci_device_get_bus;
+ spdk_pci_device_get_dev;
+ spdk_pci_device_get_func;
+ spdk_pci_device_get_addr;
+ spdk_pci_device_get_vendor_id;
+ spdk_pci_device_get_device_id;
+ spdk_pci_device_get_subvendor_id;
+ spdk_pci_device_get_subdevice_id;
+ spdk_pci_device_get_id;
+ spdk_pci_device_get_socket_id;
+ spdk_pci_device_get_serial_number;
+ spdk_pci_device_claim;
+ spdk_pci_device_unclaim;
+ spdk_pci_device_detach;
+ spdk_pci_device_attach;
+ spdk_pci_device_cfg_read;
+ spdk_pci_device_cfg_write;
+ spdk_pci_device_cfg_read8;
+ spdk_pci_device_cfg_write8;
+ spdk_pci_device_cfg_read16;
+ spdk_pci_device_cfg_write16;
+ spdk_pci_device_cfg_read32;
+ spdk_pci_device_cfg_write32;
+ spdk_pci_device_is_removed;
+ spdk_pci_addr_compare;
+ spdk_pci_addr_parse;
+ spdk_pci_addr_fmt;
+ spdk_pci_hook_device;
+ spdk_pci_unhook_device;
+ spdk_pci_device_get_type;
+ spdk_unaffinitize_thread;
+ spdk_call_unaffinitized;
+ spdk_mem_map_alloc;
+ spdk_mem_map_free;
+ spdk_mem_map_set_translation;
+ spdk_mem_map_clear_translation;
+ spdk_mem_map_translate;
+ spdk_mem_register;
+ spdk_mem_unregister;
+
+ # Public functions in env_dpdk.h
+ spdk_env_dpdk_post_init;
+ spdk_env_dpdk_post_fini;
+ spdk_env_dpdk_external_init;
+ spdk_env_dpdk_dump_mem_stats;
+
+ local: *;
+};
diff --git a/src/spdk/lib/env_dpdk/threads.c b/src/spdk/lib/env_dpdk/threads.c
new file mode 100644
index 000000000..01c7b8d9f
--- /dev/null
+++ b/src/spdk/lib/env_dpdk/threads.c
@@ -0,0 +1,108 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "env_internal.h"
+
+#include <rte_config.h>
+#include <rte_lcore.h>
+
+uint32_t
+spdk_env_get_core_count(void)
+{
+ return rte_lcore_count();
+}
+
+uint32_t
+spdk_env_get_current_core(void)
+{
+ return rte_lcore_id();
+}
+
+uint32_t
+spdk_env_get_first_core(void)
+{
+ return rte_get_next_lcore(-1, 0, 0);
+}
+
+uint32_t
+spdk_env_get_last_core(void)
+{
+ uint32_t i;
+ uint32_t last_core = UINT32_MAX;
+
+ SPDK_ENV_FOREACH_CORE(i) {
+ last_core = i;
+ }
+
+ assert(last_core != UINT32_MAX);
+
+ return last_core;
+}
+
+uint32_t
+spdk_env_get_next_core(uint32_t prev_core)
+{
+ unsigned lcore;
+
+ lcore = rte_get_next_lcore(prev_core, 0, 0);
+ if (lcore == RTE_MAX_LCORE) {
+ return UINT32_MAX;
+ }
+ return lcore;
+}
+
+uint32_t
+spdk_env_get_socket_id(uint32_t core)
+{
+ if (core >= RTE_MAX_LCORE) {
+ return SPDK_ENV_SOCKET_ID_ANY;
+ }
+
+ return rte_lcore_to_socket_id(core);
+}
+
+int
+spdk_env_thread_launch_pinned(uint32_t core, thread_start_fn fn, void *arg)
+{
+ int rc;
+
+ rc = rte_eal_remote_launch(fn, arg, core);
+
+ return rc;
+}
+
+void
+spdk_env_thread_wait_all(void)
+{
+ rte_eal_mp_wait_lcore();
+}
diff --git a/src/spdk/lib/env_ocf/.gitignore b/src/spdk/lib/env_ocf/.gitignore
new file mode 100644
index 000000000..f5452c248
--- /dev/null
+++ b/src/spdk/lib/env_ocf/.gitignore
@@ -0,0 +1,2 @@
+src/
+include/
diff --git a/src/spdk/lib/env_ocf/Makefile b/src/spdk/lib/env_ocf/Makefile
new file mode 100644
index 000000000..0ac51eecd
--- /dev/null
+++ b/src/spdk/lib/env_ocf/Makefile
@@ -0,0 +1,108 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+# OCF requires users to build with their sources
+# If SPDK is configured with OCF source directory,
+# we export its files and then compile SPDK LIB with them
+# Else if SPDK is configured with OCF precompiled library
+# we just use it as SPDK lib by copying it to /build/lib/
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+OCFDIR=$(CONFIG_OCF_DIR)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+LIBNAME := ocfenv
+
+CFLAGS += $(ENV_CFLAGS) -I$(CURDIR) -I$(CURDIR)/include -w
+C_SRCS = $(shell find -name \*.c)
+
+LIB = $(call spdk_lib_list_to_static_libs,$(LIBNAME))
+
+
+ifeq ($(CONFIG_CUSTOMOCF),y)
+
+.PHONY: all clean install
+
+all:
+ $(Q)$(MAKE) $(LIB)
+
+clean:
+ $(Q)rm -f $(LIB)
+
+$(LIB):
+ cp $(CONFIG_OCF_PATH) $(LIB)
+
+install:
+
+uninstall:
+ $(UNINSTALL_LIB)
+
+else
+
+.PHONY: all clean install ocf_inc ocf_src ocf_distclean all exportlib
+
+all: ocf_inc ocf_src
+ $(Q)$(MAKE) $(LIB)
+
+ocf_inc:
+ $(Q)$(MAKE) -C "$(CONFIG_OCF_PATH)" inc O="$(SPDK_ROOT_DIR)/lib/env_ocf/" ENV= --quiet
+
+ocf_src: ocf_inc
+ $(Q)$(MAKE) -C "$(CONFIG_OCF_PATH)" src O="$(SPDK_ROOT_DIR)/lib/env_ocf/" CMD=cp ENV= --quiet
+
+ocf_distclean:
+ $(Q)$(MAKE) -C "$(CONFIG_OCF_PATH)" distclean O="$(SPDK_ROOT_DIR)/lib/env_ocf/" ENV= --quiet
+
+clean: ocf_distclean
+ $(Q)rm -rf "$(SPDK_ROOT_DIR)/lib/env_ocf/include" \
+ "$(SPDK_ROOT_DIR)/lib/env_ocf/src" \
+ $(LIB) $(OBJS);
+
+$(LIB): $(OBJS)
+ $(LIB_C)
+
+install:
+
+uninstall:
+ $(UNINSTALL_LIB)
+
+endif
+
+exportlib: all
+ @ if [ -z $(O) ]; then echo "No output specified"; exit 1; fi
+ cp $(LIB) $(O)
+
+help:
+ @ echo "all Default"
+ @ echo "exportlib O=<outpath> Default build to specified outpath"
diff --git a/src/spdk/lib/env_ocf/ocf_env.c b/src/spdk/lib/env_ocf/ocf_env.c
new file mode 100644
index 000000000..ab5445203
--- /dev/null
+++ b/src/spdk/lib/env_ocf/ocf_env.c
@@ -0,0 +1,176 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include "ocf/ocf_def.h"
+#include "ocf_env.h"
+
+#include "spdk/crc32.h"
+#include "spdk/env.h"
+#include "spdk_internal/log.h"
+
+/* Number of buffers for mempool
+ * Need to be power of two - 1 for better memory utilization
+ * It depends on memory usage of OCF which
+ * in itself depends on the workload
+ * It is a big number because OCF uses allocators
+ * for every request it sends and recieves
+ */
+#define ENV_ALLOCATOR_NBUFS 32767
+
+/* Use unique index for env allocators */
+static env_atomic g_env_allocator_index = 0;
+
+void *
+env_allocator_new(env_allocator *allocator)
+{
+ void *mem = spdk_mempool_get(allocator->mempool);
+
+ if (spdk_likely(mem)) {
+ memset(mem, 0, allocator->element_size);
+ }
+
+ return mem;
+}
+
+env_allocator *
+env_allocator_create(uint32_t size, const char *name)
+{
+ env_allocator *allocator;
+ char qualified_name[128] = {0};
+
+ snprintf(qualified_name, 128, "ocf_env_%d", env_atomic_inc_return(&g_env_allocator_index));
+
+ allocator = calloc(1, sizeof(*allocator));
+ if (!allocator) {
+ return NULL;
+ }
+
+ allocator->mempool = spdk_mempool_create(qualified_name,
+ ENV_ALLOCATOR_NBUFS, size,
+ SPDK_MEMPOOL_DEFAULT_CACHE_SIZE,
+ SPDK_ENV_SOCKET_ID_ANY);
+
+ if (!allocator->mempool) {
+ free(allocator);
+ return NULL;
+ }
+
+ allocator->element_size = size;
+
+ return allocator;
+}
+
+void
+env_allocator_del(env_allocator *allocator, void *item)
+{
+ spdk_mempool_put(allocator->mempool, item);
+}
+
+void
+env_allocator_destroy(env_allocator *allocator)
+{
+ if (allocator) {
+ if (ENV_ALLOCATOR_NBUFS - spdk_mempool_count(allocator->mempool)) {
+ SPDK_ERRLOG("Not all objects deallocated\n");
+ assert(false);
+ }
+
+ spdk_mempool_free(allocator->mempool);
+ free(allocator);
+ }
+}
+/* *** CRC *** */
+
+uint32_t
+env_crc32(uint32_t crc, uint8_t const *message, size_t len)
+{
+ return spdk_crc32_ieee_update(message, len, crc);
+}
+
+/* EXECUTION CONTEXTS */
+pthread_mutex_t *exec_context_mutex;
+
+static void __attribute__((constructor)) init_execution_context(void)
+{
+ unsigned count = env_get_execution_context_count();
+ unsigned i;
+
+ ENV_BUG_ON(count == 0);
+ exec_context_mutex = malloc(count * sizeof(exec_context_mutex[0]));
+ ENV_BUG_ON(exec_context_mutex == NULL);
+ for (i = 0; i < count; i++) {
+ ENV_BUG_ON(pthread_mutex_init(&exec_context_mutex[i], NULL));
+ }
+}
+
+static void __attribute__((destructor)) deinit_execution_context(void)
+{
+ unsigned count = env_get_execution_context_count();
+ unsigned i;
+
+ ENV_BUG_ON(count == 0);
+ ENV_BUG_ON(exec_context_mutex == NULL);
+
+ for (i = 0; i < count; i++) {
+ ENV_BUG_ON(pthread_mutex_destroy(&exec_context_mutex[i]));
+ }
+ free(exec_context_mutex);
+}
+
+/* get_execuction_context must assure that after the call finishes, the caller
+ * will not get preempted from current execution context. For userspace env
+ * we simulate this behavior by acquiring per execution context mutex. As a
+ * result the caller might actually get preempted, but no other thread will
+ * execute in this context by the time the caller puts current execution ctx. */
+unsigned env_get_execution_context(void)
+{
+ unsigned cpu;
+
+ cpu = sched_getcpu();
+ cpu = (cpu == -1) ? 0 : cpu;
+
+ ENV_BUG_ON(pthread_mutex_lock(&exec_context_mutex[cpu]));
+
+ return cpu;
+}
+
+void env_put_execution_context(unsigned ctx)
+{
+ pthread_mutex_unlock(&exec_context_mutex[ctx]);
+}
+
+unsigned env_get_execution_context_count(void)
+{
+ int num = sysconf(_SC_NPROCESSORS_ONLN);
+
+ return (num == -1) ? 0 : num;
+}
diff --git a/src/spdk/lib/env_ocf/ocf_env.h b/src/spdk/lib/env_ocf/ocf_env.h
new file mode 100644
index 000000000..81d2e814b
--- /dev/null
+++ b/src/spdk/lib/env_ocf/ocf_env.h
@@ -0,0 +1,834 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef __LIBOCF_ENV_H__
+#define __LIBOCF_ENV_H__
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#ifndef __USE_GNU
+#define __USE_GNU
+#endif
+
+#include <linux/limits.h>
+#include <linux/stddef.h>
+
+#include "spdk/stdinc.h"
+#include "spdk/likely.h"
+#include "spdk/env.h"
+#include "spdk/util.h"
+#include "spdk_internal/log.h"
+
+#include "ocf_env_list.h"
+#include "ocf/ocf_err.h"
+
+typedef uint8_t u8;
+typedef uint16_t u16;
+typedef uint32_t u32;
+typedef uint64_t u64;
+
+typedef uint64_t sector_t;
+
+#define __packed __attribute__((packed))
+#define __aligned(x) __attribute__((aligned(x)))
+
+/* linux sector 512-bytes */
+#define ENV_SECTOR_SHIFT 9
+#define ENV_SECTOR_SIZE (1<<ENV_SECTOR_SHIFT)
+#define BYTES_TO_SECTOR(x) ((x) >> ENV_SECTOR_SHIFT)
+
+/* *** MEMORY MANAGEMENT *** */
+
+#define ENV_MEM_NORMAL 0
+#define ENV_MEM_NOIO 0
+#define ENV_MEM_ATOMIC 0
+
+#define likely spdk_likely
+#define unlikely spdk_unlikely
+
+#define min(x, y) MIN(x, y)
+#ifndef MIN
+#define MIN(x, y) spdk_min(x, y)
+#endif
+
+#define ARRAY_SIZE(x) SPDK_COUNTOF(x)
+
+/* LOGGING */
+#define ENV_PRIu64 PRIu64
+
+#define ENV_WARN(cond, fmt, args...) ({ \
+ if (spdk_unlikely((uintptr_t)(cond))) \
+ SPDK_NOTICELOG("WARNING" fmt, ##args); \
+ })
+
+#define ENV_WARN_ON(cond) ({ \
+ if (spdk_unlikely((uintptr_t)(cond))) \
+ SPDK_NOTICELOG("WARNING\n"); \
+ })
+
+#define ENV_BUG() ({ \
+ SPDK_ERRLOG("BUG\n"); \
+ assert(0); \
+ abort(); \
+ })
+
+#define ENV_BUG_ON(cond) ({ \
+ if (spdk_unlikely((uintptr_t)(cond))) { \
+ SPDK_ERRLOG("BUG\n"); \
+ assert(0); \
+ abort(); \
+ } \
+ })
+
+#define ENV_BUILD_BUG_ON(cond) _Static_assert(!(cond), "static "\
+ "assertion failure")
+
+#define container_of(ptr, type, member) SPDK_CONTAINEROF(ptr, type, member)
+
+static inline void *env_malloc(size_t size, int flags)
+{
+ return spdk_malloc(size, 0, NULL, SPDK_ENV_LCORE_ID_ANY,
+ SPDK_MALLOC_DMA);
+}
+
+static inline void *env_zalloc(size_t size, int flags)
+{
+ return spdk_zmalloc(size, 0, NULL, SPDK_ENV_LCORE_ID_ANY,
+ SPDK_MALLOC_DMA);
+}
+
+static inline void env_free(const void *ptr)
+{
+ return spdk_free((void *)ptr);
+}
+
+static inline void *env_vmalloc(size_t size)
+{
+ return spdk_malloc(size, 0, NULL, SPDK_ENV_LCORE_ID_ANY,
+ SPDK_MALLOC_DMA);
+}
+
+static inline void *env_vzalloc(size_t size)
+{
+ /* TODO: raw_ram init can request huge amount of memory to store
+ * hashtable in it. need to ensure that allocation succedds */
+ return spdk_zmalloc(size, 0, NULL, SPDK_ENV_LCORE_ID_ANY,
+ SPDK_MALLOC_DMA);
+}
+
+static inline void *env_vzalloc_flags(size_t size, int flags)
+{
+ return env_vzalloc(size);
+}
+
+static inline void *env_secure_alloc(size_t size)
+{
+ return spdk_zmalloc(size, 0, NULL, SPDK_ENV_LCORE_ID_ANY,
+ SPDK_MALLOC_DMA);
+}
+
+static inline void env_secure_free(const void *ptr, size_t size)
+{
+ return spdk_free((void *)ptr);
+}
+
+static inline void env_vfree(const void *ptr)
+{
+ return spdk_free((void *)ptr);
+}
+
+static inline uint64_t env_get_free_memory(void)
+{
+ return -1;
+}
+
+/* *** ALLOCATOR *** */
+
+#define OCF_ALLOCATOR_NAME_MAX 128
+
+typedef struct {
+ struct spdk_mempool *mempool;
+ size_t element_size;
+} env_allocator;
+
+env_allocator *env_allocator_create(uint32_t size, const char *name);
+
+void env_allocator_destroy(env_allocator *allocator);
+
+void *env_allocator_new(env_allocator *allocator);
+
+void env_allocator_del(env_allocator *allocator, void *item);
+
+uint32_t env_allocator_item_count(env_allocator *allocator);
+
+/* *** MUTEX *** */
+
+typedef struct {
+ pthread_mutex_t m;
+} env_mutex;
+
+static inline int env_mutex_init(env_mutex *mutex)
+{
+ return !!pthread_mutex_init(&mutex->m, NULL);
+}
+
+static inline void env_mutex_lock(env_mutex *mutex)
+{
+ ENV_BUG_ON(pthread_mutex_lock(&mutex->m));
+}
+
+static inline int env_mutex_lock_interruptible(env_mutex *mutex)
+{
+ env_mutex_lock(mutex);
+ return 0;
+}
+
+static inline int env_mutex_trylock(env_mutex *mutex)
+{
+ return pthread_mutex_trylock(&mutex->m) ? -OCF_ERR_NO_LOCK : 0;
+}
+
+static inline void env_mutex_unlock(env_mutex *mutex)
+{
+ ENV_BUG_ON(pthread_mutex_unlock(&mutex->m));
+}
+
+static inline int env_mutex_is_locked(env_mutex *mutex)
+{
+ if (env_mutex_trylock(mutex) == 0) {
+ env_mutex_unlock(mutex);
+ return 0;
+ }
+
+ return 1;
+}
+
+static inline int env_mutex_destroy(env_mutex *mutex)
+{
+ if (pthread_mutex_destroy(&mutex->m)) {
+ return 1;
+ }
+
+ return 0;
+}
+
+/* *** RECURSIVE MUTEX *** */
+
+typedef env_mutex env_rmutex;
+
+static inline int env_rmutex_init(env_rmutex *rmutex)
+{
+ pthread_mutexattr_t attr;
+
+ pthread_mutexattr_init(&attr);
+ pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
+ pthread_mutex_init(&rmutex->m, &attr);
+
+ return 0;
+}
+
+static inline void env_rmutex_lock(env_rmutex *rmutex)
+{
+ env_mutex_lock(rmutex);
+}
+
+static inline int env_rmutex_lock_interruptible(env_rmutex *rmutex)
+{
+ return env_mutex_lock_interruptible(rmutex);
+}
+
+static inline int env_rmutex_trylock(env_rmutex *rmutex)
+{
+ return env_mutex_trylock(rmutex);
+}
+
+static inline void env_rmutex_unlock(env_rmutex *rmutex)
+{
+ env_mutex_unlock(rmutex);
+}
+
+static inline int env_rmutex_is_locked(env_rmutex *rmutex)
+{
+ return env_mutex_is_locked(rmutex);
+}
+
+static inline int env_rmutex_destroy(env_rmutex *rmutex)
+{
+ return env_mutex_destroy(rmutex);
+}
+
+/* *** RW SEMAPHORE *** */
+typedef struct {
+ pthread_rwlock_t lock;
+} env_rwsem;
+
+static inline int env_rwsem_init(env_rwsem *s)
+{
+ return !!pthread_rwlock_init(&s->lock, NULL);
+}
+
+static inline void env_rwsem_up_read(env_rwsem *s)
+{
+ ENV_BUG_ON(pthread_rwlock_unlock(&s->lock));
+}
+
+static inline void env_rwsem_down_read(env_rwsem *s)
+{
+ ENV_BUG_ON(pthread_rwlock_rdlock(&s->lock));
+}
+
+static inline int env_rwsem_down_read_trylock(env_rwsem *s)
+{
+ return pthread_rwlock_tryrdlock(&s->lock) ? -OCF_ERR_NO_LOCK : 0;
+}
+
+static inline void env_rwsem_up_write(env_rwsem *s)
+{
+ ENV_BUG_ON(pthread_rwlock_unlock(&s->lock));
+}
+
+static inline void env_rwsem_down_write(env_rwsem *s)
+{
+ ENV_BUG_ON(pthread_rwlock_wrlock(&s->lock));
+}
+
+static inline int env_rwsem_down_write_trylock(env_rwsem *s)
+{
+ return pthread_rwlock_trywrlock(&s->lock) ? -OCF_ERR_NO_LOCK : 0;
+}
+
+static inline int env_rwsem_is_locked(env_rwsem *s)
+{
+ if (env_rwsem_down_read_trylock(s) == 0) {
+ env_rwsem_up_read(s);
+ return 0;
+ }
+
+ return 1;
+}
+
+static inline int env_rwsem_down_read_interruptible(env_rwsem *s)
+{
+ return pthread_rwlock_rdlock(&s->lock);
+}
+static inline int env_rwsem_down_write_interruptible(env_rwsem *s)
+{
+ return pthread_rwlock_wrlock(&s->lock);
+}
+
+static inline int env_rwsem_destroy(env_rwsem *s)
+{
+ return pthread_rwlock_destroy(&s->lock);
+}
+
+/* *** ATOMIC VARIABLES *** */
+
+typedef int env_atomic;
+
+typedef long env_atomic64;
+
+#ifndef atomic_read
+#define atomic_read(ptr) (*(__typeof__(*ptr) *volatile) (ptr))
+#endif
+
+#ifndef atomic_set
+#define atomic_set(ptr, i) ((*(__typeof__(*ptr) *volatile) (ptr)) = (i))
+#endif
+
+#define atomic_inc(ptr) ((void) __sync_fetch_and_add(ptr, 1))
+#define atomic_dec(ptr) ((void) __sync_fetch_and_add(ptr, -1))
+#define atomic_add(ptr, n) ((void) __sync_fetch_and_add(ptr, n))
+#define atomic_sub(ptr, n) ((void) __sync_fetch_and_sub(ptr, n))
+
+#define atomic_cmpxchg __sync_val_compare_and_swap
+
+static inline int env_atomic_read(const env_atomic *a)
+{
+ return atomic_read(a);
+}
+
+static inline void env_atomic_set(env_atomic *a, int i)
+{
+ atomic_set(a, i);
+}
+
+static inline void env_atomic_add(int i, env_atomic *a)
+{
+ atomic_add(a, i);
+}
+
+static inline void env_atomic_sub(int i, env_atomic *a)
+{
+ atomic_sub(a, i);
+}
+
+static inline bool env_atomic_sub_and_test(int i, env_atomic *a)
+{
+ return __sync_sub_and_fetch(a, i) == 0;
+}
+
+static inline void env_atomic_inc(env_atomic *a)
+{
+ atomic_inc(a);
+}
+
+static inline void env_atomic_dec(env_atomic *a)
+{
+ atomic_dec(a);
+}
+
+static inline bool env_atomic_dec_and_test(env_atomic *a)
+{
+ return __sync_sub_and_fetch(a, 1) == 0;
+}
+
+static inline bool env_atomic_inc_and_test(env_atomic *a)
+{
+ return __sync_add_and_fetch(a, 1) == 0;
+}
+
+static inline int env_atomic_add_return(int i, env_atomic *a)
+{
+ return __sync_add_and_fetch(a, i);
+}
+
+static inline int env_atomic_sub_return(int i, env_atomic *a)
+{
+ return __sync_sub_and_fetch(a, i);
+}
+
+static inline int env_atomic_inc_return(env_atomic *a)
+{
+ return env_atomic_add_return(1, a);
+}
+
+static inline int env_atomic_dec_return(env_atomic *a)
+{
+ return env_atomic_sub_return(1, a);
+}
+
+static inline int env_atomic_cmpxchg(env_atomic *a, int old, int new_value)
+{
+ return atomic_cmpxchg(a, old, new_value);
+}
+
+static inline int env_atomic_add_unless(env_atomic *a, int i, int u)
+{
+ int c, old;
+ c = env_atomic_read(a);
+ for (;;) {
+ if (spdk_unlikely(c == (u))) {
+ break;
+ }
+ old = env_atomic_cmpxchg((a), c, c + (i));
+ if (spdk_likely(old == c)) {
+ break;
+ }
+ c = old;
+ }
+ return c != (u);
+}
+
+static inline long env_atomic64_read(const env_atomic64 *a)
+{
+ return atomic_read(a);
+}
+
+static inline void env_atomic64_set(env_atomic64 *a, long i)
+{
+ atomic_set(a, i);
+}
+
+static inline void env_atomic64_add(long i, env_atomic64 *a)
+{
+ atomic_add(a, i);
+}
+
+static inline void env_atomic64_sub(long i, env_atomic64 *a)
+{
+ atomic_sub(a, i);
+}
+
+static inline void env_atomic64_inc(env_atomic64 *a)
+{
+ atomic_inc(a);
+}
+
+static inline void env_atomic64_dec(env_atomic64 *a)
+{
+ atomic_dec(a);
+}
+
+static inline int env_atomic64_add_return(int i, env_atomic *a)
+{
+ return __sync_add_and_fetch(a, i);
+}
+
+static inline int env_atomic64_sub_return(int i, env_atomic *a)
+{
+ return __sync_sub_and_fetch(a, i);
+}
+
+static inline int env_atomic64_inc_return(env_atomic *a)
+{
+ return env_atomic64_add_return(1, a);
+}
+
+static inline int env_atomic64_dec_return(env_atomic *a)
+{
+ return env_atomic_sub_return(1, a);
+}
+
+static inline long env_atomic64_cmpxchg(env_atomic64 *a, long old, long new)
+{
+ return atomic_cmpxchg(a, old, new);
+}
+
+/* *** COMPLETION *** */
+typedef struct completion {
+ sem_t sem;
+} env_completion;
+
+static inline void env_completion_init(env_completion *completion)
+{
+ sem_init(&completion->sem, 0, 0);
+}
+
+static inline void env_completion_wait(env_completion *completion)
+{
+ sem_wait(&completion->sem);
+}
+
+static inline void env_completion_complete(env_completion *completion)
+{
+ sem_post(&completion->sem);
+}
+
+static inline void env_completion_destroy(env_completion *completion)
+{
+ sem_destroy(&completion->sem);
+}
+
+/* *** SPIN LOCKS *** */
+
+typedef struct {
+ pthread_spinlock_t lock;
+} env_spinlock;
+
+static inline int env_spinlock_init(env_spinlock *l)
+{
+ return pthread_spin_init(&l->lock, 0);
+}
+
+static inline int env_spinlock_trylock(env_spinlock *l)
+{
+ return pthread_spin_trylock(&l->lock) ? -OCF_ERR_NO_LOCK : 0;
+}
+
+static inline void env_spinlock_lock(env_spinlock *l)
+{
+ ENV_BUG_ON(pthread_spin_lock(&l->lock));
+}
+
+static inline void env_spinlock_unlock(env_spinlock *l)
+{
+ ENV_BUG_ON(pthread_spin_unlock(&l->lock));
+}
+
+#define env_spinlock_lock_irqsave(l, flags) \
+ (void)flags; \
+ env_spinlock_lock(l)
+
+#define env_spinlock_unlock_irqrestore(l, flags) \
+ (void)flags; \
+ env_spinlock_unlock(l)
+
+static inline void env_spinlock_destroy(env_spinlock *l)
+{
+ ENV_BUG_ON(pthread_spin_destroy(&l->lock));
+}
+
+/* *** RW LOCKS *** */
+
+typedef struct {
+ pthread_rwlock_t lock;
+} env_rwlock;
+
+static inline void env_rwlock_init(env_rwlock *l)
+{
+ ENV_BUG_ON(pthread_rwlock_init(&l->lock, NULL));
+}
+
+static inline void env_rwlock_read_lock(env_rwlock *l)
+{
+ ENV_BUG_ON(pthread_rwlock_rdlock(&l->lock));
+}
+
+static inline void env_rwlock_read_unlock(env_rwlock *l)
+{
+ ENV_BUG_ON(pthread_rwlock_unlock(&l->lock));
+}
+
+static inline void env_rwlock_write_lock(env_rwlock *l)
+{
+ ENV_BUG_ON(pthread_rwlock_wrlock(&l->lock));
+}
+
+static inline void env_rwlock_write_unlock(env_rwlock *l)
+{
+ ENV_BUG_ON(pthread_rwlock_unlock(&l->lock));
+}
+
+static inline void env_rwlock_destroy(env_rwlock *l)
+{
+ ENV_BUG_ON(pthread_rwlock_destroy(&l->lock));
+}
+
+static inline void env_bit_set(int nr, volatile void *addr)
+{
+ char *byte = (char *)addr + (nr >> 3);
+ char mask = 1 << (nr & 7);
+
+ __sync_or_and_fetch(byte, mask);
+}
+
+static inline void env_bit_clear(int nr, volatile void *addr)
+{
+ char *byte = (char *)addr + (nr >> 3);
+ char mask = 1 << (nr & 7);
+
+ mask = ~mask;
+ __sync_and_and_fetch(byte, mask);
+}
+
+static inline bool env_bit_test(int nr, const volatile unsigned long *addr)
+{
+ const char *byte = (char *)addr + (nr >> 3);
+ char mask = 1 << (nr & 7);
+
+ return !!(*byte & mask);
+}
+
+/* *** WAITQUEUE *** */
+
+typedef struct {
+ sem_t sem;
+} env_waitqueue;
+
+static inline void env_waitqueue_init(env_waitqueue *w)
+{
+ sem_init(&w->sem, 0, 0);
+}
+
+static inline void env_waitqueue_wake_up(env_waitqueue *w)
+{
+ sem_post(&w->sem);
+}
+
+#define env_waitqueue_wait(w, condition) \
+({ \
+ int __ret = 0; \
+ if (!(condition)) \
+ sem_wait(&w.sem); \
+ __ret = __ret; \
+})
+
+/* *** SCHEDULING *** */
+
+/* CAS does not need this while in user-space */
+static inline void env_schedule(void)
+{
+}
+
+#define env_cond_resched env_schedule
+
+static inline int env_in_interrupt(void)
+{
+ return 0;
+}
+
+static inline uint64_t env_get_tick_count(void)
+{
+ return spdk_get_ticks();
+}
+
+static inline uint64_t env_ticks_to_secs(uint64_t j)
+{
+ return j / spdk_get_ticks_hz();
+}
+
+static inline uint64_t env_ticks_to_msecs(uint64_t j)
+{
+ return env_ticks_to_secs(j) * 1000;
+}
+
+static inline uint64_t env_ticks_to_nsecs(uint64_t j)
+{
+ return env_ticks_to_secs(j) * 1000 * 1000;
+}
+
+static inline uint64_t env_ticks_to_usecs(uint64_t j)
+{
+ return env_ticks_to_secs(j) * 1000 * 1000 * 1000;
+}
+
+static inline uint64_t env_secs_to_ticks(uint64_t j)
+{
+ return j * spdk_get_ticks_hz();
+}
+
+/* *** STRING OPERATIONS *** */
+
+/* 512 KB is sufficient amount of memory for OCF operations */
+#define ENV_MAX_MEM (512 * 1024)
+
+static inline int env_memset(void *dest, size_t len, uint8_t value)
+{
+ if (dest == NULL || len == 0) {
+ return 1;
+ }
+
+ memset(dest, value, len);
+ return 0;
+}
+
+static inline int env_memcpy(void *dest, size_t dmax, const void *src, size_t len)
+{
+ if (dest == NULL || src == NULL) {
+ return 1;
+ }
+ if (dmax == 0 || dmax > ENV_MAX_MEM) {
+ return 1;
+ }
+ if (len == 0 || len > dmax) {
+ return 1;
+ }
+
+ memcpy(dest, src, len);
+ return 0;
+}
+
+static inline int env_memcmp(const void *aptr, size_t dmax, const void *bptr, size_t len,
+ int *diff)
+{
+ if (diff == NULL || aptr == NULL || bptr == NULL) {
+ return 1;
+ }
+ if (dmax == 0 || dmax > ENV_MAX_MEM) {
+ return 1;
+ }
+ if (len == 0 || len > dmax) {
+ return 1;
+ }
+
+ *diff = memcmp(aptr, bptr, len);
+ return 0;
+}
+
+/* 4096 is sufficient max length for any OCF operation on string */
+#define ENV_MAX_STR (4 * 1024)
+
+static inline size_t env_strnlen(const char *src, size_t dmax)
+{
+ return strnlen(src, dmax);
+}
+
+static inline int env_strncpy(char *dest, size_t dmax, const char *src, size_t len)
+{
+ if (dest == NULL || src == NULL) {
+ return 1;
+ }
+ if (dmax == 0 || dmax > ENV_MAX_STR) {
+ return 1;
+ }
+ if (len == 0) {
+ return 1;
+ }
+ /* Just copy as many characters as we can instead of return failure */
+ len = min(len, dmax);
+
+ strncpy(dest, src, len);
+ return 0;
+}
+
+#define env_strncmp(s1, slen1, s2, slen2) strncmp(s1, s2, min(slen1, slen2))
+
+static inline char *env_strdup(const char *src, int flags)
+{
+ int len;
+ char *ret;
+
+ if (src == NULL) {
+ return NULL;
+ }
+
+ len = env_strnlen(src, ENV_MAX_STR) + 1;
+ ret = env_malloc(len, flags);
+
+ if (env_strncpy(ret, ENV_MAX_STR, src, len)) {
+ return NULL;
+ } else {
+ return ret;
+ }
+}
+
+/* *** SORTING *** */
+
+static inline void env_sort(void *base, size_t num, size_t size,
+ int (*cmp_fn)(const void *, const void *),
+ void (*swap_fn)(void *, void *, int size))
+{
+ qsort(base, num, size, cmp_fn);
+}
+
+static inline void env_msleep(uint64_t n)
+{
+ usleep(n * 1000);
+}
+
+static inline void env_touch_softlockup_wd(void)
+{
+}
+
+/* *** CRC *** */
+
+uint32_t env_crc32(uint32_t crc, uint8_t const *data, size_t len);
+
+/* EXECUTION CONTEXTS */
+unsigned env_get_execution_context(void);
+void env_put_execution_context(unsigned ctx);
+unsigned env_get_execution_context_count(void);
+
+#endif /* __OCF_ENV_H__ */
diff --git a/src/spdk/lib/env_ocf/ocf_env_headers.h b/src/spdk/lib/env_ocf/ocf_env_headers.h
new file mode 100644
index 000000000..742479374
--- /dev/null
+++ b/src/spdk/lib/env_ocf/ocf_env_headers.h
@@ -0,0 +1,43 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __OCF_ENV_HEADERS_H__
+#define __OCF_ENV_HEADERS_H__
+
+#include "spdk/stdinc.h"
+
+#define OCF_VERSION_MAIN 20
+#define OCF_VERSION_MAJOR 3
+#define OCF_VERSION_MINOR 0
+
+#endif /* __OCF_ENV_HEADERS_H__ */
diff --git a/src/spdk/lib/env_ocf/ocf_env_list.h b/src/spdk/lib/env_ocf/ocf_env_list.h
new file mode 100644
index 000000000..e5f60d6c3
--- /dev/null
+++ b/src/spdk/lib/env_ocf/ocf_env_list.h
@@ -0,0 +1,185 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __OCF_LIST_H__
+#define __OCF_LIST_H__
+
+#define LIST_POISON1 ((void *) 0x00100100)
+#define LIST_POISON2 ((void *) 0x00200200)
+
+/**
+ * List entry structure mimicking linux kernel based one.
+ */
+struct list_head {
+ struct list_head *next;
+ struct list_head *prev;
+};
+
+/**
+ * start an empty list
+ */
+#define INIT_LIST_HEAD(l) { (l)->prev = l; (l)->next = l; }
+
+/**
+ * Add item to list head.
+ * @param it list entry to be added
+ * @param l1 list main node (head)
+ */
+static inline void list_add(struct list_head *it, struct list_head *l1)
+{
+ it->prev = l1;
+ it->next = l1->next;
+
+ l1->next->prev = it;
+ l1->next = it;
+}
+
+/**
+ * Add item it to tail.
+ * @param it list entry to be added
+ * @param l1 list main node (head)
+ */
+static inline void list_add_tail(struct list_head *it, struct list_head *l1)
+{
+ it->prev = l1->prev;
+ it->next = l1;
+
+ l1->prev->next = it;
+ l1->prev = it;
+}
+
+/**
+ * check if a list is empty (return true)
+ */
+static inline int list_empty(struct list_head *it)
+{
+ return it->next == it;
+}
+
+/**
+ * delete an entry from a list
+ */
+static inline void list_del(struct list_head *it)
+{
+ it->next->prev = it->prev;
+ it->prev->next = it->next;
+}
+
+static inline void list_move_tail(struct list_head *list,
+ struct list_head *head)
+{
+ list_del(list);
+ list_add_tail(list, head);
+}
+
+static inline void list_move(struct list_head *list,
+ struct list_head *head)
+{
+ list_del(list);
+ list_add(list, head);
+}
+
+/**
+ * Extract an entry.
+ * @param list_head_i list head item, from which entry is extracted
+ * @param item_type type (struct) of list entry
+ * @param field_name name of list_head field within item_type
+ */
+#define list_entry(list_head_i, item_type, field_name) \
+ (item_type *)(((void*)(list_head_i)) - offsetof(item_type, field_name))
+
+#define list_first_entry(list_head_i, item_type, field_name) \
+ list_entry((list_head_i)->next, item_type, field_name)
+
+/**
+ * @param iterator uninitialized list_head pointer, to be used as iterator
+ * @param plist list head (main node)
+ */
+#define list_for_each(iterator, plist) \
+ for (iterator = (plist)->next; \
+ (iterator)->next != (plist)->next; \
+ iterator = (iterator)->next)
+
+/**
+ * Safe version of list_for_each which works even if entries are deleted during
+ * loop.
+ * @param iterator uninitialized list_head pointer, to be used as iterator
+ * @param q another uninitialized list_head, used as helper
+ * @param plist list head (main node)
+ */
+/*
+ * Algorithm handles situation, where q is deleted.
+ * consider in example 3 element list with header h:
+ *
+ * h -> 1 -> 2 -> 3 ->
+ *1. i q
+ *
+ *2. i q
+ *
+ *3. q i
+ */
+#define list_for_each_safe(iterator, q, plist) \
+ for (iterator = (q = (plist)->next->next)->prev; \
+ (q) != (plist)->next; \
+ iterator = (q = (q)->next)->prev)
+
+#define _list_entry_helper(item, head, field_name) list_entry(head, typeof(*item), field_name)
+
+/**
+ * Iterate over list entries.
+ * @param list pointer to list item (iterator)
+ * @param plist pointer to list_head item
+ * @param field_name name of list_head field in list entry
+ */
+#define list_for_each_entry(item, plist, field_name) \
+ for (item = _list_entry_helper(item, (plist)->next, field_name); \
+ _list_entry_helper(item, (item)->field_name.next, field_name) !=\
+ _list_entry_helper(item, (plist)->next, field_name); \
+ item = _list_entry_helper(item, (item)->field_name.next, field_name))
+
+/**
+ * Safe version of list_for_each_entry which works even if entries are deleted
+ * during loop.
+ * @param list pointer to list item (iterator)
+ * @param q another pointer to list item, used as helper
+ * @param plist pointer to list_head item
+ * @param field_name name of list_head field in list entry
+ */
+#define list_for_each_entry_safe(item, q, plist, field_name) \
+ for (item = _list_entry_helper(item, (plist)->next, field_name), \
+ q = _list_entry_helper(item, (item)->field_name.next, field_name); \
+ _list_entry_helper(item, (item)->field_name.next, field_name) != \
+ _list_entry_helper(item, (plist)->next, field_name); \
+ item = q, q = _list_entry_helper(q, (q)->field_name.next, field_name))
+
+#endif
diff --git a/src/spdk/lib/event/Makefile b/src/spdk/lib/event/Makefile
new file mode 100644
index 000000000..87a6209c7
--- /dev/null
+++ b/src/spdk/lib/event/Makefile
@@ -0,0 +1,45 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 5
+SO_MINOR := 0
+
+LIBNAME = event
+C_SRCS = app.c reactor.c rpc.c subsystem.c json_config.c
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_event.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/event/app.c b/src/spdk/lib/event/app.c
new file mode 100644
index 000000000..b6cab05a3
--- /dev/null
+++ b/src/spdk/lib/event/app.c
@@ -0,0 +1,1177 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation. All rights reserved.
+ * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+#include "spdk/version.h"
+
+#include "spdk_internal/event.h"
+
+#include "spdk/env.h"
+#include "spdk/log.h"
+#include "spdk/conf.h"
+#include "spdk/thread.h"
+#include "spdk/trace.h"
+#include "spdk/string.h"
+#include "spdk/rpc.h"
+#include "spdk/util.h"
+
+#define SPDK_APP_DEFAULT_LOG_LEVEL SPDK_LOG_NOTICE
+#define SPDK_APP_DEFAULT_LOG_PRINT_LEVEL SPDK_LOG_INFO
+#define SPDK_APP_DEFAULT_NUM_TRACE_ENTRIES SPDK_DEFAULT_NUM_TRACE_ENTRIES
+
+#define SPDK_APP_DPDK_DEFAULT_MEM_SIZE -1
+#define SPDK_APP_DPDK_DEFAULT_MASTER_CORE -1
+#define SPDK_APP_DPDK_DEFAULT_MEM_CHANNEL -1
+#define SPDK_APP_DPDK_DEFAULT_CORE_MASK "0x1"
+#define SPDK_APP_DPDK_DEFAULT_BASE_VIRTADDR 0x200000000000
+#define SPDK_APP_DEFAULT_CORE_LIMIT 0x140000000 /* 5 GiB */
+
+struct spdk_app {
+ struct spdk_conf *config;
+ const char *json_config_file;
+ bool json_config_ignore_errors;
+ const char *rpc_addr;
+ int shm_id;
+ spdk_app_shutdown_cb shutdown_cb;
+ int rc;
+};
+
+static struct spdk_app g_spdk_app;
+static spdk_msg_fn g_start_fn = NULL;
+static void *g_start_arg = NULL;
+static struct spdk_thread *g_app_thread = NULL;
+static bool g_delay_subsystem_init = false;
+static bool g_shutdown_sig_received = false;
+static char *g_executable_name;
+static struct spdk_app_opts g_default_opts;
+
+int
+spdk_app_get_shm_id(void)
+{
+ return g_spdk_app.shm_id;
+}
+
+/* append one empty option to indicate the end of the array */
+static const struct option g_cmdline_options[] = {
+#define CONFIG_FILE_OPT_IDX 'c'
+ {"config", required_argument, NULL, CONFIG_FILE_OPT_IDX},
+#define LIMIT_COREDUMP_OPT_IDX 'd'
+ {"limit-coredump", no_argument, NULL, LIMIT_COREDUMP_OPT_IDX},
+#define TPOINT_GROUP_MASK_OPT_IDX 'e'
+ {"tpoint-group-mask", required_argument, NULL, TPOINT_GROUP_MASK_OPT_IDX},
+#define SINGLE_FILE_SEGMENTS_OPT_IDX 'g'
+ {"single-file-segments", no_argument, NULL, SINGLE_FILE_SEGMENTS_OPT_IDX},
+#define HELP_OPT_IDX 'h'
+ {"help", no_argument, NULL, HELP_OPT_IDX},
+#define SHM_ID_OPT_IDX 'i'
+ {"shm-id", required_argument, NULL, SHM_ID_OPT_IDX},
+#define CPUMASK_OPT_IDX 'm'
+ {"cpumask", required_argument, NULL, CPUMASK_OPT_IDX},
+#define MEM_CHANNELS_OPT_IDX 'n'
+ {"mem-channels", required_argument, NULL, MEM_CHANNELS_OPT_IDX},
+#define MASTER_CORE_OPT_IDX 'p'
+ {"master-core", required_argument, NULL, MASTER_CORE_OPT_IDX},
+#define RPC_SOCKET_OPT_IDX 'r'
+ {"rpc-socket", required_argument, NULL, RPC_SOCKET_OPT_IDX},
+#define MEM_SIZE_OPT_IDX 's'
+ {"mem-size", required_argument, NULL, MEM_SIZE_OPT_IDX},
+#define NO_PCI_OPT_IDX 'u'
+ {"no-pci", no_argument, NULL, NO_PCI_OPT_IDX},
+#define VERSION_OPT_IDX 'v'
+ {"version", no_argument, NULL, VERSION_OPT_IDX},
+#define PCI_BLACKLIST_OPT_IDX 'B'
+ {"pci-blacklist", required_argument, NULL, PCI_BLACKLIST_OPT_IDX},
+#define LOGFLAG_OPT_IDX 'L'
+ {"logflag", required_argument, NULL, LOGFLAG_OPT_IDX},
+#define HUGE_UNLINK_OPT_IDX 'R'
+ {"huge-unlink", no_argument, NULL, HUGE_UNLINK_OPT_IDX},
+#define PCI_WHITELIST_OPT_IDX 'W'
+ {"pci-whitelist", required_argument, NULL, PCI_WHITELIST_OPT_IDX},
+#define SILENCE_NOTICELOG_OPT_IDX 257
+ {"silence-noticelog", no_argument, NULL, SILENCE_NOTICELOG_OPT_IDX},
+#define WAIT_FOR_RPC_OPT_IDX 258
+ {"wait-for-rpc", no_argument, NULL, WAIT_FOR_RPC_OPT_IDX},
+#define HUGE_DIR_OPT_IDX 259
+ {"huge-dir", required_argument, NULL, HUGE_DIR_OPT_IDX},
+#define NUM_TRACE_ENTRIES_OPT_IDX 260
+ {"num-trace-entries", required_argument, NULL, NUM_TRACE_ENTRIES_OPT_IDX},
+#define MAX_REACTOR_DELAY_OPT_IDX 261
+ {"max-delay", required_argument, NULL, MAX_REACTOR_DELAY_OPT_IDX},
+#define JSON_CONFIG_OPT_IDX 262
+ {"json", required_argument, NULL, JSON_CONFIG_OPT_IDX},
+#define JSON_CONFIG_IGNORE_INIT_ERRORS_IDX 263
+ {"json-ignore-init-errors", no_argument, NULL, JSON_CONFIG_IGNORE_INIT_ERRORS_IDX},
+#define IOVA_MODE_OPT_IDX 264
+ {"iova-mode", required_argument, NULL, IOVA_MODE_OPT_IDX},
+#define BASE_VIRTADDR_OPT_IDX 265
+ {"base-virtaddr", required_argument, NULL, BASE_VIRTADDR_OPT_IDX},
+};
+
+/* Global section */
+#define GLOBAL_CONFIG_TMPL \
+"# Configuration file\n" \
+"#\n" \
+"# Please write all parameters using ASCII.\n" \
+"# The parameter must be quoted if it includes whitespace.\n" \
+"#\n" \
+"# Configuration syntax:\n" \
+"# Spaces at head of line are deleted, other spaces are as separator\n" \
+"# Lines starting with '#' are comments and not evaluated.\n" \
+"# Lines ending with '\\' are concatenated with the next line.\n" \
+"# Bracketed keys are section keys grouping the following value keys.\n" \
+"# Number of section key is used as a tag number.\n" \
+"# Ex. [TargetNode1] = TargetNode section key with tag number 1\n" \
+"[Global]\n" \
+" Comment \"Global section\"\n" \
+"\n" \
+" # Users can restrict work items to only run on certain cores by\n" \
+" # specifying a ReactorMask. Default is to allow work items to run\n" \
+" # on all cores. Core 0 must be set in the mask if one is specified.\n" \
+" # Default: 0xFFFF (cores 0-15)\n" \
+" ReactorMask \"0x%s\"\n" \
+"\n" \
+" # Tracepoint group mask for spdk trace buffers\n" \
+" # Default: 0x0 (all tracepoint groups disabled)\n" \
+" # Set to 0xFFFF to enable all tracepoint groups.\n" \
+" TpointGroupMask \"0x%" PRIX64 "\"\n" \
+"\n" \
+
+static void
+app_config_dump_global_section(FILE *fp)
+{
+ struct spdk_cpuset *coremask;
+
+ if (NULL == fp) {
+ return;
+ }
+
+ coremask = spdk_app_get_core_mask();
+
+ fprintf(fp, GLOBAL_CONFIG_TMPL, spdk_cpuset_fmt(coremask),
+ spdk_trace_get_tpoint_group_mask());
+}
+
+int
+spdk_app_get_running_config(char **config_str, char *name)
+{
+ FILE *fp = NULL;
+ int fd = -1;
+ long length = 0, ret = 0;
+ char vbuf[BUFSIZ];
+ char config_template[64];
+
+ snprintf(config_template, sizeof(config_template), "/tmp/%s.XXXXXX", name);
+ /* Create temporary file to hold config */
+ fd = mkstemp(config_template);
+ if (fd == -1) {
+ SPDK_ERRLOG("mkstemp failed\n");
+ return -1;
+ }
+ fp = fdopen(fd, "wb+");
+ if (NULL == fp) {
+ SPDK_ERRLOG("error opening tmpfile fd = %d\n", fd);
+ return -1;
+ }
+
+ /* Buffered IO */
+ setvbuf(fp, vbuf, _IOFBF, BUFSIZ);
+
+ app_config_dump_global_section(fp);
+ spdk_subsystem_config(fp);
+
+ length = ftell(fp);
+
+ *config_str = malloc(length + 1);
+ if (!*config_str) {
+ SPDK_ERRLOG("out-of-memory for config\n");
+ fclose(fp);
+ return -1;
+ }
+ fseek(fp, 0, SEEK_SET);
+ ret = fread(*config_str, sizeof(char), length, fp);
+ if (ret < length) {
+ SPDK_ERRLOG("short read\n");
+ }
+ fclose(fp);
+ (*config_str)[length] = '\0';
+
+ return 0;
+}
+
+static void
+app_start_shutdown(void *ctx)
+{
+ if (g_spdk_app.shutdown_cb) {
+ g_spdk_app.shutdown_cb();
+ g_spdk_app.shutdown_cb = NULL;
+ } else {
+ spdk_app_stop(0);
+ }
+}
+
+void
+spdk_app_start_shutdown(void)
+{
+ spdk_thread_send_critical_msg(g_app_thread, app_start_shutdown);
+}
+
+static void
+__shutdown_signal(int signo)
+{
+ if (!g_shutdown_sig_received) {
+ g_shutdown_sig_received = true;
+ spdk_app_start_shutdown();
+ }
+}
+
+static int
+app_opts_validate(const char *app_opts)
+{
+ int i = 0, j;
+
+ for (i = 0; app_opts[i] != '\0'; i++) {
+ /* ignore getopt control characters */
+ if (app_opts[i] == ':' || app_opts[i] == '+' || app_opts[i] == '-') {
+ continue;
+ }
+
+ for (j = 0; SPDK_APP_GETOPT_STRING[j] != '\0'; j++) {
+ if (app_opts[i] == SPDK_APP_GETOPT_STRING[j]) {
+ return app_opts[i];
+ }
+ }
+ }
+ return 0;
+}
+
+void
+spdk_app_opts_init(struct spdk_app_opts *opts)
+{
+ if (!opts) {
+ return;
+ }
+
+ memset(opts, 0, sizeof(*opts));
+
+ opts->enable_coredump = true;
+ opts->shm_id = -1;
+ opts->mem_size = SPDK_APP_DPDK_DEFAULT_MEM_SIZE;
+ opts->master_core = SPDK_APP_DPDK_DEFAULT_MASTER_CORE;
+ opts->mem_channel = SPDK_APP_DPDK_DEFAULT_MEM_CHANNEL;
+ opts->reactor_mask = NULL;
+ opts->base_virtaddr = SPDK_APP_DPDK_DEFAULT_BASE_VIRTADDR;
+ opts->print_level = SPDK_APP_DEFAULT_LOG_PRINT_LEVEL;
+ opts->rpc_addr = SPDK_DEFAULT_RPC_ADDR;
+ opts->num_entries = SPDK_APP_DEFAULT_NUM_TRACE_ENTRIES;
+ opts->delay_subsystem_init = false;
+}
+
+static int
+app_setup_signal_handlers(struct spdk_app_opts *opts)
+{
+ struct sigaction sigact;
+ sigset_t sigmask;
+ int rc;
+
+ sigemptyset(&sigmask);
+ memset(&sigact, 0, sizeof(sigact));
+ sigemptyset(&sigact.sa_mask);
+
+ sigact.sa_handler = SIG_IGN;
+ rc = sigaction(SIGPIPE, &sigact, NULL);
+ if (rc < 0) {
+ SPDK_ERRLOG("sigaction(SIGPIPE) failed\n");
+ return rc;
+ }
+
+ /* Install the same handler for SIGINT and SIGTERM */
+ g_shutdown_sig_received = false;
+ sigact.sa_handler = __shutdown_signal;
+ rc = sigaction(SIGINT, &sigact, NULL);
+ if (rc < 0) {
+ SPDK_ERRLOG("sigaction(SIGINT) failed\n");
+ return rc;
+ }
+ sigaddset(&sigmask, SIGINT);
+
+ rc = sigaction(SIGTERM, &sigact, NULL);
+ if (rc < 0) {
+ SPDK_ERRLOG("sigaction(SIGTERM) failed\n");
+ return rc;
+ }
+ sigaddset(&sigmask, SIGTERM);
+
+ if (opts->usr1_handler != NULL) {
+ sigact.sa_handler = opts->usr1_handler;
+ rc = sigaction(SIGUSR1, &sigact, NULL);
+ if (rc < 0) {
+ SPDK_ERRLOG("sigaction(SIGUSR1) failed\n");
+ return rc;
+ }
+ sigaddset(&sigmask, SIGUSR1);
+ }
+
+ pthread_sigmask(SIG_UNBLOCK, &sigmask, NULL);
+
+ return 0;
+}
+
+static void
+app_start_application(void)
+{
+ assert(spdk_get_thread() == g_app_thread);
+
+ g_start_fn(g_start_arg);
+}
+
+static void
+app_start_rpc(int rc, void *arg1)
+{
+ if (rc) {
+ spdk_app_stop(rc);
+ return;
+ }
+
+ spdk_rpc_initialize(g_spdk_app.rpc_addr);
+ if (!g_delay_subsystem_init) {
+ spdk_rpc_set_state(SPDK_RPC_RUNTIME);
+ app_start_application();
+ }
+}
+
+static struct spdk_conf *
+app_setup_conf(const char *config_file)
+{
+ struct spdk_conf *config;
+ int rc;
+
+ config = spdk_conf_allocate();
+ assert(config != NULL);
+ if (config_file) {
+ rc = spdk_conf_read(config, config_file);
+ if (rc != 0) {
+ SPDK_ERRLOG("Could not read config file %s\n", config_file);
+ goto error;
+ }
+ if (spdk_conf_first_section(config) == NULL) {
+ SPDK_ERRLOG("Invalid config file %s\n", config_file);
+ goto error;
+ }
+ }
+ spdk_conf_set_as_default(config);
+ return config;
+
+error:
+ spdk_conf_free(config);
+ return NULL;
+}
+
+static int
+app_opts_add_pci_addr(struct spdk_app_opts *opts, struct spdk_pci_addr **list, char *bdf)
+{
+ struct spdk_pci_addr *tmp = *list;
+ size_t i = opts->num_pci_addr;
+
+ tmp = realloc(tmp, sizeof(*tmp) * (i + 1));
+ if (tmp == NULL) {
+ SPDK_ERRLOG("realloc error\n");
+ return -ENOMEM;
+ }
+
+ *list = tmp;
+ if (spdk_pci_addr_parse(*list + i, bdf) < 0) {
+ SPDK_ERRLOG("Invalid address %s\n", bdf);
+ return -EINVAL;
+ }
+
+ opts->num_pci_addr++;
+ return 0;
+}
+
+static int
+app_read_config_file_global_params(struct spdk_app_opts *opts)
+{
+ struct spdk_conf_section *sp;
+ char *bdf;
+ int i, rc = 0;
+
+ sp = spdk_conf_find_section(NULL, "Global");
+
+ if (opts->shm_id == -1) {
+ if (sp != NULL) {
+ opts->shm_id = spdk_conf_section_get_intval(sp, "SharedMemoryID");
+ }
+ }
+
+ if (opts->reactor_mask == NULL) {
+ if (sp && spdk_conf_section_get_val(sp, "ReactorMask")) {
+ SPDK_ERRLOG("ReactorMask config option is deprecated. Use -m/--cpumask\n"
+ "command line parameter instead.\n");
+ opts->reactor_mask = spdk_conf_section_get_val(sp, "ReactorMask");
+ } else {
+ opts->reactor_mask = SPDK_APP_DPDK_DEFAULT_CORE_MASK;
+ }
+ }
+
+ if (!opts->no_pci && sp) {
+ opts->no_pci = spdk_conf_section_get_boolval(sp, "NoPci", false);
+ }
+
+ if (opts->tpoint_group_mask == NULL) {
+ if (sp != NULL) {
+ opts->tpoint_group_mask = spdk_conf_section_get_val(sp, "TpointGroupMask");
+ }
+ }
+
+ if (sp == NULL) {
+ return 0;
+ }
+
+ for (i = 0; ; i++) {
+ bdf = spdk_conf_section_get_nmval(sp, "PciBlacklist", i, 0);
+ if (!bdf) {
+ break;
+ }
+
+ rc = app_opts_add_pci_addr(opts, &opts->pci_blacklist, bdf);
+ if (rc != 0) {
+ free(opts->pci_blacklist);
+ return rc;
+ }
+ }
+
+ for (i = 0; ; i++) {
+ bdf = spdk_conf_section_get_nmval(sp, "PciWhitelist", i, 0);
+ if (!bdf) {
+ break;
+ }
+
+ if (opts->pci_blacklist != NULL) {
+ SPDK_ERRLOG("PciBlacklist and PciWhitelist cannot be used at the same time\n");
+ free(opts->pci_blacklist);
+ return -EINVAL;
+ }
+
+ rc = app_opts_add_pci_addr(opts, &opts->pci_whitelist, bdf);
+ if (rc != 0) {
+ free(opts->pci_whitelist);
+ return rc;
+ }
+ }
+ return 0;
+}
+
+static int
+app_setup_env(struct spdk_app_opts *opts)
+{
+ struct spdk_env_opts env_opts = {};
+ int rc;
+
+ if (opts == NULL) {
+ rc = spdk_env_init(NULL);
+ if (rc != 0) {
+ SPDK_ERRLOG("Unable to reinitialize SPDK env\n");
+ }
+
+ return rc;
+ }
+
+
+ spdk_env_opts_init(&env_opts);
+
+ env_opts.name = opts->name;
+ env_opts.core_mask = opts->reactor_mask;
+ env_opts.shm_id = opts->shm_id;
+ env_opts.mem_channel = opts->mem_channel;
+ env_opts.master_core = opts->master_core;
+ env_opts.mem_size = opts->mem_size;
+ env_opts.hugepage_single_segments = opts->hugepage_single_segments;
+ env_opts.unlink_hugepage = opts->unlink_hugepage;
+ env_opts.hugedir = opts->hugedir;
+ env_opts.no_pci = opts->no_pci;
+ env_opts.num_pci_addr = opts->num_pci_addr;
+ env_opts.pci_blacklist = opts->pci_blacklist;
+ env_opts.pci_whitelist = opts->pci_whitelist;
+ env_opts.env_context = opts->env_context;
+ env_opts.iova_mode = opts->iova_mode;
+
+ rc = spdk_env_init(&env_opts);
+ free(env_opts.pci_blacklist);
+ free(env_opts.pci_whitelist);
+
+
+ if (rc < 0) {
+ SPDK_ERRLOG("Unable to initialize SPDK env\n");
+ }
+
+ return rc;
+}
+
+static int
+app_setup_trace(struct spdk_app_opts *opts)
+{
+ char shm_name[64];
+ uint64_t tpoint_group_mask;
+ char *end;
+
+ if (opts->shm_id >= 0) {
+ snprintf(shm_name, sizeof(shm_name), "/%s_trace.%d", opts->name, opts->shm_id);
+ } else {
+ snprintf(shm_name, sizeof(shm_name), "/%s_trace.pid%d", opts->name, (int)getpid());
+ }
+
+ if (spdk_trace_init(shm_name, opts->num_entries) != 0) {
+ return -1;
+ }
+
+ if (opts->tpoint_group_mask != NULL) {
+ errno = 0;
+ tpoint_group_mask = strtoull(opts->tpoint_group_mask, &end, 16);
+ if (*end != '\0' || errno) {
+ SPDK_ERRLOG("invalid tpoint mask %s\n", opts->tpoint_group_mask);
+ } else {
+ SPDK_NOTICELOG("Tracepoint Group Mask %s specified.\n", opts->tpoint_group_mask);
+ SPDK_NOTICELOG("Use 'spdk_trace -s %s %s %d' to capture a snapshot of events at runtime.\n",
+ opts->name,
+ opts->shm_id >= 0 ? "-i" : "-p",
+ opts->shm_id >= 0 ? opts->shm_id : getpid());
+#if defined(__linux__)
+ SPDK_NOTICELOG("Or copy /dev/shm%s for offline analysis/debug.\n", shm_name);
+#endif
+ spdk_trace_set_tpoint_group_mask(tpoint_group_mask);
+ }
+ }
+
+ return 0;
+}
+
+static void
+bootstrap_fn(void *arg1)
+{
+ if (g_spdk_app.json_config_file) {
+ g_delay_subsystem_init = false;
+ spdk_app_json_config_load(g_spdk_app.json_config_file, g_spdk_app.rpc_addr, app_start_rpc,
+ NULL, !g_spdk_app.json_config_ignore_errors);
+ } else {
+ if (!g_delay_subsystem_init) {
+ spdk_subsystem_init(app_start_rpc, NULL);
+ } else {
+ spdk_rpc_initialize(g_spdk_app.rpc_addr);
+ }
+ }
+}
+
+int
+spdk_app_start(struct spdk_app_opts *opts, spdk_msg_fn start_fn,
+ void *arg1)
+{
+ struct spdk_conf *config = NULL;
+ int rc;
+ char *tty;
+ struct spdk_cpuset tmp_cpumask = {};
+ static bool g_env_was_setup = false;
+
+ if (!opts) {
+ SPDK_ERRLOG("opts should not be NULL\n");
+ return 1;
+ }
+
+ if (!start_fn) {
+ SPDK_ERRLOG("start_fn should not be NULL\n");
+ return 1;
+ }
+
+ tty = ttyname(STDERR_FILENO);
+ if (opts->print_level > SPDK_LOG_WARN &&
+ isatty(STDERR_FILENO) &&
+ tty &&
+ !strncmp(tty, "/dev/tty", strlen("/dev/tty"))) {
+ printf("Warning: printing stderr to console terminal without -q option specified.\n");
+ printf("Suggest using --silence-noticelog to disable logging to stderr and\n");
+ printf("monitor syslog, or redirect stderr to a file.\n");
+ printf("(Delaying for 10 seconds...)\n");
+ sleep(10);
+ }
+
+ spdk_log_set_print_level(opts->print_level);
+
+#ifndef SPDK_NO_RLIMIT
+ if (opts->enable_coredump) {
+ struct rlimit core_limits;
+
+ core_limits.rlim_cur = core_limits.rlim_max = SPDK_APP_DEFAULT_CORE_LIMIT;
+ setrlimit(RLIMIT_CORE, &core_limits);
+ }
+#endif
+
+ config = app_setup_conf(opts->config_file);
+ if (config == NULL) {
+ return 1;
+ }
+
+ if (app_read_config_file_global_params(opts) < 0) {
+ spdk_conf_free(config);
+ return 1;
+ }
+
+ memset(&g_spdk_app, 0, sizeof(g_spdk_app));
+ g_spdk_app.config = config;
+ g_spdk_app.json_config_file = opts->json_config_file;
+ g_spdk_app.json_config_ignore_errors = opts->json_config_ignore_errors;
+ g_spdk_app.rpc_addr = opts->rpc_addr;
+ g_spdk_app.shm_id = opts->shm_id;
+ g_spdk_app.shutdown_cb = opts->shutdown_cb;
+ g_spdk_app.rc = 0;
+
+ spdk_log_set_level(SPDK_APP_DEFAULT_LOG_LEVEL);
+
+ /* Pass NULL to app_setup_env if SPDK app has been set up, in order to
+ * indicate that this is a reinitialization.
+ */
+ if (app_setup_env(g_env_was_setup ? NULL : opts) < 0) {
+ return 1;
+ }
+
+ spdk_log_open(opts->log);
+ SPDK_NOTICELOG("Total cores available: %d\n", spdk_env_get_core_count());
+
+ /*
+ * If mask not specified on command line or in configuration file,
+ * reactor_mask will be 0x1 which will enable core 0 to run one
+ * reactor.
+ */
+ if ((rc = spdk_reactors_init()) != 0) {
+ SPDK_ERRLOG("Reactor Initilization failed: rc = %d\n", rc);
+ return 1;
+ }
+
+ spdk_cpuset_set_cpu(&tmp_cpumask, spdk_env_get_current_core(), true);
+
+ /* Now that the reactors have been initialized, we can create an
+ * initialization thread. */
+ g_app_thread = spdk_thread_create("app_thread", &tmp_cpumask);
+ if (!g_app_thread) {
+ SPDK_ERRLOG("Unable to create an spdk_thread for initialization\n");
+ return 1;
+ }
+
+ /*
+ * Note the call to app_setup_trace() is located here
+ * ahead of app_setup_signal_handlers().
+ * That's because there is not an easy/direct clean
+ * way of unwinding alloc'd resources that can occur
+ * in app_setup_signal_handlers().
+ */
+ if (app_setup_trace(opts) != 0) {
+ return 1;
+ }
+
+ if ((rc = app_setup_signal_handlers(opts)) != 0) {
+ return 1;
+ }
+
+ g_delay_subsystem_init = opts->delay_subsystem_init;
+ g_start_fn = start_fn;
+ g_start_arg = arg1;
+
+ spdk_thread_send_msg(g_app_thread, bootstrap_fn, NULL);
+
+ /* This blocks until spdk_app_stop is called */
+ spdk_reactors_start();
+
+ g_env_was_setup = true;
+
+ return g_spdk_app.rc;
+}
+
+void
+spdk_app_fini(void)
+{
+ spdk_trace_cleanup();
+ spdk_reactors_fini();
+ spdk_env_fini();
+ spdk_conf_free(g_spdk_app.config);
+ spdk_log_close();
+}
+
+static void
+app_stop(void *arg1)
+{
+ spdk_rpc_finish();
+ spdk_subsystem_fini(spdk_reactors_stop, NULL);
+}
+
+void
+spdk_app_stop(int rc)
+{
+ if (rc) {
+ SPDK_WARNLOG("spdk_app_stop'd on non-zero\n");
+ }
+ g_spdk_app.rc = rc;
+ /*
+ * We want to run spdk_subsystem_fini() from the same thread where spdk_subsystem_init()
+ * was called.
+ */
+ spdk_thread_send_msg(g_app_thread, app_stop, NULL);
+}
+
+static void
+usage(void (*app_usage)(void))
+{
+ printf("%s [options]\n", g_executable_name);
+ printf("options:\n");
+ printf(" -c, --config <config> config file (default %s)\n",
+ g_default_opts.config_file != NULL ? g_default_opts.config_file : "none");
+ printf(" --json <config> JSON config file (default %s)\n",
+ g_default_opts.json_config_file != NULL ? g_default_opts.json_config_file : "none");
+ printf(" --json-ignore-init-errors\n");
+ printf(" don't exit on invalid config entry\n");
+ printf(" -d, --limit-coredump do not set max coredump size to RLIM_INFINITY\n");
+ printf(" -g, --single-file-segments\n");
+ printf(" force creating just one hugetlbfs file\n");
+ printf(" -h, --help show this usage\n");
+ printf(" -i, --shm-id <id> shared memory ID (optional)\n");
+ printf(" -m, --cpumask <mask> core mask for DPDK\n");
+ printf(" -n, --mem-channels <num> channel number of memory channels used for DPDK\n");
+ printf(" -p, --master-core <id> master (primary) core for DPDK\n");
+ printf(" -r, --rpc-socket <path> RPC listen address (default %s)\n", SPDK_DEFAULT_RPC_ADDR);
+ printf(" -s, --mem-size <size> memory size in MB for DPDK (default: ");
+#ifndef __linux__
+ if (g_default_opts.mem_size <= 0) {
+ printf("all hugepage memory)\n");
+ } else
+#endif
+ {
+ printf("%dMB)\n", g_default_opts.mem_size >= 0 ? g_default_opts.mem_size : 0);
+ }
+ printf(" --silence-noticelog disable notice level logging to stderr\n");
+ printf(" -u, --no-pci disable PCI access\n");
+ printf(" --wait-for-rpc wait for RPCs to initialize subsystems\n");
+ printf(" --max-delay <num> maximum reactor delay (in microseconds)\n");
+ printf(" -B, --pci-blacklist <bdf>\n");
+ printf(" pci addr to blacklist (can be used more than once)\n");
+ printf(" -R, --huge-unlink unlink huge files after initialization\n");
+ printf(" -v, --version print SPDK version\n");
+ printf(" -W, --pci-whitelist <bdf>\n");
+ printf(" pci addr to whitelist (-B and -W cannot be used at the same time)\n");
+ printf(" --huge-dir <path> use a specific hugetlbfs mount to reserve memory from\n");
+ printf(" --iova-mode <pa/va> set IOVA mode ('pa' for IOVA_PA and 'va' for IOVA_VA)\n");
+ printf(" --base-virtaddr <addr> the base virtual address for DPDK (default: 0x200000000000)\n");
+ printf(" --num-trace-entries <num> number of trace entries for each core, must be power of 2. (default %d)\n",
+ SPDK_APP_DEFAULT_NUM_TRACE_ENTRIES);
+ spdk_log_usage(stdout, "-L");
+ spdk_trace_mask_usage(stdout, "-e");
+ if (app_usage) {
+ app_usage();
+ }
+}
+
+spdk_app_parse_args_rvals_t
+spdk_app_parse_args(int argc, char **argv, struct spdk_app_opts *opts,
+ const char *app_getopt_str, struct option *app_long_opts,
+ int (*app_parse)(int ch, char *arg),
+ void (*app_usage)(void))
+{
+ int ch, rc, opt_idx, global_long_opts_len, app_long_opts_len;
+ struct option *cmdline_options;
+ char *cmdline_short_opts = NULL;
+ enum spdk_app_parse_args_rvals retval = SPDK_APP_PARSE_ARGS_FAIL;
+ long int tmp;
+
+ memcpy(&g_default_opts, opts, sizeof(g_default_opts));
+
+ if (opts->config_file && access(opts->config_file, R_OK) != 0) {
+ SPDK_WARNLOG("Can't read legacy configuration file '%s'\n", opts->config_file);
+ opts->config_file = NULL;
+ }
+
+ if (opts->json_config_file && access(opts->json_config_file, R_OK) != 0) {
+ SPDK_WARNLOG("Can't read JSON configuration file '%s'\n", opts->json_config_file);
+ opts->json_config_file = NULL;
+ }
+
+ if (app_long_opts == NULL) {
+ app_long_opts_len = 0;
+ } else {
+ for (app_long_opts_len = 0;
+ app_long_opts[app_long_opts_len].name != NULL;
+ app_long_opts_len++);
+ }
+
+ global_long_opts_len = SPDK_COUNTOF(g_cmdline_options);
+
+ cmdline_options = calloc(global_long_opts_len + app_long_opts_len + 1, sizeof(*cmdline_options));
+ if (!cmdline_options) {
+ SPDK_ERRLOG("Out of memory\n");
+ return SPDK_APP_PARSE_ARGS_FAIL;
+ }
+
+ memcpy(&cmdline_options[0], g_cmdline_options, sizeof(g_cmdline_options));
+ if (app_long_opts) {
+ memcpy(&cmdline_options[global_long_opts_len], app_long_opts,
+ app_long_opts_len * sizeof(*app_long_opts));
+ }
+
+ if (app_getopt_str != NULL) {
+ ch = app_opts_validate(app_getopt_str);
+ if (ch) {
+ SPDK_ERRLOG("Duplicated option '%c' between the generic and application specific spdk opts.\n",
+ ch);
+ goto out;
+ }
+ }
+
+ cmdline_short_opts = spdk_sprintf_alloc("%s%s", app_getopt_str, SPDK_APP_GETOPT_STRING);
+ if (!cmdline_short_opts) {
+ SPDK_ERRLOG("Out of memory\n");
+ goto out;
+ }
+
+ g_executable_name = argv[0];
+
+ while ((ch = getopt_long(argc, argv, cmdline_short_opts, cmdline_options, &opt_idx)) != -1) {
+ switch (ch) {
+ case CONFIG_FILE_OPT_IDX:
+ opts->config_file = optarg;
+ break;
+ case JSON_CONFIG_OPT_IDX:
+ opts->json_config_file = optarg;
+ break;
+ case JSON_CONFIG_IGNORE_INIT_ERRORS_IDX:
+ opts->json_config_ignore_errors = true;
+ break;
+ case LIMIT_COREDUMP_OPT_IDX:
+ opts->enable_coredump = false;
+ break;
+ case TPOINT_GROUP_MASK_OPT_IDX:
+ opts->tpoint_group_mask = optarg;
+ break;
+ case SINGLE_FILE_SEGMENTS_OPT_IDX:
+ opts->hugepage_single_segments = true;
+ break;
+ case HELP_OPT_IDX:
+ usage(app_usage);
+ retval = SPDK_APP_PARSE_ARGS_HELP;
+ goto out;
+ case SHM_ID_OPT_IDX:
+ opts->shm_id = spdk_strtol(optarg, 0);
+ if (opts->shm_id < 0) {
+ SPDK_ERRLOG("Invalid shared memory ID %s\n", optarg);
+ goto out;
+ }
+ break;
+ case CPUMASK_OPT_IDX:
+ opts->reactor_mask = optarg;
+ break;
+ case MEM_CHANNELS_OPT_IDX:
+ opts->mem_channel = spdk_strtol(optarg, 0);
+ if (opts->mem_channel < 0) {
+ SPDK_ERRLOG("Invalid memory channel %s\n", optarg);
+ goto out;
+ }
+ break;
+ case MASTER_CORE_OPT_IDX:
+ opts->master_core = spdk_strtol(optarg, 0);
+ if (opts->master_core < 0) {
+ SPDK_ERRLOG("Invalid master core %s\n", optarg);
+ goto out;
+ }
+ break;
+ case SILENCE_NOTICELOG_OPT_IDX:
+ opts->print_level = SPDK_LOG_WARN;
+ break;
+ case RPC_SOCKET_OPT_IDX:
+ opts->rpc_addr = optarg;
+ break;
+ case MEM_SIZE_OPT_IDX: {
+ uint64_t mem_size_mb;
+ bool mem_size_has_prefix;
+
+ rc = spdk_parse_capacity(optarg, &mem_size_mb, &mem_size_has_prefix);
+ if (rc != 0) {
+ SPDK_ERRLOG("invalid memory pool size `-s %s`\n", optarg);
+ usage(app_usage);
+ goto out;
+ }
+
+ if (mem_size_has_prefix) {
+ /* the mem size is in MB by default, so if a prefix was
+ * specified, we need to manually convert to MB.
+ */
+ mem_size_mb /= 1024 * 1024;
+ }
+
+ if (mem_size_mb > INT_MAX) {
+ SPDK_ERRLOG("invalid memory pool size `-s %s`\n", optarg);
+ usage(app_usage);
+ goto out;
+ }
+
+ opts->mem_size = (int) mem_size_mb;
+ break;
+ }
+ case NO_PCI_OPT_IDX:
+ opts->no_pci = true;
+ break;
+ case WAIT_FOR_RPC_OPT_IDX:
+ opts->delay_subsystem_init = true;
+ break;
+ case PCI_BLACKLIST_OPT_IDX:
+ if (opts->pci_whitelist) {
+ free(opts->pci_whitelist);
+ opts->pci_whitelist = NULL;
+ SPDK_ERRLOG("-B and -W cannot be used at the same time\n");
+ usage(app_usage);
+ goto out;
+ }
+
+ rc = app_opts_add_pci_addr(opts, &opts->pci_blacklist, optarg);
+ if (rc != 0) {
+ free(opts->pci_blacklist);
+ opts->pci_blacklist = NULL;
+ goto out;
+ }
+ break;
+ case LOGFLAG_OPT_IDX:
+#ifndef DEBUG
+ SPDK_ERRLOG("%s must be configured with --enable-debug for -L flag\n",
+ argv[0]);
+ usage(app_usage);
+ goto out;
+#else
+ rc = spdk_log_set_flag(optarg);
+ if (rc < 0) {
+ SPDK_ERRLOG("unknown flag\n");
+ usage(app_usage);
+ goto out;
+ }
+ opts->print_level = SPDK_LOG_DEBUG;
+ break;
+#endif
+ case HUGE_UNLINK_OPT_IDX:
+ opts->unlink_hugepage = true;
+ break;
+ case PCI_WHITELIST_OPT_IDX:
+ if (opts->pci_blacklist) {
+ free(opts->pci_blacklist);
+ opts->pci_blacklist = NULL;
+ SPDK_ERRLOG("-B and -W cannot be used at the same time\n");
+ usage(app_usage);
+ goto out;
+ }
+
+ rc = app_opts_add_pci_addr(opts, &opts->pci_whitelist, optarg);
+ if (rc != 0) {
+ free(opts->pci_whitelist);
+ opts->pci_whitelist = NULL;
+ goto out;
+ }
+ break;
+ case BASE_VIRTADDR_OPT_IDX:
+ tmp = spdk_strtoll(optarg, 0);
+ if (tmp <= 0) {
+ SPDK_ERRLOG("Invalid base-virtaddr %s\n", optarg);
+ usage(app_usage);
+ goto out;
+ }
+ opts->base_virtaddr = (uint64_t)tmp;
+ break;
+ case HUGE_DIR_OPT_IDX:
+ opts->hugedir = optarg;
+ break;
+ case IOVA_MODE_OPT_IDX:
+ opts->iova_mode = optarg;
+ break;
+ case NUM_TRACE_ENTRIES_OPT_IDX:
+ tmp = spdk_strtoll(optarg, 0);
+ if (tmp <= 0) {
+ SPDK_ERRLOG("Invalid num-trace-entries %s\n", optarg);
+ usage(app_usage);
+ goto out;
+ }
+ opts->num_entries = (uint64_t)tmp;
+ if (opts->num_entries & (opts->num_entries - 1)) {
+ SPDK_ERRLOG("num-trace-entries must be power of 2\n");
+ usage(app_usage);
+ goto out;
+ }
+ break;
+ case MAX_REACTOR_DELAY_OPT_IDX:
+ SPDK_ERRLOG("Deprecation warning: The maximum allowed latency parameter is no longer supported.\n");
+ break;
+ case VERSION_OPT_IDX:
+ printf(SPDK_VERSION_STRING"\n");
+ retval = SPDK_APP_PARSE_ARGS_HELP;
+ goto out;
+ case '?':
+ /*
+ * In the event getopt() above detects an option
+ * in argv that is NOT in the getopt_str,
+ * getopt() will return a '?' indicating failure.
+ */
+ usage(app_usage);
+ goto out;
+ default:
+ rc = app_parse(ch, optarg);
+ if (rc) {
+ SPDK_ERRLOG("Parsing application specific arguments failed: %d\n", rc);
+ goto out;
+ }
+ }
+ }
+
+ if (opts->config_file && opts->json_config_file) {
+ SPDK_ERRLOG("ERROR: Legacy config and JSON config can't be used together.\n");
+ goto out;
+ }
+
+ if (opts->json_config_file && opts->delay_subsystem_init) {
+ SPDK_ERRLOG("ERROR: JSON configuration file can't be used together with --wait-for-rpc.\n");
+ goto out;
+ }
+
+ /* TBD: Replace warning by failure when RPCs for startup are prepared. */
+ if (opts->config_file && opts->delay_subsystem_init) {
+ fprintf(stderr,
+ "WARNING: --wait-for-rpc and config file are used at the same time. "
+ "- Please be careful one options might overwrite others.\n");
+ }
+
+ retval = SPDK_APP_PARSE_ARGS_SUCCESS;
+out:
+ if (retval != SPDK_APP_PARSE_ARGS_SUCCESS) {
+ free(opts->pci_blacklist);
+ opts->pci_blacklist = NULL;
+ free(opts->pci_whitelist);
+ opts->pci_whitelist = NULL;
+ }
+ free(cmdline_short_opts);
+ free(cmdline_options);
+ return retval;
+}
+
+void
+spdk_app_usage(void)
+{
+ if (g_executable_name == NULL) {
+ SPDK_ERRLOG("%s not valid before calling spdk_app_parse_args()\n", __func__);
+ return;
+ }
+
+ usage(NULL);
+}
+
+static void
+rpc_framework_start_init_cpl(int rc, void *arg1)
+{
+ struct spdk_jsonrpc_request *request = arg1;
+ struct spdk_json_write_ctx *w;
+
+ assert(spdk_get_thread() == g_app_thread);
+
+ if (rc) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "framework_initialization failed");
+ return;
+ }
+
+ spdk_rpc_set_state(SPDK_RPC_RUNTIME);
+ app_start_application();
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+}
+
+static void
+rpc_framework_start_init(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ if (params != NULL) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "framework_start_init requires no parameters");
+ return;
+ }
+
+ spdk_subsystem_init(rpc_framework_start_init_cpl, request);
+}
+SPDK_RPC_REGISTER("framework_start_init", rpc_framework_start_init, SPDK_RPC_STARTUP)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(framework_start_init, start_subsystem_init)
+
+struct subsystem_init_poller_ctx {
+ struct spdk_poller *init_poller;
+ struct spdk_jsonrpc_request *request;
+};
+
+static int
+rpc_subsystem_init_poller_ctx(void *ctx)
+{
+ struct spdk_json_write_ctx *w;
+ struct subsystem_init_poller_ctx *poller_ctx = ctx;
+
+ if (spdk_rpc_get_state() == SPDK_RPC_RUNTIME) {
+ w = spdk_jsonrpc_begin_result(poller_ctx->request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(poller_ctx->request, w);
+ spdk_poller_unregister(&poller_ctx->init_poller);
+ free(poller_ctx);
+ }
+
+ return SPDK_POLLER_BUSY;
+}
+
+static void
+rpc_framework_wait_init(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct spdk_json_write_ctx *w;
+ struct subsystem_init_poller_ctx *ctx;
+
+ if (spdk_rpc_get_state() == SPDK_RPC_RUNTIME) {
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ } else {
+ ctx = malloc(sizeof(struct subsystem_init_poller_ctx));
+ if (ctx == NULL) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Unable to allocate memory for the request context\n");
+ return;
+ }
+ ctx->request = request;
+ ctx->init_poller = SPDK_POLLER_REGISTER(rpc_subsystem_init_poller_ctx, ctx, 0);
+ }
+}
+SPDK_RPC_REGISTER("framework_wait_init", rpc_framework_wait_init,
+ SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(framework_wait_init, wait_subsystem_init)
diff --git a/src/spdk/lib/event/json_config.c b/src/spdk/lib/event/json_config.c
new file mode 100644
index 000000000..69a95097a
--- /dev/null
+++ b/src/spdk/lib/event/json_config.c
@@ -0,0 +1,630 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/util.h"
+#include "spdk/file.h"
+#include "spdk/log.h"
+#include "spdk/env.h"
+#include "spdk/thread.h"
+#include "spdk/jsonrpc.h"
+#include "spdk/rpc.h"
+
+#include "spdk_internal/event.h"
+#include "spdk_internal/log.h"
+
+#define SPDK_DEBUG_APP_CFG(...) SPDK_DEBUGLOG(SPDK_LOG_APP_CONFIG, __VA_ARGS__)
+
+/* JSON configuration format is as follows
+ *
+ * {
+ * "subsystems" : [ <<== *subsystems JSON array
+ * { <<== *subsystems_it array entry pointer (iterator)
+ * "subsystem": "<< SUBSYSTEM NAME >>",
+ * "config": [ <<== *config JSON array
+ * { <<== *config_it array entry pointer (iterator)
+ * "method": "<< METHOD NAME >>", <<== *method
+ * "params": { << PARAMS >> } <<== *params
+ * },
+ * << MORE "config" ARRY ENTRIES >>
+ * ]
+ * },
+ * << MORE "subsystems" ARRAY ENTRIES >>
+ * ]
+ *
+ * << ANYTHING ELSE IS IGNORRED IN ROOT OBJECT>>
+ * }
+ *
+ */
+
+struct load_json_config_ctx;
+typedef void (*client_resp_handler)(struct load_json_config_ctx *,
+ struct spdk_jsonrpc_client_response *);
+
+#define RPC_SOCKET_PATH_MAX sizeof(((struct sockaddr_un *)0)->sun_path)
+
+/* 1s connections timeout */
+#define RPC_CLIENT_CONNECT_TIMEOUT_US (1U * 1000U * 1000U)
+
+/*
+ * Currently there is no timeout in SPDK for any RPC command. This result that
+ * we can't put a hard limit during configuration load as it most likely randomly fail.
+ * So just print WARNLOG every 10s. */
+#define RPC_CLIENT_REQUEST_TIMEOUT_US (10U * 1000 * 1000)
+
+struct load_json_config_ctx {
+ /* Thread used during configuration. */
+ struct spdk_thread *thread;
+ spdk_subsystem_init_fn cb_fn;
+ void *cb_arg;
+ bool stop_on_error;
+
+ /* Current subsystem */
+ struct spdk_json_val *subsystems; /* "subsystems" array */
+ struct spdk_json_val *subsystems_it; /* current subsystem array position in "subsystems" array */
+
+ struct spdk_json_val *subsystem_name; /* current subsystem name */
+
+ /* Current "config" entry we are processing */
+ struct spdk_json_val *config; /* "config" array */
+ struct spdk_json_val *config_it; /* current config position in "config" array */
+
+ /* Current request id we are sending. */
+ uint32_t rpc_request_id;
+
+ /* Whole configuration file read and parsed. */
+ size_t json_data_size;
+ char *json_data;
+
+ size_t values_cnt;
+ struct spdk_json_val *values;
+
+ char rpc_socket_path_temp[RPC_SOCKET_PATH_MAX + 1];
+
+ struct spdk_jsonrpc_client *client_conn;
+ struct spdk_poller *client_conn_poller;
+
+ client_resp_handler client_resp_cb;
+
+ /* Timeout for current RPC client action. */
+ uint64_t timeout;
+};
+
+static void app_json_config_load_subsystem(void *_ctx);
+
+static void
+app_json_config_load_done(struct load_json_config_ctx *ctx, int rc)
+{
+ spdk_poller_unregister(&ctx->client_conn_poller);
+ if (ctx->client_conn != NULL) {
+ spdk_jsonrpc_client_close(ctx->client_conn);
+ }
+
+ spdk_rpc_finish();
+
+ SPDK_DEBUG_APP_CFG("Config load finished with rc %d\n", rc);
+ ctx->cb_fn(rc, ctx->cb_arg);
+
+ free(ctx->json_data);
+ free(ctx->values);
+ free(ctx);
+}
+
+static void
+rpc_client_set_timeout(struct load_json_config_ctx *ctx, uint64_t timeout_us)
+{
+ ctx->timeout = spdk_get_ticks() + timeout_us * spdk_get_ticks_hz() / (1000 * 1000);
+}
+
+static int
+rpc_client_check_timeout(struct load_json_config_ctx *ctx)
+{
+ if (ctx->timeout < spdk_get_ticks()) {
+ SPDK_WARNLOG("RPC client command timeout.\n");
+ return -ETIMEDOUT;
+ }
+
+ return 0;
+}
+
+struct json_write_buf {
+ char data[1024];
+ unsigned cur_off;
+};
+
+static int
+json_write_stdout(void *cb_ctx, const void *data, size_t size)
+{
+ struct json_write_buf *buf = cb_ctx;
+ size_t rc;
+
+ rc = snprintf(buf->data + buf->cur_off, sizeof(buf->data) - buf->cur_off,
+ "%s", (const char *)data);
+ if (rc > 0) {
+ buf->cur_off += rc;
+ }
+ return rc == size ? 0 : -1;
+}
+
+static int
+rpc_client_poller(void *arg)
+{
+ struct load_json_config_ctx *ctx = arg;
+ struct spdk_jsonrpc_client_response *resp;
+ client_resp_handler cb;
+ int rc;
+
+ assert(spdk_get_thread() == ctx->thread);
+
+ rc = spdk_jsonrpc_client_poll(ctx->client_conn, 0);
+ if (rc == 0) {
+ rc = rpc_client_check_timeout(ctx);
+ if (rc == -ETIMEDOUT) {
+ rpc_client_set_timeout(ctx, RPC_CLIENT_REQUEST_TIMEOUT_US);
+ rc = 0;
+ }
+ }
+
+ if (rc == 0) {
+ /* No response yet */
+ return SPDK_POLLER_BUSY;
+ } else if (rc < 0) {
+ app_json_config_load_done(ctx, rc);
+ return SPDK_POLLER_BUSY;
+ }
+
+ resp = spdk_jsonrpc_client_get_response(ctx->client_conn);
+ assert(resp);
+
+ if (resp->error) {
+ struct json_write_buf buf = {};
+ struct spdk_json_write_ctx *w = spdk_json_write_begin(json_write_stdout,
+ &buf, SPDK_JSON_PARSE_FLAG_DECODE_IN_PLACE);
+
+ if (w == NULL) {
+ SPDK_ERRLOG("error response: (?)\n");
+ } else {
+ spdk_json_write_val(w, resp->error);
+ spdk_json_write_end(w);
+ SPDK_ERRLOG("error response: \n%s\n", buf.data);
+ }
+ }
+
+ if (resp->error && ctx->stop_on_error) {
+ spdk_jsonrpc_client_free_response(resp);
+ app_json_config_load_done(ctx, -EINVAL);
+ } else {
+ /* We have response so we must have callback for it. */
+ cb = ctx->client_resp_cb;
+ assert(cb != NULL);
+
+ /* Mark we are done with this handler. */
+ ctx->client_resp_cb = NULL;
+ cb(ctx, resp);
+ }
+
+
+ return SPDK_POLLER_BUSY;
+}
+
+static int
+rpc_client_connect_poller(void *_ctx)
+{
+ struct load_json_config_ctx *ctx = _ctx;
+ int rc;
+
+ rc = spdk_jsonrpc_client_poll(ctx->client_conn, 0);
+ if (rc != -ENOTCONN) {
+ /* We are connected. Start regular poller and issue first request */
+ spdk_poller_unregister(&ctx->client_conn_poller);
+ ctx->client_conn_poller = SPDK_POLLER_REGISTER(rpc_client_poller, ctx, 100);
+ app_json_config_load_subsystem(ctx);
+ } else {
+ rc = rpc_client_check_timeout(ctx);
+ if (rc) {
+ app_json_config_load_done(ctx, rc);
+ }
+
+ return SPDK_POLLER_IDLE;
+ }
+
+ return SPDK_POLLER_BUSY;
+}
+
+static int
+client_send_request(struct load_json_config_ctx *ctx, struct spdk_jsonrpc_client_request *request,
+ client_resp_handler client_resp_cb)
+{
+ int rc;
+
+ assert(spdk_get_thread() == ctx->thread);
+
+ ctx->client_resp_cb = client_resp_cb;
+ rpc_client_set_timeout(ctx, RPC_CLIENT_REQUEST_TIMEOUT_US);
+ rc = spdk_jsonrpc_client_send_request(ctx->client_conn, request);
+
+ if (rc) {
+ SPDK_DEBUG_APP_CFG("Sending request to client failed (%d)\n", rc);
+ }
+
+ return rc;
+}
+
+static int
+cap_string(const struct spdk_json_val *val, void *out)
+{
+ const struct spdk_json_val **vptr = out;
+
+ if (val->type != SPDK_JSON_VAL_STRING) {
+ return -EINVAL;
+ }
+
+ *vptr = val;
+ return 0;
+}
+
+static int
+cap_object(const struct spdk_json_val *val, void *out)
+{
+ const struct spdk_json_val **vptr = out;
+
+ if (val->type != SPDK_JSON_VAL_OBJECT_BEGIN) {
+ return -EINVAL;
+ }
+
+ *vptr = val;
+ return 0;
+}
+
+
+static int
+cap_array_or_null(const struct spdk_json_val *val, void *out)
+{
+ const struct spdk_json_val **vptr = out;
+
+ if (val->type != SPDK_JSON_VAL_ARRAY_BEGIN && val->type != SPDK_JSON_VAL_NULL) {
+ return -EINVAL;
+ }
+
+ *vptr = val;
+ return 0;
+}
+
+struct config_entry {
+ char *method;
+ struct spdk_json_val *params;
+};
+
+static struct spdk_json_object_decoder jsonrpc_cmd_decoders[] = {
+ {"method", offsetof(struct config_entry, method), spdk_json_decode_string},
+ {"params", offsetof(struct config_entry, params), cap_object, true}
+};
+
+static void app_json_config_load_subsystem_config_entry(void *_ctx);
+
+static void
+app_json_config_load_subsystem_config_entry_next(struct load_json_config_ctx *ctx,
+ struct spdk_jsonrpc_client_response *resp)
+{
+ /* Don't care about the response */
+ spdk_jsonrpc_client_free_response(resp);
+
+ ctx->config_it = spdk_json_next(ctx->config_it);
+ app_json_config_load_subsystem_config_entry(ctx);
+}
+
+/* Load "config" entry */
+static void
+app_json_config_load_subsystem_config_entry(void *_ctx)
+{
+ struct load_json_config_ctx *ctx = _ctx;
+ struct spdk_jsonrpc_client_request *rpc_request;
+ struct spdk_json_write_ctx *w;
+ struct config_entry cfg = {};
+ struct spdk_json_val *params_end;
+ size_t params_len;
+ int rc;
+
+ if (ctx->config_it == NULL) {
+ SPDK_DEBUG_APP_CFG("Subsystem '%.*s': configuration done.\n", ctx->subsystem_name->len,
+ (char *)ctx->subsystem_name->start);
+ ctx->subsystems_it = spdk_json_next(ctx->subsystems_it);
+ /* Invoke later to avoid recurrency */
+ spdk_thread_send_msg(ctx->thread, app_json_config_load_subsystem, ctx);
+ return;
+ }
+
+ if (spdk_json_decode_object(ctx->config_it, jsonrpc_cmd_decoders,
+ SPDK_COUNTOF(jsonrpc_cmd_decoders), &cfg)) {
+ params_end = spdk_json_next(ctx->config_it);
+ assert(params_end != NULL);
+ params_len = params_end->start - ctx->config->start + 1;
+ SPDK_ERRLOG("Failed to decode config entry: %.*s!\n", (int)params_len, (char *)ctx->config_it);
+ app_json_config_load_done(ctx, -EINVAL);
+ goto out;
+ }
+
+ rc = spdk_rpc_is_method_allowed(cfg.method, spdk_rpc_get_state());
+ if (rc == -EPERM) {
+ SPDK_DEBUG_APP_CFG("Method '%s' not allowed -> skipping\n", cfg.method);
+ /* Invoke later to avoid recurrency */
+ ctx->config_it = spdk_json_next(ctx->config_it);
+ spdk_thread_send_msg(ctx->thread, app_json_config_load_subsystem_config_entry, ctx);
+ goto out;
+ }
+
+ /* Get _END by skipping params and going back by one element. */
+ params_end = cfg.params + spdk_json_val_len(cfg.params) - 1;
+
+ /* Need to add one character to include '}' */
+ params_len = params_end->start - cfg.params->start + 1;
+
+ SPDK_DEBUG_APP_CFG("\tmethod: %s\n", cfg.method);
+ SPDK_DEBUG_APP_CFG("\tparams: %.*s\n", (int)params_len, (char *)cfg.params->start);
+
+ rpc_request = spdk_jsonrpc_client_create_request();
+ if (!rpc_request) {
+ app_json_config_load_done(ctx, -errno);
+ goto out;
+ }
+
+ w = spdk_jsonrpc_begin_request(rpc_request, ctx->rpc_request_id, NULL);
+ if (!w) {
+ spdk_jsonrpc_client_free_request(rpc_request);
+ app_json_config_load_done(ctx, -ENOMEM);
+ goto out;
+ }
+
+ spdk_json_write_named_string(w, "method", cfg.method);
+
+ /* No need to parse "params". Just dump the whole content of "params"
+ * directly into the request and let the remote side verify it. */
+ spdk_json_write_name(w, "params");
+ spdk_json_write_val_raw(w, cfg.params->start, params_len);
+ spdk_jsonrpc_end_request(rpc_request, w);
+
+ rc = client_send_request(ctx, rpc_request, app_json_config_load_subsystem_config_entry_next);
+ if (rc != 0) {
+ app_json_config_load_done(ctx, -rc);
+ goto out;
+ }
+out:
+ free(cfg.method);
+}
+
+static void
+subsystem_init_done(int rc, void *arg1)
+{
+ struct load_json_config_ctx *ctx = arg1;
+
+ if (rc) {
+ app_json_config_load_done(ctx, rc);
+ return;
+ }
+
+ spdk_rpc_set_state(SPDK_RPC_RUNTIME);
+ /* Another round. This time for RUNTIME methods */
+ SPDK_DEBUG_APP_CFG("'framework_start_init' done - continuing configuration\n");
+
+ assert(ctx != NULL);
+ if (ctx->subsystems) {
+ ctx->subsystems_it = spdk_json_array_first(ctx->subsystems);
+ }
+
+ app_json_config_load_subsystem(ctx);
+}
+
+static struct spdk_json_object_decoder subsystem_decoders[] = {
+ {"subsystem", offsetof(struct load_json_config_ctx, subsystem_name), cap_string},
+ {"config", offsetof(struct load_json_config_ctx, config), cap_array_or_null}
+};
+
+/*
+ * Start loading subsystem pointed by ctx->subsystems_it. This must point to the
+ * beginning of the "subsystem" object in "subsystems" array or be NULL. If it is
+ * NULL then no more subsystems to load.
+ *
+ * There are two iterations:
+ *
+ * In first iteration only STARTUP RPC methods are used, other methods are ignored. When
+ * allsubsystems are walked the ctx->subsystems_it became NULL and "framework_start_init"
+ * is called to let the SPDK move to RUNTIME state (initialize all subsystems) and
+ * second iteration begins.
+ *
+ * In second iteration "subsystems" array is walked through again, this time only
+ * RUNTIME RPC methods are used. When ctx->subsystems_it became NULL second time it
+ * indicate that there is no more subsystems to load. The cb_fn is called to finish
+ * configuration.
+ */
+static void
+app_json_config_load_subsystem(void *_ctx)
+{
+ struct load_json_config_ctx *ctx = _ctx;
+
+ if (ctx->subsystems_it == NULL) {
+ if (spdk_rpc_get_state() == SPDK_RPC_STARTUP) {
+ SPDK_DEBUG_APP_CFG("No more entries for current state, calling 'framework_start_init'\n");
+ spdk_subsystem_init(subsystem_init_done, ctx);
+ } else {
+ app_json_config_load_done(ctx, 0);
+ }
+
+ return;
+ }
+
+ /* Capture subsystem name and config array */
+ if (spdk_json_decode_object(ctx->subsystems_it, subsystem_decoders,
+ SPDK_COUNTOF(subsystem_decoders), ctx)) {
+ SPDK_ERRLOG("Failed to parse subsystem configuration\n");
+ app_json_config_load_done(ctx, -EINVAL);
+ return;
+ }
+
+ SPDK_DEBUG_APP_CFG("Loading subsystem '%.*s' configuration\n", ctx->subsystem_name->len,
+ (char *)ctx->subsystem_name->start);
+
+ /* Get 'config' array first configuration entry */
+ ctx->config_it = spdk_json_array_first(ctx->config);
+ app_json_config_load_subsystem_config_entry(ctx);
+}
+
+static void *
+read_file(const char *filename, size_t *size)
+{
+ FILE *file = fopen(filename, "r");
+ void *data;
+
+ if (file == NULL) {
+ /* errno is set by fopen */
+ return NULL;
+ }
+
+ data = spdk_posix_file_load(file, size);
+ fclose(file);
+ return data;
+}
+
+static int
+app_json_config_read(const char *config_file, struct load_json_config_ctx *ctx)
+{
+ struct spdk_json_val *values = NULL;
+ void *json = NULL, *end;
+ ssize_t values_cnt, rc;
+ size_t json_size;
+
+ json = read_file(config_file, &json_size);
+ if (!json) {
+ return -errno;
+ }
+
+ rc = spdk_json_parse(json, json_size, NULL, 0, &end,
+ SPDK_JSON_PARSE_FLAG_ALLOW_COMMENTS);
+ if (rc < 0) {
+ SPDK_ERRLOG("Parsing JSON configuration failed (%zd)\n", rc);
+ goto err;
+ }
+
+ values_cnt = rc;
+ values = calloc(values_cnt, sizeof(struct spdk_json_val));
+ if (values == NULL) {
+ SPDK_ERRLOG("Out of memory\n");
+ goto err;
+ }
+
+ rc = spdk_json_parse(json, json_size, values, values_cnt, &end,
+ SPDK_JSON_PARSE_FLAG_ALLOW_COMMENTS);
+ if (rc != values_cnt) {
+ SPDK_ERRLOG("Parsing JSON configuration failed (%zd)\n", rc);
+ goto err;
+ }
+
+ ctx->json_data = json;
+ ctx->json_data_size = json_size;
+
+ ctx->values = values;
+ ctx->values_cnt = values_cnt;
+
+ return 0;
+err:
+ free(json);
+ free(values);
+ return rc;
+}
+
+void
+spdk_app_json_config_load(const char *json_config_file, const char *rpc_addr,
+ spdk_subsystem_init_fn cb_fn, void *cb_arg,
+ bool stop_on_error)
+{
+ struct load_json_config_ctx *ctx = calloc(1, sizeof(*ctx));
+ int rc;
+
+ assert(cb_fn);
+ if (!ctx) {
+ cb_fn(-ENOMEM, cb_arg);
+ return;
+ }
+
+ ctx->cb_fn = cb_fn;
+ ctx->cb_arg = cb_arg;
+ ctx->stop_on_error = stop_on_error;
+ ctx->thread = spdk_get_thread();
+
+ rc = app_json_config_read(json_config_file, ctx);
+ if (rc) {
+ goto fail;
+ }
+
+ /* Capture subsystems array */
+ rc = spdk_json_find_array(ctx->values, "subsystems", NULL, &ctx->subsystems);
+ if (rc) {
+ SPDK_WARNLOG("No 'subsystems' key JSON configuration file.\n");
+ } else {
+ /* Get first subsystem */
+ ctx->subsystems_it = spdk_json_array_first(ctx->subsystems);
+ if (ctx->subsystems_it == NULL) {
+ SPDK_NOTICELOG("'subsystems' configuration is empty\n");
+ }
+ }
+
+ /* If rpc_addr is not an Unix socket use default address as prefix. */
+ if (rpc_addr == NULL || rpc_addr[0] != '/') {
+ rpc_addr = SPDK_DEFAULT_RPC_ADDR;
+ }
+
+ /* FIXME: rpc client should use socketpair() instead of this temporary socket nonsense */
+ rc = snprintf(ctx->rpc_socket_path_temp, sizeof(ctx->rpc_socket_path_temp), "%s.%d_config",
+ rpc_addr, getpid());
+ if (rc >= (int)sizeof(ctx->rpc_socket_path_temp)) {
+ SPDK_ERRLOG("Socket name create failed\n");
+ goto fail;
+ }
+
+ /* FIXME: spdk_rpc_initialize() function should return error code. */
+ spdk_rpc_initialize(ctx->rpc_socket_path_temp);
+ ctx->client_conn = spdk_jsonrpc_client_connect(ctx->rpc_socket_path_temp, AF_UNIX);
+ if (ctx->client_conn == NULL) {
+ SPDK_ERRLOG("Failed to connect to '%s'\n", ctx->rpc_socket_path_temp);
+ goto fail;
+ }
+
+ rpc_client_set_timeout(ctx, RPC_CLIENT_CONNECT_TIMEOUT_US);
+ ctx->client_conn_poller = SPDK_POLLER_REGISTER(rpc_client_connect_poller, ctx, 100);
+ return;
+
+fail:
+ app_json_config_load_done(ctx, -EINVAL);
+}
+
+SPDK_LOG_REGISTER_COMPONENT("app_config", SPDK_LOG_APP_CONFIG)
diff --git a/src/spdk/lib/event/reactor.c b/src/spdk/lib/event/reactor.c
new file mode 100644
index 000000000..cda4a32b1
--- /dev/null
+++ b/src/spdk/lib/event/reactor.c
@@ -0,0 +1,664 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+#include "spdk/likely.h"
+
+#include "spdk_internal/event.h"
+#include "spdk_internal/log.h"
+#include "spdk_internal/thread.h"
+
+#include "spdk/log.h"
+#include "spdk/thread.h"
+#include "spdk/env.h"
+#include "spdk/util.h"
+
+#ifdef __linux__
+#include <sys/prctl.h>
+#endif
+
+#ifdef __FreeBSD__
+#include <pthread_np.h>
+#endif
+
+#define SPDK_EVENT_BATCH_SIZE 8
+
+static struct spdk_reactor *g_reactors;
+static struct spdk_cpuset g_reactor_core_mask;
+static enum spdk_reactor_state g_reactor_state = SPDK_REACTOR_STATE_UNINITIALIZED;
+
+static bool g_framework_context_switch_monitor_enabled = true;
+
+static struct spdk_mempool *g_spdk_event_mempool = NULL;
+
+static void
+reactor_construct(struct spdk_reactor *reactor, uint32_t lcore)
+{
+ reactor->lcore = lcore;
+ reactor->flags.is_valid = true;
+
+ TAILQ_INIT(&reactor->threads);
+ reactor->thread_count = 0;
+
+ reactor->events = spdk_ring_create(SPDK_RING_TYPE_MP_SC, 65536, SPDK_ENV_SOCKET_ID_ANY);
+ assert(reactor->events != NULL);
+}
+
+struct spdk_reactor *
+spdk_reactor_get(uint32_t lcore)
+{
+ struct spdk_reactor *reactor;
+
+ if (g_reactors == NULL) {
+ SPDK_WARNLOG("Called spdk_reactor_get() while the g_reactors array was NULL!\n");
+ return NULL;
+ }
+
+ reactor = &g_reactors[lcore];
+
+ if (reactor->flags.is_valid == false) {
+ return NULL;
+ }
+
+ return reactor;
+}
+
+static int reactor_thread_op(struct spdk_thread *thread, enum spdk_thread_op op);
+static bool reactor_thread_op_supported(enum spdk_thread_op op);
+
+int
+spdk_reactors_init(void)
+{
+ int rc;
+ uint32_t i, last_core;
+ char mempool_name[32];
+
+ snprintf(mempool_name, sizeof(mempool_name), "evtpool_%d", getpid());
+ g_spdk_event_mempool = spdk_mempool_create(mempool_name,
+ 262144 - 1, /* Power of 2 minus 1 is optimal for memory consumption */
+ sizeof(struct spdk_event),
+ SPDK_MEMPOOL_DEFAULT_CACHE_SIZE,
+ SPDK_ENV_SOCKET_ID_ANY);
+
+ if (g_spdk_event_mempool == NULL) {
+ SPDK_ERRLOG("spdk_event_mempool creation failed\n");
+ return -1;
+ }
+
+ /* struct spdk_reactor must be aligned on 64 byte boundary */
+ last_core = spdk_env_get_last_core();
+ rc = posix_memalign((void **)&g_reactors, 64,
+ (last_core + 1) * sizeof(struct spdk_reactor));
+ if (rc != 0) {
+ SPDK_ERRLOG("Could not allocate array size=%u for g_reactors\n",
+ last_core + 1);
+ spdk_mempool_free(g_spdk_event_mempool);
+ return -1;
+ }
+
+ memset(g_reactors, 0, (last_core + 1) * sizeof(struct spdk_reactor));
+
+ spdk_thread_lib_init_ext(reactor_thread_op, reactor_thread_op_supported,
+ sizeof(struct spdk_lw_thread));
+
+ SPDK_ENV_FOREACH_CORE(i) {
+ reactor_construct(&g_reactors[i], i);
+ }
+
+ g_reactor_state = SPDK_REACTOR_STATE_INITIALIZED;
+
+ return 0;
+}
+
+void
+spdk_reactors_fini(void)
+{
+ uint32_t i;
+ struct spdk_reactor *reactor;
+
+ if (g_reactor_state == SPDK_REACTOR_STATE_UNINITIALIZED) {
+ return;
+ }
+
+ spdk_thread_lib_fini();
+
+ SPDK_ENV_FOREACH_CORE(i) {
+ reactor = spdk_reactor_get(i);
+ assert(reactor != NULL);
+ assert(reactor->thread_count == 0);
+ if (reactor->events != NULL) {
+ spdk_ring_free(reactor->events);
+ }
+ }
+
+ spdk_mempool_free(g_spdk_event_mempool);
+
+ free(g_reactors);
+ g_reactors = NULL;
+}
+
+struct spdk_event *
+spdk_event_allocate(uint32_t lcore, spdk_event_fn fn, void *arg1, void *arg2)
+{
+ struct spdk_event *event = NULL;
+ struct spdk_reactor *reactor = spdk_reactor_get(lcore);
+
+ if (!reactor) {
+ assert(false);
+ return NULL;
+ }
+
+ event = spdk_mempool_get(g_spdk_event_mempool);
+ if (event == NULL) {
+ assert(false);
+ return NULL;
+ }
+
+ event->lcore = lcore;
+ event->fn = fn;
+ event->arg1 = arg1;
+ event->arg2 = arg2;
+
+ return event;
+}
+
+void
+spdk_event_call(struct spdk_event *event)
+{
+ int rc;
+ struct spdk_reactor *reactor;
+
+ reactor = spdk_reactor_get(event->lcore);
+
+ assert(reactor != NULL);
+ assert(reactor->events != NULL);
+
+ rc = spdk_ring_enqueue(reactor->events, (void **)&event, 1, NULL);
+ if (rc != 1) {
+ assert(false);
+ }
+}
+
+static inline uint32_t
+event_queue_run_batch(struct spdk_reactor *reactor)
+{
+ unsigned count, i;
+ void *events[SPDK_EVENT_BATCH_SIZE];
+ struct spdk_thread *thread;
+ struct spdk_lw_thread *lw_thread;
+
+#ifdef DEBUG
+ /*
+ * spdk_ring_dequeue() fills events and returns how many entries it wrote,
+ * so we will never actually read uninitialized data from events, but just to be sure
+ * (and to silence a static analyzer false positive), initialize the array to NULL pointers.
+ */
+ memset(events, 0, sizeof(events));
+#endif
+
+ count = spdk_ring_dequeue(reactor->events, events, SPDK_EVENT_BATCH_SIZE);
+ if (count == 0) {
+ return 0;
+ }
+
+ /* Execute the events. There are still some remaining events
+ * that must occur on an SPDK thread. To accomodate those, try to
+ * run them on the first thread in the list, if it exists. */
+ lw_thread = TAILQ_FIRST(&reactor->threads);
+ if (lw_thread) {
+ thread = spdk_thread_get_from_ctx(lw_thread);
+ } else {
+ thread = NULL;
+ }
+
+ spdk_set_thread(thread);
+
+ for (i = 0; i < count; i++) {
+ struct spdk_event *event = events[i];
+
+ assert(event != NULL);
+ event->fn(event->arg1, event->arg2);
+ }
+
+ spdk_set_thread(NULL);
+
+ spdk_mempool_put_bulk(g_spdk_event_mempool, events, count);
+
+ return count;
+}
+
+/* 1s */
+#define CONTEXT_SWITCH_MONITOR_PERIOD 1000000
+
+static int
+get_rusage(struct spdk_reactor *reactor)
+{
+ struct rusage rusage;
+
+ if (getrusage(RUSAGE_THREAD, &rusage) != 0) {
+ return -1;
+ }
+
+ if (rusage.ru_nvcsw != reactor->rusage.ru_nvcsw || rusage.ru_nivcsw != reactor->rusage.ru_nivcsw) {
+ SPDK_INFOLOG(SPDK_LOG_REACTOR,
+ "Reactor %d: %ld voluntary context switches and %ld involuntary context switches in the last second.\n",
+ reactor->lcore, rusage.ru_nvcsw - reactor->rusage.ru_nvcsw,
+ rusage.ru_nivcsw - reactor->rusage.ru_nivcsw);
+ }
+ reactor->rusage = rusage;
+
+ return -1;
+}
+
+void
+spdk_framework_enable_context_switch_monitor(bool enable)
+{
+ /* This global is being read by multiple threads, so this isn't
+ * strictly thread safe. However, we're toggling between true and
+ * false here, and if a thread sees the value update later than it
+ * should, it's no big deal. */
+ g_framework_context_switch_monitor_enabled = enable;
+}
+
+bool
+spdk_framework_context_switch_monitor_enabled(void)
+{
+ return g_framework_context_switch_monitor_enabled;
+}
+
+static void
+_set_thread_name(const char *thread_name)
+{
+#if defined(__linux__)
+ prctl(PR_SET_NAME, thread_name, 0, 0, 0);
+#elif defined(__FreeBSD__)
+ pthread_set_name_np(pthread_self(), thread_name);
+#else
+#error missing platform support for thread name
+#endif
+}
+
+static int _reactor_schedule_thread(struct spdk_thread *thread);
+static uint64_t g_rusage_period;
+
+static void
+_reactor_run(struct spdk_reactor *reactor)
+{
+ struct spdk_thread *thread;
+ struct spdk_lw_thread *lw_thread, *tmp;
+ uint64_t now;
+ int rc;
+
+ event_queue_run_batch(reactor);
+
+ TAILQ_FOREACH_SAFE(lw_thread, &reactor->threads, link, tmp) {
+ thread = spdk_thread_get_from_ctx(lw_thread);
+ rc = spdk_thread_poll(thread, 0, reactor->tsc_last);
+
+ now = spdk_thread_get_last_tsc(thread);
+ if (rc == 0) {
+ reactor->idle_tsc += now - reactor->tsc_last;
+ } else if (rc > 0) {
+ reactor->busy_tsc += now - reactor->tsc_last;
+ }
+ reactor->tsc_last = now;
+
+ if (spdk_unlikely(lw_thread->resched)) {
+ lw_thread->resched = false;
+ TAILQ_REMOVE(&reactor->threads, lw_thread, link);
+ assert(reactor->thread_count > 0);
+ reactor->thread_count--;
+ _reactor_schedule_thread(thread);
+ continue;
+ }
+
+ if (spdk_unlikely(spdk_thread_is_exited(thread) &&
+ spdk_thread_is_idle(thread))) {
+ TAILQ_REMOVE(&reactor->threads, lw_thread, link);
+ assert(reactor->thread_count > 0);
+ reactor->thread_count--;
+ spdk_thread_destroy(thread);
+ continue;
+ }
+ }
+
+ if (g_framework_context_switch_monitor_enabled) {
+ if ((reactor->last_rusage + g_rusage_period) < reactor->tsc_last) {
+ get_rusage(reactor);
+ reactor->last_rusage = reactor->tsc_last;
+ }
+ }
+}
+
+static int
+reactor_run(void *arg)
+{
+ struct spdk_reactor *reactor = arg;
+ struct spdk_thread *thread;
+ struct spdk_lw_thread *lw_thread, *tmp;
+ char thread_name[32];
+
+ SPDK_NOTICELOG("Reactor started on core %u\n", reactor->lcore);
+
+ /* Rename the POSIX thread because the reactor is tied to the POSIX
+ * thread in the SPDK event library.
+ */
+ snprintf(thread_name, sizeof(thread_name), "reactor_%u", reactor->lcore);
+ _set_thread_name(thread_name);
+
+ reactor->tsc_last = spdk_get_ticks();
+
+ while (1) {
+ _reactor_run(reactor);
+
+ if (g_reactor_state != SPDK_REACTOR_STATE_RUNNING) {
+ break;
+ }
+ }
+
+ TAILQ_FOREACH(lw_thread, &reactor->threads, link) {
+ thread = spdk_thread_get_from_ctx(lw_thread);
+ spdk_set_thread(thread);
+ spdk_thread_exit(thread);
+ }
+
+ while (!TAILQ_EMPTY(&reactor->threads)) {
+ TAILQ_FOREACH_SAFE(lw_thread, &reactor->threads, link, tmp) {
+ thread = spdk_thread_get_from_ctx(lw_thread);
+ spdk_set_thread(thread);
+ if (spdk_thread_is_exited(thread)) {
+ TAILQ_REMOVE(&reactor->threads, lw_thread, link);
+ assert(reactor->thread_count > 0);
+ reactor->thread_count--;
+ spdk_thread_destroy(thread);
+ } else {
+ spdk_thread_poll(thread, 0, 0);
+ }
+ }
+ }
+
+ return 0;
+}
+
+int
+spdk_app_parse_core_mask(const char *mask, struct spdk_cpuset *cpumask)
+{
+ int ret;
+ struct spdk_cpuset *validmask;
+
+ ret = spdk_cpuset_parse(cpumask, mask);
+ if (ret < 0) {
+ return ret;
+ }
+
+ validmask = spdk_app_get_core_mask();
+ spdk_cpuset_and(cpumask, validmask);
+
+ return 0;
+}
+
+struct spdk_cpuset *
+spdk_app_get_core_mask(void)
+{
+ return &g_reactor_core_mask;
+}
+
+void
+spdk_reactors_start(void)
+{
+ struct spdk_reactor *reactor;
+ struct spdk_cpuset tmp_cpumask = {};
+ uint32_t i, current_core;
+ int rc;
+ char thread_name[32];
+
+ g_rusage_period = (CONTEXT_SWITCH_MONITOR_PERIOD * spdk_get_ticks_hz()) / SPDK_SEC_TO_USEC;
+ g_reactor_state = SPDK_REACTOR_STATE_RUNNING;
+
+ current_core = spdk_env_get_current_core();
+ SPDK_ENV_FOREACH_CORE(i) {
+ if (i != current_core) {
+ reactor = spdk_reactor_get(i);
+ if (reactor == NULL) {
+ continue;
+ }
+
+ rc = spdk_env_thread_launch_pinned(reactor->lcore, reactor_run, reactor);
+ if (rc < 0) {
+ SPDK_ERRLOG("Unable to start reactor thread on core %u\n", reactor->lcore);
+ assert(false);
+ return;
+ }
+
+ /* For now, for each reactor spawn one thread. */
+ snprintf(thread_name, sizeof(thread_name), "reactor_%u", reactor->lcore);
+
+ spdk_cpuset_zero(&tmp_cpumask);
+ spdk_cpuset_set_cpu(&tmp_cpumask, i, true);
+
+ spdk_thread_create(thread_name, &tmp_cpumask);
+ }
+ spdk_cpuset_set_cpu(&g_reactor_core_mask, i, true);
+ }
+
+ /* Start the master reactor */
+ reactor = spdk_reactor_get(current_core);
+ assert(reactor != NULL);
+ reactor_run(reactor);
+
+ spdk_env_thread_wait_all();
+
+ g_reactor_state = SPDK_REACTOR_STATE_SHUTDOWN;
+}
+
+void
+spdk_reactors_stop(void *arg1)
+{
+ g_reactor_state = SPDK_REACTOR_STATE_EXITING;
+}
+
+static pthread_mutex_t g_scheduler_mtx = PTHREAD_MUTEX_INITIALIZER;
+static uint32_t g_next_core = UINT32_MAX;
+
+static void
+_schedule_thread(void *arg1, void *arg2)
+{
+ struct spdk_lw_thread *lw_thread = arg1;
+ struct spdk_thread *thread;
+ struct spdk_cpuset *cpumask;
+ struct spdk_reactor *reactor;
+ uint32_t current_core;
+
+ current_core = spdk_env_get_current_core();
+
+ thread = spdk_thread_get_from_ctx(lw_thread);
+ cpumask = spdk_thread_get_cpumask(thread);
+ if (!spdk_cpuset_get_cpu(cpumask, current_core)) {
+ SPDK_ERRLOG("Thread was scheduled to the wrong core %d\n", current_core);
+ assert(false);
+ }
+
+ reactor = spdk_reactor_get(current_core);
+ assert(reactor != NULL);
+
+ TAILQ_INSERT_TAIL(&reactor->threads, lw_thread, link);
+ reactor->thread_count++;
+}
+
+static int
+_reactor_schedule_thread(struct spdk_thread *thread)
+{
+ uint32_t core;
+ struct spdk_lw_thread *lw_thread;
+ struct spdk_event *evt = NULL;
+ struct spdk_cpuset *cpumask;
+ uint32_t i;
+
+ cpumask = spdk_thread_get_cpumask(thread);
+
+ lw_thread = spdk_thread_get_ctx(thread);
+ assert(lw_thread != NULL);
+ memset(lw_thread, 0, sizeof(*lw_thread));
+
+ pthread_mutex_lock(&g_scheduler_mtx);
+ for (i = 0; i < spdk_env_get_core_count(); i++) {
+ if (g_next_core > spdk_env_get_last_core()) {
+ g_next_core = spdk_env_get_first_core();
+ }
+ core = g_next_core;
+ g_next_core = spdk_env_get_next_core(g_next_core);
+
+ if (spdk_cpuset_get_cpu(cpumask, core)) {
+ evt = spdk_event_allocate(core, _schedule_thread, lw_thread, NULL);
+ break;
+ }
+ }
+ pthread_mutex_unlock(&g_scheduler_mtx);
+
+ assert(evt != NULL);
+ if (evt == NULL) {
+ SPDK_ERRLOG("Unable to schedule thread on requested core mask.\n");
+ return -1;
+ }
+
+ lw_thread->tsc_start = spdk_get_ticks();
+
+ spdk_event_call(evt);
+
+ return 0;
+}
+
+static void
+_reactor_request_thread_reschedule(struct spdk_thread *thread)
+{
+ struct spdk_lw_thread *lw_thread;
+
+ assert(thread == spdk_get_thread());
+
+ lw_thread = spdk_thread_get_ctx(thread);
+
+ assert(lw_thread != NULL);
+
+ lw_thread->resched = true;
+}
+
+static int
+reactor_thread_op(struct spdk_thread *thread, enum spdk_thread_op op)
+{
+ switch (op) {
+ case SPDK_THREAD_OP_NEW:
+ return _reactor_schedule_thread(thread);
+ case SPDK_THREAD_OP_RESCHED:
+ _reactor_request_thread_reschedule(thread);
+ return 0;
+ default:
+ return -ENOTSUP;
+ }
+}
+
+static bool
+reactor_thread_op_supported(enum spdk_thread_op op)
+{
+ switch (op) {
+ case SPDK_THREAD_OP_NEW:
+ case SPDK_THREAD_OP_RESCHED:
+ return true;
+ default:
+ return false;
+ }
+}
+
+struct call_reactor {
+ uint32_t cur_core;
+ spdk_event_fn fn;
+ void *arg1;
+ void *arg2;
+
+ uint32_t orig_core;
+ spdk_event_fn cpl;
+};
+
+static void
+on_reactor(void *arg1, void *arg2)
+{
+ struct call_reactor *cr = arg1;
+ struct spdk_event *evt;
+
+ cr->fn(cr->arg1, cr->arg2);
+
+ cr->cur_core = spdk_env_get_next_core(cr->cur_core);
+
+ if (cr->cur_core > spdk_env_get_last_core()) {
+ SPDK_DEBUGLOG(SPDK_LOG_REACTOR, "Completed reactor iteration\n");
+
+ evt = spdk_event_allocate(cr->orig_core, cr->cpl, cr->arg1, cr->arg2);
+ free(cr);
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_REACTOR, "Continuing reactor iteration to %d\n",
+ cr->cur_core);
+
+ evt = spdk_event_allocate(cr->cur_core, on_reactor, arg1, NULL);
+ }
+ assert(evt != NULL);
+ spdk_event_call(evt);
+}
+
+void
+spdk_for_each_reactor(spdk_event_fn fn, void *arg1, void *arg2, spdk_event_fn cpl)
+{
+ struct call_reactor *cr;
+ struct spdk_event *evt;
+
+ cr = calloc(1, sizeof(*cr));
+ if (!cr) {
+ SPDK_ERRLOG("Unable to perform reactor iteration\n");
+ cpl(arg1, arg2);
+ return;
+ }
+
+ cr->fn = fn;
+ cr->arg1 = arg1;
+ cr->arg2 = arg2;
+ cr->cpl = cpl;
+ cr->orig_core = spdk_env_get_current_core();
+ cr->cur_core = spdk_env_get_first_core();
+
+ SPDK_DEBUGLOG(SPDK_LOG_REACTOR, "Starting reactor iteration from %d\n", cr->orig_core);
+
+ evt = spdk_event_allocate(cr->cur_core, on_reactor, cr, NULL);
+ assert(evt != NULL);
+
+ spdk_event_call(evt);
+}
+
+SPDK_LOG_REGISTER_COMPONENT("reactor", SPDK_LOG_REACTOR)
diff --git a/src/spdk/lib/event/rpc.c b/src/spdk/lib/event/rpc.c
new file mode 100644
index 000000000..a42d5ebeb
--- /dev/null
+++ b/src/spdk/lib/event/rpc.c
@@ -0,0 +1,87 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/conf.h"
+#include "spdk/env.h"
+#include "spdk/thread.h"
+#include "spdk/log.h"
+#include "spdk/rpc.h"
+
+#include "spdk_internal/event.h"
+
+#define RPC_SELECT_INTERVAL 4000 /* 4ms */
+
+static struct spdk_poller *g_rpc_poller = NULL;
+
+static int
+rpc_subsystem_poll(void *arg)
+{
+ spdk_rpc_accept();
+ return SPDK_POLLER_BUSY;
+}
+
+void
+spdk_rpc_initialize(const char *listen_addr)
+{
+ int rc;
+
+ if (listen_addr == NULL) {
+ return;
+ }
+
+ if (!spdk_rpc_verify_methods()) {
+ spdk_app_stop(-EINVAL);
+ return;
+ }
+
+ /* Listen on the requested address */
+ rc = spdk_rpc_listen(listen_addr);
+ if (rc != 0) {
+ SPDK_ERRLOG("Unable to start RPC service at %s\n", listen_addr);
+ return;
+ }
+
+ spdk_rpc_set_state(SPDK_RPC_STARTUP);
+
+ /* Register a poller to periodically check for RPCs */
+ g_rpc_poller = SPDK_POLLER_REGISTER(rpc_subsystem_poll, NULL, RPC_SELECT_INTERVAL);
+}
+
+void
+spdk_rpc_finish(void)
+{
+ spdk_rpc_close();
+ spdk_poller_unregister(&g_rpc_poller);
+}
diff --git a/src/spdk/lib/event/spdk_event.map b/src/spdk/lib/event/spdk_event.map
new file mode 100644
index 000000000..8208c5e1f
--- /dev/null
+++ b/src/spdk/lib/event/spdk_event.map
@@ -0,0 +1,46 @@
+{
+ global:
+
+ # Public functions
+ spdk_app_opts_init;
+ spdk_app_start;
+ spdk_app_fini;
+ spdk_app_start_shutdown;
+ spdk_app_stop;
+ spdk_app_get_running_config;
+ spdk_app_get_shm_id;
+ spdk_app_parse_core_mask;
+ spdk_app_get_core_mask;
+ spdk_app_parse_args;
+ spdk_app_usage;
+ spdk_event_allocate;
+ spdk_event_call;
+ spdk_framework_enable_context_switch_monitor;
+ spdk_framework_context_switch_monitor_enabled;
+
+ # Functions used by other SPDK libraries
+ spdk_reactors_init;
+ spdk_reactors_fini;
+ spdk_reactors_start;
+ spdk_reactors_stop;
+ spdk_reactor_get;
+ spdk_for_each_reactor;
+ spdk_subsystem_find;
+ spdk_subsystem_get_first;
+ spdk_subsystem_get_next;
+ spdk_subsystem_get_first_depend;
+ spdk_subsystem_get_next_depend;
+ spdk_add_subsystem;
+ spdk_add_subsystem_depend;
+ spdk_subsystem_init;
+ spdk_subsystem_fini;
+ spdk_subsystem_init_next;
+ spdk_subsystem_fini_next;
+ spdk_subsystem_config;
+ spdk_app_json_config_load;
+ spdk_subsystem_config_json;
+ spdk_rpc_initialize;
+ spdk_rpc_finish;
+
+ local: *;
+};
diff --git a/src/spdk/lib/event/subsystem.c b/src/spdk/lib/event/subsystem.c
new file mode 100644
index 000000000..2cff890b2
--- /dev/null
+++ b/src/spdk/lib/event/subsystem.c
@@ -0,0 +1,288 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/log.h"
+#include "spdk/thread.h"
+
+#include "spdk_internal/event.h"
+#include "spdk/env.h"
+
+TAILQ_HEAD(spdk_subsystem_list, spdk_subsystem);
+struct spdk_subsystem_list g_subsystems = TAILQ_HEAD_INITIALIZER(g_subsystems);
+
+TAILQ_HEAD(spdk_subsystem_depend_list, spdk_subsystem_depend);
+struct spdk_subsystem_depend_list g_subsystems_deps = TAILQ_HEAD_INITIALIZER(g_subsystems_deps);
+static struct spdk_subsystem *g_next_subsystem;
+static bool g_subsystems_initialized = false;
+static bool g_subsystems_init_interrupted = false;
+static spdk_subsystem_init_fn g_subsystem_start_fn = NULL;
+static void *g_subsystem_start_arg = NULL;
+static spdk_msg_fn g_subsystem_stop_fn = NULL;
+static void *g_subsystem_stop_arg = NULL;
+static struct spdk_thread *g_fini_thread = NULL;
+
+void
+spdk_add_subsystem(struct spdk_subsystem *subsystem)
+{
+ TAILQ_INSERT_TAIL(&g_subsystems, subsystem, tailq);
+}
+
+void
+spdk_add_subsystem_depend(struct spdk_subsystem_depend *depend)
+{
+ TAILQ_INSERT_TAIL(&g_subsystems_deps, depend, tailq);
+}
+
+static struct spdk_subsystem *
+_subsystem_find(struct spdk_subsystem_list *list, const char *name)
+{
+ struct spdk_subsystem *iter;
+
+ TAILQ_FOREACH(iter, list, tailq) {
+ if (strcmp(name, iter->name) == 0) {
+ return iter;
+ }
+ }
+
+ return NULL;
+}
+
+struct spdk_subsystem *
+spdk_subsystem_find(const char *name)
+{
+ return _subsystem_find(&g_subsystems, name);
+}
+
+struct spdk_subsystem *
+spdk_subsystem_get_first(void)
+{
+ return TAILQ_FIRST(&g_subsystems);
+}
+
+struct spdk_subsystem *
+spdk_subsystem_get_next(struct spdk_subsystem *cur_subsystem)
+{
+ return TAILQ_NEXT(cur_subsystem, tailq);
+}
+
+
+struct spdk_subsystem_depend *
+spdk_subsystem_get_first_depend(void)
+{
+ return TAILQ_FIRST(&g_subsystems_deps);
+}
+
+struct spdk_subsystem_depend *
+spdk_subsystem_get_next_depend(struct spdk_subsystem_depend *cur_depend)
+{
+ return TAILQ_NEXT(cur_depend, tailq);
+}
+
+static void
+subsystem_sort(void)
+{
+ bool depends_on, depends_on_sorted;
+ struct spdk_subsystem *subsystem, *subsystem_tmp;
+ struct spdk_subsystem_depend *subsystem_dep;
+
+ struct spdk_subsystem_list subsystems_list = TAILQ_HEAD_INITIALIZER(subsystems_list);
+
+ while (!TAILQ_EMPTY(&g_subsystems)) {
+ TAILQ_FOREACH_SAFE(subsystem, &g_subsystems, tailq, subsystem_tmp) {
+ depends_on = false;
+ TAILQ_FOREACH(subsystem_dep, &g_subsystems_deps, tailq) {
+ if (strcmp(subsystem->name, subsystem_dep->name) == 0) {
+ depends_on = true;
+ depends_on_sorted = !!_subsystem_find(&subsystems_list, subsystem_dep->depends_on);
+ if (depends_on_sorted) {
+ continue;
+ }
+ break;
+ }
+ }
+
+ if (depends_on == false) {
+ TAILQ_REMOVE(&g_subsystems, subsystem, tailq);
+ TAILQ_INSERT_TAIL(&subsystems_list, subsystem, tailq);
+ } else {
+ if (depends_on_sorted == true) {
+ TAILQ_REMOVE(&g_subsystems, subsystem, tailq);
+ TAILQ_INSERT_TAIL(&subsystems_list, subsystem, tailq);
+ }
+ }
+ }
+ }
+
+ TAILQ_FOREACH_SAFE(subsystem, &subsystems_list, tailq, subsystem_tmp) {
+ TAILQ_REMOVE(&subsystems_list, subsystem, tailq);
+ TAILQ_INSERT_TAIL(&g_subsystems, subsystem, tailq);
+ }
+}
+
+void
+spdk_subsystem_init_next(int rc)
+{
+ /* The initialization is interrupted by the spdk_subsystem_fini, so just return */
+ if (g_subsystems_init_interrupted) {
+ return;
+ }
+
+ if (rc) {
+ SPDK_ERRLOG("Init subsystem %s failed\n", g_next_subsystem->name);
+ g_subsystem_start_fn(rc, g_subsystem_start_arg);
+ return;
+ }
+
+ if (!g_next_subsystem) {
+ g_next_subsystem = TAILQ_FIRST(&g_subsystems);
+ } else {
+ g_next_subsystem = TAILQ_NEXT(g_next_subsystem, tailq);
+ }
+
+ if (!g_next_subsystem) {
+ g_subsystems_initialized = true;
+ g_subsystem_start_fn(0, g_subsystem_start_arg);
+ return;
+ }
+
+ if (g_next_subsystem->init) {
+ g_next_subsystem->init();
+ } else {
+ spdk_subsystem_init_next(0);
+ }
+}
+
+void
+spdk_subsystem_init(spdk_subsystem_init_fn cb_fn, void *cb_arg)
+{
+ struct spdk_subsystem_depend *dep;
+
+ g_subsystem_start_fn = cb_fn;
+ g_subsystem_start_arg = cb_arg;
+
+ /* Verify that all dependency name and depends_on subsystems are registered */
+ TAILQ_FOREACH(dep, &g_subsystems_deps, tailq) {
+ if (!spdk_subsystem_find(dep->name)) {
+ SPDK_ERRLOG("subsystem %s is missing\n", dep->name);
+ g_subsystem_start_fn(-1, g_subsystem_start_arg);
+ return;
+ }
+ if (!spdk_subsystem_find(dep->depends_on)) {
+ SPDK_ERRLOG("subsystem %s dependency %s is missing\n",
+ dep->name, dep->depends_on);
+ g_subsystem_start_fn(-1, g_subsystem_start_arg);
+ return;
+ }
+ }
+
+ subsystem_sort();
+
+ spdk_subsystem_init_next(0);
+}
+
+static void
+subsystem_fini_next(void *arg1)
+{
+ assert(g_fini_thread == spdk_get_thread());
+
+ if (!g_next_subsystem) {
+ /* If the initialized flag is false, then we've failed to initialize
+ * the very first subsystem and no de-init is needed
+ */
+ if (g_subsystems_initialized) {
+ g_next_subsystem = TAILQ_LAST(&g_subsystems, spdk_subsystem_list);
+ }
+ } else {
+ if (g_subsystems_initialized || g_subsystems_init_interrupted) {
+ g_next_subsystem = TAILQ_PREV(g_next_subsystem, spdk_subsystem_list, tailq);
+ } else {
+ g_subsystems_init_interrupted = true;
+ }
+ }
+
+ while (g_next_subsystem) {
+ if (g_next_subsystem->fini) {
+ g_next_subsystem->fini();
+ return;
+ }
+ g_next_subsystem = TAILQ_PREV(g_next_subsystem, spdk_subsystem_list, tailq);
+ }
+
+ g_subsystem_stop_fn(g_subsystem_stop_arg);
+ return;
+}
+
+void
+spdk_subsystem_fini_next(void)
+{
+ if (g_fini_thread != spdk_get_thread()) {
+ spdk_thread_send_msg(g_fini_thread, subsystem_fini_next, NULL);
+ } else {
+ subsystem_fini_next(NULL);
+ }
+}
+
+void
+spdk_subsystem_fini(spdk_msg_fn cb_fn, void *cb_arg)
+{
+ g_subsystem_stop_fn = cb_fn;
+ g_subsystem_stop_arg = cb_arg;
+
+ g_fini_thread = spdk_get_thread();
+
+ spdk_subsystem_fini_next();
+}
+
+void
+spdk_subsystem_config(FILE *fp)
+{
+ struct spdk_subsystem *subsystem;
+
+ TAILQ_FOREACH(subsystem, &g_subsystems, tailq) {
+ if (subsystem->config) {
+ subsystem->config(fp);
+ }
+ }
+}
+
+void
+spdk_subsystem_config_json(struct spdk_json_write_ctx *w, struct spdk_subsystem *subsystem)
+{
+ if (subsystem && subsystem->write_config_json) {
+ subsystem->write_config_json(w);
+ } else {
+ spdk_json_write_null(w);
+ }
+}
diff --git a/src/spdk/lib/ftl/Makefile b/src/spdk/lib/ftl/Makefile
new file mode 100644
index 000000000..c24274622
--- /dev/null
+++ b/src/spdk/lib/ftl/Makefile
@@ -0,0 +1,47 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = ftl_band.c ftl_core.c ftl_debug.c ftl_io.c ftl_reloc.c \
+ ftl_restore.c ftl_init.c ftl_trace.c
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_ftl.map)
+
+LIBNAME = ftl
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/ftl/ftl_addr.h b/src/spdk/lib/ftl/ftl_addr.h
new file mode 100644
index 000000000..36d2ffb00
--- /dev/null
+++ b/src/spdk/lib/ftl/ftl_addr.h
@@ -0,0 +1,76 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FTL_ADDR_H
+#define FTL_ADDR_H
+
+#include "spdk/stdinc.h"
+
+/* Marks address as invalid */
+#define FTL_ADDR_INVALID (-1)
+/* Marks LBA as invalid */
+#define FTL_LBA_INVALID ((uint64_t)-1)
+/* Smallest data unit size */
+#define FTL_BLOCK_SIZE 4096
+
+/* This structure represents on-disk address. It can have one of the following */
+/* formats: */
+/* - offset inside the disk */
+/* - cache_offset inside the cache (indicated by the cached flag) */
+/* - packed version of the two formats above (can be only used when the */
+/* offset can be represented in less than 32 bits) */
+/* Packed format is used, when possible, to avoid wasting RAM on the L2P table. */
+struct ftl_addr {
+ union {
+ struct {
+ uint64_t cache_offset : 63;
+ uint64_t cached : 1;
+ };
+
+ struct {
+ union {
+ struct {
+ uint32_t cache_offset : 31;
+ uint32_t cached : 1;
+ };
+
+ uint32_t offset;
+ };
+ uint32_t rsvd;
+ } pack;
+
+ uint64_t offset;
+ };
+};
+
+#endif /* FTL_ADDR_H */
diff --git a/src/spdk/lib/ftl/ftl_band.c b/src/spdk/lib/ftl/ftl_band.c
new file mode 100644
index 000000000..62221dcf6
--- /dev/null
+++ b/src/spdk/lib/ftl/ftl_band.c
@@ -0,0 +1,1097 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/crc32.h"
+#include "spdk/likely.h"
+#include "spdk/util.h"
+#include "spdk/ftl.h"
+
+#include "ftl_band.h"
+#include "ftl_io.h"
+#include "ftl_core.h"
+#include "ftl_reloc.h"
+#include "ftl_debug.h"
+
+/* TODO: define some signature for meta version */
+#define FTL_MD_VER 1
+
+struct __attribute__((packed)) ftl_md_hdr {
+ /* Device instance */
+ struct spdk_uuid uuid;
+
+ /* Meta version */
+ uint8_t ver;
+
+ /* Sequence number */
+ uint64_t seq;
+
+ /* CRC32 checksum */
+ uint32_t checksum;
+};
+
+/* End metadata layout stored on media (with all three being aligned to block size): */
+/* - header */
+/* - valid bitmap */
+/* - LBA map */
+struct __attribute__((packed)) ftl_tail_md {
+ struct ftl_md_hdr hdr;
+
+ /* Max number of blocks */
+ uint64_t num_blocks;
+
+ uint8_t reserved[4059];
+};
+SPDK_STATIC_ASSERT(sizeof(struct ftl_tail_md) == FTL_BLOCK_SIZE, "Incorrect metadata size");
+
+struct __attribute__((packed)) ftl_head_md {
+ struct ftl_md_hdr hdr;
+
+ /* Number of defrag cycles */
+ uint64_t wr_cnt;
+
+ /* Number of surfaced LBAs */
+ uint64_t lba_cnt;
+
+ /* Transfer size */
+ uint32_t xfer_size;
+};
+
+size_t
+ftl_tail_md_hdr_num_blocks(void)
+{
+ return spdk_divide_round_up(sizeof(struct ftl_tail_md), FTL_BLOCK_SIZE);
+}
+
+size_t
+ftl_vld_map_num_blocks(const struct spdk_ftl_dev *dev)
+{
+ return spdk_divide_round_up(ftl_vld_map_size(dev), FTL_BLOCK_SIZE);
+}
+
+size_t
+ftl_lba_map_num_blocks(const struct spdk_ftl_dev *dev)
+{
+ return spdk_divide_round_up(ftl_get_num_blocks_in_band(dev) * sizeof(uint64_t), FTL_BLOCK_SIZE);
+}
+
+size_t
+ftl_head_md_num_blocks(const struct spdk_ftl_dev *dev)
+{
+ return dev->xfer_size;
+}
+
+size_t
+ftl_tail_md_num_blocks(const struct spdk_ftl_dev *dev)
+{
+ return spdk_divide_round_up(ftl_tail_md_hdr_num_blocks() +
+ ftl_vld_map_num_blocks(dev) +
+ ftl_lba_map_num_blocks(dev),
+ dev->xfer_size) * dev->xfer_size;
+}
+
+static uint64_t
+ftl_band_tail_md_offset(const struct ftl_band *band)
+{
+ return ftl_band_num_usable_blocks(band) -
+ ftl_tail_md_num_blocks(band->dev);
+}
+
+int
+ftl_band_full(struct ftl_band *band, size_t offset)
+{
+ return offset == ftl_band_tail_md_offset(band);
+}
+
+void
+ftl_band_write_failed(struct ftl_band *band)
+{
+ struct spdk_ftl_dev *dev = band->dev;
+
+ band->high_prio = 1;
+
+ ftl_reloc_add(dev->reloc, band, 0, ftl_get_num_blocks_in_band(dev), 1, true);
+ ftl_band_set_state(band, FTL_BAND_STATE_CLOSED);
+}
+
+static void
+ftl_band_free_lba_map(struct ftl_band *band)
+{
+ struct spdk_ftl_dev *dev = band->dev;
+ struct ftl_lba_map *lba_map = &band->lba_map;
+
+ assert(band->state == FTL_BAND_STATE_CLOSED ||
+ band->state == FTL_BAND_STATE_FREE);
+ assert(lba_map->ref_cnt == 0);
+ assert(lba_map->map != NULL);
+ assert(!band->high_prio);
+
+ /* Verify that band's metadata is consistent with l2p */
+ if (band->num_zones) {
+ assert(ftl_band_validate_md(band) == true);
+ }
+
+ spdk_mempool_put(dev->lba_pool, lba_map->dma_buf);
+ lba_map->map = NULL;
+ lba_map->dma_buf = NULL;
+}
+
+static void
+_ftl_band_set_free(struct ftl_band *band)
+{
+ struct spdk_ftl_dev *dev = band->dev;
+ struct ftl_band *lband, *prev;
+
+ /* Remove the band from the closed band list */
+ LIST_REMOVE(band, list_entry);
+
+ /* Keep the list sorted by band's write count */
+ LIST_FOREACH(lband, &dev->free_bands, list_entry) {
+ if (lband->wr_cnt > band->wr_cnt) {
+ LIST_INSERT_BEFORE(lband, band, list_entry);
+ break;
+ }
+ prev = lband;
+ }
+
+ if (!lband) {
+ if (LIST_EMPTY(&dev->free_bands)) {
+ LIST_INSERT_HEAD(&dev->free_bands, band, list_entry);
+ } else {
+ LIST_INSERT_AFTER(prev, band, list_entry);
+ }
+ }
+
+#if defined(DEBUG)
+ prev = NULL;
+ LIST_FOREACH(lband, &dev->free_bands, list_entry) {
+ if (!prev) {
+ continue;
+ }
+ assert(prev->wr_cnt <= lband->wr_cnt);
+ }
+#endif
+ dev->num_free++;
+ ftl_apply_limits(dev);
+}
+
+static void
+_ftl_band_set_preparing(struct ftl_band *band)
+{
+ struct spdk_ftl_dev *dev = band->dev;
+
+ /* Remove band from free list */
+ LIST_REMOVE(band, list_entry);
+
+ band->wr_cnt++;
+
+ assert(dev->num_free > 0);
+ dev->num_free--;
+
+ ftl_apply_limits(dev);
+}
+
+static void
+_ftl_band_set_closed(struct ftl_band *band)
+{
+ struct spdk_ftl_dev *dev = band->dev;
+
+ /* Set the state as free_md() checks for that */
+ band->state = FTL_BAND_STATE_CLOSED;
+
+ /* Free the lba map if there are no outstanding IOs */
+ ftl_band_release_lba_map(band);
+
+ if (spdk_likely(band->num_zones)) {
+ LIST_INSERT_HEAD(&dev->shut_bands, band, list_entry);
+ } else {
+ LIST_REMOVE(band, list_entry);
+ }
+}
+
+static uint32_t
+ftl_md_calc_crc(const struct ftl_md_hdr *hdr, size_t size)
+{
+ size_t checkoff = offsetof(struct ftl_md_hdr, checksum);
+ size_t mdoff = checkoff + sizeof(hdr->checksum);
+ uint32_t crc;
+
+ crc = spdk_crc32c_update(hdr, checkoff, 0);
+ return spdk_crc32c_update((const char *)hdr + mdoff, size - mdoff, crc);
+}
+
+static void
+ftl_set_md_hdr(struct ftl_band *band, struct ftl_md_hdr *hdr, size_t size)
+{
+ hdr->seq = band->seq;
+ hdr->ver = FTL_MD_VER;
+ hdr->uuid = band->dev->uuid;
+ hdr->checksum = ftl_md_calc_crc(hdr, size);
+}
+
+static int
+ftl_pack_head_md(struct ftl_band *band)
+{
+ struct spdk_ftl_dev *dev = band->dev;
+ struct ftl_head_md *head = band->lba_map.dma_buf;
+
+ head->wr_cnt = band->wr_cnt;
+ head->lba_cnt = dev->num_lbas;
+ head->xfer_size = dev->xfer_size;
+ ftl_set_md_hdr(band, &head->hdr, sizeof(struct ftl_head_md));
+
+ return FTL_MD_SUCCESS;
+}
+
+static int
+ftl_pack_tail_md(struct ftl_band *band)
+{
+ struct spdk_ftl_dev *dev = band->dev;
+ struct ftl_lba_map *lba_map = &band->lba_map;
+ struct ftl_tail_md *tail = lba_map->dma_buf;
+ void *vld_offset;
+
+ vld_offset = (char *)tail + ftl_tail_md_hdr_num_blocks() * FTL_BLOCK_SIZE;
+
+ /* Clear out the buffer */
+ memset(tail, 0, ftl_tail_md_hdr_num_blocks() * FTL_BLOCK_SIZE);
+ tail->num_blocks = ftl_get_num_blocks_in_band(dev);
+
+ pthread_spin_lock(&lba_map->lock);
+ spdk_bit_array_store_mask(lba_map->vld, vld_offset);
+ pthread_spin_unlock(&lba_map->lock);
+
+ ftl_set_md_hdr(band, &tail->hdr, ftl_tail_md_num_blocks(dev) * FTL_BLOCK_SIZE);
+
+ return FTL_MD_SUCCESS;
+}
+
+static int
+ftl_md_hdr_vld(struct spdk_ftl_dev *dev, const struct ftl_md_hdr *hdr, size_t size)
+{
+ if (spdk_uuid_compare(&dev->uuid, &hdr->uuid) != 0) {
+ return FTL_MD_NO_MD;
+ }
+
+ if (hdr->ver != FTL_MD_VER) {
+ return FTL_MD_INVALID_VER;
+ }
+
+ if (ftl_md_calc_crc(hdr, size) != hdr->checksum) {
+ return FTL_MD_INVALID_CRC;
+ }
+
+ return FTL_MD_SUCCESS;
+}
+
+static int
+ftl_unpack_tail_md(struct ftl_band *band)
+{
+ struct spdk_ftl_dev *dev = band->dev;
+ void *vld_offset;
+ struct ftl_lba_map *lba_map = &band->lba_map;
+ struct ftl_tail_md *tail = lba_map->dma_buf;
+ int rc;
+
+ vld_offset = (char *)tail + ftl_tail_md_hdr_num_blocks() * FTL_BLOCK_SIZE;
+
+ rc = ftl_md_hdr_vld(dev, &tail->hdr, ftl_tail_md_num_blocks(dev) * FTL_BLOCK_SIZE);
+ if (rc) {
+ return rc;
+ }
+
+ /*
+ * When restoring from a dirty shutdown it's possible old tail meta wasn't yet cleared -
+ * band had saved head meta, but didn't manage to send erase to all zones.
+ * The already found tail md header is valid, but inconsistent with the head meta. Treat
+ * such a band as open/without valid tail md.
+ */
+ if (band->seq != tail->hdr.seq) {
+ return FTL_MD_NO_MD;
+ }
+
+ if (tail->num_blocks != ftl_get_num_blocks_in_band(dev)) {
+ return FTL_MD_INVALID_SIZE;
+ }
+
+ spdk_bit_array_load_mask(lba_map->vld, vld_offset);
+
+ return FTL_MD_SUCCESS;
+}
+
+static int
+ftl_unpack_head_md(struct ftl_band *band)
+{
+ struct spdk_ftl_dev *dev = band->dev;
+ struct ftl_head_md *head = band->lba_map.dma_buf;
+ int rc;
+
+ rc = ftl_md_hdr_vld(dev, &head->hdr, sizeof(struct ftl_head_md));
+ if (rc) {
+ return rc;
+ }
+
+ band->seq = head->hdr.seq;
+ band->wr_cnt = head->wr_cnt;
+
+ if (dev->global_md.num_lbas == 0) {
+ dev->global_md.num_lbas = head->lba_cnt;
+ }
+
+ if (dev->global_md.num_lbas != head->lba_cnt) {
+ return FTL_MD_INVALID_SIZE;
+ }
+
+ if (dev->xfer_size != head->xfer_size) {
+ return FTL_MD_INVALID_SIZE;
+ }
+
+ return FTL_MD_SUCCESS;
+}
+
+struct ftl_addr
+ftl_band_tail_md_addr(struct ftl_band *band)
+{
+ struct ftl_addr addr = {};
+ struct ftl_zone *zone;
+ struct spdk_ftl_dev *dev = band->dev;
+ size_t xfer_size = dev->xfer_size;
+ size_t num_req = ftl_band_tail_md_offset(band) / xfer_size;
+ size_t i;
+
+ if (spdk_unlikely(!band->num_zones)) {
+ return ftl_to_addr(FTL_ADDR_INVALID);
+ }
+
+ /* Metadata should be aligned to xfer size */
+ assert(ftl_band_tail_md_offset(band) % xfer_size == 0);
+
+ zone = CIRCLEQ_FIRST(&band->zones);
+ for (i = 0; i < num_req % band->num_zones; ++i) {
+ zone = ftl_band_next_zone(band, zone);
+ }
+
+ addr.offset = (num_req / band->num_zones) * xfer_size;
+ addr.offset += zone->info.zone_id;
+
+ return addr;
+}
+
+struct ftl_addr
+ftl_band_head_md_addr(struct ftl_band *band)
+{
+ if (spdk_unlikely(!band->num_zones)) {
+ return ftl_to_addr(FTL_ADDR_INVALID);
+ }
+
+ return ftl_to_addr(CIRCLEQ_FIRST(&band->zones)->info.zone_id);
+}
+
+void
+ftl_band_set_state(struct ftl_band *band, enum ftl_band_state state)
+{
+ switch (state) {
+ case FTL_BAND_STATE_FREE:
+ assert(band->state == FTL_BAND_STATE_CLOSED);
+ _ftl_band_set_free(band);
+ break;
+
+ case FTL_BAND_STATE_PREP:
+ assert(band->state == FTL_BAND_STATE_FREE);
+ _ftl_band_set_preparing(band);
+ break;
+
+ case FTL_BAND_STATE_CLOSED:
+ if (band->state != FTL_BAND_STATE_CLOSED) {
+ assert(band->state == FTL_BAND_STATE_CLOSING || band->high_prio);
+ _ftl_band_set_closed(band);
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ band->state = state;
+}
+
+void
+ftl_band_set_addr(struct ftl_band *band, uint64_t lba, struct ftl_addr addr)
+{
+ struct ftl_lba_map *lba_map = &band->lba_map;
+ uint64_t offset;
+
+ assert(lba != FTL_LBA_INVALID);
+
+ offset = ftl_band_block_offset_from_addr(band, addr);
+ pthread_spin_lock(&lba_map->lock);
+
+ lba_map->num_vld++;
+ lba_map->map[offset] = lba;
+ spdk_bit_array_set(lba_map->vld, offset);
+
+ pthread_spin_unlock(&lba_map->lock);
+}
+
+size_t
+ftl_band_age(const struct ftl_band *band)
+{
+ return (size_t)(band->dev->seq - band->seq);
+}
+
+size_t
+ftl_band_num_usable_blocks(const struct ftl_band *band)
+{
+ return band->num_zones * ftl_get_num_blocks_in_zone(band->dev);
+}
+
+size_t
+ftl_band_user_blocks_left(const struct ftl_band *band, size_t offset)
+{
+ size_t tail_md_offset = ftl_band_tail_md_offset(band);
+
+ if (spdk_unlikely(offset <= ftl_head_md_num_blocks(band->dev))) {
+ return ftl_band_user_blocks(band);
+ }
+
+ if (spdk_unlikely(offset > tail_md_offset)) {
+ return 0;
+ }
+
+ return tail_md_offset - offset;
+}
+
+size_t
+ftl_band_user_blocks(const struct ftl_band *band)
+{
+ return ftl_band_num_usable_blocks(band) -
+ ftl_head_md_num_blocks(band->dev) -
+ ftl_tail_md_num_blocks(band->dev);
+}
+
+struct ftl_band *
+ftl_band_from_addr(struct spdk_ftl_dev *dev, struct ftl_addr addr)
+{
+ size_t band_id = ftl_addr_get_band(dev, addr);
+
+ assert(band_id < ftl_get_num_bands(dev));
+ return &dev->bands[band_id];
+}
+
+struct ftl_zone *
+ftl_band_zone_from_addr(struct ftl_band *band, struct ftl_addr addr)
+{
+ size_t pu_id = ftl_addr_get_punit(band->dev, addr);
+
+ assert(pu_id < ftl_get_num_punits(band->dev));
+ return &band->zone_buf[pu_id];
+}
+
+uint64_t
+ftl_band_block_offset_from_addr(struct ftl_band *band, struct ftl_addr addr)
+{
+ assert(ftl_addr_get_band(band->dev, addr) == band->id);
+ assert(ftl_addr_get_punit(band->dev, addr) < ftl_get_num_punits(band->dev));
+ return addr.offset % ftl_get_num_blocks_in_band(band->dev);
+}
+
+struct ftl_addr
+ftl_band_next_xfer_addr(struct ftl_band *band, struct ftl_addr addr, size_t num_blocks)
+{
+ struct spdk_ftl_dev *dev = band->dev;
+ struct ftl_zone *zone;
+ size_t num_xfers, num_stripes;
+ uint64_t offset;
+
+ assert(ftl_addr_get_band(dev, addr) == band->id);
+
+ offset = ftl_addr_get_zone_offset(dev, addr);
+ zone = ftl_band_zone_from_addr(band, addr);
+
+ num_blocks += (offset % dev->xfer_size);
+ offset -= (offset % dev->xfer_size);
+
+#if defined(DEBUG)
+ /* Check that the number of zones has not been changed */
+ struct ftl_zone *_zone;
+ size_t _num_zones = 0;
+ CIRCLEQ_FOREACH(_zone, &band->zones, circleq) {
+ if (spdk_likely(_zone->info.state != SPDK_BDEV_ZONE_STATE_OFFLINE)) {
+ _num_zones++;
+ }
+ }
+ assert(band->num_zones == _num_zones);
+#endif
+ assert(band->num_zones != 0);
+ num_stripes = (num_blocks / dev->xfer_size) / band->num_zones;
+ offset += num_stripes * dev->xfer_size;
+ num_blocks -= num_stripes * dev->xfer_size * band->num_zones;
+
+ if (offset > ftl_get_num_blocks_in_zone(dev)) {
+ return ftl_to_addr(FTL_ADDR_INVALID);
+ }
+
+ num_xfers = num_blocks / dev->xfer_size;
+ for (size_t i = 0; i < num_xfers; ++i) {
+ /* When the last zone is reached the block part of the address */
+ /* needs to be increased by xfer_size */
+ if (ftl_band_zone_is_last(band, zone)) {
+ offset += dev->xfer_size;
+ if (offset > ftl_get_num_blocks_in_zone(dev)) {
+ return ftl_to_addr(FTL_ADDR_INVALID);
+ }
+ }
+
+ zone = ftl_band_next_operational_zone(band, zone);
+ assert(zone);
+
+ num_blocks -= dev->xfer_size;
+ }
+
+ if (num_blocks) {
+ offset += num_blocks;
+ if (offset > ftl_get_num_blocks_in_zone(dev)) {
+ return ftl_to_addr(FTL_ADDR_INVALID);
+ }
+ }
+
+ addr.offset = zone->info.zone_id + offset;
+ return addr;
+}
+
+static size_t
+ftl_xfer_offset_from_addr(struct ftl_band *band, struct ftl_addr addr)
+{
+ struct ftl_zone *zone, *current_zone;
+ unsigned int punit_offset = 0;
+ size_t num_stripes, xfer_size = band->dev->xfer_size;
+ uint64_t offset;
+
+ assert(ftl_addr_get_band(band->dev, addr) == band->id);
+
+ offset = ftl_addr_get_zone_offset(band->dev, addr);
+ num_stripes = (offset / xfer_size) * band->num_zones;
+
+ current_zone = ftl_band_zone_from_addr(band, addr);
+ CIRCLEQ_FOREACH(zone, &band->zones, circleq) {
+ if (current_zone == zone) {
+ break;
+ }
+ punit_offset++;
+ }
+
+ return xfer_size * (num_stripes + punit_offset) + offset % xfer_size;
+}
+
+struct ftl_addr
+ftl_band_addr_from_block_offset(struct ftl_band *band, uint64_t block_off)
+{
+ struct ftl_addr addr = { .offset = 0 };
+
+ addr.offset = block_off + band->id * ftl_get_num_blocks_in_band(band->dev);
+ return addr;
+}
+
+struct ftl_addr
+ftl_band_next_addr(struct ftl_band *band, struct ftl_addr addr, size_t offset)
+{
+ uint64_t block_off = ftl_band_block_offset_from_addr(band, addr);
+ return ftl_band_addr_from_block_offset(band, block_off + offset);
+}
+
+void
+ftl_band_acquire_lba_map(struct ftl_band *band)
+{
+ assert(band->lba_map.map != NULL);
+ band->lba_map.ref_cnt++;
+}
+
+int
+ftl_band_alloc_lba_map(struct ftl_band *band)
+{
+ struct spdk_ftl_dev *dev = band->dev;
+ struct ftl_lba_map *lba_map = &band->lba_map;
+
+ assert(lba_map->ref_cnt == 0);
+ assert(lba_map->map == NULL);
+
+ lba_map->dma_buf = spdk_mempool_get(dev->lba_pool);
+
+ if (!lba_map->dma_buf) {
+ return -1;
+ }
+
+ memset(lba_map->dma_buf, 0, ftl_lba_map_pool_elem_size(band->dev));
+
+ lba_map->map = (uint64_t *)((char *)lba_map->dma_buf + FTL_BLOCK_SIZE *
+ (ftl_tail_md_hdr_num_blocks() + ftl_vld_map_num_blocks(dev)));
+
+ lba_map->segments = (char *)lba_map->dma_buf + ftl_tail_md_num_blocks(dev) * FTL_BLOCK_SIZE;
+
+ ftl_band_acquire_lba_map(band);
+ return 0;
+}
+
+void
+ftl_band_release_lba_map(struct ftl_band *band)
+{
+ struct ftl_lba_map *lba_map = &band->lba_map;
+
+ assert(lba_map->map != NULL);
+ assert(lba_map->ref_cnt > 0);
+ lba_map->ref_cnt--;
+
+ if (lba_map->ref_cnt == 0) {
+ ftl_band_free_lba_map(band);
+ }
+}
+
+static void
+ftl_read_md_cb(struct ftl_io *io, void *arg, int status)
+{
+ struct ftl_md_io *md_io = (struct ftl_md_io *)io;
+
+ if (!status) {
+ status = md_io->pack_fn(md_io->io.band);
+ } else {
+ status = FTL_MD_IO_FAILURE;
+ }
+
+ md_io->cb_fn(io, md_io->cb_ctx, status);
+}
+
+static struct ftl_md_io *
+ftl_io_init_md_read(struct spdk_ftl_dev *dev, struct ftl_addr addr,
+ struct ftl_band *band, size_t num_blocks, void *buf,
+ ftl_io_fn fn, ftl_md_pack_fn pack_fn, ftl_io_fn cb_fn, void *cb_ctx)
+{
+ struct ftl_md_io *io;
+ struct ftl_io_init_opts opts = {
+ .dev = dev,
+ .io = NULL,
+ .band = band,
+ .size = sizeof(*io),
+ .flags = FTL_IO_MD | FTL_IO_PHYSICAL_MODE,
+ .type = FTL_IO_READ,
+ .num_blocks = num_blocks,
+ .cb_fn = fn,
+ .iovs = {
+ {
+ .iov_base = buf,
+ .iov_len = num_blocks * FTL_BLOCK_SIZE,
+ }
+ },
+ .iovcnt = 1,
+ };
+
+ io = (struct ftl_md_io *)ftl_io_init_internal(&opts);
+ if (!io) {
+ return NULL;
+ }
+
+ io->io.addr = addr;
+ io->pack_fn = pack_fn;
+ io->cb_fn = cb_fn;
+ io->cb_ctx = cb_ctx;
+
+ return io;
+}
+
+static struct ftl_io *
+ftl_io_init_md_write(struct spdk_ftl_dev *dev, struct ftl_band *band,
+ void *data, size_t num_blocks, ftl_io_fn cb)
+{
+ struct ftl_io_init_opts opts = {
+ .dev = dev,
+ .io = NULL,
+ .band = band,
+ .size = sizeof(struct ftl_io),
+ .flags = FTL_IO_MD | FTL_IO_PHYSICAL_MODE,
+ .type = FTL_IO_WRITE,
+ .num_blocks = num_blocks,
+ .cb_fn = cb,
+ .iovs = {
+ {
+ .iov_base = data,
+ .iov_len = num_blocks * FTL_BLOCK_SIZE,
+ }
+ },
+ .iovcnt = 1,
+ .md = NULL,
+ };
+
+ return ftl_io_init_internal(&opts);
+}
+
+static int
+ftl_band_write_md(struct ftl_band *band, size_t num_blocks,
+ ftl_md_pack_fn md_fn, ftl_io_fn cb)
+{
+ struct spdk_ftl_dev *dev = band->dev;
+ struct ftl_io *io;
+
+ io = ftl_io_init_md_write(dev, band, band->lba_map.dma_buf, num_blocks, cb);
+ if (!io) {
+ return -ENOMEM;
+ }
+
+ md_fn(band);
+
+ ftl_io_write(io);
+ return 0;
+}
+
+void
+ftl_band_md_clear(struct ftl_band *band)
+{
+ band->seq = 0;
+ band->wr_cnt = 0;
+ band->lba_map.num_vld = 0;
+ band->lba_map.map = NULL;
+}
+
+int
+ftl_band_write_head_md(struct ftl_band *band, ftl_io_fn cb)
+{
+ return ftl_band_write_md(band, ftl_head_md_num_blocks(band->dev),
+ ftl_pack_head_md, cb);
+}
+
+int
+ftl_band_write_tail_md(struct ftl_band *band, ftl_io_fn cb)
+{
+ return ftl_band_write_md(band, ftl_tail_md_num_blocks(band->dev),
+ ftl_pack_tail_md, cb);
+}
+
+static struct ftl_addr
+ftl_band_lba_map_addr(struct ftl_band *band, size_t offset)
+{
+ return ftl_band_next_xfer_addr(band, band->tail_md_addr,
+ ftl_tail_md_hdr_num_blocks() +
+ ftl_vld_map_num_blocks(band->dev) +
+ offset);
+}
+
+static int
+ftl_band_read_md(struct ftl_band *band, size_t num_blocks, struct ftl_addr start_addr,
+ void *buf, ftl_io_fn fn, ftl_md_pack_fn pack_fn, ftl_io_fn cb_fn, void *cb_ctx)
+{
+ struct spdk_ftl_dev *dev = band->dev;
+ struct ftl_md_io *io;
+
+ if (spdk_unlikely(!band->num_zones)) {
+ return -ENOENT;
+ }
+
+ io = ftl_io_init_md_read(dev, start_addr, band, num_blocks, buf, fn, pack_fn, cb_fn, cb_ctx);
+ if (!io) {
+ return -ENOMEM;
+ }
+
+ ftl_io_read((struct ftl_io *)io);
+ return 0;
+}
+
+int
+ftl_band_read_tail_md(struct ftl_band *band, struct ftl_addr addr, ftl_io_fn cb_fn, void *cb_ctx)
+{
+ return ftl_band_read_md(band, ftl_tail_md_num_blocks(band->dev), addr, band->lba_map.dma_buf,
+ ftl_read_md_cb, ftl_unpack_tail_md, cb_fn, cb_ctx);
+}
+
+static size_t
+ftl_lba_map_request_segment_done(struct ftl_lba_map_request *request, size_t offset,
+ size_t num_segments)
+{
+ size_t i, num_done = 0;
+
+ for (i = offset; i < offset + num_segments; ++i) {
+ if (spdk_bit_array_get(request->segments, i)) {
+ spdk_bit_array_clear(request->segments, offset);
+ num_done++;
+ }
+ }
+
+ assert(request->num_pending >= num_done);
+ request->num_pending -= num_done;
+
+ return num_done;
+}
+
+static void
+ftl_lba_map_set_segment_state(struct ftl_lba_map *lba_map, size_t offset, size_t num_segments,
+ enum ftl_lba_map_seg_state state)
+{
+ size_t i;
+
+ for (i = offset; i < offset + num_segments; ++i) {
+ lba_map->segments[i] = state;
+ }
+}
+
+static void
+ftl_lba_map_request_free(struct spdk_ftl_dev *dev, struct ftl_lba_map_request *request)
+{
+ spdk_bit_array_clear_mask(request->segments);
+ spdk_mempool_put(dev->lba_request_pool, request);
+}
+
+static void
+ftl_process_lba_map_requests(struct spdk_ftl_dev *dev, struct ftl_lba_map *lba_map, size_t offset,
+ size_t num_segments, int status)
+{
+ struct ftl_lba_map_request *request, *trequest;
+ size_t num_done;
+
+ LIST_FOREACH_SAFE(request, &lba_map->request_list, list_entry, trequest) {
+ num_done = ftl_lba_map_request_segment_done(request, offset, num_segments);
+ if (request->num_pending == 0 || (status && num_done)) {
+ request->cb(NULL, request->cb_ctx, status);
+ LIST_REMOVE(request, list_entry);
+ ftl_lba_map_request_free(dev, request);
+ }
+ }
+}
+
+static size_t
+ftl_lba_map_offset_from_addr(struct ftl_band *band, struct ftl_addr addr)
+{
+ size_t offset;
+ struct ftl_addr start_addr = ftl_band_lba_map_addr(band, 0);
+
+ offset = ftl_xfer_offset_from_addr(band, addr) - ftl_xfer_offset_from_addr(band, start_addr);
+ assert(offset < ftl_lba_map_num_blocks(band->dev));
+
+ return offset;
+}
+
+static void
+ftl_read_lba_map_cb(struct ftl_io *io, void *arg, int status)
+{
+ struct ftl_lba_map *lba_map = &io->band->lba_map;
+ uint64_t block_off;
+
+ block_off = ftl_lba_map_offset_from_addr(io->band, io->addr);
+ assert(block_off + io->num_blocks <= ftl_lba_map_num_blocks(io->dev));
+
+ if (!status) {
+ ftl_lba_map_set_segment_state(lba_map, block_off, io->num_blocks,
+ FTL_LBA_MAP_SEG_CACHED);
+ }
+
+ ftl_process_lba_map_requests(io->dev, lba_map, block_off, io->num_blocks, status);
+}
+
+static struct ftl_lba_map_request *
+ftl_lba_map_alloc_request(struct ftl_band *band, size_t offset, size_t num_segments,
+ ftl_io_fn cb, void *cb_ctx)
+{
+ struct ftl_lba_map_request *request;
+ struct spdk_ftl_dev *dev = band->dev;
+ size_t i;
+
+ request = spdk_mempool_get(dev->lba_request_pool);
+ if (!request) {
+ return NULL;
+ }
+
+ request->cb = cb;
+ request->cb_ctx = cb_ctx;
+ request->num_pending = num_segments;
+
+ for (i = offset; i < offset + num_segments; ++i) {
+ spdk_bit_array_set(request->segments, i);
+ }
+
+ return request;
+}
+
+static size_t
+ftl_lba_map_num_clear_segments(struct ftl_lba_map *lba_map,
+ size_t offset, size_t num_segments)
+{
+ size_t i, cnt = 0;
+
+ for (i = offset; i < offset + num_segments; ++i) {
+ if (lba_map->segments[i] != FTL_LBA_MAP_SEG_CLEAR) {
+ break;
+ }
+ cnt++;
+ }
+
+ return cnt;
+}
+
+int
+ftl_band_read_lba_map(struct ftl_band *band, size_t offset, size_t lba_cnt,
+ ftl_io_fn cb_fn, void *cb_ctx)
+{
+ size_t num_blocks, block_off, num_read, num_segments;
+ struct ftl_lba_map *lba_map = &band->lba_map;
+ struct ftl_lba_map_request *request;
+ int rc = 0;
+
+ block_off = offset / FTL_NUM_LBA_IN_BLOCK;
+ num_segments = spdk_divide_round_up(offset + lba_cnt, FTL_NUM_LBA_IN_BLOCK);
+ num_blocks = num_segments - block_off;
+ assert(block_off + num_blocks <= ftl_lba_map_num_blocks(band->dev));
+
+ request = ftl_lba_map_alloc_request(band, block_off, num_blocks, cb_fn, cb_ctx);
+ if (!request) {
+ return -ENOMEM;
+ }
+
+ while (num_blocks) {
+ if (lba_map->segments[block_off] != FTL_LBA_MAP_SEG_CLEAR) {
+ if (lba_map->segments[block_off] == FTL_LBA_MAP_SEG_CACHED) {
+ ftl_lba_map_request_segment_done(request, block_off, 1);
+ }
+ num_blocks--;
+ block_off++;
+ continue;
+ }
+
+ num_read = ftl_lba_map_num_clear_segments(lba_map, block_off, num_blocks);
+ ftl_lba_map_set_segment_state(lba_map, block_off, num_read,
+ FTL_LBA_MAP_SEG_PENDING);
+
+ rc = ftl_band_read_md(band, num_read, ftl_band_lba_map_addr(band, block_off),
+ (char *)band->lba_map.map + block_off * FTL_BLOCK_SIZE,
+ ftl_read_lba_map_cb, NULL, cb_fn, cb_ctx);
+ if (rc) {
+ ftl_lba_map_request_free(band->dev, request);
+ return rc;
+ }
+
+ assert(num_blocks >= num_read);
+ num_blocks -= num_read;
+ block_off += num_read;
+ }
+
+ if (request->num_pending) {
+ LIST_INSERT_HEAD(&lba_map->request_list, request, list_entry);
+ } else {
+ cb_fn(NULL, cb_ctx, 0);
+ ftl_lba_map_request_free(band->dev, request);
+ }
+
+ return rc;
+}
+
+int
+ftl_band_read_head_md(struct ftl_band *band, ftl_io_fn cb_fn, void *cb_ctx)
+{
+ return ftl_band_read_md(band,
+ ftl_head_md_num_blocks(band->dev),
+ ftl_band_head_md_addr(band),
+ band->lba_map.dma_buf,
+ ftl_read_md_cb,
+ ftl_unpack_head_md,
+ cb_fn,
+ cb_ctx);
+}
+
+void
+ftl_band_remove_zone(struct ftl_band *band, struct ftl_zone *zone)
+{
+ CIRCLEQ_REMOVE(&band->zones, zone, circleq);
+ band->num_zones--;
+}
+
+int
+ftl_band_write_prep(struct ftl_band *band)
+{
+ struct spdk_ftl_dev *dev = band->dev;
+
+ if (ftl_band_alloc_lba_map(band)) {
+ return -1;
+ }
+
+ band->seq = ++dev->seq;
+ return 0;
+}
+
+struct ftl_zone *
+ftl_band_next_operational_zone(struct ftl_band *band, struct ftl_zone *zone)
+{
+ struct ftl_zone *result = NULL;
+ struct ftl_zone *entry;
+
+ if (spdk_unlikely(!band->num_zones)) {
+ return NULL;
+ }
+
+ /* Erasing band may fail after it was assigned to wptr. */
+ /* In such a case zone is no longer in band->zones queue. */
+ if (spdk_likely(zone->info.state != SPDK_BDEV_ZONE_STATE_OFFLINE)) {
+ result = ftl_band_next_zone(band, zone);
+ } else {
+ CIRCLEQ_FOREACH_REVERSE(entry, &band->zones, circleq) {
+ if (entry->info.zone_id > zone->info.zone_id) {
+ result = entry;
+ } else {
+ if (!result) {
+ result = CIRCLEQ_FIRST(&band->zones);
+ }
+ break;
+ }
+ }
+ }
+
+ return result;
+}
+
+void
+ftl_band_clear_lba_map(struct ftl_band *band)
+{
+ struct ftl_lba_map *lba_map = &band->lba_map;
+ size_t num_segments;
+
+ spdk_bit_array_clear_mask(lba_map->vld);
+ memset(lba_map->map, 0, ftl_lba_map_num_blocks(band->dev) * FTL_BLOCK_SIZE);
+
+ /* For open band all lba map segments are already cached */
+ assert(band->state == FTL_BAND_STATE_PREP);
+ num_segments = spdk_divide_round_up(ftl_get_num_blocks_in_band(band->dev), FTL_NUM_LBA_IN_BLOCK);
+ ftl_lba_map_set_segment_state(&band->lba_map, 0, num_segments, FTL_LBA_MAP_SEG_CACHED);
+
+ lba_map->num_vld = 0;
+}
+
+size_t
+ftl_lba_map_pool_elem_size(struct spdk_ftl_dev *dev)
+{
+ /* Map pool element holds the whole tail md + segments map */
+ return ftl_tail_md_num_blocks(dev) * FTL_BLOCK_SIZE +
+ spdk_divide_round_up(ftl_get_num_blocks_in_band(dev), FTL_NUM_LBA_IN_BLOCK);
+}
diff --git a/src/spdk/lib/ftl/ftl_band.h b/src/spdk/lib/ftl/ftl_band.h
new file mode 100644
index 000000000..109b369a5
--- /dev/null
+++ b/src/spdk/lib/ftl/ftl_band.h
@@ -0,0 +1,287 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FTL_BAND_H
+#define FTL_BAND_H
+
+#include "spdk/stdinc.h"
+#include "spdk/bit_array.h"
+#include "spdk/queue.h"
+#include "spdk/bdev_zone.h"
+
+#include "ftl_io.h"
+#include "ftl_addr.h"
+#include "ftl_core.h"
+
+/* Number of LBAs that could be stored in a single block */
+#define FTL_NUM_LBA_IN_BLOCK (FTL_BLOCK_SIZE / sizeof(uint64_t))
+
+struct spdk_ftl_dev;
+struct ftl_lba_map_request;
+
+struct ftl_zone {
+ struct spdk_bdev_zone_info info;
+
+ /* Indicates that there is inflight write */
+ bool busy;
+
+ CIRCLEQ_ENTRY(ftl_zone) circleq;
+};
+
+enum ftl_md_status {
+ FTL_MD_SUCCESS,
+ /* Metadata read failure */
+ FTL_MD_IO_FAILURE,
+ /* Invalid version */
+ FTL_MD_INVALID_VER,
+ /* UUID doesn't match */
+ FTL_MD_NO_MD,
+ /* UUID and version matches but CRC doesn't */
+ FTL_MD_INVALID_CRC,
+ /* Vld or lba map size doesn't match */
+ FTL_MD_INVALID_SIZE
+};
+
+enum ftl_lba_map_seg_state {
+ FTL_LBA_MAP_SEG_CLEAR,
+ FTL_LBA_MAP_SEG_PENDING,
+ FTL_LBA_MAP_SEG_CACHED
+};
+
+struct ftl_lba_map {
+ /* LBA/vld map lock */
+ pthread_spinlock_t lock;
+
+ /* Number of valid LBAs */
+ size_t num_vld;
+
+ /* LBA map's reference count */
+ size_t ref_cnt;
+
+ /* Bitmap of valid LBAs */
+ struct spdk_bit_array *vld;
+
+ /* LBA map (only valid for open/relocating bands) */
+ uint64_t *map;
+
+ /* LBA map segment state map (clear, pending, cached) */
+ uint8_t *segments;
+
+ LIST_HEAD(, ftl_lba_map_request) request_list;
+
+ /* Metadata DMA buffer (only valid for open/relocating bands) */
+ void *dma_buf;
+};
+
+enum ftl_band_state {
+ FTL_BAND_STATE_FREE,
+ FTL_BAND_STATE_PREP,
+ FTL_BAND_STATE_OPENING,
+ FTL_BAND_STATE_OPEN,
+ FTL_BAND_STATE_FULL,
+ FTL_BAND_STATE_CLOSING,
+ FTL_BAND_STATE_CLOSED,
+ FTL_BAND_STATE_MAX
+};
+
+struct ftl_lba_map_request {
+ /* Completion callback */
+ ftl_io_fn cb;
+
+ /* Completion callback context */
+ void *cb_ctx;
+
+ /* Bit array of requested segments */
+ struct spdk_bit_array *segments;
+
+ /* Number of pending segments to read */
+ size_t num_pending;
+
+ LIST_ENTRY(ftl_lba_map_request) list_entry;
+};
+
+struct ftl_band {
+ /* Device this band belongs to */
+ struct spdk_ftl_dev *dev;
+
+ /* Number of operational zones */
+ size_t num_zones;
+
+ /* Array of zones */
+ struct ftl_zone *zone_buf;
+
+ /* List of operational zones */
+ CIRCLEQ_HEAD(, ftl_zone) zones;
+
+ /* LBA map */
+ struct ftl_lba_map lba_map;
+
+ /* Band's state */
+ enum ftl_band_state state;
+
+ /* Band's index */
+ unsigned int id;
+
+ /* Latest merit calculation */
+ double merit;
+
+ /* High defrag priority - means that the metadata should be copied and */
+ /* the band should be defragged immediately */
+ int high_prio;
+
+ /* Sequence number */
+ uint64_t seq;
+
+ /* Number of defrag cycles */
+ uint64_t wr_cnt;
+
+ /* End metadata start addr */
+ struct ftl_addr tail_md_addr;
+
+ /* Bitmap of all bands that have its data moved onto this band */
+ struct spdk_bit_array *reloc_bitmap;
+ /* Number of open bands containing data moved from this band */
+ size_t num_reloc_bands;
+ /* Number of blocks currently being moved from this band */
+ size_t num_reloc_blocks;
+
+ /* Free/shut bands' lists */
+ LIST_ENTRY(ftl_band) list_entry;
+
+ /* High priority queue link */
+ STAILQ_ENTRY(ftl_band) prio_stailq;
+};
+
+uint64_t ftl_band_block_offset_from_addr(struct ftl_band *band, struct ftl_addr addr);
+struct ftl_addr ftl_band_addr_from_block_offset(struct ftl_band *band, uint64_t block_off);
+void ftl_band_set_state(struct ftl_band *band, enum ftl_band_state state);
+size_t ftl_band_age(const struct ftl_band *band);
+void ftl_band_acquire_lba_map(struct ftl_band *band);
+int ftl_band_alloc_lba_map(struct ftl_band *band);
+void ftl_band_clear_lba_map(struct ftl_band *band);
+void ftl_band_release_lba_map(struct ftl_band *band);
+int ftl_band_read_lba_map(struct ftl_band *band,
+ size_t offset, size_t lba_cnt,
+ ftl_io_fn cb_fn, void *cb_ctx);
+struct ftl_addr ftl_band_next_xfer_addr(struct ftl_band *band, struct ftl_addr addr,
+ size_t num_blocks);
+struct ftl_addr ftl_band_next_addr(struct ftl_band *band, struct ftl_addr addr,
+ size_t offset);
+size_t ftl_band_num_usable_blocks(const struct ftl_band *band);
+size_t ftl_band_user_blocks_left(const struct ftl_band *band, size_t offset);
+size_t ftl_band_user_blocks(const struct ftl_band *band);
+void ftl_band_set_addr(struct ftl_band *band, uint64_t lba,
+ struct ftl_addr addr);
+struct ftl_band *ftl_band_from_addr(struct spdk_ftl_dev *dev, struct ftl_addr addr);
+struct ftl_zone *ftl_band_zone_from_addr(struct ftl_band *band, struct ftl_addr);
+void ftl_band_md_clear(struct ftl_band *band);
+int ftl_band_read_tail_md(struct ftl_band *band, struct ftl_addr,
+ ftl_io_fn cb_fn, void *cb_ctx);
+int ftl_band_read_head_md(struct ftl_band *band, ftl_io_fn cb_fn, void *cb_ctx);
+int ftl_band_write_tail_md(struct ftl_band *band, ftl_io_fn cb);
+int ftl_band_write_head_md(struct ftl_band *band, ftl_io_fn cb);
+struct ftl_addr ftl_band_tail_md_addr(struct ftl_band *band);
+struct ftl_addr ftl_band_head_md_addr(struct ftl_band *band);
+void ftl_band_write_failed(struct ftl_band *band);
+int ftl_band_full(struct ftl_band *band, size_t offset);
+int ftl_band_write_prep(struct ftl_band *band);
+struct ftl_zone *ftl_band_next_operational_zone(struct ftl_band *band,
+ struct ftl_zone *zone);
+size_t ftl_lba_map_pool_elem_size(struct spdk_ftl_dev *dev);
+void ftl_band_remove_zone(struct ftl_band *band, struct ftl_zone *zone);
+
+
+static inline int
+ftl_band_empty(const struct ftl_band *band)
+{
+ return band->lba_map.num_vld == 0;
+}
+
+static inline struct ftl_zone *
+ftl_band_next_zone(struct ftl_band *band, struct ftl_zone *zone)
+{
+ assert(zone->info.state != SPDK_BDEV_ZONE_STATE_OFFLINE);
+ return CIRCLEQ_LOOP_NEXT(&band->zones, zone, circleq);
+}
+
+static inline void
+ftl_band_set_next_state(struct ftl_band *band)
+{
+ ftl_band_set_state(band, (band->state + 1) % FTL_BAND_STATE_MAX);
+}
+
+static inline int
+ftl_band_state_changing(struct ftl_band *band)
+{
+ return band->state == FTL_BAND_STATE_OPENING ||
+ band->state == FTL_BAND_STATE_CLOSING;
+}
+
+static inline int
+ftl_band_block_offset_valid(struct ftl_band *band, size_t block_off)
+{
+ struct ftl_lba_map *lba_map = &band->lba_map;
+
+ pthread_spin_lock(&lba_map->lock);
+ if (spdk_bit_array_get(lba_map->vld, block_off)) {
+ pthread_spin_unlock(&lba_map->lock);
+ return 1;
+ }
+
+ pthread_spin_unlock(&lba_map->lock);
+ return 0;
+}
+
+static inline int
+ftl_band_zone_is_last(struct ftl_band *band, struct ftl_zone *zone)
+{
+ return zone == CIRCLEQ_LAST(&band->zones);
+}
+
+static inline int
+ftl_band_zone_is_first(struct ftl_band *band, struct ftl_zone *zone)
+{
+ return zone == CIRCLEQ_FIRST(&band->zones);
+}
+
+static inline int
+ftl_zone_is_writable(const struct spdk_ftl_dev *dev, const struct ftl_zone *zone)
+{
+ bool busy = ftl_is_append_supported(dev) ? false : zone->busy;
+
+ return (zone->info.state == SPDK_BDEV_ZONE_STATE_OPEN ||
+ zone->info.state == SPDK_BDEV_ZONE_STATE_EMPTY) &&
+ !busy;
+}
+
+#endif /* FTL_BAND_H */
diff --git a/src/spdk/lib/ftl/ftl_core.c b/src/spdk/lib/ftl/ftl_core.c
new file mode 100644
index 000000000..b0b448806
--- /dev/null
+++ b/src/spdk/lib/ftl/ftl_core.c
@@ -0,0 +1,2460 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/likely.h"
+#include "spdk/stdinc.h"
+#include "spdk/nvme.h"
+#include "spdk/thread.h"
+#include "spdk/bdev_module.h"
+#include "spdk/string.h"
+#include "spdk_internal/log.h"
+#include "spdk/ftl.h"
+#include "spdk/crc32.h"
+
+#include "ftl_core.h"
+#include "ftl_band.h"
+#include "ftl_io.h"
+#include "ftl_debug.h"
+#include "ftl_reloc.h"
+
+struct ftl_band_flush {
+ struct spdk_ftl_dev *dev;
+ /* Number of bands left to be flushed */
+ size_t num_bands;
+ /* User callback */
+ spdk_ftl_fn cb_fn;
+ /* Callback's argument */
+ void *cb_arg;
+ /* List link */
+ LIST_ENTRY(ftl_band_flush) list_entry;
+};
+
+struct ftl_wptr {
+ /* Owner device */
+ struct spdk_ftl_dev *dev;
+
+ /* Current address */
+ struct ftl_addr addr;
+
+ /* Band currently being written to */
+ struct ftl_band *band;
+
+ /* Current logical block's offset */
+ uint64_t offset;
+
+ /* Current zone */
+ struct ftl_zone *zone;
+
+ /* Pending IO queue */
+ TAILQ_HEAD(, ftl_io) pending_queue;
+
+ /* List link */
+ LIST_ENTRY(ftl_wptr) list_entry;
+
+ /*
+ * If setup in direct mode, there will be no offset or band state update after IO.
+ * The zoned bdev address is not assigned by wptr, and is instead taken directly
+ * from the request.
+ */
+ bool direct_mode;
+
+ /* Number of outstanding write requests */
+ uint32_t num_outstanding;
+
+ /* Marks that the band related to this wptr needs to be closed as soon as possible */
+ bool flush;
+};
+
+struct ftl_flush {
+ /* Owner device */
+ struct spdk_ftl_dev *dev;
+
+ /* Number of batches to wait for */
+ size_t num_req;
+
+ /* Callback */
+ struct {
+ spdk_ftl_fn fn;
+ void *ctx;
+ } cb;
+
+ /* Batch bitmap */
+ struct spdk_bit_array *bmap;
+
+ /* List link */
+ LIST_ENTRY(ftl_flush) list_entry;
+};
+
+static void
+ftl_wptr_free(struct ftl_wptr *wptr)
+{
+ if (!wptr) {
+ return;
+ }
+
+ free(wptr);
+}
+
+static void
+ftl_remove_wptr(struct ftl_wptr *wptr)
+{
+ struct spdk_ftl_dev *dev = wptr->dev;
+ struct ftl_band_flush *flush, *tmp;
+
+ if (spdk_unlikely(wptr->flush)) {
+ LIST_FOREACH_SAFE(flush, &dev->band_flush_list, list_entry, tmp) {
+ assert(flush->num_bands > 0);
+ if (--flush->num_bands == 0) {
+ flush->cb_fn(flush->cb_arg, 0);
+ LIST_REMOVE(flush, list_entry);
+ free(flush);
+ }
+ }
+ }
+
+ LIST_REMOVE(wptr, list_entry);
+ ftl_wptr_free(wptr);
+}
+
+static struct ftl_wbuf_entry *
+ftl_acquire_wbuf_entry(struct ftl_io_channel *io_channel, int io_flags)
+{
+ struct ftl_wbuf_entry *entry = NULL;
+ uint32_t qdepth;
+
+ if (!(io_flags & FTL_IO_INTERNAL)) {
+ qdepth = __atomic_fetch_add(&io_channel->qdepth_current, 1, __ATOMIC_SEQ_CST);
+ if (qdepth >= io_channel->qdepth_limit) {
+ __atomic_fetch_sub(&io_channel->qdepth_current, 1, __ATOMIC_SEQ_CST);
+ return NULL;
+ }
+ }
+
+ if (spdk_ring_dequeue(io_channel->free_queue, (void **)&entry, 1) != 1) {
+ if (!(io_flags & FTL_IO_INTERNAL)) {
+ __atomic_fetch_sub(&io_channel->qdepth_current, 1, __ATOMIC_SEQ_CST);
+ }
+
+ return NULL;
+ }
+
+ assert(entry != NULL);
+
+ ftl_evict_cache_entry(io_channel->dev, entry);
+
+ entry->io_flags = io_flags;
+ entry->addr.offset = FTL_ADDR_INVALID;
+ entry->lba = FTL_LBA_INVALID;
+ entry->band = NULL;
+ entry->valid = false;
+
+ return entry;
+}
+
+static void
+ftl_release_wbuf_entry(struct ftl_wbuf_entry *entry)
+{
+ struct ftl_io_channel *io_channel = entry->ioch;
+
+ if (!(entry->io_flags & FTL_IO_INTERNAL)) {
+ __atomic_fetch_sub(&io_channel->qdepth_current, 1, __ATOMIC_SEQ_CST);
+ }
+
+ spdk_ring_enqueue(io_channel->free_queue, (void **)&entry, 1, NULL);
+}
+
+static struct ftl_batch *
+ftl_get_next_batch(struct spdk_ftl_dev *dev)
+{
+ struct ftl_batch *batch = dev->current_batch;
+ struct ftl_io_channel *ioch;
+#define FTL_DEQUEUE_ENTRIES 128
+ struct ftl_wbuf_entry *entries[FTL_DEQUEUE_ENTRIES];
+ TAILQ_HEAD(, ftl_io_channel) ioch_queue;
+ size_t i, num_dequeued, num_remaining;
+ uint64_t *metadata;
+
+ if (batch == NULL) {
+ batch = TAILQ_FIRST(&dev->pending_batches);
+ if (batch != NULL) {
+ TAILQ_REMOVE(&dev->pending_batches, batch, tailq);
+ return batch;
+ }
+
+ batch = TAILQ_FIRST(&dev->free_batches);
+ if (spdk_unlikely(batch == NULL)) {
+ return NULL;
+ }
+
+ assert(TAILQ_EMPTY(&batch->entries));
+ assert(batch->num_entries == 0);
+ TAILQ_REMOVE(&dev->free_batches, batch, tailq);
+ }
+
+ /*
+ * Keep shifting the queue to ensure fairness in IO channel selection. Each time
+ * ftl_get_next_batch() is called, we're starting to dequeue write buffer entries from a
+ * different IO channel.
+ */
+ TAILQ_INIT(&ioch_queue);
+ while (!TAILQ_EMPTY(&dev->ioch_queue)) {
+ ioch = TAILQ_FIRST(&dev->ioch_queue);
+ TAILQ_REMOVE(&dev->ioch_queue, ioch, tailq);
+ TAILQ_INSERT_TAIL(&ioch_queue, ioch, tailq);
+
+ num_remaining = dev->xfer_size - batch->num_entries;
+ while (num_remaining > 0) {
+ num_dequeued = spdk_ring_dequeue(ioch->submit_queue, (void **)entries,
+ spdk_min(num_remaining,
+ FTL_DEQUEUE_ENTRIES));
+ if (num_dequeued == 0) {
+ break;
+ }
+
+ for (i = 0; i < num_dequeued; ++i) {
+ batch->iov[batch->num_entries + i].iov_base = entries[i]->payload;
+ batch->iov[batch->num_entries + i].iov_len = FTL_BLOCK_SIZE;
+
+ if (batch->metadata != NULL) {
+ metadata = (uint64_t *)((char *)batch->metadata +
+ i * dev->md_size);
+ *metadata = entries[i]->lba;
+ }
+
+ TAILQ_INSERT_TAIL(&batch->entries, entries[i], tailq);
+ }
+
+ batch->num_entries += num_dequeued;
+ num_remaining -= num_dequeued;
+ }
+
+ if (num_remaining == 0) {
+ break;
+ }
+ }
+
+ TAILQ_CONCAT(&dev->ioch_queue, &ioch_queue, tailq);
+
+ if (batch->num_entries == dev->xfer_size) {
+ dev->current_batch = NULL;
+ } else {
+ dev->current_batch = batch;
+ batch = NULL;
+ }
+
+ return batch;
+}
+
+static void
+ftl_release_batch(struct spdk_ftl_dev *dev, struct ftl_batch *batch)
+{
+ struct ftl_wbuf_entry *entry;
+
+ while (!TAILQ_EMPTY(&batch->entries)) {
+ entry = TAILQ_FIRST(&batch->entries);
+ TAILQ_REMOVE(&batch->entries, entry, tailq);
+ ftl_release_wbuf_entry(entry);
+ }
+
+ batch->num_entries = 0;
+ TAILQ_INSERT_TAIL(&dev->free_batches, batch, tailq);
+}
+
+static struct ftl_wbuf_entry *
+ftl_get_entry_from_addr(struct spdk_ftl_dev *dev, struct ftl_addr addr)
+{
+ struct ftl_io_channel *ioch;
+ uint64_t ioch_offset, entry_offset;
+
+ ioch_offset = addr.cache_offset & ((1 << dev->ioch_shift) - 1);
+ entry_offset = addr.cache_offset >> dev->ioch_shift;
+ ioch = dev->ioch_array[ioch_offset];
+
+ assert(ioch_offset < dev->conf.max_io_channels);
+ assert(entry_offset < ioch->num_entries);
+ assert(addr.cached == 1);
+
+ return &ioch->wbuf_entries[entry_offset];
+}
+
+static struct ftl_addr
+ftl_get_addr_from_entry(struct ftl_wbuf_entry *entry)
+{
+ struct ftl_io_channel *ioch = entry->ioch;
+ struct ftl_addr addr = {};
+
+ addr.cached = 1;
+ addr.cache_offset = (uint64_t)entry->index << ioch->dev->ioch_shift | ioch->index;
+
+ return addr;
+}
+
+static void
+ftl_io_cmpl_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct ftl_io *io = cb_arg;
+ struct spdk_ftl_dev *dev = io->dev;
+
+ if (spdk_unlikely(!success)) {
+ io->status = -EIO;
+ }
+
+ ftl_trace_completion(dev, io, FTL_TRACE_COMPLETION_DISK);
+
+ if (io->type == FTL_IO_WRITE && ftl_is_append_supported(dev)) {
+ assert(io->parent);
+ io->parent->addr.offset = spdk_bdev_io_get_append_location(bdev_io);
+ }
+
+ ftl_io_dec_req(io);
+ if (ftl_io_done(io)) {
+ ftl_io_complete(io);
+ }
+
+ spdk_bdev_free_io(bdev_io);
+}
+
+static void
+ftl_halt_writes(struct spdk_ftl_dev *dev, struct ftl_band *band)
+{
+ struct ftl_wptr *wptr = NULL;
+
+ LIST_FOREACH(wptr, &dev->wptr_list, list_entry) {
+ if (wptr->band == band) {
+ break;
+ }
+ }
+
+ /* If the band already has the high_prio flag set, other writes must */
+ /* have failed earlier, so it's already taken care of. */
+ if (band->high_prio) {
+ assert(wptr == NULL);
+ return;
+ }
+
+ ftl_band_write_failed(band);
+ ftl_remove_wptr(wptr);
+}
+
+static struct ftl_wptr *
+ftl_wptr_from_band(struct ftl_band *band)
+{
+ struct spdk_ftl_dev *dev = band->dev;
+ struct ftl_wptr *wptr = NULL;
+
+ LIST_FOREACH(wptr, &dev->wptr_list, list_entry) {
+ if (wptr->band == band) {
+ return wptr;
+ }
+ }
+
+ return NULL;
+}
+
+static void
+ftl_md_write_fail(struct ftl_io *io, int status)
+{
+ struct ftl_band *band = io->band;
+ struct ftl_wptr *wptr;
+ char buf[128];
+
+ wptr = ftl_wptr_from_band(band);
+ assert(wptr);
+
+ SPDK_ERRLOG("Metadata write failed @addr: %s, status: %d\n",
+ ftl_addr2str(wptr->addr, buf, sizeof(buf)), status);
+
+ ftl_halt_writes(io->dev, band);
+}
+
+static void
+ftl_md_write_cb(struct ftl_io *io, void *arg, int status)
+{
+ struct spdk_ftl_dev *dev = io->dev;
+ struct ftl_nv_cache *nv_cache = &dev->nv_cache;
+ struct ftl_band *band = io->band;
+ struct ftl_wptr *wptr;
+ size_t id;
+
+ wptr = ftl_wptr_from_band(band);
+ assert(wptr);
+
+ if (status) {
+ ftl_md_write_fail(io, status);
+ return;
+ }
+
+ ftl_band_set_next_state(band);
+ if (band->state == FTL_BAND_STATE_CLOSED) {
+ if (ftl_dev_has_nv_cache(dev)) {
+ pthread_spin_lock(&nv_cache->lock);
+ nv_cache->num_available += ftl_band_user_blocks(band);
+
+ if (spdk_unlikely(nv_cache->num_available > nv_cache->num_data_blocks)) {
+ nv_cache->num_available = nv_cache->num_data_blocks;
+ }
+ pthread_spin_unlock(&nv_cache->lock);
+ }
+
+ /*
+ * Go through the reloc_bitmap, checking for all the bands that had its data moved
+ * onto current band and update their counters to allow them to be used for writing
+ * (once they're closed and empty).
+ */
+ for (id = 0; id < ftl_get_num_bands(dev); ++id) {
+ if (spdk_bit_array_get(band->reloc_bitmap, id)) {
+ assert(dev->bands[id].num_reloc_bands > 0);
+ dev->bands[id].num_reloc_bands--;
+
+ spdk_bit_array_clear(band->reloc_bitmap, id);
+ }
+ }
+
+ ftl_remove_wptr(wptr);
+ }
+}
+
+static int
+ftl_read_next_physical_addr(struct ftl_io *io, struct ftl_addr *addr)
+{
+ struct spdk_ftl_dev *dev = io->dev;
+ size_t num_blocks, max_blocks;
+
+ assert(ftl_io_mode_physical(io));
+ assert(io->iov_pos < io->iov_cnt);
+
+ if (io->pos == 0) {
+ *addr = io->addr;
+ } else {
+ *addr = ftl_band_next_xfer_addr(io->band, io->addr, io->pos);
+ }
+
+ assert(!ftl_addr_invalid(*addr));
+
+ /* Metadata has to be read in the way it's written (jumping across */
+ /* the zones in xfer_size increments) */
+ if (io->flags & FTL_IO_MD) {
+ max_blocks = dev->xfer_size - (addr->offset % dev->xfer_size);
+ num_blocks = spdk_min(ftl_io_iovec_len_left(io), max_blocks);
+ assert(addr->offset / dev->xfer_size ==
+ (addr->offset + num_blocks - 1) / dev->xfer_size);
+ } else {
+ num_blocks = ftl_io_iovec_len_left(io);
+ }
+
+ return num_blocks;
+}
+
+static int
+ftl_wptr_close_band(struct ftl_wptr *wptr)
+{
+ struct ftl_band *band = wptr->band;
+
+ ftl_band_set_state(band, FTL_BAND_STATE_CLOSING);
+
+ return ftl_band_write_tail_md(band, ftl_md_write_cb);
+}
+
+static int
+ftl_wptr_open_band(struct ftl_wptr *wptr)
+{
+ struct ftl_band *band = wptr->band;
+
+ assert(ftl_band_zone_is_first(band, wptr->zone));
+ assert(band->lba_map.num_vld == 0);
+
+ ftl_band_clear_lba_map(band);
+
+ assert(band->state == FTL_BAND_STATE_PREP);
+ ftl_band_set_state(band, FTL_BAND_STATE_OPENING);
+
+ return ftl_band_write_head_md(band, ftl_md_write_cb);
+}
+
+static int
+ftl_submit_erase(struct ftl_io *io)
+{
+ struct spdk_ftl_dev *dev = io->dev;
+ struct ftl_band *band = io->band;
+ struct ftl_addr addr = io->addr;
+ struct ftl_io_channel *ioch;
+ struct ftl_zone *zone;
+ int rc = 0;
+ size_t i;
+
+ ioch = ftl_io_channel_get_ctx(ftl_get_io_channel(dev));
+
+ for (i = 0; i < io->num_blocks; ++i) {
+ if (i != 0) {
+ zone = ftl_band_next_zone(band, ftl_band_zone_from_addr(band, addr));
+ assert(zone->info.state == SPDK_BDEV_ZONE_STATE_FULL);
+ addr.offset = zone->info.zone_id;
+ }
+
+ assert(ftl_addr_get_zone_offset(dev, addr) == 0);
+
+ ftl_trace_submission(dev, io, addr, 1);
+ rc = spdk_bdev_zone_management(dev->base_bdev_desc, ioch->base_ioch, addr.offset,
+ SPDK_BDEV_ZONE_RESET, ftl_io_cmpl_cb, io);
+ if (spdk_unlikely(rc)) {
+ ftl_io_fail(io, rc);
+ SPDK_ERRLOG("Vector reset failed with status: %d\n", rc);
+ break;
+ }
+
+ ftl_io_inc_req(io);
+ ftl_io_advance(io, 1);
+ }
+
+ if (ftl_io_done(io)) {
+ ftl_io_complete(io);
+ }
+
+ return rc;
+}
+
+static bool
+ftl_check_core_thread(const struct spdk_ftl_dev *dev)
+{
+ return dev->core_thread == spdk_get_thread();
+}
+
+struct spdk_io_channel *
+ftl_get_io_channel(const struct spdk_ftl_dev *dev)
+{
+ if (ftl_check_core_thread(dev)) {
+ return dev->ioch;
+ }
+
+ return NULL;
+}
+
+static void
+ftl_erase_fail(struct ftl_io *io, int status)
+{
+ struct ftl_zone *zone;
+ struct ftl_band *band = io->band;
+ char buf[128];
+
+ SPDK_ERRLOG("Erase failed at address: %s, status: %d\n",
+ ftl_addr2str(io->addr, buf, sizeof(buf)), status);
+
+ zone = ftl_band_zone_from_addr(band, io->addr);
+ zone->info.state = SPDK_BDEV_ZONE_STATE_OFFLINE;
+ ftl_band_remove_zone(band, zone);
+ band->tail_md_addr = ftl_band_tail_md_addr(band);
+}
+
+static void
+ftl_zone_erase_cb(struct ftl_io *io, void *ctx, int status)
+{
+ struct ftl_zone *zone;
+
+ zone = ftl_band_zone_from_addr(io->band, io->addr);
+ zone->busy = false;
+
+ if (spdk_unlikely(status)) {
+ ftl_erase_fail(io, status);
+ return;
+ }
+
+ zone->info.state = SPDK_BDEV_ZONE_STATE_EMPTY;
+ zone->info.write_pointer = zone->info.zone_id;
+}
+
+static int
+ftl_band_erase(struct ftl_band *band)
+{
+ struct ftl_zone *zone;
+ struct ftl_io *io;
+ int rc = 0;
+
+ assert(band->state == FTL_BAND_STATE_CLOSED ||
+ band->state == FTL_BAND_STATE_FREE);
+
+ ftl_band_set_state(band, FTL_BAND_STATE_PREP);
+
+ CIRCLEQ_FOREACH(zone, &band->zones, circleq) {
+ if (zone->info.state == SPDK_BDEV_ZONE_STATE_EMPTY) {
+ continue;
+ }
+
+ io = ftl_io_erase_init(band, 1, ftl_zone_erase_cb);
+ if (!io) {
+ rc = -ENOMEM;
+ break;
+ }
+
+ zone->busy = true;
+ io->addr.offset = zone->info.zone_id;
+ rc = ftl_submit_erase(io);
+ if (rc) {
+ zone->busy = false;
+ assert(0);
+ /* TODO: change band's state back to close? */
+ break;
+ }
+ }
+
+ return rc;
+}
+
+static struct ftl_band *
+ftl_next_write_band(struct spdk_ftl_dev *dev)
+{
+ struct ftl_band *band;
+
+ /* Find a free band that has all of its data moved onto other closed bands */
+ LIST_FOREACH(band, &dev->free_bands, list_entry) {
+ assert(band->state == FTL_BAND_STATE_FREE);
+ if (band->num_reloc_bands == 0 && band->num_reloc_blocks == 0) {
+ break;
+ }
+ }
+
+ if (spdk_unlikely(!band)) {
+ return NULL;
+ }
+
+ if (ftl_band_erase(band)) {
+ /* TODO: handle erase failure */
+ return NULL;
+ }
+
+ return band;
+}
+
+static struct ftl_band *
+ftl_next_wptr_band(struct spdk_ftl_dev *dev)
+{
+ struct ftl_band *band;
+
+ if (!dev->next_band) {
+ band = ftl_next_write_band(dev);
+ } else {
+ assert(dev->next_band->state == FTL_BAND_STATE_PREP);
+ band = dev->next_band;
+ dev->next_band = NULL;
+ }
+
+ return band;
+}
+
+static struct ftl_wptr *
+ftl_wptr_init(struct ftl_band *band)
+{
+ struct spdk_ftl_dev *dev = band->dev;
+ struct ftl_wptr *wptr;
+
+ wptr = calloc(1, sizeof(*wptr));
+ if (!wptr) {
+ return NULL;
+ }
+
+ wptr->dev = dev;
+ wptr->band = band;
+ wptr->zone = CIRCLEQ_FIRST(&band->zones);
+ wptr->addr.offset = wptr->zone->info.zone_id;
+ TAILQ_INIT(&wptr->pending_queue);
+
+ return wptr;
+}
+
+static int
+ftl_add_direct_wptr(struct ftl_band *band)
+{
+ struct spdk_ftl_dev *dev = band->dev;
+ struct ftl_wptr *wptr;
+
+ assert(band->state == FTL_BAND_STATE_OPEN);
+
+ wptr = ftl_wptr_init(band);
+ if (!wptr) {
+ return -1;
+ }
+
+ wptr->direct_mode = true;
+
+ if (ftl_band_alloc_lba_map(band)) {
+ ftl_wptr_free(wptr);
+ return -1;
+ }
+
+ LIST_INSERT_HEAD(&dev->wptr_list, wptr, list_entry);
+
+ SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "wptr: direct band %u\n", band->id);
+ ftl_trace_write_band(dev, band);
+ return 0;
+}
+
+static void
+ftl_close_direct_wptr(struct ftl_band *band)
+{
+ struct ftl_wptr *wptr = ftl_wptr_from_band(band);
+
+ assert(wptr);
+ assert(wptr->direct_mode);
+ assert(band->state == FTL_BAND_STATE_CLOSED);
+
+ ftl_band_release_lba_map(band);
+
+ ftl_remove_wptr(wptr);
+}
+
+int
+ftl_band_set_direct_access(struct ftl_band *band, bool access)
+{
+ if (access) {
+ return ftl_add_direct_wptr(band);
+ } else {
+ ftl_close_direct_wptr(band);
+ return 0;
+ }
+}
+
+static int
+ftl_add_wptr(struct spdk_ftl_dev *dev)
+{
+ struct ftl_band *band;
+ struct ftl_wptr *wptr;
+
+ band = ftl_next_wptr_band(dev);
+ if (!band) {
+ return -1;
+ }
+
+ wptr = ftl_wptr_init(band);
+ if (!wptr) {
+ return -1;
+ }
+
+ if (ftl_band_write_prep(band)) {
+ ftl_wptr_free(wptr);
+ return -1;
+ }
+
+ LIST_INSERT_HEAD(&dev->wptr_list, wptr, list_entry);
+
+ SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "wptr: band %u\n", band->id);
+ ftl_trace_write_band(dev, band);
+ return 0;
+}
+
+static void
+ftl_wptr_advance(struct ftl_wptr *wptr, size_t xfer_size)
+{
+ struct ftl_band *band = wptr->band;
+ struct spdk_ftl_dev *dev = wptr->dev;
+ struct spdk_ftl_conf *conf = &dev->conf;
+ size_t next_thld;
+
+ if (spdk_unlikely(wptr->direct_mode)) {
+ return;
+ }
+
+ wptr->offset += xfer_size;
+ next_thld = (ftl_band_num_usable_blocks(band) * conf->band_thld) / 100;
+
+ if (ftl_band_full(band, wptr->offset)) {
+ ftl_band_set_state(band, FTL_BAND_STATE_FULL);
+ }
+
+ wptr->zone->busy = true;
+ wptr->addr = ftl_band_next_xfer_addr(band, wptr->addr, xfer_size);
+ wptr->zone = ftl_band_next_operational_zone(band, wptr->zone);
+
+ assert(!ftl_addr_invalid(wptr->addr));
+
+ SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "wptr: pu:%lu band:%lu, offset:%lu\n",
+ ftl_addr_get_punit(dev, wptr->addr),
+ ftl_addr_get_band(dev, wptr->addr),
+ wptr->addr.offset);
+
+ if (wptr->offset >= next_thld && !dev->next_band) {
+ dev->next_band = ftl_next_write_band(dev);
+ }
+}
+
+static size_t
+ftl_wptr_user_blocks_left(const struct ftl_wptr *wptr)
+{
+ return ftl_band_user_blocks_left(wptr->band, wptr->offset);
+}
+
+static bool
+ftl_wptr_ready(struct ftl_wptr *wptr)
+{
+ struct ftl_band *band = wptr->band;
+
+ /* TODO: add handling of empty bands */
+
+ if (spdk_unlikely(!ftl_zone_is_writable(wptr->dev, wptr->zone))) {
+ /* Erasing band may fail after it was assigned to wptr. */
+ if (spdk_unlikely(wptr->zone->info.state == SPDK_BDEV_ZONE_STATE_OFFLINE)) {
+ ftl_wptr_advance(wptr, wptr->dev->xfer_size);
+ }
+ return false;
+ }
+
+ /* If we're in the process of writing metadata, wait till it is */
+ /* completed. */
+ /* TODO: we should probably change bands once we're writing tail md */
+ if (ftl_band_state_changing(band)) {
+ return false;
+ }
+
+ if (band->state == FTL_BAND_STATE_FULL) {
+ if (wptr->num_outstanding == 0) {
+ if (ftl_wptr_close_band(wptr)) {
+ /* TODO: need recovery here */
+ assert(false);
+ }
+ }
+
+ return false;
+ }
+
+ if (band->state != FTL_BAND_STATE_OPEN) {
+ if (ftl_wptr_open_band(wptr)) {
+ /* TODO: need recovery here */
+ assert(false);
+ }
+
+ return false;
+ }
+
+ return true;
+}
+
+int
+ftl_flush_active_bands(struct spdk_ftl_dev *dev, spdk_ftl_fn cb_fn, void *cb_arg)
+{
+ struct ftl_wptr *wptr;
+ struct ftl_band_flush *flush;
+
+ assert(ftl_get_core_thread(dev) == spdk_get_thread());
+
+ flush = calloc(1, sizeof(*flush));
+ if (spdk_unlikely(!flush)) {
+ return -ENOMEM;
+ }
+
+ LIST_INSERT_HEAD(&dev->band_flush_list, flush, list_entry);
+
+ flush->cb_fn = cb_fn;
+ flush->cb_arg = cb_arg;
+ flush->dev = dev;
+
+ LIST_FOREACH(wptr, &dev->wptr_list, list_entry) {
+ wptr->flush = true;
+ flush->num_bands++;
+ }
+
+ return 0;
+}
+
+static const struct spdk_ftl_limit *
+ftl_get_limit(const struct spdk_ftl_dev *dev, int type)
+{
+ assert(type < SPDK_FTL_LIMIT_MAX);
+ return &dev->conf.limits[type];
+}
+
+static bool
+ftl_cache_lba_valid(struct spdk_ftl_dev *dev, struct ftl_wbuf_entry *entry)
+{
+ struct ftl_addr addr;
+
+ /* If the LBA is invalid don't bother checking the md and l2p */
+ if (spdk_unlikely(entry->lba == FTL_LBA_INVALID)) {
+ return false;
+ }
+
+ addr = ftl_l2p_get(dev, entry->lba);
+ if (!(ftl_addr_cached(addr) && entry == ftl_get_entry_from_addr(dev, addr))) {
+ return false;
+ }
+
+ return true;
+}
+
+void
+ftl_evict_cache_entry(struct spdk_ftl_dev *dev, struct ftl_wbuf_entry *entry)
+{
+ pthread_spin_lock(&entry->lock);
+
+ if (!entry->valid) {
+ goto unlock;
+ }
+
+ /* If the l2p wasn't updated and still points at the entry, fill it with the */
+ /* on-disk address and clear the cache status bit. Otherwise, skip the l2p update */
+ /* and just clear the cache status. */
+ if (!ftl_cache_lba_valid(dev, entry)) {
+ goto clear;
+ }
+
+ ftl_l2p_set(dev, entry->lba, entry->addr);
+clear:
+ entry->valid = false;
+unlock:
+ pthread_spin_unlock(&entry->lock);
+}
+
+static void
+ftl_pad_wbuf(struct spdk_ftl_dev *dev, size_t size)
+{
+ struct ftl_wbuf_entry *entry;
+ struct ftl_io_channel *ioch;
+ int flags = FTL_IO_PAD | FTL_IO_INTERNAL;
+
+ ioch = ftl_io_channel_get_ctx(ftl_get_io_channel(dev));
+
+ for (size_t i = 0; i < size; ++i) {
+ entry = ftl_acquire_wbuf_entry(ioch, flags);
+ if (!entry) {
+ break;
+ }
+
+ entry->lba = FTL_LBA_INVALID;
+ entry->addr = ftl_to_addr(FTL_ADDR_INVALID);
+ memset(entry->payload, 0, FTL_BLOCK_SIZE);
+
+ spdk_ring_enqueue(ioch->submit_queue, (void **)&entry, 1, NULL);
+ }
+}
+
+static void
+ftl_remove_free_bands(struct spdk_ftl_dev *dev)
+{
+ while (!LIST_EMPTY(&dev->free_bands)) {
+ LIST_REMOVE(LIST_FIRST(&dev->free_bands), list_entry);
+ }
+
+ dev->next_band = NULL;
+}
+
+static void
+ftl_wptr_pad_band(struct ftl_wptr *wptr)
+{
+ struct spdk_ftl_dev *dev = wptr->dev;
+ struct ftl_batch *batch = dev->current_batch;
+ struct ftl_io_channel *ioch;
+ size_t size, pad_size, blocks_left;
+
+ size = batch != NULL ? batch->num_entries : 0;
+ TAILQ_FOREACH(ioch, &dev->ioch_queue, tailq) {
+ size += spdk_ring_count(ioch->submit_queue);
+ }
+
+ ioch = ftl_io_channel_get_ctx(ftl_get_io_channel(dev));
+
+ blocks_left = ftl_wptr_user_blocks_left(wptr);
+ assert(size <= blocks_left);
+ assert(blocks_left % dev->xfer_size == 0);
+ pad_size = spdk_min(blocks_left - size, spdk_ring_count(ioch->free_queue));
+
+ ftl_pad_wbuf(dev, pad_size);
+}
+
+static void
+ftl_wptr_process_shutdown(struct ftl_wptr *wptr)
+{
+ struct spdk_ftl_dev *dev = wptr->dev;
+ struct ftl_batch *batch = dev->current_batch;
+ struct ftl_io_channel *ioch;
+ size_t size;
+
+ size = batch != NULL ? batch->num_entries : 0;
+ TAILQ_FOREACH(ioch, &dev->ioch_queue, tailq) {
+ size += spdk_ring_count(ioch->submit_queue);
+ }
+
+ if (size >= dev->xfer_size) {
+ return;
+ }
+
+ /* If we reach this point we need to remove free bands */
+ /* and pad current wptr band to the end */
+ ftl_remove_free_bands(dev);
+ ftl_wptr_pad_band(wptr);
+}
+
+static int
+ftl_shutdown_complete(struct spdk_ftl_dev *dev)
+{
+ struct ftl_io_channel *ioch = ftl_io_channel_get_ctx(dev->ioch);
+
+ return !__atomic_load_n(&dev->num_inflight, __ATOMIC_SEQ_CST) &&
+ dev->num_io_channels == 1 && LIST_EMPTY(&dev->wptr_list) &&
+ TAILQ_EMPTY(&ioch->retry_queue);
+}
+
+void
+ftl_apply_limits(struct spdk_ftl_dev *dev)
+{
+ const struct spdk_ftl_limit *limit;
+ struct ftl_io_channel *ioch;
+ struct ftl_stats *stats = &dev->stats;
+ uint32_t qdepth_limit = 100;
+ int i;
+
+ /* Clear existing limit */
+ dev->limit = SPDK_FTL_LIMIT_MAX;
+
+ for (i = SPDK_FTL_LIMIT_CRIT; i < SPDK_FTL_LIMIT_MAX; ++i) {
+ limit = ftl_get_limit(dev, i);
+
+ if (dev->num_free <= limit->thld) {
+ qdepth_limit = limit->limit;
+ stats->limits[i]++;
+ dev->limit = i;
+ break;
+ }
+ }
+
+ ftl_trace_limits(dev, dev->limit, dev->num_free);
+ TAILQ_FOREACH(ioch, &dev->ioch_queue, tailq) {
+ __atomic_store_n(&ioch->qdepth_limit, (qdepth_limit * ioch->num_entries) / 100,
+ __ATOMIC_SEQ_CST);
+ }
+}
+
+static int
+ftl_invalidate_addr_unlocked(struct spdk_ftl_dev *dev, struct ftl_addr addr)
+{
+ struct ftl_band *band = ftl_band_from_addr(dev, addr);
+ struct ftl_lba_map *lba_map = &band->lba_map;
+ uint64_t offset;
+
+ offset = ftl_band_block_offset_from_addr(band, addr);
+
+ /* The bit might be already cleared if two writes are scheduled to the */
+ /* same LBA at the same time */
+ if (spdk_bit_array_get(lba_map->vld, offset)) {
+ assert(lba_map->num_vld > 0);
+ spdk_bit_array_clear(lba_map->vld, offset);
+ lba_map->num_vld--;
+ return 1;
+ }
+
+ return 0;
+}
+
+int
+ftl_invalidate_addr(struct spdk_ftl_dev *dev, struct ftl_addr addr)
+{
+ struct ftl_band *band;
+ int rc;
+
+ assert(!ftl_addr_cached(addr));
+ band = ftl_band_from_addr(dev, addr);
+
+ pthread_spin_lock(&band->lba_map.lock);
+ rc = ftl_invalidate_addr_unlocked(dev, addr);
+ pthread_spin_unlock(&band->lba_map.lock);
+
+ return rc;
+}
+
+static int
+ftl_read_retry(int rc)
+{
+ return rc == -EAGAIN;
+}
+
+static int
+ftl_read_canceled(int rc)
+{
+ return rc == -EFAULT || rc == 0;
+}
+
+static int
+ftl_cache_read(struct ftl_io *io, uint64_t lba,
+ struct ftl_addr addr, void *buf)
+{
+ struct ftl_wbuf_entry *entry;
+ struct ftl_addr naddr;
+ int rc = 0;
+
+ entry = ftl_get_entry_from_addr(io->dev, addr);
+ pthread_spin_lock(&entry->lock);
+
+ naddr = ftl_l2p_get(io->dev, lba);
+ if (addr.offset != naddr.offset) {
+ rc = -1;
+ goto out;
+ }
+
+ memcpy(buf, entry->payload, FTL_BLOCK_SIZE);
+out:
+ pthread_spin_unlock(&entry->lock);
+ return rc;
+}
+
+static int
+ftl_read_next_logical_addr(struct ftl_io *io, struct ftl_addr *addr)
+{
+ struct spdk_ftl_dev *dev = io->dev;
+ struct ftl_addr next_addr;
+ size_t i;
+
+ *addr = ftl_l2p_get(dev, ftl_io_current_lba(io));
+
+ SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "Read addr:%lx, lba:%lu\n",
+ addr->offset, ftl_io_current_lba(io));
+
+ /* If the address is invalid, skip it (the buffer should already be zero'ed) */
+ if (ftl_addr_invalid(*addr)) {
+ return -EFAULT;
+ }
+
+ if (ftl_addr_cached(*addr)) {
+ if (!ftl_cache_read(io, ftl_io_current_lba(io), *addr, ftl_io_iovec_addr(io))) {
+ return 0;
+ }
+
+ /* If the state changed, we have to re-read the l2p */
+ return -EAGAIN;
+ }
+
+ for (i = 1; i < ftl_io_iovec_len_left(io); ++i) {
+ next_addr = ftl_l2p_get(dev, ftl_io_get_lba(io, io->pos + i));
+
+ if (ftl_addr_invalid(next_addr) || ftl_addr_cached(next_addr)) {
+ break;
+ }
+
+ if (addr->offset + i != next_addr.offset) {
+ break;
+ }
+ }
+
+ return i;
+}
+
+static int
+ftl_submit_read(struct ftl_io *io)
+{
+ struct spdk_ftl_dev *dev = io->dev;
+ struct ftl_io_channel *ioch;
+ struct ftl_addr addr;
+ int rc = 0, num_blocks;
+
+ ioch = ftl_io_channel_get_ctx(io->ioch);
+
+ assert(LIST_EMPTY(&io->children));
+
+ while (io->pos < io->num_blocks) {
+ if (ftl_io_mode_physical(io)) {
+ num_blocks = rc = ftl_read_next_physical_addr(io, &addr);
+ } else {
+ num_blocks = rc = ftl_read_next_logical_addr(io, &addr);
+ }
+
+ /* We might need to retry the read from scratch (e.g. */
+ /* because write was under way and completed before */
+ /* we could read it from the write buffer */
+ if (ftl_read_retry(rc)) {
+ continue;
+ }
+
+ /* We don't have to schedule the read, as it was read from cache */
+ if (ftl_read_canceled(rc)) {
+ ftl_io_advance(io, 1);
+ ftl_trace_completion(io->dev, io, rc ? FTL_TRACE_COMPLETION_INVALID :
+ FTL_TRACE_COMPLETION_CACHE);
+ rc = 0;
+ continue;
+ }
+
+ assert(num_blocks > 0);
+
+ ftl_trace_submission(dev, io, addr, num_blocks);
+ rc = spdk_bdev_read_blocks(dev->base_bdev_desc, ioch->base_ioch,
+ ftl_io_iovec_addr(io),
+ addr.offset,
+ num_blocks, ftl_io_cmpl_cb, io);
+ if (spdk_unlikely(rc)) {
+ if (rc == -ENOMEM) {
+ TAILQ_INSERT_TAIL(&ioch->retry_queue, io, ioch_entry);
+ rc = 0;
+ } else {
+ ftl_io_fail(io, rc);
+ }
+ break;
+ }
+
+ ftl_io_inc_req(io);
+ ftl_io_advance(io, num_blocks);
+ }
+
+ /* If we didn't have to read anything from the device, */
+ /* complete the request right away */
+ if (ftl_io_done(io)) {
+ ftl_io_complete(io);
+ }
+
+ return rc;
+}
+
+static void
+ftl_complete_flush(struct ftl_flush *flush)
+{
+ assert(flush->num_req == 0);
+ LIST_REMOVE(flush, list_entry);
+
+ flush->cb.fn(flush->cb.ctx, 0);
+
+ spdk_bit_array_free(&flush->bmap);
+ free(flush);
+}
+
+static void
+ftl_process_flush(struct spdk_ftl_dev *dev, struct ftl_batch *batch)
+{
+ struct ftl_flush *flush, *tflush;
+ size_t offset;
+
+ LIST_FOREACH_SAFE(flush, &dev->flush_list, list_entry, tflush) {
+ offset = batch->index;
+
+ if (spdk_bit_array_get(flush->bmap, offset)) {
+ spdk_bit_array_clear(flush->bmap, offset);
+ if (!(--flush->num_req)) {
+ ftl_complete_flush(flush);
+ }
+ }
+ }
+}
+
+static void
+ftl_nv_cache_wrap_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct ftl_nv_cache *nv_cache = cb_arg;
+
+ if (!success) {
+ SPDK_ERRLOG("Unable to write non-volatile cache metadata header\n");
+ /* TODO: go into read-only mode */
+ assert(0);
+ }
+
+ pthread_spin_lock(&nv_cache->lock);
+ nv_cache->ready = true;
+ pthread_spin_unlock(&nv_cache->lock);
+
+ spdk_bdev_free_io(bdev_io);
+}
+
+static void
+ftl_nv_cache_wrap(void *ctx)
+{
+ struct ftl_nv_cache *nv_cache = ctx;
+ int rc;
+
+ rc = ftl_nv_cache_write_header(nv_cache, false, ftl_nv_cache_wrap_cb, nv_cache);
+ if (spdk_unlikely(rc != 0)) {
+ SPDK_ERRLOG("Unable to write non-volatile cache metadata header: %s\n",
+ spdk_strerror(-rc));
+ /* TODO: go into read-only mode */
+ assert(0);
+ }
+}
+
+static uint64_t
+ftl_reserve_nv_cache(struct ftl_nv_cache *nv_cache, size_t *num_blocks, unsigned int *phase)
+{
+ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(nv_cache->bdev_desc);
+ struct spdk_ftl_dev *dev = SPDK_CONTAINEROF(nv_cache, struct spdk_ftl_dev, nv_cache);
+ uint64_t num_available, cache_size, cache_addr = FTL_LBA_INVALID;
+
+ cache_size = spdk_bdev_get_num_blocks(bdev);
+
+ pthread_spin_lock(&nv_cache->lock);
+ if (spdk_unlikely(nv_cache->num_available == 0 || !nv_cache->ready)) {
+ goto out;
+ }
+
+ num_available = spdk_min(nv_cache->num_available, *num_blocks);
+ num_available = spdk_min(num_available, dev->conf.nv_cache.max_request_cnt);
+
+ if (spdk_unlikely(nv_cache->current_addr + num_available > cache_size)) {
+ *num_blocks = cache_size - nv_cache->current_addr;
+ } else {
+ *num_blocks = num_available;
+ }
+
+ cache_addr = nv_cache->current_addr;
+ nv_cache->current_addr += *num_blocks;
+ nv_cache->num_available -= *num_blocks;
+ *phase = nv_cache->phase;
+
+ if (nv_cache->current_addr == spdk_bdev_get_num_blocks(bdev)) {
+ nv_cache->current_addr = FTL_NV_CACHE_DATA_OFFSET;
+ nv_cache->phase = ftl_nv_cache_next_phase(nv_cache->phase);
+ nv_cache->ready = false;
+ spdk_thread_send_msg(ftl_get_core_thread(dev), ftl_nv_cache_wrap, nv_cache);
+ }
+out:
+ pthread_spin_unlock(&nv_cache->lock);
+ return cache_addr;
+}
+
+static struct ftl_io *
+ftl_alloc_io_nv_cache(struct ftl_io *parent, size_t num_blocks)
+{
+ struct ftl_io_init_opts opts = {
+ .dev = parent->dev,
+ .parent = parent,
+ .iovcnt = 0,
+ .num_blocks = num_blocks,
+ .flags = parent->flags | FTL_IO_CACHE,
+ };
+
+ return ftl_io_init_internal(&opts);
+}
+
+static void
+ftl_nv_cache_submit_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct ftl_io *io = cb_arg;
+ struct ftl_nv_cache *nv_cache = &io->dev->nv_cache;
+
+ if (spdk_unlikely(!success)) {
+ SPDK_ERRLOG("Non-volatile cache write failed at %"PRIx64"\n", io->addr.offset);
+ io->status = -EIO;
+ }
+
+ ftl_io_dec_req(io);
+ if (ftl_io_done(io)) {
+ spdk_mempool_put(nv_cache->md_pool, io->md);
+ ftl_io_complete(io);
+ }
+
+ spdk_bdev_free_io(bdev_io);
+}
+
+static void
+ftl_submit_nv_cache(void *ctx)
+{
+ struct ftl_io *io = ctx;
+ struct spdk_ftl_dev *dev = io->dev;
+ struct spdk_thread *thread;
+ struct ftl_nv_cache *nv_cache = &dev->nv_cache;
+ struct ftl_io_channel *ioch;
+ int rc;
+
+ ioch = ftl_io_channel_get_ctx(io->ioch);
+ thread = spdk_io_channel_get_thread(io->ioch);
+
+ rc = spdk_bdev_write_blocks_with_md(nv_cache->bdev_desc, ioch->cache_ioch,
+ ftl_io_iovec_addr(io), io->md, io->addr.offset,
+ io->num_blocks, ftl_nv_cache_submit_cb, io);
+ if (rc == -ENOMEM) {
+ spdk_thread_send_msg(thread, ftl_submit_nv_cache, io);
+ return;
+ } else if (rc) {
+ SPDK_ERRLOG("Write to persistent cache failed: %s (%"PRIu64", %"PRIu64")\n",
+ spdk_strerror(-rc), io->addr.offset, io->num_blocks);
+ spdk_mempool_put(nv_cache->md_pool, io->md);
+ io->status = -EIO;
+ ftl_io_complete(io);
+ return;
+ }
+
+ ftl_io_advance(io, io->num_blocks);
+ ftl_io_inc_req(io);
+}
+
+static void
+ftl_nv_cache_fill_md(struct ftl_io *io, unsigned int phase)
+{
+ struct spdk_bdev *bdev;
+ struct ftl_nv_cache *nv_cache = &io->dev->nv_cache;
+ uint64_t block_off, lba;
+ void *md_buf = io->md;
+
+ bdev = spdk_bdev_desc_get_bdev(nv_cache->bdev_desc);
+
+ for (block_off = 0; block_off < io->num_blocks; ++block_off) {
+ lba = ftl_nv_cache_pack_lba(ftl_io_get_lba(io, block_off), phase);
+ memcpy(md_buf, &lba, sizeof(lba));
+ md_buf += spdk_bdev_get_md_size(bdev);
+ }
+}
+
+static void
+_ftl_write_nv_cache(void *ctx)
+{
+ struct ftl_io *child, *io = ctx;
+ struct spdk_ftl_dev *dev = io->dev;
+ struct spdk_thread *thread;
+ unsigned int phase;
+ uint64_t num_blocks;
+
+ thread = spdk_io_channel_get_thread(io->ioch);
+
+ while (io->pos < io->num_blocks) {
+ num_blocks = ftl_io_iovec_len_left(io);
+
+ child = ftl_alloc_io_nv_cache(io, num_blocks);
+ if (spdk_unlikely(!child)) {
+ spdk_thread_send_msg(thread, _ftl_write_nv_cache, io);
+ return;
+ }
+
+ child->md = spdk_mempool_get(dev->nv_cache.md_pool);
+ if (spdk_unlikely(!child->md)) {
+ ftl_io_free(child);
+ spdk_thread_send_msg(thread, _ftl_write_nv_cache, io);
+ break;
+ }
+
+ /* Reserve area on the write buffer cache */
+ child->addr.offset = ftl_reserve_nv_cache(&dev->nv_cache, &num_blocks, &phase);
+ if (child->addr.offset == FTL_LBA_INVALID) {
+ spdk_mempool_put(dev->nv_cache.md_pool, child->md);
+ ftl_io_free(child);
+ spdk_thread_send_msg(thread, _ftl_write_nv_cache, io);
+ break;
+ }
+
+ /* Shrink the IO if there isn't enough room in the cache to fill the whole iovec */
+ if (spdk_unlikely(num_blocks != ftl_io_iovec_len_left(io))) {
+ ftl_io_shrink_iovec(child, num_blocks);
+ }
+
+ ftl_nv_cache_fill_md(child, phase);
+ ftl_submit_nv_cache(child);
+ }
+
+ if (ftl_io_done(io)) {
+ ftl_io_complete(io);
+ }
+}
+
+static void
+ftl_write_nv_cache(struct ftl_io *parent)
+{
+ ftl_io_reset(parent);
+ parent->flags |= FTL_IO_CACHE;
+ _ftl_write_nv_cache(parent);
+}
+
+int
+ftl_nv_cache_write_header(struct ftl_nv_cache *nv_cache, bool shutdown,
+ spdk_bdev_io_completion_cb cb_fn, void *cb_arg)
+{
+ struct spdk_ftl_dev *dev = SPDK_CONTAINEROF(nv_cache, struct spdk_ftl_dev, nv_cache);
+ struct ftl_nv_cache_header *hdr = nv_cache->dma_buf;
+ struct spdk_bdev *bdev;
+ struct ftl_io_channel *ioch;
+
+ bdev = spdk_bdev_desc_get_bdev(nv_cache->bdev_desc);
+ ioch = ftl_io_channel_get_ctx(ftl_get_io_channel(dev));
+
+ memset(hdr, 0, spdk_bdev_get_block_size(bdev));
+
+ hdr->phase = (uint8_t)nv_cache->phase;
+ hdr->size = spdk_bdev_get_num_blocks(bdev);
+ hdr->uuid = dev->uuid;
+ hdr->version = FTL_NV_CACHE_HEADER_VERSION;
+ hdr->current_addr = shutdown ? nv_cache->current_addr : FTL_LBA_INVALID;
+ hdr->checksum = spdk_crc32c_update(hdr, offsetof(struct ftl_nv_cache_header, checksum), 0);
+
+ return spdk_bdev_write_blocks(nv_cache->bdev_desc, ioch->cache_ioch, hdr, 0, 1,
+ cb_fn, cb_arg);
+}
+
+int
+ftl_nv_cache_scrub(struct ftl_nv_cache *nv_cache, spdk_bdev_io_completion_cb cb_fn, void *cb_arg)
+{
+ struct spdk_ftl_dev *dev = SPDK_CONTAINEROF(nv_cache, struct spdk_ftl_dev, nv_cache);
+ struct ftl_io_channel *ioch;
+ struct spdk_bdev *bdev;
+
+ ioch = ftl_io_channel_get_ctx(ftl_get_io_channel(dev));
+ bdev = spdk_bdev_desc_get_bdev(nv_cache->bdev_desc);
+
+ return spdk_bdev_write_zeroes_blocks(nv_cache->bdev_desc, ioch->cache_ioch, 1,
+ spdk_bdev_get_num_blocks(bdev) - 1,
+ cb_fn, cb_arg);
+}
+
+static void
+ftl_write_fail(struct ftl_io *io, int status)
+{
+ struct ftl_batch *batch = io->batch;
+ struct spdk_ftl_dev *dev = io->dev;
+ struct ftl_wbuf_entry *entry;
+ struct ftl_band *band;
+ char buf[128];
+
+ entry = TAILQ_FIRST(&batch->entries);
+
+ band = ftl_band_from_addr(io->dev, entry->addr);
+ SPDK_ERRLOG("Write failed @addr: %s, status: %d\n",
+ ftl_addr2str(entry->addr, buf, sizeof(buf)), status);
+
+ /* Close the band and, halt wptr and defrag */
+ ftl_halt_writes(dev, band);
+
+ TAILQ_FOREACH(entry, &batch->entries, tailq) {
+ /* Invalidate meta set by process_writes() */
+ ftl_invalidate_addr(dev, entry->addr);
+ }
+
+ /* Reset the batch back to the write buffer to resend it later */
+ TAILQ_INSERT_TAIL(&dev->pending_batches, batch, tailq);
+}
+
+static void
+ftl_write_cb(struct ftl_io *io, void *arg, int status)
+{
+ struct spdk_ftl_dev *dev = io->dev;
+ struct ftl_batch *batch = io->batch;
+ struct ftl_wbuf_entry *entry;
+ struct ftl_band *band;
+ struct ftl_addr prev_addr, addr = io->addr;
+
+ if (status) {
+ ftl_write_fail(io, status);
+ return;
+ }
+
+ assert(io->num_blocks == dev->xfer_size);
+ assert(!(io->flags & FTL_IO_MD));
+
+ TAILQ_FOREACH(entry, &batch->entries, tailq) {
+ band = entry->band;
+ if (!(entry->io_flags & FTL_IO_PAD)) {
+ /* Verify that the LBA is set for user blocks */
+ assert(entry->lba != FTL_LBA_INVALID);
+ }
+
+ if (band != NULL) {
+ assert(band->num_reloc_blocks > 0);
+ band->num_reloc_blocks--;
+ }
+
+ entry->addr = addr;
+ if (entry->lba != FTL_LBA_INVALID) {
+ pthread_spin_lock(&entry->lock);
+ prev_addr = ftl_l2p_get(dev, entry->lba);
+
+ /* If the l2p was updated in the meantime, don't update band's metadata */
+ if (ftl_addr_cached(prev_addr) &&
+ entry == ftl_get_entry_from_addr(dev, prev_addr)) {
+ /* Setting entry's cache bit needs to be done after metadata */
+ /* within the band is updated to make sure that writes */
+ /* invalidating the entry clear the metadata as well */
+ ftl_band_set_addr(io->band, entry->lba, entry->addr);
+ entry->valid = true;
+ }
+ pthread_spin_unlock(&entry->lock);
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "Write addr:%lu, lba:%lu\n",
+ entry->addr.offset, entry->lba);
+
+ addr = ftl_band_next_addr(io->band, addr, 1);
+ }
+
+ ftl_process_flush(dev, batch);
+ ftl_release_batch(dev, batch);
+}
+
+static void
+ftl_update_stats(struct spdk_ftl_dev *dev, const struct ftl_wbuf_entry *entry)
+{
+ if (!(entry->io_flags & FTL_IO_INTERNAL)) {
+ dev->stats.write_user++;
+ }
+ dev->stats.write_total++;
+}
+
+static void
+ftl_update_l2p(struct spdk_ftl_dev *dev, const struct ftl_wbuf_entry *entry,
+ struct ftl_addr addr)
+{
+ struct ftl_addr prev_addr;
+ struct ftl_wbuf_entry *prev;
+ struct ftl_band *band;
+ int valid;
+ bool io_weak = entry->io_flags & FTL_IO_WEAK;
+
+ prev_addr = ftl_l2p_get(dev, entry->lba);
+ if (ftl_addr_invalid(prev_addr)) {
+ ftl_l2p_set(dev, entry->lba, addr);
+ return;
+ }
+
+ if (ftl_addr_cached(prev_addr)) {
+ prev = ftl_get_entry_from_addr(dev, prev_addr);
+ pthread_spin_lock(&prev->lock);
+
+ /* Re-read the L2P under the lock to protect against updates */
+ /* to this LBA from other threads */
+ prev_addr = ftl_l2p_get(dev, entry->lba);
+
+ /* If the entry is no longer in cache, another write has been */
+ /* scheduled in the meantime, so we can return to evicted path */
+ if (!ftl_addr_cached(prev_addr)) {
+ pthread_spin_unlock(&prev->lock);
+ goto evicted;
+ }
+
+ /*
+ * Relocating block could still reside in cache due to fact that write
+ * buffers are independent for each IO channel and enough amount of data
+ * (write unit size) must be collected before it will be submitted to lower
+ * layer.
+ * When previous entry wasn't overwritten invalidate old address and entry.
+ * Otherwise skip relocating block.
+ */
+ if (io_weak &&
+ /* Check if prev_addr was updated in meantime */
+ !(ftl_addr_cmp(prev_addr, ftl_get_addr_from_entry(prev)) &&
+ /* Check if relocating address it the same as in previous entry */
+ ftl_addr_cmp(prev->addr, entry->addr))) {
+ pthread_spin_unlock(&prev->lock);
+ return;
+ }
+
+ /*
+ * If previous entry is part of cache and was written into disk remove
+ * and invalidate it
+ */
+ if (prev->valid) {
+ ftl_invalidate_addr(dev, prev->addr);
+ prev->valid = false;
+ }
+
+ ftl_l2p_set(dev, entry->lba, addr);
+ pthread_spin_unlock(&prev->lock);
+ return;
+ }
+
+evicted:
+ /*
+ * If the L2P's physical address is different than what we expected we don't need to
+ * do anything (someone's already overwritten our data).
+ */
+ if (io_weak && !ftl_addr_cmp(prev_addr, entry->addr)) {
+ return;
+ }
+
+ /* Lock the band containing previous physical address. This assures atomic changes to */
+ /* the L2P as wall as metadata. The valid bits in metadata are used to */
+ /* check weak writes validity. */
+ band = ftl_band_from_addr(dev, prev_addr);
+ pthread_spin_lock(&band->lba_map.lock);
+
+ valid = ftl_invalidate_addr_unlocked(dev, prev_addr);
+
+ /* If the address has been invalidated already, we don't want to update */
+ /* the L2P for weak writes, as it means the write is no longer valid. */
+ if (!io_weak || valid) {
+ ftl_l2p_set(dev, entry->lba, addr);
+ }
+
+ pthread_spin_unlock(&band->lba_map.lock);
+}
+
+static struct ftl_io *
+ftl_io_init_child_write(struct ftl_io *parent, struct ftl_addr addr, ftl_io_fn cb)
+{
+ struct ftl_io *io;
+ struct spdk_ftl_dev *dev = parent->dev;
+ struct ftl_io_init_opts opts = {
+ .dev = dev,
+ .io = NULL,
+ .parent = parent,
+ .band = parent->band,
+ .size = sizeof(struct ftl_io),
+ .flags = 0,
+ .type = parent->type,
+ .num_blocks = dev->xfer_size,
+ .cb_fn = cb,
+ .iovcnt = 0,
+ };
+
+ io = ftl_io_init_internal(&opts);
+ if (!io) {
+ return NULL;
+ }
+
+ io->addr = addr;
+
+ return io;
+}
+
+static void
+ftl_io_child_write_cb(struct ftl_io *io, void *ctx, int status)
+{
+ struct ftl_zone *zone;
+ struct ftl_wptr *wptr;
+
+ zone = ftl_band_zone_from_addr(io->band, io->addr);
+ wptr = ftl_wptr_from_band(io->band);
+
+ zone->busy = false;
+ zone->info.write_pointer += io->num_blocks;
+
+ if (zone->info.write_pointer == zone->info.zone_id + zone->info.capacity) {
+ zone->info.state = SPDK_BDEV_ZONE_STATE_FULL;
+ }
+
+ /* If some other write on the same band failed the write pointer would already be freed */
+ if (spdk_likely(wptr)) {
+ wptr->num_outstanding--;
+ }
+}
+
+static int
+ftl_submit_child_write(struct ftl_wptr *wptr, struct ftl_io *io)
+{
+ struct spdk_ftl_dev *dev = io->dev;
+ struct ftl_io_channel *ioch;
+ struct ftl_io *child;
+ struct ftl_addr addr;
+ int rc;
+
+ ioch = ftl_io_channel_get_ctx(io->ioch);
+
+ if (spdk_likely(!wptr->direct_mode)) {
+ addr = wptr->addr;
+ } else {
+ assert(io->flags & FTL_IO_DIRECT_ACCESS);
+ assert(ftl_addr_get_band(dev, io->addr) == wptr->band->id);
+ addr = io->addr;
+ }
+
+ /* Split IO to child requests and release zone immediately after child is completed */
+ child = ftl_io_init_child_write(io, addr, ftl_io_child_write_cb);
+ if (!child) {
+ return -EAGAIN;
+ }
+
+ wptr->num_outstanding++;
+
+ if (ftl_is_append_supported(dev)) {
+ rc = spdk_bdev_zone_appendv(dev->base_bdev_desc, ioch->base_ioch,
+ child->iov, child->iov_cnt,
+ ftl_addr_get_zone_slba(dev, addr),
+ dev->xfer_size, ftl_io_cmpl_cb, child);
+ } else {
+ rc = spdk_bdev_writev_blocks(dev->base_bdev_desc, ioch->base_ioch,
+ child->iov, child->iov_cnt, addr.offset,
+ dev->xfer_size, ftl_io_cmpl_cb, child);
+ }
+
+ if (rc) {
+ wptr->num_outstanding--;
+ ftl_io_fail(child, rc);
+ ftl_io_complete(child);
+ SPDK_ERRLOG("spdk_bdev_write_blocks_with_md failed with status:%d, addr:%lu\n",
+ rc, addr.offset);
+ return -EIO;
+ }
+
+ ftl_io_inc_req(child);
+ ftl_io_advance(child, dev->xfer_size);
+
+ return 0;
+}
+
+static int
+ftl_submit_write(struct ftl_wptr *wptr, struct ftl_io *io)
+{
+ struct spdk_ftl_dev *dev = io->dev;
+ int rc = 0;
+
+ assert(io->num_blocks % dev->xfer_size == 0);
+
+ while (io->iov_pos < io->iov_cnt) {
+ /* There are no guarantees of the order of completion of NVMe IO submission queue */
+ /* so wait until zone is not busy before submitting another write */
+ if (!ftl_is_append_supported(dev) && wptr->zone->busy) {
+ TAILQ_INSERT_TAIL(&wptr->pending_queue, io, ioch_entry);
+ rc = -EAGAIN;
+ break;
+ }
+
+ rc = ftl_submit_child_write(wptr, io);
+ if (spdk_unlikely(rc)) {
+ if (rc == -EAGAIN) {
+ TAILQ_INSERT_TAIL(&wptr->pending_queue, io, ioch_entry);
+ } else {
+ ftl_io_fail(io, rc);
+ }
+ break;
+ }
+
+ ftl_trace_submission(dev, io, wptr->addr, dev->xfer_size);
+ ftl_wptr_advance(wptr, dev->xfer_size);
+ }
+
+ if (ftl_io_done(io)) {
+ /* Parent IO will complete after all children are completed */
+ ftl_io_complete(io);
+ }
+
+ return rc;
+}
+
+static void
+ftl_flush_pad_batch(struct spdk_ftl_dev *dev)
+{
+ struct ftl_batch *batch = dev->current_batch;
+ struct ftl_io_channel *ioch;
+ size_t size = 0, num_entries = 0;
+
+ assert(batch != NULL);
+ assert(batch->num_entries < dev->xfer_size);
+
+ TAILQ_FOREACH(ioch, &dev->ioch_queue, tailq) {
+ size += spdk_ring_count(ioch->submit_queue);
+ }
+
+ num_entries = dev->xfer_size - batch->num_entries;
+ if (size < num_entries) {
+ ftl_pad_wbuf(dev, num_entries - size);
+ }
+}
+
+static bool
+ftl_check_io_channel_flush(struct spdk_ftl_dev *dev)
+{
+ struct ftl_io_channel *ioch;
+
+ TAILQ_FOREACH(ioch, &dev->ioch_queue, tailq) {
+ if (ioch->flush && spdk_ring_count(ioch->free_queue) != ioch->num_entries) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static int
+ftl_wptr_process_writes(struct ftl_wptr *wptr)
+{
+ struct spdk_ftl_dev *dev = wptr->dev;
+ struct ftl_batch *batch;
+ struct ftl_wbuf_entry *entry;
+ struct ftl_io *io;
+
+ if (spdk_unlikely(!TAILQ_EMPTY(&wptr->pending_queue))) {
+ io = TAILQ_FIRST(&wptr->pending_queue);
+ TAILQ_REMOVE(&wptr->pending_queue, io, ioch_entry);
+
+ if (ftl_submit_write(wptr, io) == -EAGAIN) {
+ return 0;
+ }
+ }
+
+ /* Make sure the band is prepared for writing */
+ if (!ftl_wptr_ready(wptr)) {
+ return 0;
+ }
+
+ if (dev->halt) {
+ ftl_wptr_process_shutdown(wptr);
+ }
+
+ if (spdk_unlikely(wptr->flush)) {
+ ftl_wptr_pad_band(wptr);
+ }
+
+ batch = ftl_get_next_batch(dev);
+ if (!batch) {
+ /* If there are queued flush requests we need to pad the write buffer to */
+ /* force out remaining entries */
+ if (!LIST_EMPTY(&dev->flush_list) || ftl_check_io_channel_flush(dev)) {
+ ftl_flush_pad_batch(dev);
+ }
+
+ return 0;
+ }
+
+ io = ftl_io_wbuf_init(dev, wptr->addr, wptr->band, batch, ftl_write_cb);
+ if (!io) {
+ goto error;
+ }
+
+ TAILQ_FOREACH(entry, &batch->entries, tailq) {
+ /* Update band's relocation stats if the IO comes from reloc */
+ if (entry->io_flags & FTL_IO_WEAK) {
+ if (!spdk_bit_array_get(wptr->band->reloc_bitmap, entry->band->id)) {
+ spdk_bit_array_set(wptr->band->reloc_bitmap, entry->band->id);
+ entry->band->num_reloc_bands++;
+ }
+ }
+
+ ftl_trace_wbuf_pop(dev, entry);
+ ftl_update_stats(dev, entry);
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "Write addr:%lx\n", wptr->addr.offset);
+
+ if (ftl_submit_write(wptr, io)) {
+ /* TODO: we need some recovery here */
+ assert(0 && "Write submit failed");
+ if (ftl_io_done(io)) {
+ ftl_io_free(io);
+ }
+ }
+
+ return dev->xfer_size;
+error:
+ TAILQ_INSERT_TAIL(&dev->pending_batches, batch, tailq);
+ return 0;
+}
+
+static int
+ftl_process_writes(struct spdk_ftl_dev *dev)
+{
+ struct ftl_wptr *wptr, *twptr;
+ size_t num_active = 0;
+ enum ftl_band_state state;
+
+ LIST_FOREACH_SAFE(wptr, &dev->wptr_list, list_entry, twptr) {
+ ftl_wptr_process_writes(wptr);
+ state = wptr->band->state;
+
+ if (state != FTL_BAND_STATE_FULL &&
+ state != FTL_BAND_STATE_CLOSING &&
+ state != FTL_BAND_STATE_CLOSED) {
+ num_active++;
+ }
+ }
+
+ if (num_active < 1) {
+ ftl_add_wptr(dev);
+ }
+
+ return 0;
+}
+
+static void
+ftl_fill_wbuf_entry(struct ftl_wbuf_entry *entry, struct ftl_io *io)
+{
+ memcpy(entry->payload, ftl_io_iovec_addr(io), FTL_BLOCK_SIZE);
+
+ if (entry->io_flags & FTL_IO_WEAK) {
+ entry->band = ftl_band_from_addr(io->dev, io->addr);
+ entry->addr = ftl_band_next_addr(entry->band, io->addr, io->pos);
+ entry->band->num_reloc_blocks++;
+ }
+
+ entry->trace = io->trace;
+ entry->lba = ftl_io_current_lba(io);
+}
+
+static int
+ftl_wbuf_fill(struct ftl_io *io)
+{
+ struct spdk_ftl_dev *dev = io->dev;
+ struct ftl_io_channel *ioch;
+ struct ftl_wbuf_entry *entry;
+
+ ioch = ftl_io_channel_get_ctx(io->ioch);
+
+ while (io->pos < io->num_blocks) {
+ if (ftl_io_current_lba(io) == FTL_LBA_INVALID) {
+ ftl_io_advance(io, 1);
+ continue;
+ }
+
+ entry = ftl_acquire_wbuf_entry(ioch, io->flags);
+ if (!entry) {
+ TAILQ_INSERT_TAIL(&ioch->retry_queue, io, ioch_entry);
+ return 0;
+ }
+
+ ftl_fill_wbuf_entry(entry, io);
+
+ ftl_trace_wbuf_fill(dev, io);
+ ftl_update_l2p(dev, entry, ftl_get_addr_from_entry(entry));
+ ftl_io_advance(io, 1);
+
+ /* Needs to be done after L2P is updated to avoid race with */
+ /* write completion callback when it's processed faster than */
+ /* L2P is set in update_l2p(). */
+ spdk_ring_enqueue(ioch->submit_queue, (void **)&entry, 1, NULL);
+ }
+
+ if (ftl_io_done(io)) {
+ if (ftl_dev_has_nv_cache(dev) && !(io->flags & FTL_IO_BYPASS_CACHE)) {
+ ftl_write_nv_cache(io);
+ } else {
+ TAILQ_INSERT_TAIL(&ioch->write_cmpl_queue, io, ioch_entry);
+ }
+ }
+
+ return 0;
+}
+
+static bool
+ftl_dev_needs_defrag(struct spdk_ftl_dev *dev)
+{
+ const struct spdk_ftl_limit *limit = ftl_get_limit(dev, SPDK_FTL_LIMIT_START);
+
+ if (ftl_reloc_is_halted(dev->reloc)) {
+ return false;
+ }
+
+ if (ftl_reloc_is_defrag_active(dev->reloc)) {
+ return false;
+ }
+
+ if (dev->num_free <= limit->thld) {
+ return true;
+ }
+
+ return false;
+}
+
+static double
+ftl_band_calc_merit(struct ftl_band *band, size_t *threshold_valid)
+{
+ size_t usable, valid, invalid;
+ double vld_ratio;
+
+ /* If the band doesn't have any usable blocks it's of no use */
+ usable = ftl_band_num_usable_blocks(band);
+ if (usable == 0) {
+ return 0.0;
+ }
+
+ valid = threshold_valid ? (usable - *threshold_valid) : band->lba_map.num_vld;
+ invalid = usable - valid;
+
+ /* Add one to avoid division by 0 */
+ vld_ratio = (double)invalid / (double)(valid + 1);
+ return vld_ratio * ftl_band_age(band);
+}
+
+static bool
+ftl_band_needs_defrag(struct ftl_band *band, struct spdk_ftl_dev *dev)
+{
+ struct spdk_ftl_conf *conf = &dev->conf;
+ size_t thld_vld;
+
+ /* If we're in dire need of free bands, every band is worth defragging */
+ if (ftl_current_limit(dev) == SPDK_FTL_LIMIT_CRIT) {
+ return true;
+ }
+
+ thld_vld = (ftl_band_num_usable_blocks(band) * conf->invalid_thld) / 100;
+
+ return band->merit > ftl_band_calc_merit(band, &thld_vld);
+}
+
+static struct ftl_band *
+ftl_select_defrag_band(struct spdk_ftl_dev *dev)
+{
+ struct ftl_band *band, *mband = NULL;
+ double merit = 0;
+
+ LIST_FOREACH(band, &dev->shut_bands, list_entry) {
+ assert(band->state == FTL_BAND_STATE_CLOSED);
+ band->merit = ftl_band_calc_merit(band, NULL);
+ if (band->merit > merit) {
+ merit = band->merit;
+ mband = band;
+ }
+ }
+
+ if (mband && !ftl_band_needs_defrag(mband, dev)) {
+ mband = NULL;
+ }
+
+ return mband;
+}
+
+static void
+ftl_process_relocs(struct spdk_ftl_dev *dev)
+{
+ struct ftl_band *band;
+
+ if (ftl_dev_needs_defrag(dev)) {
+ band = ftl_select_defrag_band(dev);
+ if (band) {
+ ftl_reloc_add(dev->reloc, band, 0, ftl_get_num_blocks_in_band(dev), 0, true);
+ ftl_trace_defrag_band(dev, band);
+ }
+ }
+
+ ftl_reloc(dev->reloc);
+}
+
+int
+ftl_current_limit(const struct spdk_ftl_dev *dev)
+{
+ return dev->limit;
+}
+
+void
+spdk_ftl_dev_get_attrs(const struct spdk_ftl_dev *dev, struct spdk_ftl_attrs *attrs)
+{
+ attrs->uuid = dev->uuid;
+ attrs->num_blocks = dev->num_lbas;
+ attrs->block_size = FTL_BLOCK_SIZE;
+ attrs->num_zones = ftl_get_num_zones(dev);
+ attrs->zone_size = ftl_get_num_blocks_in_zone(dev);
+ attrs->conf = dev->conf;
+ attrs->base_bdev = spdk_bdev_get_name(spdk_bdev_desc_get_bdev(dev->base_bdev_desc));
+
+ attrs->cache_bdev = NULL;
+ if (dev->nv_cache.bdev_desc) {
+ attrs->cache_bdev = spdk_bdev_get_name(
+ spdk_bdev_desc_get_bdev(dev->nv_cache.bdev_desc));
+ }
+}
+
+static void
+_ftl_io_write(void *ctx)
+{
+ ftl_io_write((struct ftl_io *)ctx);
+}
+
+static int
+ftl_submit_write_leaf(struct ftl_io *io)
+{
+ int rc;
+
+ rc = ftl_submit_write(ftl_wptr_from_band(io->band), io);
+ if (rc == -EAGAIN) {
+ /* EAGAIN means that the request was put on the pending queue */
+ return 0;
+ }
+
+ return rc;
+}
+
+void
+ftl_io_write(struct ftl_io *io)
+{
+ struct spdk_ftl_dev *dev = io->dev;
+ struct ftl_io_channel *ioch = ftl_io_channel_get_ctx(io->ioch);
+
+ /* Put the IO on retry queue in case IO channel is not initialized */
+ if (spdk_unlikely(ioch->index == FTL_IO_CHANNEL_INDEX_INVALID)) {
+ TAILQ_INSERT_TAIL(&ioch->retry_queue, io, ioch_entry);
+ return;
+ }
+
+ /* For normal IOs we just need to copy the data onto the write buffer */
+ if (!(io->flags & FTL_IO_MD)) {
+ ftl_io_call_foreach_child(io, ftl_wbuf_fill);
+ } else {
+ /* Metadata has its own buffer, so it doesn't have to be copied, so just */
+ /* send it the the core thread and schedule the write immediately */
+ if (ftl_check_core_thread(dev)) {
+ ftl_io_call_foreach_child(io, ftl_submit_write_leaf);
+ } else {
+ spdk_thread_send_msg(ftl_get_core_thread(dev), _ftl_io_write, io);
+ }
+ }
+}
+
+int
+spdk_ftl_write(struct spdk_ftl_dev *dev, struct spdk_io_channel *ch, uint64_t lba, size_t lba_cnt,
+ struct iovec *iov, size_t iov_cnt, spdk_ftl_fn cb_fn, void *cb_arg)
+{
+ struct ftl_io *io;
+
+ if (iov_cnt == 0) {
+ return -EINVAL;
+ }
+
+ if (lba_cnt == 0) {
+ return -EINVAL;
+ }
+
+ if (lba_cnt != ftl_iovec_num_blocks(iov, iov_cnt)) {
+ return -EINVAL;
+ }
+
+ if (!dev->initialized) {
+ return -EBUSY;
+ }
+
+ io = ftl_io_user_init(ch, lba, lba_cnt, iov, iov_cnt, cb_fn, cb_arg, FTL_IO_WRITE);
+ if (!io) {
+ return -ENOMEM;
+ }
+
+ ftl_io_write(io);
+
+ return 0;
+}
+
+void
+ftl_io_read(struct ftl_io *io)
+{
+ ftl_io_call_foreach_child(io, ftl_submit_read);
+}
+
+int
+spdk_ftl_read(struct spdk_ftl_dev *dev, struct spdk_io_channel *ch, uint64_t lba, size_t lba_cnt,
+ struct iovec *iov, size_t iov_cnt, spdk_ftl_fn cb_fn, void *cb_arg)
+{
+ struct ftl_io *io;
+
+ if (iov_cnt == 0) {
+ return -EINVAL;
+ }
+
+ if (lba_cnt == 0) {
+ return -EINVAL;
+ }
+
+ if (lba_cnt != ftl_iovec_num_blocks(iov, iov_cnt)) {
+ return -EINVAL;
+ }
+
+ if (!dev->initialized) {
+ return -EBUSY;
+ }
+
+ io = ftl_io_user_init(ch, lba, lba_cnt, iov, iov_cnt, cb_fn, cb_arg, FTL_IO_READ);
+ if (!io) {
+ return -ENOMEM;
+ }
+
+ ftl_io_read(io);
+ return 0;
+}
+
+static struct ftl_flush *
+ftl_flush_init(struct spdk_ftl_dev *dev, spdk_ftl_fn cb_fn, void *cb_arg)
+{
+ struct ftl_flush *flush;
+
+ flush = calloc(1, sizeof(*flush));
+ if (!flush) {
+ return NULL;
+ }
+
+ flush->bmap = spdk_bit_array_create(FTL_BATCH_COUNT);
+ if (!flush->bmap) {
+ goto error;
+ }
+
+ flush->dev = dev;
+ flush->cb.fn = cb_fn;
+ flush->cb.ctx = cb_arg;
+
+ return flush;
+error:
+ free(flush);
+ return NULL;
+}
+
+static void
+_ftl_flush(void *ctx)
+{
+ struct ftl_flush *flush = ctx;
+ struct spdk_ftl_dev *dev = flush->dev;
+ uint32_t i;
+
+ /* Attach flush object to all non-empty batches */
+ for (i = 0; i < FTL_BATCH_COUNT; ++i) {
+ if (dev->batch_array[i].num_entries > 0) {
+ spdk_bit_array_set(flush->bmap, i);
+ flush->num_req++;
+ }
+ }
+
+ LIST_INSERT_HEAD(&dev->flush_list, flush, list_entry);
+
+ /* If the write buffer was already empty, the flush can be completed right away */
+ if (!flush->num_req) {
+ ftl_complete_flush(flush);
+ }
+}
+
+int
+ftl_flush_wbuf(struct spdk_ftl_dev *dev, spdk_ftl_fn cb_fn, void *cb_arg)
+{
+ struct ftl_flush *flush;
+
+ flush = ftl_flush_init(dev, cb_fn, cb_arg);
+ if (!flush) {
+ return -ENOMEM;
+ }
+
+ spdk_thread_send_msg(ftl_get_core_thread(dev), _ftl_flush, flush);
+ return 0;
+}
+
+int
+spdk_ftl_flush(struct spdk_ftl_dev *dev, spdk_ftl_fn cb_fn, void *cb_arg)
+{
+ if (!dev->initialized) {
+ return -EBUSY;
+ }
+
+ return ftl_flush_wbuf(dev, cb_fn, cb_arg);
+}
+
+bool
+ftl_addr_is_written(struct ftl_band *band, struct ftl_addr addr)
+{
+ struct ftl_zone *zone = ftl_band_zone_from_addr(band, addr);
+
+ return addr.offset < zone->info.write_pointer;
+}
+
+static void ftl_process_media_event(struct spdk_ftl_dev *dev, struct spdk_bdev_media_event event);
+
+static void
+_ftl_process_media_event(void *ctx)
+{
+ struct ftl_media_event *event = ctx;
+ struct spdk_ftl_dev *dev = event->dev;
+
+ ftl_process_media_event(dev, event->event);
+ spdk_mempool_put(dev->media_events_pool, event);
+}
+
+static void
+ftl_process_media_event(struct spdk_ftl_dev *dev, struct spdk_bdev_media_event event)
+{
+ struct ftl_band *band;
+ struct ftl_addr addr = { .offset = event.offset };
+ size_t block_off;
+
+ if (!ftl_check_core_thread(dev)) {
+ struct ftl_media_event *media_event;
+
+ media_event = spdk_mempool_get(dev->media_events_pool);
+ if (!media_event) {
+ SPDK_ERRLOG("Media event lost due to lack of memory");
+ return;
+ }
+
+ media_event->dev = dev;
+ media_event->event = event;
+ spdk_thread_send_msg(ftl_get_core_thread(dev), _ftl_process_media_event,
+ media_event);
+ return;
+ }
+
+ band = ftl_band_from_addr(dev, addr);
+ block_off = ftl_band_block_offset_from_addr(band, addr);
+
+ ftl_reloc_add(dev->reloc, band, block_off, event.num_blocks, 0, false);
+}
+
+void
+ftl_get_media_events(struct spdk_ftl_dev *dev)
+{
+#define FTL_MAX_MEDIA_EVENTS 128
+ struct spdk_bdev_media_event events[FTL_MAX_MEDIA_EVENTS];
+ size_t num_events, i;
+
+ if (!dev->initialized) {
+ return;
+ }
+
+ do {
+ num_events = spdk_bdev_get_media_events(dev->base_bdev_desc,
+ events, FTL_MAX_MEDIA_EVENTS);
+
+ for (i = 0; i < num_events; ++i) {
+ ftl_process_media_event(dev, events[i]);
+ }
+
+ } while (num_events);
+}
+
+int
+ftl_io_channel_poll(void *arg)
+{
+ struct ftl_io_channel *ch = arg;
+ struct ftl_io *io;
+ TAILQ_HEAD(, ftl_io) retry_queue;
+
+ if (TAILQ_EMPTY(&ch->write_cmpl_queue) && TAILQ_EMPTY(&ch->retry_queue)) {
+ return SPDK_POLLER_IDLE;
+ }
+
+ while (!TAILQ_EMPTY(&ch->write_cmpl_queue)) {
+ io = TAILQ_FIRST(&ch->write_cmpl_queue);
+ TAILQ_REMOVE(&ch->write_cmpl_queue, io, ioch_entry);
+ ftl_io_complete(io);
+ }
+
+ /*
+ * Create local copy of the retry queue to prevent from infinite retrying if IO will be
+ * inserted to the retry queue again
+ */
+ TAILQ_INIT(&retry_queue);
+ TAILQ_SWAP(&ch->retry_queue, &retry_queue, ftl_io, ioch_entry);
+
+ while (!TAILQ_EMPTY(&retry_queue)) {
+ io = TAILQ_FIRST(&retry_queue);
+ TAILQ_REMOVE(&retry_queue, io, ioch_entry);
+ if (io->type == FTL_IO_WRITE) {
+ ftl_io_write(io);
+ } else {
+ ftl_io_read(io);
+ }
+ }
+
+ return SPDK_POLLER_BUSY;
+}
+
+int
+ftl_task_core(void *ctx)
+{
+ struct spdk_ftl_dev *dev = ctx;
+
+ if (dev->halt) {
+ if (ftl_shutdown_complete(dev)) {
+ spdk_poller_unregister(&dev->core_poller);
+ return SPDK_POLLER_IDLE;
+ }
+ }
+
+ ftl_process_writes(dev);
+ ftl_process_relocs(dev);
+
+ return SPDK_POLLER_BUSY;
+}
+
+SPDK_LOG_REGISTER_COMPONENT("ftl_core", SPDK_LOG_FTL_CORE)
diff --git a/src/spdk/lib/ftl/ftl_core.h b/src/spdk/lib/ftl/ftl_core.h
new file mode 100644
index 000000000..b782ba731
--- /dev/null
+++ b/src/spdk/lib/ftl/ftl_core.h
@@ -0,0 +1,552 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FTL_CORE_H
+#define FTL_CORE_H
+
+#include "spdk/stdinc.h"
+#include "spdk/uuid.h"
+#include "spdk/thread.h"
+#include "spdk/util.h"
+#include "spdk_internal/log.h"
+#include "spdk/likely.h"
+#include "spdk/queue.h"
+#include "spdk/ftl.h"
+#include "spdk/bdev.h"
+#include "spdk/bdev_zone.h"
+
+#include "ftl_addr.h"
+#include "ftl_io.h"
+#include "ftl_trace.h"
+
+#ifdef SPDK_CONFIG_PMDK
+#include "libpmem.h"
+#endif /* SPDK_CONFIG_PMDK */
+
+struct spdk_ftl_dev;
+struct ftl_band;
+struct ftl_zone;
+struct ftl_io;
+struct ftl_restore;
+struct ftl_wptr;
+struct ftl_flush;
+struct ftl_reloc;
+struct ftl_anm_event;
+struct ftl_band_flush;
+
+struct ftl_stats {
+ /* Number of writes scheduled directly by the user */
+ uint64_t write_user;
+
+ /* Total number of writes */
+ uint64_t write_total;
+
+ /* Traces */
+ struct ftl_trace trace;
+
+ /* Number of limits applied */
+ uint64_t limits[SPDK_FTL_LIMIT_MAX];
+};
+
+struct ftl_global_md {
+ /* Device instance */
+ struct spdk_uuid uuid;
+ /* Size of the l2p table */
+ uint64_t num_lbas;
+};
+
+struct ftl_nv_cache {
+ /* Write buffer cache bdev */
+ struct spdk_bdev_desc *bdev_desc;
+ /* Write pointer */
+ uint64_t current_addr;
+ /* Number of available blocks left */
+ uint64_t num_available;
+ /* Maximum number of blocks */
+ uint64_t num_data_blocks;
+ /*
+ * Phase of the current cycle of writes. Each time whole cache area is filled, the phase is
+ * advanced. Current phase is saved in every IO's metadata, as well as in the header saved
+ * in the first sector. By looking at the phase of each block, it's possible to find the
+ * oldest block and replay the order of the writes when recovering the data from the cache.
+ */
+ unsigned int phase;
+ /* Indicates that the data can be written to the cache */
+ bool ready;
+ /* Metadata pool */
+ struct spdk_mempool *md_pool;
+ /* DMA buffer for writing the header */
+ void *dma_buf;
+ /* Cache lock */
+ pthread_spinlock_t lock;
+};
+
+struct ftl_batch {
+ /* Queue of write buffer entries, can reach up to xfer_size entries */
+ TAILQ_HEAD(, ftl_wbuf_entry) entries;
+ /* Number of entries in the queue above */
+ uint32_t num_entries;
+ /* Index within spdk_ftl_dev.batch_array */
+ uint32_t index;
+ struct iovec *iov;
+ void *metadata;
+ TAILQ_ENTRY(ftl_batch) tailq;
+};
+
+struct spdk_ftl_dev {
+ /* Device instance */
+ struct spdk_uuid uuid;
+ /* Device name */
+ char *name;
+ /* Configuration */
+ struct spdk_ftl_conf conf;
+
+ /* Indicates the device is fully initialized */
+ int initialized;
+ /* Indicates the device is about to be stopped */
+ int halt;
+ /* Indicates the device is about to start stopping - use to handle multiple stop request */
+ bool halt_started;
+
+ /* Underlying device */
+ struct spdk_bdev_desc *base_bdev_desc;
+
+ /* Non-volatile write buffer cache */
+ struct ftl_nv_cache nv_cache;
+
+ /* LBA map memory pool */
+ struct spdk_mempool *lba_pool;
+
+ /* LBA map requests pool */
+ struct spdk_mempool *lba_request_pool;
+
+ /* Media management events pool */
+ struct spdk_mempool *media_events_pool;
+
+ /* Statistics */
+ struct ftl_stats stats;
+
+ /* Current sequence number */
+ uint64_t seq;
+
+ /* Array of bands */
+ struct ftl_band *bands;
+ /* Number of operational bands */
+ size_t num_bands;
+ /* Next write band */
+ struct ftl_band *next_band;
+ /* Free band list */
+ LIST_HEAD(, ftl_band) free_bands;
+ /* Closed bands list */
+ LIST_HEAD(, ftl_band) shut_bands;
+ /* Number of free bands */
+ size_t num_free;
+
+ /* List of write pointers */
+ LIST_HEAD(, ftl_wptr) wptr_list;
+
+ /* Logical -> physical table */
+ void *l2p;
+ /* Size of the l2p table */
+ uint64_t num_lbas;
+ /* Size of pages mmapped for l2p, valid only for mapping on persistent memory */
+ size_t l2p_pmem_len;
+
+ /* Address size */
+ size_t addr_len;
+
+ /* Flush list */
+ LIST_HEAD(, ftl_flush) flush_list;
+ /* List of band flush requests */
+ LIST_HEAD(, ftl_band_flush) band_flush_list;
+
+ /* Device specific md buffer */
+ struct ftl_global_md global_md;
+
+ /* Metadata size */
+ size_t md_size;
+ void *md_buf;
+
+ /* Transfer unit size */
+ size_t xfer_size;
+
+ /* Current user write limit */
+ int limit;
+
+ /* Inflight IO operations */
+ uint32_t num_inflight;
+
+ /* Manages data relocation */
+ struct ftl_reloc *reloc;
+
+ /* Thread on which the poller is running */
+ struct spdk_thread *core_thread;
+ /* IO channel */
+ struct spdk_io_channel *ioch;
+ /* Poller */
+ struct spdk_poller *core_poller;
+
+ /* IO channel array provides means for retrieving write buffer entries
+ * from their address stored in L2P. The address is divided into two
+ * parts - IO channel offset poining at specific IO channel (within this
+ * array) and entry offset pointing at specific entry within that IO
+ * channel.
+ */
+ struct ftl_io_channel **ioch_array;
+ TAILQ_HEAD(, ftl_io_channel) ioch_queue;
+ uint64_t num_io_channels;
+ /* Value required to shift address of a write buffer entry to retrieve
+ * the IO channel it's part of. The other part of the address describes
+ * the offset of an entry within the IO channel's entry array.
+ */
+ uint64_t ioch_shift;
+
+ /* Write buffer batches */
+#define FTL_BATCH_COUNT 4096
+ struct ftl_batch batch_array[FTL_BATCH_COUNT];
+ /* Iovec buffer used by batches */
+ struct iovec *iov_buf;
+ /* Batch currently being filled */
+ struct ftl_batch *current_batch;
+ /* Full and ready to be sent batches. A batch is put on this queue in
+ * case it's already filled, but cannot be sent.
+ */
+ TAILQ_HEAD(, ftl_batch) pending_batches;
+ TAILQ_HEAD(, ftl_batch) free_batches;
+
+ /* Devices' list */
+ STAILQ_ENTRY(spdk_ftl_dev) stailq;
+};
+
+struct ftl_nv_cache_header {
+ /* Version of the header */
+ uint32_t version;
+ /* UUID of the FTL device */
+ struct spdk_uuid uuid;
+ /* Size of the non-volatile cache (in blocks) */
+ uint64_t size;
+ /* Contains the next address to be written after clean shutdown, invalid LBA otherwise */
+ uint64_t current_addr;
+ /* Current phase */
+ uint8_t phase;
+ /* Checksum of the header, needs to be last element */
+ uint32_t checksum;
+} __attribute__((packed));
+
+struct ftl_media_event {
+ /* Owner */
+ struct spdk_ftl_dev *dev;
+ /* Media event */
+ struct spdk_bdev_media_event event;
+};
+
+typedef void (*ftl_restore_fn)(struct ftl_restore *, int, void *cb_arg);
+
+void ftl_apply_limits(struct spdk_ftl_dev *dev);
+void ftl_io_read(struct ftl_io *io);
+void ftl_io_write(struct ftl_io *io);
+int ftl_flush_wbuf(struct spdk_ftl_dev *dev, spdk_ftl_fn cb_fn, void *cb_arg);
+int ftl_current_limit(const struct spdk_ftl_dev *dev);
+int ftl_invalidate_addr(struct spdk_ftl_dev *dev, struct ftl_addr addr);
+int ftl_task_core(void *ctx);
+int ftl_task_read(void *ctx);
+void ftl_process_anm_event(struct ftl_anm_event *event);
+size_t ftl_tail_md_num_blocks(const struct spdk_ftl_dev *dev);
+size_t ftl_tail_md_hdr_num_blocks(void);
+size_t ftl_vld_map_num_blocks(const struct spdk_ftl_dev *dev);
+size_t ftl_lba_map_num_blocks(const struct spdk_ftl_dev *dev);
+size_t ftl_head_md_num_blocks(const struct spdk_ftl_dev *dev);
+int ftl_restore_md(struct spdk_ftl_dev *dev, ftl_restore_fn cb, void *cb_arg);
+int ftl_restore_device(struct ftl_restore *restore, ftl_restore_fn cb, void *cb_arg);
+void ftl_restore_nv_cache(struct ftl_restore *restore, ftl_restore_fn cb, void *cb_arg);
+int ftl_band_set_direct_access(struct ftl_band *band, bool access);
+bool ftl_addr_is_written(struct ftl_band *band, struct ftl_addr addr);
+int ftl_flush_active_bands(struct spdk_ftl_dev *dev, spdk_ftl_fn cb_fn, void *cb_arg);
+int ftl_nv_cache_write_header(struct ftl_nv_cache *nv_cache, bool shutdown,
+ spdk_bdev_io_completion_cb cb_fn, void *cb_arg);
+int ftl_nv_cache_scrub(struct ftl_nv_cache *nv_cache, spdk_bdev_io_completion_cb cb_fn,
+ void *cb_arg);
+void ftl_get_media_events(struct spdk_ftl_dev *dev);
+int ftl_io_channel_poll(void *arg);
+void ftl_evict_cache_entry(struct spdk_ftl_dev *dev, struct ftl_wbuf_entry *entry);
+struct spdk_io_channel *ftl_get_io_channel(const struct spdk_ftl_dev *dev);
+struct ftl_io_channel *ftl_io_channel_get_ctx(struct spdk_io_channel *ioch);
+
+
+#define ftl_to_addr(address) \
+ (struct ftl_addr) { .offset = (uint64_t)(address) }
+
+#define ftl_to_addr_packed(address) \
+ (struct ftl_addr) { .pack.offset = (uint32_t)(address) }
+
+static inline struct spdk_thread *
+ftl_get_core_thread(const struct spdk_ftl_dev *dev)
+{
+ return dev->core_thread;
+}
+
+static inline size_t
+ftl_get_num_bands(const struct spdk_ftl_dev *dev)
+{
+ return dev->num_bands;
+}
+
+static inline size_t
+ftl_get_num_punits(const struct spdk_ftl_dev *dev)
+{
+ return spdk_bdev_get_optimal_open_zones(spdk_bdev_desc_get_bdev(dev->base_bdev_desc));
+}
+
+static inline size_t
+ftl_get_num_zones(const struct spdk_ftl_dev *dev)
+{
+ return ftl_get_num_bands(dev) * ftl_get_num_punits(dev);
+}
+
+static inline size_t
+ftl_get_num_blocks_in_zone(const struct spdk_ftl_dev *dev)
+{
+ return spdk_bdev_get_zone_size(spdk_bdev_desc_get_bdev(dev->base_bdev_desc));
+}
+
+static inline uint64_t
+ftl_get_num_blocks_in_band(const struct spdk_ftl_dev *dev)
+{
+ return ftl_get_num_punits(dev) * ftl_get_num_blocks_in_zone(dev);
+}
+
+static inline uint64_t
+ftl_addr_get_zone_slba(const struct spdk_ftl_dev *dev, struct ftl_addr addr)
+{
+ return addr.offset -= (addr.offset % ftl_get_num_blocks_in_zone(dev));
+}
+
+static inline uint64_t
+ftl_addr_get_band(const struct spdk_ftl_dev *dev, struct ftl_addr addr)
+{
+ return addr.offset / ftl_get_num_blocks_in_band(dev);
+}
+
+static inline uint64_t
+ftl_addr_get_punit(const struct spdk_ftl_dev *dev, struct ftl_addr addr)
+{
+ return (addr.offset / ftl_get_num_blocks_in_zone(dev)) % ftl_get_num_punits(dev);
+}
+
+static inline uint64_t
+ftl_addr_get_zone_offset(const struct spdk_ftl_dev *dev, struct ftl_addr addr)
+{
+ return addr.offset % ftl_get_num_blocks_in_zone(dev);
+}
+
+static inline size_t
+ftl_vld_map_size(const struct spdk_ftl_dev *dev)
+{
+ return (size_t)spdk_divide_round_up(ftl_get_num_blocks_in_band(dev), CHAR_BIT);
+}
+
+static inline int
+ftl_addr_packed(const struct spdk_ftl_dev *dev)
+{
+ return dev->addr_len < 32;
+}
+
+static inline void
+ftl_l2p_lba_persist(const struct spdk_ftl_dev *dev, uint64_t lba)
+{
+#ifdef SPDK_CONFIG_PMDK
+ size_t ftl_addr_size = ftl_addr_packed(dev) ? 4 : 8;
+ pmem_persist((char *)dev->l2p + (lba * ftl_addr_size), ftl_addr_size);
+#else /* SPDK_CONFIG_PMDK */
+ SPDK_ERRLOG("Libpmem not available, cannot flush l2p to pmem\n");
+ assert(0);
+#endif /* SPDK_CONFIG_PMDK */
+}
+
+static inline int
+ftl_addr_invalid(struct ftl_addr addr)
+{
+ return addr.offset == ftl_to_addr(FTL_ADDR_INVALID).offset;
+}
+
+static inline int
+ftl_addr_cached(struct ftl_addr addr)
+{
+ return !ftl_addr_invalid(addr) && addr.cached;
+}
+
+static inline struct ftl_addr
+ftl_addr_to_packed(const struct spdk_ftl_dev *dev, struct ftl_addr addr)
+{
+ struct ftl_addr p = {};
+
+ if (ftl_addr_invalid(addr)) {
+ p = ftl_to_addr_packed(FTL_ADDR_INVALID);
+ } else if (ftl_addr_cached(addr)) {
+ p.pack.cached = 1;
+ p.pack.cache_offset = (uint32_t) addr.cache_offset;
+ } else {
+ p.pack.offset = (uint32_t) addr.offset;
+ }
+
+ return p;
+}
+
+static inline struct ftl_addr
+ftl_addr_from_packed(const struct spdk_ftl_dev *dev, struct ftl_addr p)
+{
+ struct ftl_addr addr = {};
+
+ if (p.pack.offset == (uint32_t)FTL_ADDR_INVALID) {
+ addr = ftl_to_addr(FTL_ADDR_INVALID);
+ } else if (p.pack.cached) {
+ addr.cached = 1;
+ addr.cache_offset = p.pack.cache_offset;
+ } else {
+ addr = p;
+ }
+
+ return addr;
+}
+
+#define _ftl_l2p_set(l2p, off, val, bits) \
+ __atomic_store_n(((uint##bits##_t *)(l2p)) + (off), val, __ATOMIC_SEQ_CST)
+
+#define _ftl_l2p_set32(l2p, off, val) \
+ _ftl_l2p_set(l2p, off, val, 32)
+
+#define _ftl_l2p_set64(l2p, off, val) \
+ _ftl_l2p_set(l2p, off, val, 64)
+
+#define _ftl_l2p_get(l2p, off, bits) \
+ __atomic_load_n(((uint##bits##_t *)(l2p)) + (off), __ATOMIC_SEQ_CST)
+
+#define _ftl_l2p_get32(l2p, off) \
+ _ftl_l2p_get(l2p, off, 32)
+
+#define _ftl_l2p_get64(l2p, off) \
+ _ftl_l2p_get(l2p, off, 64)
+
+#define ftl_addr_cmp(p1, p2) \
+ ((p1).offset == (p2).offset)
+
+static inline void
+ftl_l2p_set(struct spdk_ftl_dev *dev, uint64_t lba, struct ftl_addr addr)
+{
+ assert(dev->num_lbas > lba);
+
+ if (ftl_addr_packed(dev)) {
+ _ftl_l2p_set32(dev->l2p, lba, ftl_addr_to_packed(dev, addr).offset);
+ } else {
+ _ftl_l2p_set64(dev->l2p, lba, addr.offset);
+ }
+
+ if (dev->l2p_pmem_len != 0) {
+ ftl_l2p_lba_persist(dev, lba);
+ }
+}
+
+static inline struct ftl_addr
+ftl_l2p_get(struct spdk_ftl_dev *dev, uint64_t lba)
+{
+ assert(dev->num_lbas > lba);
+
+ if (ftl_addr_packed(dev)) {
+ return ftl_addr_from_packed(dev, ftl_to_addr_packed(
+ _ftl_l2p_get32(dev->l2p, lba)));
+ } else {
+ return ftl_to_addr(_ftl_l2p_get64(dev->l2p, lba));
+ }
+}
+
+static inline bool
+ftl_dev_has_nv_cache(const struct spdk_ftl_dev *dev)
+{
+ return dev->nv_cache.bdev_desc != NULL;
+}
+
+#define FTL_NV_CACHE_HEADER_VERSION (1)
+#define FTL_NV_CACHE_DATA_OFFSET (1)
+#define FTL_NV_CACHE_PHASE_OFFSET (62)
+#define FTL_NV_CACHE_PHASE_COUNT (4)
+#define FTL_NV_CACHE_PHASE_MASK (3ULL << FTL_NV_CACHE_PHASE_OFFSET)
+#define FTL_NV_CACHE_LBA_INVALID (FTL_LBA_INVALID & ~FTL_NV_CACHE_PHASE_MASK)
+
+static inline bool
+ftl_nv_cache_phase_is_valid(unsigned int phase)
+{
+ return phase > 0 && phase <= 3;
+}
+
+static inline unsigned int
+ftl_nv_cache_next_phase(unsigned int current)
+{
+ static const unsigned int phases[] = { 0, 2, 3, 1 };
+ assert(ftl_nv_cache_phase_is_valid(current));
+ return phases[current];
+}
+
+static inline unsigned int
+ftl_nv_cache_prev_phase(unsigned int current)
+{
+ static const unsigned int phases[] = { 0, 3, 1, 2 };
+ assert(ftl_nv_cache_phase_is_valid(current));
+ return phases[current];
+}
+
+static inline uint64_t
+ftl_nv_cache_pack_lba(uint64_t lba, unsigned int phase)
+{
+ assert(ftl_nv_cache_phase_is_valid(phase));
+ return (lba & ~FTL_NV_CACHE_PHASE_MASK) | ((uint64_t)phase << FTL_NV_CACHE_PHASE_OFFSET);
+}
+
+static inline void
+ftl_nv_cache_unpack_lba(uint64_t in_lba, uint64_t *out_lba, unsigned int *phase)
+{
+ *out_lba = in_lba & ~FTL_NV_CACHE_PHASE_MASK;
+ *phase = (in_lba & FTL_NV_CACHE_PHASE_MASK) >> FTL_NV_CACHE_PHASE_OFFSET;
+
+ /* If the phase is invalid the block wasn't written yet, so treat the LBA as invalid too */
+ if (!ftl_nv_cache_phase_is_valid(*phase) || *out_lba == FTL_NV_CACHE_LBA_INVALID) {
+ *out_lba = FTL_LBA_INVALID;
+ }
+}
+
+static inline bool
+ftl_is_append_supported(const struct spdk_ftl_dev *dev)
+{
+ return dev->conf.use_append;
+}
+
+#endif /* FTL_CORE_H */
diff --git a/src/spdk/lib/ftl/ftl_debug.c b/src/spdk/lib/ftl/ftl_debug.c
new file mode 100644
index 000000000..9fbb43810
--- /dev/null
+++ b/src/spdk/lib/ftl/ftl_debug.c
@@ -0,0 +1,169 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk_internal/log.h"
+#include "spdk/ftl.h"
+#include "ftl_debug.h"
+#include "ftl_band.h"
+
+#if defined(DEBUG)
+#if defined(FTL_META_DEBUG)
+
+static const char *ftl_band_state_str[] = {
+ "free",
+ "prep",
+ "opening",
+ "open",
+ "full",
+ "closing",
+ "closed",
+ "max"
+};
+
+bool
+ftl_band_validate_md(struct ftl_band *band)
+{
+ struct spdk_ftl_dev *dev = band->dev;
+ struct ftl_lba_map *lba_map = &band->lba_map;
+ struct ftl_addr addr_md, addr_l2p;
+ size_t i, size, seg_off;
+ bool valid = true;
+
+ size = ftl_get_num_blocks_in_band(dev);
+
+ pthread_spin_lock(&lba_map->lock);
+ for (i = 0; i < size; ++i) {
+ if (!spdk_bit_array_get(lba_map->vld, i)) {
+ continue;
+ }
+
+ seg_off = i / FTL_NUM_LBA_IN_BLOCK;
+ if (lba_map->segments[seg_off] != FTL_LBA_MAP_SEG_CACHED) {
+ continue;
+ }
+
+ addr_md = ftl_band_addr_from_block_offset(band, i);
+ addr_l2p = ftl_l2p_get(dev, lba_map->map[i]);
+
+ if (addr_l2p.cached) {
+ continue;
+ }
+
+ if (addr_l2p.offset != addr_md.offset) {
+ valid = false;
+ break;
+ }
+
+ }
+
+ pthread_spin_unlock(&lba_map->lock);
+
+ return valid;
+}
+
+void
+ftl_dev_dump_bands(struct spdk_ftl_dev *dev)
+{
+ size_t i, total = 0;
+
+ if (!dev->bands) {
+ return;
+ }
+
+ ftl_debug("Bands validity:\n");
+ for (i = 0; i < ftl_get_num_bands(dev); ++i) {
+ if (dev->bands[i].state == FTL_BAND_STATE_FREE &&
+ dev->bands[i].wr_cnt == 0) {
+ continue;
+ }
+
+ if (!dev->bands[i].num_zones) {
+ ftl_debug(" Band %3zu: all zones are offline\n", i + 1);
+ continue;
+ }
+
+ total += dev->bands[i].lba_map.num_vld;
+ ftl_debug(" Band %3zu: %8zu / %zu \tnum_zones: %zu \twr_cnt: %"PRIu64"\tmerit:"
+ "%10.3f\tstate: %s\n",
+ i + 1, dev->bands[i].lba_map.num_vld,
+ ftl_band_user_blocks(&dev->bands[i]),
+ dev->bands[i].num_zones,
+ dev->bands[i].wr_cnt,
+ dev->bands[i].merit,
+ ftl_band_state_str[dev->bands[i].state]);
+ }
+}
+
+#endif /* defined(FTL_META_DEBUG) */
+
+#if defined(FTL_DUMP_STATS)
+
+void
+ftl_dev_dump_stats(const struct spdk_ftl_dev *dev)
+{
+ size_t i, total = 0;
+ char uuid[SPDK_UUID_STRING_LEN];
+ double waf;
+ const char *limits[] = {
+ [SPDK_FTL_LIMIT_CRIT] = "crit",
+ [SPDK_FTL_LIMIT_HIGH] = "high",
+ [SPDK_FTL_LIMIT_LOW] = "low",
+ [SPDK_FTL_LIMIT_START] = "start"
+ };
+
+ if (!dev->bands) {
+ return;
+ }
+
+ /* Count the number of valid LBAs */
+ for (i = 0; i < ftl_get_num_bands(dev); ++i) {
+ total += dev->bands[i].lba_map.num_vld;
+ }
+
+ waf = (double)dev->stats.write_total / (double)dev->stats.write_user;
+
+ spdk_uuid_fmt_lower(uuid, sizeof(uuid), &dev->uuid);
+ ftl_debug("\n");
+ ftl_debug("device UUID: %s\n", uuid);
+ ftl_debug("total valid LBAs: %zu\n", total);
+ ftl_debug("total writes: %"PRIu64"\n", dev->stats.write_total);
+ ftl_debug("user writes: %"PRIu64"\n", dev->stats.write_user);
+ ftl_debug("WAF: %.4lf\n", waf);
+ ftl_debug("limits:\n");
+ for (i = 0; i < SPDK_FTL_LIMIT_MAX; ++i) {
+ ftl_debug(" %5s: %"PRIu64"\n", limits[i], dev->stats.limits[i]);
+ }
+}
+
+#endif /* defined(FTL_DUMP_STATS) */
+#endif /* defined(DEBUG) */
diff --git a/src/spdk/lib/ftl/ftl_debug.h b/src/spdk/lib/ftl/ftl_debug.h
new file mode 100644
index 000000000..c90c92ef2
--- /dev/null
+++ b/src/spdk/lib/ftl/ftl_debug.h
@@ -0,0 +1,73 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FTL_DEBUG_H
+#define FTL_DEBUG_H
+
+#include "ftl_addr.h"
+#include "ftl_band.h"
+#include "ftl_core.h"
+
+#if defined(DEBUG)
+/* Debug flags - enabled when defined */
+#define FTL_META_DEBUG 1
+#define FTL_DUMP_STATS 1
+
+#define ftl_debug(msg, ...) \
+ SPDK_ERRLOG(msg, ## __VA_ARGS__)
+#else
+#define ftl_debug(msg, ...)
+#endif
+
+static inline const char *
+ftl_addr2str(struct ftl_addr addr, char *buf, size_t size)
+{
+ snprintf(buf, size, "(%"PRIu64")", addr.offset);
+ return buf;
+}
+
+#if defined(FTL_META_DEBUG)
+bool ftl_band_validate_md(struct ftl_band *band);
+void ftl_dev_dump_bands(struct spdk_ftl_dev *dev);
+#else
+#define ftl_band_validate_md(band)
+#define ftl_dev_dump_bands(dev)
+#endif
+
+#if defined(FTL_DUMP_STATS)
+void ftl_dev_dump_stats(const struct spdk_ftl_dev *dev);
+#else
+#define ftl_dev_dump_stats(dev)
+#endif
+
+#endif /* FTL_DEBUG_H */
diff --git a/src/spdk/lib/ftl/ftl_init.c b/src/spdk/lib/ftl/ftl_init.c
new file mode 100644
index 000000000..15a8c21c9
--- /dev/null
+++ b/src/spdk/lib/ftl/ftl_init.c
@@ -0,0 +1,1688 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+#include "spdk/nvme.h"
+#include "spdk/thread.h"
+#include "spdk/string.h"
+#include "spdk/likely.h"
+#include "spdk_internal/log.h"
+#include "spdk/ftl.h"
+#include "spdk/likely.h"
+#include "spdk/string.h"
+#include "spdk/bdev_zone.h"
+#include "spdk/bdev_module.h"
+#include "spdk/config.h"
+
+#include "ftl_core.h"
+#include "ftl_io.h"
+#include "ftl_reloc.h"
+#include "ftl_band.h"
+#include "ftl_debug.h"
+
+#ifdef SPDK_CONFIG_PMDK
+#include "libpmem.h"
+#endif /* SPDK_CONFIG_PMDK */
+
+#define FTL_CORE_RING_SIZE 4096
+#define FTL_INIT_TIMEOUT 30
+#define FTL_NSID 1
+#define FTL_ZONE_INFO_COUNT 64
+
+/* Dummy bdev module used to to claim bdevs. */
+static struct spdk_bdev_module g_ftl_bdev_module = {
+ .name = "ftl_lib",
+};
+
+struct ftl_dev_init_ctx {
+ /* Owner */
+ struct spdk_ftl_dev *dev;
+ /* Initial arguments */
+ struct spdk_ftl_dev_init_opts opts;
+ /* IO channel for zone info retrieving */
+ struct spdk_io_channel *ioch;
+ /* Buffer for reading zone info */
+ struct spdk_bdev_zone_info info[FTL_ZONE_INFO_COUNT];
+ /* Currently read zone */
+ size_t zone_id;
+ /* User's callback */
+ spdk_ftl_init_fn cb_fn;
+ /* Callback's argument */
+ void *cb_arg;
+ /* Thread to call the callback on */
+ struct spdk_thread *thread;
+ /* Poller to check if the device has been destroyed/initialized */
+ struct spdk_poller *poller;
+ /* Status to return for halt completion callback */
+ int halt_complete_status;
+};
+
+static STAILQ_HEAD(, spdk_ftl_dev) g_ftl_queue = STAILQ_HEAD_INITIALIZER(g_ftl_queue);
+static pthread_mutex_t g_ftl_queue_lock = PTHREAD_MUTEX_INITIALIZER;
+static const struct spdk_ftl_conf g_default_conf = {
+ .limits = {
+ /* 5 free bands / 0 % host writes */
+ [SPDK_FTL_LIMIT_CRIT] = { .thld = 5, .limit = 0 },
+ /* 10 free bands / 5 % host writes */
+ [SPDK_FTL_LIMIT_HIGH] = { .thld = 10, .limit = 5 },
+ /* 20 free bands / 40 % host writes */
+ [SPDK_FTL_LIMIT_LOW] = { .thld = 20, .limit = 40 },
+ /* 40 free bands / 100 % host writes - defrag starts running */
+ [SPDK_FTL_LIMIT_START] = { .thld = 40, .limit = 100 },
+ },
+ /* 10 percent valid blocks */
+ .invalid_thld = 10,
+ /* 20% spare blocks */
+ .lba_rsvd = 20,
+ /* 6M write buffer per each IO channel */
+ .write_buffer_size = 6 * 1024 * 1024,
+ /* 90% band fill threshold */
+ .band_thld = 90,
+ /* Max 32 IO depth per band relocate */
+ .max_reloc_qdepth = 32,
+ /* Max 3 active band relocates */
+ .max_active_relocs = 3,
+ /* IO pool size per user thread (this should be adjusted to thread IO qdepth) */
+ .user_io_pool_size = 2048,
+ /*
+ * If clear ftl will return error when restoring after a dirty shutdown
+ * If set, last band will be padded, ftl will restore based only on closed bands - this
+ * will result in lost data after recovery.
+ */
+ .allow_open_bands = false,
+ .max_io_channels = 128,
+ .nv_cache = {
+ /* Maximum number of concurrent requests */
+ .max_request_cnt = 2048,
+ /* Maximum number of blocks per request */
+ .max_request_size = 16,
+ }
+};
+
+static int
+ftl_band_init_md(struct ftl_band *band)
+{
+ struct ftl_lba_map *lba_map = &band->lba_map;
+ int rc;
+
+ lba_map->vld = spdk_bit_array_create(ftl_get_num_blocks_in_band(band->dev));
+ if (!lba_map->vld) {
+ return -ENOMEM;
+ }
+
+ rc = pthread_spin_init(&lba_map->lock, PTHREAD_PROCESS_PRIVATE);
+ if (rc) {
+ spdk_bit_array_free(&lba_map->vld);
+ return rc;
+ }
+ ftl_band_md_clear(band);
+ return 0;
+}
+
+static int
+ftl_check_conf(const struct spdk_ftl_dev *dev, const struct spdk_ftl_conf *conf)
+{
+ size_t i;
+
+ if (conf->invalid_thld >= 100) {
+ return -1;
+ }
+ if (conf->lba_rsvd >= 100) {
+ return -1;
+ }
+ if (conf->lba_rsvd == 0) {
+ return -1;
+ }
+ if (conf->write_buffer_size == 0) {
+ return -1;
+ }
+ if (conf->write_buffer_size % FTL_BLOCK_SIZE != 0) {
+ return -1;
+ }
+
+ for (i = 0; i < SPDK_FTL_LIMIT_MAX; ++i) {
+ if (conf->limits[i].limit > 100) {
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+static int
+ftl_dev_init_bands(struct spdk_ftl_dev *dev)
+{
+ struct ftl_band *band, *pband;
+ unsigned int i;
+ int rc = 0;
+
+ LIST_INIT(&dev->free_bands);
+ LIST_INIT(&dev->shut_bands);
+
+ dev->num_free = 0;
+ dev->bands = calloc(ftl_get_num_bands(dev), sizeof(*dev->bands));
+ if (!dev->bands) {
+ return -1;
+ }
+
+ for (i = 0; i < ftl_get_num_bands(dev); ++i) {
+ band = &dev->bands[i];
+ band->id = i;
+ band->dev = dev;
+ band->state = FTL_BAND_STATE_CLOSED;
+
+ if (LIST_EMPTY(&dev->shut_bands)) {
+ LIST_INSERT_HEAD(&dev->shut_bands, band, list_entry);
+ } else {
+ LIST_INSERT_AFTER(pband, band, list_entry);
+ }
+ pband = band;
+
+ CIRCLEQ_INIT(&band->zones);
+ band->zone_buf = calloc(ftl_get_num_punits(dev), sizeof(*band->zone_buf));
+ if (!band->zone_buf) {
+ SPDK_ERRLOG("Failed to allocate block state table for band: [%u]\n", i);
+ rc = -1;
+ break;
+ }
+
+ rc = ftl_band_init_md(band);
+ if (rc) {
+ SPDK_ERRLOG("Failed to initialize metadata structures for band [%u]\n", i);
+ break;
+ }
+
+ band->reloc_bitmap = spdk_bit_array_create(ftl_get_num_bands(dev));
+ if (!band->reloc_bitmap) {
+ SPDK_ERRLOG("Failed to allocate band relocation bitmap\n");
+ break;
+ }
+ }
+
+ return rc;
+}
+
+static void
+ftl_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *event_ctx)
+{
+ struct spdk_ftl_dev *dev = event_ctx;
+
+ switch (type) {
+ case SPDK_BDEV_EVENT_REMOVE:
+ assert(0);
+ break;
+ case SPDK_BDEV_EVENT_MEDIA_MANAGEMENT:
+ assert(bdev == spdk_bdev_desc_get_bdev(dev->base_bdev_desc));
+ ftl_get_media_events(dev);
+ default:
+ break;
+ }
+}
+
+static int
+ftl_dev_init_nv_cache(struct spdk_ftl_dev *dev, const char *bdev_name)
+{
+ struct spdk_bdev *bdev;
+ struct spdk_ftl_conf *conf = &dev->conf;
+ struct ftl_nv_cache *nv_cache = &dev->nv_cache;
+ char pool_name[128];
+ int rc;
+
+ if (!bdev_name) {
+ return 0;
+ }
+
+ bdev = spdk_bdev_get_by_name(bdev_name);
+ if (!bdev) {
+ SPDK_ERRLOG("Unable to find bdev: %s\n", bdev_name);
+ return -1;
+ }
+
+ if (spdk_bdev_open_ext(bdev_name, true, ftl_bdev_event_cb,
+ dev, &nv_cache->bdev_desc)) {
+ SPDK_ERRLOG("Unable to open bdev: %s\n", bdev_name);
+ return -1;
+ }
+
+ if (spdk_bdev_module_claim_bdev(bdev, nv_cache->bdev_desc, &g_ftl_bdev_module)) {
+ spdk_bdev_close(nv_cache->bdev_desc);
+ nv_cache->bdev_desc = NULL;
+ SPDK_ERRLOG("Unable to claim bdev %s\n", bdev_name);
+ return -1;
+ }
+
+ SPDK_INFOLOG(SPDK_LOG_FTL_INIT, "Using %s as write buffer cache\n",
+ spdk_bdev_get_name(bdev));
+
+ if (spdk_bdev_get_block_size(bdev) != FTL_BLOCK_SIZE) {
+ SPDK_ERRLOG("Unsupported block size (%d)\n", spdk_bdev_get_block_size(bdev));
+ return -1;
+ }
+
+ if (!spdk_bdev_is_md_separate(bdev)) {
+ SPDK_ERRLOG("Bdev %s doesn't support separate metadata buffer IO\n",
+ spdk_bdev_get_name(bdev));
+ return -1;
+ }
+
+ if (spdk_bdev_get_md_size(bdev) < sizeof(uint64_t)) {
+ SPDK_ERRLOG("Bdev's %s metadata is too small (%"PRIu32")\n",
+ spdk_bdev_get_name(bdev), spdk_bdev_get_md_size(bdev));
+ return -1;
+ }
+
+ if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) {
+ SPDK_ERRLOG("Unsupported DIF type used by bdev %s\n",
+ spdk_bdev_get_name(bdev));
+ return -1;
+ }
+
+ /* The cache needs to be capable of storing at least two full bands. This requirement comes
+ * from the fact that cache works as a protection against power loss, so before the data
+ * inside the cache can be overwritten, the band it's stored on has to be closed. Plus one
+ * extra block is needed to store the header.
+ */
+ if (spdk_bdev_get_num_blocks(bdev) < ftl_get_num_blocks_in_band(dev) * 2 + 1) {
+ SPDK_ERRLOG("Insufficient number of blocks for write buffer cache (available: %"
+ PRIu64", required: %"PRIu64")\n", spdk_bdev_get_num_blocks(bdev),
+ ftl_get_num_blocks_in_band(dev) * 2 + 1);
+ return -1;
+ }
+
+ rc = snprintf(pool_name, sizeof(pool_name), "ftl-nvpool-%p", dev);
+ if (rc < 0 || rc >= 128) {
+ return -1;
+ }
+
+ nv_cache->md_pool = spdk_mempool_create(pool_name, conf->nv_cache.max_request_cnt,
+ spdk_bdev_get_md_size(bdev) *
+ conf->nv_cache.max_request_size,
+ SPDK_MEMPOOL_DEFAULT_CACHE_SIZE,
+ SPDK_ENV_SOCKET_ID_ANY);
+ if (!nv_cache->md_pool) {
+ SPDK_ERRLOG("Failed to initialize non-volatile cache metadata pool\n");
+ return -1;
+ }
+
+ nv_cache->dma_buf = spdk_dma_zmalloc(FTL_BLOCK_SIZE, spdk_bdev_get_buf_align(bdev), NULL);
+ if (!nv_cache->dma_buf) {
+ SPDK_ERRLOG("Memory allocation failure\n");
+ return -1;
+ }
+
+ if (pthread_spin_init(&nv_cache->lock, PTHREAD_PROCESS_PRIVATE)) {
+ SPDK_ERRLOG("Failed to initialize cache lock\n");
+ return -1;
+ }
+
+ nv_cache->current_addr = FTL_NV_CACHE_DATA_OFFSET;
+ nv_cache->num_data_blocks = spdk_bdev_get_num_blocks(bdev) - 1;
+ nv_cache->num_available = nv_cache->num_data_blocks;
+ nv_cache->ready = false;
+
+ return 0;
+}
+
+void
+spdk_ftl_conf_init_defaults(struct spdk_ftl_conf *conf)
+{
+ *conf = g_default_conf;
+}
+
+static void
+ftl_lba_map_request_ctor(struct spdk_mempool *mp, void *opaque, void *obj, unsigned obj_idx)
+{
+ struct ftl_lba_map_request *request = obj;
+ struct spdk_ftl_dev *dev = opaque;
+
+ request->segments = spdk_bit_array_create(spdk_divide_round_up(
+ ftl_get_num_blocks_in_band(dev), FTL_NUM_LBA_IN_BLOCK));
+}
+
+static int
+ftl_init_media_events_pool(struct spdk_ftl_dev *dev)
+{
+ char pool_name[128];
+ int rc;
+
+ rc = snprintf(pool_name, sizeof(pool_name), "ftl-media-%p", dev);
+ if (rc < 0 || rc >= (int)sizeof(pool_name)) {
+ SPDK_ERRLOG("Failed to create media pool name\n");
+ return -1;
+ }
+
+ dev->media_events_pool = spdk_mempool_create(pool_name, 1024,
+ sizeof(struct ftl_media_event),
+ SPDK_MEMPOOL_DEFAULT_CACHE_SIZE,
+ SPDK_ENV_SOCKET_ID_ANY);
+ if (!dev->media_events_pool) {
+ SPDK_ERRLOG("Failed to create media events pool\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int
+ftl_init_lba_map_pools(struct spdk_ftl_dev *dev)
+{
+#define POOL_NAME_LEN 128
+ char pool_name[POOL_NAME_LEN];
+ int rc;
+
+ rc = snprintf(pool_name, sizeof(pool_name), "%s-%s", dev->name, "ftl-lba-pool");
+ if (rc < 0 || rc >= POOL_NAME_LEN) {
+ return -ENAMETOOLONG;
+ }
+
+ /* We need to reserve at least 2 buffers for band close / open sequence
+ * alone, plus additional (8) buffers for handling write errors.
+ * TODO: This memory pool is utilized only by core thread - it introduce
+ * unnecessary overhead and should be replaced by different data structure.
+ */
+ dev->lba_pool = spdk_mempool_create(pool_name, 2 + 8,
+ ftl_lba_map_pool_elem_size(dev),
+ SPDK_MEMPOOL_DEFAULT_CACHE_SIZE,
+ SPDK_ENV_SOCKET_ID_ANY);
+ if (!dev->lba_pool) {
+ return -ENOMEM;
+ }
+
+ rc = snprintf(pool_name, sizeof(pool_name), "%s-%s", dev->name, "ftl-lbareq-pool");
+ if (rc < 0 || rc >= POOL_NAME_LEN) {
+ return -ENAMETOOLONG;
+ }
+
+ dev->lba_request_pool = spdk_mempool_create_ctor(pool_name,
+ dev->conf.max_reloc_qdepth * dev->conf.max_active_relocs,
+ sizeof(struct ftl_lba_map_request),
+ SPDK_MEMPOOL_DEFAULT_CACHE_SIZE,
+ SPDK_ENV_SOCKET_ID_ANY,
+ ftl_lba_map_request_ctor,
+ dev);
+ if (!dev->lba_request_pool) {
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void
+ftl_init_wptr_list(struct spdk_ftl_dev *dev)
+{
+ LIST_INIT(&dev->wptr_list);
+ LIST_INIT(&dev->flush_list);
+ LIST_INIT(&dev->band_flush_list);
+}
+
+static size_t
+ftl_dev_band_max_seq(struct spdk_ftl_dev *dev)
+{
+ struct ftl_band *band;
+ size_t seq = 0;
+
+ LIST_FOREACH(band, &dev->shut_bands, list_entry) {
+ if (band->seq > seq) {
+ seq = band->seq;
+ }
+ }
+
+ return seq;
+}
+
+static void
+_ftl_init_bands_state(void *ctx)
+{
+ struct ftl_band *band, *temp_band;
+ struct spdk_ftl_dev *dev = ctx;
+
+ dev->seq = ftl_dev_band_max_seq(dev);
+
+ LIST_FOREACH_SAFE(band, &dev->shut_bands, list_entry, temp_band) {
+ if (!band->lba_map.num_vld) {
+ ftl_band_set_state(band, FTL_BAND_STATE_FREE);
+ }
+ }
+
+ ftl_reloc_resume(dev->reloc);
+ /* Clear the limit applications as they're incremented incorrectly by */
+ /* the initialization code */
+ memset(dev->stats.limits, 0, sizeof(dev->stats.limits));
+}
+
+static int
+ftl_init_num_free_bands(struct spdk_ftl_dev *dev)
+{
+ struct ftl_band *band;
+ int cnt = 0;
+
+ LIST_FOREACH(band, &dev->shut_bands, list_entry) {
+ if (band->num_zones && !band->lba_map.num_vld) {
+ cnt++;
+ }
+ }
+ return cnt;
+}
+
+static int
+ftl_init_bands_state(struct spdk_ftl_dev *dev)
+{
+ /* TODO: Should we abort initialization or expose read only device */
+ /* if there is no free bands? */
+ /* If we abort initialization should we depend on condition that */
+ /* we have no free bands or should we have some minimal number of */
+ /* free bands? */
+ if (!ftl_init_num_free_bands(dev)) {
+ return -1;
+ }
+
+ spdk_thread_send_msg(ftl_get_core_thread(dev), _ftl_init_bands_state, dev);
+ return 0;
+}
+
+static void
+_ftl_dev_init_core_thread(void *ctx)
+{
+ struct spdk_ftl_dev *dev = ctx;
+
+ dev->core_poller = SPDK_POLLER_REGISTER(ftl_task_core, dev, 0);
+ if (!dev->core_poller) {
+ SPDK_ERRLOG("Unable to register core poller\n");
+ assert(0);
+ }
+
+ dev->ioch = spdk_get_io_channel(dev);
+}
+
+static int
+ftl_dev_init_core_thread(struct spdk_ftl_dev *dev, const struct spdk_ftl_dev_init_opts *opts)
+{
+ if (!opts->core_thread) {
+ return -1;
+ }
+
+ dev->core_thread = opts->core_thread;
+
+ spdk_thread_send_msg(opts->core_thread, _ftl_dev_init_core_thread, dev);
+ return 0;
+}
+
+static int
+ftl_dev_l2p_alloc_pmem(struct spdk_ftl_dev *dev, size_t l2p_size, const char *l2p_path)
+{
+#ifdef SPDK_CONFIG_PMDK
+ int is_pmem;
+
+ if ((dev->l2p = pmem_map_file(l2p_path, 0,
+ 0, 0, &dev->l2p_pmem_len, &is_pmem)) == NULL) {
+ SPDK_ERRLOG("Failed to mmap l2p_path\n");
+ return -1;
+ }
+
+ if (!is_pmem) {
+ SPDK_NOTICELOG("l2p_path mapped on non-pmem device\n");
+ }
+
+ if (dev->l2p_pmem_len < l2p_size) {
+ SPDK_ERRLOG("l2p_path file is too small\n");
+ return -1;
+ }
+
+ pmem_memset_persist(dev->l2p, FTL_ADDR_INVALID, l2p_size);
+
+ return 0;
+#else /* SPDK_CONFIG_PMDK */
+ SPDK_ERRLOG("Libpmem not available, cannot use pmem l2p_path\n");
+ return -1;
+#endif /* SPDK_CONFIG_PMDK */
+}
+
+static int
+ftl_dev_l2p_alloc_dram(struct spdk_ftl_dev *dev, size_t l2p_size)
+{
+ dev->l2p = malloc(l2p_size);
+ if (!dev->l2p) {
+ SPDK_ERRLOG("Failed to allocate l2p table\n");
+ return -1;
+ }
+
+ memset(dev->l2p, FTL_ADDR_INVALID, l2p_size);
+
+ return 0;
+}
+
+static int
+ftl_dev_l2p_alloc(struct spdk_ftl_dev *dev)
+{
+ size_t addr_size = dev->addr_len >= 32 ? 8 : 4;
+ size_t l2p_size = dev->num_lbas * addr_size;
+ const char *l2p_path = dev->conf.l2p_path;
+
+ if (dev->num_lbas == 0) {
+ SPDK_ERRLOG("Invalid l2p table size\n");
+ return -1;
+ }
+
+ if (dev->l2p) {
+ SPDK_ERRLOG("L2p table already allocated\n");
+ return -1;
+ }
+
+ dev->l2p_pmem_len = 0;
+ if (l2p_path) {
+ return ftl_dev_l2p_alloc_pmem(dev, l2p_size, l2p_path);
+ } else {
+ return ftl_dev_l2p_alloc_dram(dev, l2p_size);
+ }
+}
+
+static void
+ftl_dev_free_init_ctx(struct ftl_dev_init_ctx *init_ctx)
+{
+ if (!init_ctx) {
+ return;
+ }
+
+ if (init_ctx->ioch) {
+ spdk_put_io_channel(init_ctx->ioch);
+ }
+
+ free(init_ctx);
+}
+
+static void
+ftl_call_init_complete_cb(void *ctx)
+{
+ struct ftl_dev_init_ctx *init_ctx = ctx;
+ struct spdk_ftl_dev *dev = init_ctx->dev;
+
+ if (init_ctx->cb_fn != NULL) {
+ init_ctx->cb_fn(dev, init_ctx->cb_arg, 0);
+ }
+
+ ftl_dev_free_init_ctx(init_ctx);
+}
+
+static void
+ftl_init_complete(struct ftl_dev_init_ctx *init_ctx)
+{
+ struct spdk_ftl_dev *dev = init_ctx->dev;
+
+ pthread_mutex_lock(&g_ftl_queue_lock);
+ STAILQ_INSERT_HEAD(&g_ftl_queue, dev, stailq);
+ pthread_mutex_unlock(&g_ftl_queue_lock);
+
+ dev->initialized = 1;
+
+ spdk_thread_send_msg(init_ctx->thread, ftl_call_init_complete_cb, init_ctx);
+}
+
+static void
+ftl_init_fail_cb(struct spdk_ftl_dev *dev, void *ctx, int status)
+{
+ struct ftl_dev_init_ctx *init_ctx = ctx;
+
+ if (init_ctx->cb_fn != NULL) {
+ init_ctx->cb_fn(NULL, init_ctx->cb_arg, -ENODEV);
+ }
+
+ ftl_dev_free_init_ctx(init_ctx);
+}
+
+static int ftl_dev_free(struct spdk_ftl_dev *dev, spdk_ftl_init_fn cb_fn, void *cb_arg,
+ struct spdk_thread *thread);
+
+static void
+ftl_init_fail(struct ftl_dev_init_ctx *init_ctx)
+{
+ if (ftl_dev_free(init_ctx->dev, ftl_init_fail_cb, init_ctx, init_ctx->thread)) {
+ SPDK_ERRLOG("Unable to free the device\n");
+ assert(0);
+ }
+}
+
+static void
+ftl_write_nv_cache_md_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct ftl_dev_init_ctx *init_ctx = cb_arg;
+ struct spdk_ftl_dev *dev = init_ctx->dev;
+
+ spdk_bdev_free_io(bdev_io);
+ if (spdk_unlikely(!success)) {
+ SPDK_ERRLOG("Writing non-volatile cache's metadata header failed\n");
+ ftl_init_fail(init_ctx);
+ return;
+ }
+
+ dev->nv_cache.ready = true;
+ ftl_init_complete(init_ctx);
+}
+
+static void
+ftl_clear_nv_cache_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct ftl_dev_init_ctx *init_ctx = cb_arg;
+ struct spdk_ftl_dev *dev = init_ctx->dev;
+ struct ftl_nv_cache *nv_cache = &dev->nv_cache;
+
+ spdk_bdev_free_io(bdev_io);
+ if (spdk_unlikely(!success)) {
+ SPDK_ERRLOG("Unable to clear the non-volatile cache bdev\n");
+ ftl_init_fail(init_ctx);
+ return;
+ }
+
+ nv_cache->phase = 1;
+ if (ftl_nv_cache_write_header(nv_cache, false, ftl_write_nv_cache_md_cb, init_ctx)) {
+ SPDK_ERRLOG("Unable to write non-volatile cache metadata header\n");
+ ftl_init_fail(init_ctx);
+ }
+}
+
+static void
+_ftl_nv_cache_scrub(void *ctx)
+{
+ struct ftl_dev_init_ctx *init_ctx = ctx;
+ struct spdk_ftl_dev *dev = init_ctx->dev;
+ int rc;
+
+ rc = ftl_nv_cache_scrub(&dev->nv_cache, ftl_clear_nv_cache_cb, init_ctx);
+
+ if (spdk_unlikely(rc != 0)) {
+ SPDK_ERRLOG("Unable to clear the non-volatile cache bdev: %s\n",
+ spdk_strerror(-rc));
+ ftl_init_fail(init_ctx);
+ }
+}
+
+static int
+ftl_setup_initial_state(struct ftl_dev_init_ctx *init_ctx)
+{
+ struct spdk_ftl_dev *dev = init_ctx->dev;
+ struct spdk_ftl_conf *conf = &dev->conf;
+ size_t i;
+
+ spdk_uuid_generate(&dev->uuid);
+
+ dev->num_lbas = 0;
+ for (i = 0; i < ftl_get_num_bands(dev); ++i) {
+ dev->num_lbas += ftl_band_num_usable_blocks(&dev->bands[i]);
+ }
+
+ dev->num_lbas = (dev->num_lbas * (100 - conf->lba_rsvd)) / 100;
+
+ if (ftl_dev_l2p_alloc(dev)) {
+ SPDK_ERRLOG("Unable to init l2p table\n");
+ return -1;
+ }
+
+ if (ftl_init_bands_state(dev)) {
+ SPDK_ERRLOG("Unable to finish the initialization\n");
+ return -1;
+ }
+
+ if (!ftl_dev_has_nv_cache(dev)) {
+ ftl_init_complete(init_ctx);
+ } else {
+ spdk_thread_send_msg(ftl_get_core_thread(dev), _ftl_nv_cache_scrub, init_ctx);
+ }
+
+ return 0;
+}
+
+static void
+ftl_restore_nv_cache_cb(struct ftl_restore *restore, int status, void *cb_arg)
+{
+ struct ftl_dev_init_ctx *init_ctx = cb_arg;
+
+ if (spdk_unlikely(status != 0)) {
+ SPDK_ERRLOG("Failed to restore the non-volatile cache state\n");
+ ftl_init_fail(init_ctx);
+ return;
+ }
+
+ ftl_init_complete(init_ctx);
+}
+
+static void
+ftl_restore_device_cb(struct ftl_restore *restore, int status, void *cb_arg)
+{
+ struct ftl_dev_init_ctx *init_ctx = cb_arg;
+ struct spdk_ftl_dev *dev = init_ctx->dev;
+
+ if (status) {
+ SPDK_ERRLOG("Failed to restore the device from the SSD\n");
+ ftl_init_fail(init_ctx);
+ return;
+ }
+
+ if (ftl_init_bands_state(dev)) {
+ SPDK_ERRLOG("Unable to finish the initialization\n");
+ ftl_init_fail(init_ctx);
+ return;
+ }
+
+ if (!ftl_dev_has_nv_cache(dev)) {
+ ftl_init_complete(init_ctx);
+ return;
+ }
+
+ ftl_restore_nv_cache(restore, ftl_restore_nv_cache_cb, init_ctx);
+}
+
+static void
+ftl_restore_md_cb(struct ftl_restore *restore, int status, void *cb_arg)
+{
+ struct ftl_dev_init_ctx *init_ctx = cb_arg;
+
+ if (status) {
+ SPDK_ERRLOG("Failed to restore the metadata from the SSD\n");
+ goto error;
+ }
+
+ /* After the metadata is read it should be possible to allocate the L2P */
+ if (ftl_dev_l2p_alloc(init_ctx->dev)) {
+ SPDK_ERRLOG("Failed to allocate the L2P\n");
+ goto error;
+ }
+
+ if (ftl_restore_device(restore, ftl_restore_device_cb, init_ctx)) {
+ SPDK_ERRLOG("Failed to start device restoration from the SSD\n");
+ goto error;
+ }
+
+ return;
+error:
+ ftl_init_fail(init_ctx);
+}
+
+static int
+ftl_restore_state(struct ftl_dev_init_ctx *init_ctx)
+{
+ struct spdk_ftl_dev *dev = init_ctx->dev;
+
+ dev->uuid = init_ctx->opts.uuid;
+
+ if (ftl_restore_md(dev, ftl_restore_md_cb, init_ctx)) {
+ SPDK_ERRLOG("Failed to start metadata restoration from the SSD\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+static void
+ftl_dev_update_bands(struct spdk_ftl_dev *dev)
+{
+ struct ftl_band *band, *temp_band;
+ size_t i;
+
+ for (i = 0; i < ftl_get_num_bands(dev); ++i) {
+ band = &dev->bands[i];
+ band->tail_md_addr = ftl_band_tail_md_addr(band);
+ }
+
+ /* Remove band from shut_bands list to prevent further processing */
+ /* if all blocks on this band are bad */
+ LIST_FOREACH_SAFE(band, &dev->shut_bands, list_entry, temp_band) {
+ if (!band->num_zones) {
+ dev->num_bands--;
+ LIST_REMOVE(band, list_entry);
+ }
+ }
+}
+
+static void
+ftl_dev_init_state(struct ftl_dev_init_ctx *init_ctx)
+{
+ struct spdk_ftl_dev *dev = init_ctx->dev;
+
+ ftl_dev_update_bands(dev);
+
+ if (ftl_dev_init_core_thread(dev, &init_ctx->opts)) {
+ SPDK_ERRLOG("Unable to initialize device thread\n");
+ ftl_init_fail(init_ctx);
+ return;
+ }
+
+ if (init_ctx->opts.mode & SPDK_FTL_MODE_CREATE) {
+ if (ftl_setup_initial_state(init_ctx)) {
+ SPDK_ERRLOG("Failed to setup initial state of the device\n");
+ ftl_init_fail(init_ctx);
+ return;
+ }
+ } else {
+ if (ftl_restore_state(init_ctx)) {
+ SPDK_ERRLOG("Unable to restore device's state from the SSD\n");
+ ftl_init_fail(init_ctx);
+ return;
+ }
+ }
+}
+
+static void ftl_dev_get_zone_info(struct ftl_dev_init_ctx *init_ctx);
+
+static void
+ftl_dev_get_zone_info_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct ftl_dev_init_ctx *init_ctx = cb_arg;
+ struct spdk_ftl_dev *dev = init_ctx->dev;
+ struct ftl_band *band;
+ struct ftl_zone *zone;
+ struct ftl_addr addr;
+ size_t i, zones_left, num_zones;
+
+ spdk_bdev_free_io(bdev_io);
+
+ if (spdk_unlikely(!success)) {
+ SPDK_ERRLOG("Unable to read zone info for zone id: %"PRIu64"\n", init_ctx->zone_id);
+ ftl_init_fail(init_ctx);
+ return;
+ }
+
+ zones_left = ftl_get_num_zones(dev) - (init_ctx->zone_id / ftl_get_num_blocks_in_zone(dev));
+ num_zones = spdk_min(zones_left, FTL_ZONE_INFO_COUNT);
+
+ for (i = 0; i < num_zones; ++i) {
+ addr.offset = init_ctx->info[i].zone_id;
+ band = &dev->bands[ftl_addr_get_band(dev, addr)];
+ zone = &band->zone_buf[ftl_addr_get_punit(dev, addr)];
+ zone->info = init_ctx->info[i];
+
+ /* TODO: add support for zone capacity less than zone size */
+ if (zone->info.capacity != ftl_get_num_blocks_in_zone(dev)) {
+ zone->info.state = SPDK_BDEV_ZONE_STATE_OFFLINE;
+ SPDK_ERRLOG("Zone capacity is not equal zone size for "
+ "zone id: %"PRIu64"\n", init_ctx->zone_id);
+ }
+
+ /* Set write pointer to the last block plus one for zone in full state */
+ if (zone->info.state == SPDK_BDEV_ZONE_STATE_FULL) {
+ zone->info.write_pointer = zone->info.zone_id + zone->info.capacity;
+ }
+
+ if (zone->info.state != SPDK_BDEV_ZONE_STATE_OFFLINE) {
+ band->num_zones++;
+ CIRCLEQ_INSERT_TAIL(&band->zones, zone, circleq);
+ }
+ }
+
+ init_ctx->zone_id = init_ctx->zone_id + num_zones * ftl_get_num_blocks_in_zone(dev);
+
+ ftl_dev_get_zone_info(init_ctx);
+}
+
+static void
+ftl_dev_get_zone_info(struct ftl_dev_init_ctx *init_ctx)
+{
+ struct spdk_ftl_dev *dev = init_ctx->dev;
+ size_t zones_left, num_zones;
+ int rc;
+
+ zones_left = ftl_get_num_zones(dev) - (init_ctx->zone_id / ftl_get_num_blocks_in_zone(dev));
+ if (zones_left == 0) {
+ ftl_dev_init_state(init_ctx);
+ return;
+ }
+
+ num_zones = spdk_min(zones_left, FTL_ZONE_INFO_COUNT);
+
+ rc = spdk_bdev_get_zone_info(dev->base_bdev_desc, init_ctx->ioch,
+ init_ctx->zone_id, num_zones, init_ctx->info,
+ ftl_dev_get_zone_info_cb, init_ctx);
+
+ if (spdk_unlikely(rc != 0)) {
+ SPDK_ERRLOG("Unable to read zone info for zone id: %"PRIu64"\n", init_ctx->zone_id);
+ ftl_init_fail(init_ctx);
+ }
+}
+
+static int
+ftl_dev_init_zones(struct ftl_dev_init_ctx *init_ctx)
+{
+ struct spdk_ftl_dev *dev = init_ctx->dev;
+
+ init_ctx->zone_id = 0;
+ init_ctx->ioch = spdk_bdev_get_io_channel(dev->base_bdev_desc);
+ if (!init_ctx->ioch) {
+ SPDK_ERRLOG("Failed to get base bdev IO channel\n");
+ return -1;
+ }
+
+ ftl_dev_get_zone_info(init_ctx);
+
+ return 0;
+}
+
+struct _ftl_io_channel {
+ struct ftl_io_channel *ioch;
+};
+
+struct ftl_io_channel *
+ftl_io_channel_get_ctx(struct spdk_io_channel *ioch)
+{
+ struct _ftl_io_channel *_ioch = spdk_io_channel_get_ctx(ioch);
+
+ return _ioch->ioch;
+}
+
+static void
+ftl_io_channel_register(void *ctx)
+{
+ struct ftl_io_channel *ioch = ctx;
+ struct spdk_ftl_dev *dev = ioch->dev;
+ uint32_t ioch_index;
+
+ for (ioch_index = 0; ioch_index < dev->conf.max_io_channels; ++ioch_index) {
+ if (dev->ioch_array[ioch_index] == NULL) {
+ dev->ioch_array[ioch_index] = ioch;
+ ioch->index = ioch_index;
+ break;
+ }
+ }
+
+ assert(ioch_index < dev->conf.max_io_channels);
+ TAILQ_INSERT_TAIL(&dev->ioch_queue, ioch, tailq);
+}
+
+static int
+ftl_io_channel_init_wbuf(struct ftl_io_channel *ioch)
+{
+ struct spdk_ftl_dev *dev = ioch->dev;
+ struct ftl_wbuf_entry *entry;
+ uint32_t i;
+ int rc;
+
+ ioch->num_entries = dev->conf.write_buffer_size / FTL_BLOCK_SIZE;
+ ioch->wbuf_entries = calloc(ioch->num_entries, sizeof(*ioch->wbuf_entries));
+ if (ioch->wbuf_entries == NULL) {
+ SPDK_ERRLOG("Failed to allocate write buffer entry array\n");
+ return -1;
+ }
+
+ ioch->qdepth_limit = ioch->num_entries;
+ ioch->wbuf_payload = spdk_zmalloc(dev->conf.write_buffer_size, FTL_BLOCK_SIZE, NULL,
+ SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+ if (ioch->wbuf_payload == NULL) {
+ SPDK_ERRLOG("Failed to allocate write buffer payload\n");
+ goto error_entries;
+ }
+
+ ioch->free_queue = spdk_ring_create(SPDK_RING_TYPE_SP_SC,
+ spdk_align32pow2(ioch->num_entries + 1),
+ SPDK_ENV_SOCKET_ID_ANY);
+ if (ioch->free_queue == NULL) {
+ SPDK_ERRLOG("Failed to allocate free queue\n");
+ goto error_payload;
+ }
+
+ ioch->submit_queue = spdk_ring_create(SPDK_RING_TYPE_SP_SC,
+ spdk_align32pow2(ioch->num_entries + 1),
+ SPDK_ENV_SOCKET_ID_ANY);
+ if (ioch->submit_queue == NULL) {
+ SPDK_ERRLOG("Failed to allocate submit queue\n");
+ goto error_free_queue;
+ }
+
+ for (i = 0; i < ioch->num_entries; ++i) {
+ entry = &ioch->wbuf_entries[i];
+ entry->payload = (char *)ioch->wbuf_payload + i * FTL_BLOCK_SIZE;
+ entry->ioch = ioch;
+ entry->index = i;
+ entry->addr.offset = FTL_ADDR_INVALID;
+
+ rc = pthread_spin_init(&entry->lock, PTHREAD_PROCESS_PRIVATE);
+ if (rc != 0) {
+ SPDK_ERRLOG("Failed to initialize spinlock\n");
+ goto error_spinlock;
+ }
+
+ spdk_ring_enqueue(ioch->free_queue, (void **)&entry, 1, NULL);
+ }
+
+ return 0;
+error_spinlock:
+ for (; i > 0; --i) {
+ pthread_spin_destroy(&ioch->wbuf_entries[i - 1].lock);
+ }
+
+ spdk_ring_free(ioch->submit_queue);
+error_free_queue:
+ spdk_ring_free(ioch->free_queue);
+error_payload:
+ spdk_free(ioch->wbuf_payload);
+error_entries:
+ free(ioch->wbuf_entries);
+
+ return -1;
+}
+
+static int
+ftl_io_channel_create_cb(void *io_device, void *ctx)
+{
+ struct spdk_ftl_dev *dev = io_device;
+ struct _ftl_io_channel *_ioch = ctx;
+ struct ftl_io_channel *ioch;
+ uint32_t num_io_channels;
+ char mempool_name[32];
+ int rc;
+
+ num_io_channels = __atomic_fetch_add(&dev->num_io_channels, 1, __ATOMIC_SEQ_CST);
+ if (num_io_channels >= dev->conf.max_io_channels) {
+ SPDK_ERRLOG("Reached maximum number of IO channels\n");
+ __atomic_fetch_sub(&dev->num_io_channels, 1, __ATOMIC_SEQ_CST);
+ return -1;
+ }
+
+ ioch = calloc(1, sizeof(*ioch));
+ if (ioch == NULL) {
+ SPDK_ERRLOG("Failed to allocate IO channel\n");
+ return -1;
+ }
+
+ rc = snprintf(mempool_name, sizeof(mempool_name), "ftl_io_%p", ioch);
+ if (rc < 0 || rc >= (int)sizeof(mempool_name)) {
+ SPDK_ERRLOG("Failed to create IO channel pool name\n");
+ free(ioch);
+ return -1;
+ }
+
+ ioch->cache_ioch = NULL;
+ ioch->index = FTL_IO_CHANNEL_INDEX_INVALID;
+ ioch->dev = dev;
+ ioch->elem_size = sizeof(struct ftl_md_io);
+ ioch->io_pool = spdk_mempool_create(mempool_name,
+ dev->conf.user_io_pool_size,
+ ioch->elem_size,
+ 0,
+ SPDK_ENV_SOCKET_ID_ANY);
+ if (!ioch->io_pool) {
+ SPDK_ERRLOG("Failed to create IO channel's IO pool\n");
+ free(ioch);
+ return -1;
+ }
+
+ ioch->base_ioch = spdk_bdev_get_io_channel(dev->base_bdev_desc);
+ if (!ioch->base_ioch) {
+ SPDK_ERRLOG("Failed to create base bdev IO channel\n");
+ goto fail_ioch;
+ }
+
+ if (ftl_dev_has_nv_cache(dev)) {
+ ioch->cache_ioch = spdk_bdev_get_io_channel(dev->nv_cache.bdev_desc);
+ if (!ioch->cache_ioch) {
+ SPDK_ERRLOG("Failed to create cache IO channel\n");
+ goto fail_cache;
+ }
+ }
+
+ TAILQ_INIT(&ioch->write_cmpl_queue);
+ TAILQ_INIT(&ioch->retry_queue);
+ ioch->poller = SPDK_POLLER_REGISTER(ftl_io_channel_poll, ioch, 0);
+ if (!ioch->poller) {
+ SPDK_ERRLOG("Failed to register IO channel poller\n");
+ goto fail_poller;
+ }
+
+ if (ftl_io_channel_init_wbuf(ioch)) {
+ SPDK_ERRLOG("Failed to initialize IO channel's write buffer\n");
+ goto fail_wbuf;
+ }
+
+ _ioch->ioch = ioch;
+
+ spdk_thread_send_msg(ftl_get_core_thread(dev), ftl_io_channel_register, ioch);
+
+ return 0;
+fail_wbuf:
+ spdk_poller_unregister(&ioch->poller);
+fail_poller:
+ if (ioch->cache_ioch) {
+ spdk_put_io_channel(ioch->cache_ioch);
+ }
+fail_cache:
+ spdk_put_io_channel(ioch->base_ioch);
+fail_ioch:
+ spdk_mempool_free(ioch->io_pool);
+ free(ioch);
+
+ return -1;
+}
+
+static void
+ftl_io_channel_unregister(void *ctx)
+{
+ struct ftl_io_channel *ioch = ctx;
+ struct spdk_ftl_dev *dev = ioch->dev;
+ uint32_t i, num_io_channels __attribute__((unused));
+
+ assert(ioch->index < dev->conf.max_io_channels);
+ assert(dev->ioch_array[ioch->index] == ioch);
+
+ dev->ioch_array[ioch->index] = NULL;
+ TAILQ_REMOVE(&dev->ioch_queue, ioch, tailq);
+
+ num_io_channels = __atomic_fetch_sub(&dev->num_io_channels, 1, __ATOMIC_SEQ_CST);
+ assert(num_io_channels > 0);
+
+ for (i = 0; i < ioch->num_entries; ++i) {
+ pthread_spin_destroy(&ioch->wbuf_entries[i].lock);
+ }
+
+ spdk_mempool_free(ioch->io_pool);
+ spdk_ring_free(ioch->free_queue);
+ spdk_ring_free(ioch->submit_queue);
+ spdk_free(ioch->wbuf_payload);
+ free(ioch->wbuf_entries);
+ free(ioch);
+}
+
+static void
+_ftl_io_channel_destroy_cb(void *ctx)
+{
+ struct ftl_io_channel *ioch = ctx;
+ struct spdk_ftl_dev *dev = ioch->dev;
+ uint32_t i;
+
+ /* Do not destroy the channel if some of its entries are still in use */
+ if (spdk_ring_count(ioch->free_queue) != ioch->num_entries) {
+ spdk_thread_send_msg(spdk_get_thread(), _ftl_io_channel_destroy_cb, ctx);
+ return;
+ }
+
+ /* Evict all valid entries from cache */
+ for (i = 0; i < ioch->num_entries; ++i) {
+ ftl_evict_cache_entry(dev, &ioch->wbuf_entries[i]);
+ }
+
+ spdk_poller_unregister(&ioch->poller);
+
+ spdk_put_io_channel(ioch->base_ioch);
+ if (ioch->cache_ioch) {
+ spdk_put_io_channel(ioch->cache_ioch);
+ }
+
+ ioch->base_ioch = NULL;
+ ioch->cache_ioch = NULL;
+
+ spdk_thread_send_msg(ftl_get_core_thread(dev), ftl_io_channel_unregister, ioch);
+}
+
+static void
+ftl_io_channel_destroy_cb(void *io_device, void *ctx)
+{
+ struct _ftl_io_channel *_ioch = ctx;
+ struct ftl_io_channel *ioch = _ioch->ioch;
+
+ /* Mark the IO channel as being flush to force out any unwritten entries */
+ ioch->flush = true;
+
+ _ftl_io_channel_destroy_cb(ioch);
+}
+
+static int
+ftl_dev_init_io_channel(struct spdk_ftl_dev *dev)
+{
+ struct ftl_batch *batch;
+ uint32_t i;
+
+ /* Align the IO channels to nearest power of 2 to allow for easy addr bit shift */
+ dev->conf.max_io_channels = spdk_align32pow2(dev->conf.max_io_channels);
+ dev->ioch_shift = spdk_u32log2(dev->conf.max_io_channels);
+
+ dev->ioch_array = calloc(dev->conf.max_io_channels, sizeof(*dev->ioch_array));
+ if (!dev->ioch_array) {
+ SPDK_ERRLOG("Failed to allocate IO channel array\n");
+ return -1;
+ }
+
+ if (dev->md_size > 0) {
+ dev->md_buf = spdk_zmalloc(dev->md_size * dev->xfer_size * FTL_BATCH_COUNT,
+ dev->md_size, NULL, SPDK_ENV_LCORE_ID_ANY,
+ SPDK_MALLOC_DMA);
+ if (dev->md_buf == NULL) {
+ SPDK_ERRLOG("Failed to allocate metadata buffer\n");
+ return -1;
+ }
+ }
+
+ dev->iov_buf = calloc(FTL_BATCH_COUNT, dev->xfer_size * sizeof(struct iovec));
+ if (!dev->iov_buf) {
+ SPDK_ERRLOG("Failed to allocate iovec buffer\n");
+ return -1;
+ }
+
+ TAILQ_INIT(&dev->free_batches);
+ TAILQ_INIT(&dev->pending_batches);
+ TAILQ_INIT(&dev->ioch_queue);
+
+ for (i = 0; i < FTL_BATCH_COUNT; ++i) {
+ batch = &dev->batch_array[i];
+ batch->iov = &dev->iov_buf[i * dev->xfer_size];
+ batch->num_entries = 0;
+ batch->index = i;
+ TAILQ_INIT(&batch->entries);
+ if (dev->md_buf != NULL) {
+ batch->metadata = (char *)dev->md_buf + i * dev->xfer_size * dev->md_size;
+ }
+
+ TAILQ_INSERT_TAIL(&dev->free_batches, batch, tailq);
+ }
+
+ dev->num_io_channels = 0;
+
+ spdk_io_device_register(dev, ftl_io_channel_create_cb, ftl_io_channel_destroy_cb,
+ sizeof(struct _ftl_io_channel),
+ NULL);
+
+ return 0;
+}
+
+static int
+ftl_dev_init_base_bdev(struct spdk_ftl_dev *dev, const char *bdev_name)
+{
+ uint32_t block_size;
+ uint64_t num_blocks;
+ struct spdk_bdev *bdev;
+
+ bdev = spdk_bdev_get_by_name(bdev_name);
+ if (!bdev) {
+ SPDK_ERRLOG("Unable to find bdev: %s\n", bdev_name);
+ return -1;
+ }
+
+ if (!spdk_bdev_is_zoned(bdev)) {
+ SPDK_ERRLOG("Bdev dosen't support zone capabilities: %s\n",
+ spdk_bdev_get_name(bdev));
+ return -1;
+ }
+
+ if (spdk_bdev_open_ext(bdev_name, true, ftl_bdev_event_cb,
+ dev, &dev->base_bdev_desc)) {
+ SPDK_ERRLOG("Unable to open bdev: %s\n", bdev_name);
+ return -1;
+ }
+
+ if (spdk_bdev_module_claim_bdev(bdev, dev->base_bdev_desc, &g_ftl_bdev_module)) {
+ spdk_bdev_close(dev->base_bdev_desc);
+ dev->base_bdev_desc = NULL;
+ SPDK_ERRLOG("Unable to claim bdev %s\n", bdev_name);
+ return -1;
+ }
+
+ dev->xfer_size = spdk_bdev_get_write_unit_size(bdev);
+ dev->md_size = spdk_bdev_get_md_size(bdev);
+
+ block_size = spdk_bdev_get_block_size(bdev);
+ if (block_size != FTL_BLOCK_SIZE) {
+ SPDK_ERRLOG("Unsupported block size (%"PRIu32")\n", block_size);
+ return -1;
+ }
+
+ num_blocks = spdk_bdev_get_num_blocks(bdev);
+ if (num_blocks % ftl_get_num_punits(dev)) {
+ SPDK_ERRLOG("Unsupported geometry. Base bdev block count must be multiple "
+ "of optimal number of zones.\n");
+ return -1;
+ }
+
+ if (ftl_is_append_supported(dev) &&
+ !spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZONE_APPEND)) {
+ SPDK_ERRLOG("Bdev dosen't support append: %s\n",
+ spdk_bdev_get_name(bdev));
+ return -1;
+ }
+
+ dev->num_bands = num_blocks / (ftl_get_num_punits(dev) * ftl_get_num_blocks_in_zone(dev));
+ dev->addr_len = spdk_u64log2(num_blocks) + 1;
+
+ return 0;
+}
+
+static void
+ftl_lba_map_request_dtor(struct spdk_mempool *mp, void *opaque, void *obj, unsigned obj_idx)
+{
+ struct ftl_lba_map_request *request = obj;
+
+ spdk_bit_array_free(&request->segments);
+}
+
+static void
+ftl_release_bdev(struct spdk_bdev_desc *bdev_desc)
+{
+ if (!bdev_desc) {
+ return;
+ }
+
+ spdk_bdev_module_release_bdev(spdk_bdev_desc_get_bdev(bdev_desc));
+ spdk_bdev_close(bdev_desc);
+}
+
+static void
+ftl_dev_free_sync(struct spdk_ftl_dev *dev)
+{
+ struct spdk_ftl_dev *iter;
+ size_t i;
+
+ if (!dev) {
+ return;
+ }
+
+ pthread_mutex_lock(&g_ftl_queue_lock);
+ STAILQ_FOREACH(iter, &g_ftl_queue, stailq) {
+ if (iter == dev) {
+ STAILQ_REMOVE(&g_ftl_queue, dev, spdk_ftl_dev, stailq);
+ break;
+ }
+ }
+ pthread_mutex_unlock(&g_ftl_queue_lock);
+
+ assert(LIST_EMPTY(&dev->wptr_list));
+ assert(dev->current_batch == NULL);
+
+ ftl_dev_dump_bands(dev);
+ ftl_dev_dump_stats(dev);
+
+ if (dev->bands) {
+ for (i = 0; i < ftl_get_num_bands(dev); ++i) {
+ free(dev->bands[i].zone_buf);
+ spdk_bit_array_free(&dev->bands[i].lba_map.vld);
+ spdk_bit_array_free(&dev->bands[i].reloc_bitmap);
+ }
+ }
+
+ spdk_dma_free(dev->nv_cache.dma_buf);
+
+ spdk_mempool_free(dev->lba_pool);
+ spdk_mempool_free(dev->nv_cache.md_pool);
+ spdk_mempool_free(dev->media_events_pool);
+ if (dev->lba_request_pool) {
+ spdk_mempool_obj_iter(dev->lba_request_pool, ftl_lba_map_request_dtor, NULL);
+ }
+ spdk_mempool_free(dev->lba_request_pool);
+
+ ftl_reloc_free(dev->reloc);
+
+ ftl_release_bdev(dev->nv_cache.bdev_desc);
+ ftl_release_bdev(dev->base_bdev_desc);
+
+ spdk_free(dev->md_buf);
+
+ assert(dev->num_io_channels == 0);
+ free(dev->ioch_array);
+ free(dev->iov_buf);
+ free(dev->name);
+ free(dev->bands);
+ if (dev->l2p_pmem_len != 0) {
+#ifdef SPDK_CONFIG_PMDK
+ pmem_unmap(dev->l2p, dev->l2p_pmem_len);
+#endif /* SPDK_CONFIG_PMDK */
+ } else {
+ free(dev->l2p);
+ }
+ free((char *)dev->conf.l2p_path);
+ free(dev);
+}
+
+int
+spdk_ftl_dev_init(const struct spdk_ftl_dev_init_opts *_opts, spdk_ftl_init_fn cb_fn, void *cb_arg)
+{
+ struct spdk_ftl_dev *dev;
+ struct spdk_ftl_dev_init_opts opts = *_opts;
+ struct ftl_dev_init_ctx *init_ctx = NULL;
+ int rc = -ENOMEM;
+
+ dev = calloc(1, sizeof(*dev));
+ if (!dev) {
+ return -ENOMEM;
+ }
+
+ init_ctx = calloc(1, sizeof(*init_ctx));
+ if (!init_ctx) {
+ goto fail_sync;
+ }
+
+ init_ctx->dev = dev;
+ init_ctx->opts = *_opts;
+ init_ctx->cb_fn = cb_fn;
+ init_ctx->cb_arg = cb_arg;
+ init_ctx->thread = spdk_get_thread();
+
+ if (!opts.conf) {
+ opts.conf = &g_default_conf;
+ }
+
+ if (!opts.base_bdev) {
+ SPDK_ERRLOG("Lack of underlying device in configuration\n");
+ rc = -EINVAL;
+ goto fail_sync;
+ }
+
+ dev->conf = *opts.conf;
+ dev->limit = SPDK_FTL_LIMIT_MAX;
+
+ dev->name = strdup(opts.name);
+ if (!dev->name) {
+ SPDK_ERRLOG("Unable to set device name\n");
+ goto fail_sync;
+ }
+
+ if (ftl_dev_init_base_bdev(dev, opts.base_bdev)) {
+ SPDK_ERRLOG("Unsupported underlying device\n");
+ goto fail_sync;
+ }
+
+ if (opts.conf->l2p_path) {
+ dev->conf.l2p_path = strdup(opts.conf->l2p_path);
+ if (!dev->conf.l2p_path) {
+ rc = -ENOMEM;
+ goto fail_sync;
+ }
+ }
+
+ /* In case of errors, we free all of the memory in ftl_dev_free_sync(), */
+ /* so we don't have to clean up in each of the init functions. */
+ if (ftl_check_conf(dev, opts.conf)) {
+ SPDK_ERRLOG("Invalid device configuration\n");
+ goto fail_sync;
+ }
+
+ if (ftl_init_lba_map_pools(dev)) {
+ SPDK_ERRLOG("Unable to init LBA map pools\n");
+ goto fail_sync;
+ }
+
+ if (ftl_init_media_events_pool(dev)) {
+ SPDK_ERRLOG("Unable to init media events pools\n");
+ goto fail_sync;
+ }
+
+ ftl_init_wptr_list(dev);
+
+ if (ftl_dev_init_bands(dev)) {
+ SPDK_ERRLOG("Unable to initialize band array\n");
+ goto fail_sync;
+ }
+
+ if (ftl_dev_init_nv_cache(dev, opts.cache_bdev)) {
+ SPDK_ERRLOG("Unable to initialize persistent cache\n");
+ goto fail_sync;
+ }
+
+ dev->reloc = ftl_reloc_init(dev);
+ if (!dev->reloc) {
+ SPDK_ERRLOG("Unable to initialize reloc structures\n");
+ goto fail_sync;
+ }
+
+ if (ftl_dev_init_io_channel(dev)) {
+ SPDK_ERRLOG("Unable to initialize IO channels\n");
+ goto fail_sync;
+ }
+
+ if (ftl_dev_init_zones(init_ctx)) {
+ SPDK_ERRLOG("Failed to initialize zones\n");
+ goto fail_async;
+ }
+
+ return 0;
+fail_sync:
+ ftl_dev_free_sync(dev);
+ ftl_dev_free_init_ctx(init_ctx);
+ return rc;
+fail_async:
+ ftl_init_fail(init_ctx);
+ return 0;
+}
+
+static void
+_ftl_halt_defrag(void *arg)
+{
+ ftl_reloc_halt(((struct spdk_ftl_dev *)arg)->reloc);
+}
+
+static void
+ftl_halt_complete_cb(void *ctx)
+{
+ struct ftl_dev_init_ctx *fini_ctx = ctx;
+ struct spdk_ftl_dev *dev = fini_ctx->dev;
+
+ /* Make sure core IO channel has already been released */
+ if (dev->num_io_channels > 0) {
+ spdk_thread_send_msg(spdk_get_thread(), ftl_halt_complete_cb, ctx);
+ return;
+ }
+
+ spdk_io_device_unregister(fini_ctx->dev, NULL);
+
+ ftl_dev_free_sync(fini_ctx->dev);
+ if (fini_ctx->cb_fn != NULL) {
+ fini_ctx->cb_fn(NULL, fini_ctx->cb_arg, fini_ctx->halt_complete_status);
+ }
+
+ ftl_dev_free_init_ctx(fini_ctx);
+}
+
+static void
+ftl_put_io_channel_cb(void *ctx)
+{
+ struct ftl_dev_init_ctx *fini_ctx = ctx;
+ struct spdk_ftl_dev *dev = fini_ctx->dev;
+
+ spdk_put_io_channel(dev->ioch);
+ spdk_thread_send_msg(spdk_get_thread(), ftl_halt_complete_cb, ctx);
+}
+
+static void
+ftl_nv_cache_header_fini_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct ftl_dev_init_ctx *fini_ctx = cb_arg;
+ int rc = 0;
+
+ spdk_bdev_free_io(bdev_io);
+ if (spdk_unlikely(!success)) {
+ SPDK_ERRLOG("Failed to write non-volatile cache metadata header\n");
+ rc = -EIO;
+ }
+
+ fini_ctx->halt_complete_status = rc;
+ spdk_thread_send_msg(fini_ctx->thread, ftl_put_io_channel_cb, fini_ctx);
+}
+
+static int
+ftl_halt_poller(void *ctx)
+{
+ struct ftl_dev_init_ctx *fini_ctx = ctx;
+ struct spdk_ftl_dev *dev = fini_ctx->dev;
+
+ if (!dev->core_poller) {
+ spdk_poller_unregister(&fini_ctx->poller);
+
+ if (ftl_dev_has_nv_cache(dev)) {
+ ftl_nv_cache_write_header(&dev->nv_cache, true,
+ ftl_nv_cache_header_fini_cb, fini_ctx);
+ } else {
+ fini_ctx->halt_complete_status = 0;
+ spdk_thread_send_msg(fini_ctx->thread, ftl_put_io_channel_cb, fini_ctx);
+ }
+ }
+
+ return SPDK_POLLER_BUSY;
+}
+
+static void
+ftl_add_halt_poller(void *ctx)
+{
+ struct ftl_dev_init_ctx *fini_ctx = ctx;
+ struct spdk_ftl_dev *dev = fini_ctx->dev;
+
+ dev->halt = 1;
+
+ _ftl_halt_defrag(dev);
+
+ assert(!fini_ctx->poller);
+ fini_ctx->poller = SPDK_POLLER_REGISTER(ftl_halt_poller, fini_ctx, 100);
+}
+
+static int
+ftl_dev_free(struct spdk_ftl_dev *dev, spdk_ftl_init_fn cb_fn, void *cb_arg,
+ struct spdk_thread *thread)
+{
+ struct ftl_dev_init_ctx *fini_ctx;
+
+ if (dev->halt_started) {
+ dev->halt_started = true;
+ return -EBUSY;
+ }
+
+ fini_ctx = calloc(1, sizeof(*fini_ctx));
+ if (!fini_ctx) {
+ return -ENOMEM;
+ }
+
+ fini_ctx->dev = dev;
+ fini_ctx->cb_fn = cb_fn;
+ fini_ctx->cb_arg = cb_arg;
+ fini_ctx->thread = thread;
+
+ spdk_thread_send_msg(ftl_get_core_thread(dev), ftl_add_halt_poller, fini_ctx);
+ return 0;
+}
+
+int
+spdk_ftl_dev_free(struct spdk_ftl_dev *dev, spdk_ftl_init_fn cb_fn, void *cb_arg)
+{
+ return ftl_dev_free(dev, cb_fn, cb_arg, spdk_get_thread());
+}
+
+SPDK_LOG_REGISTER_COMPONENT("ftl_init", SPDK_LOG_FTL_INIT)
diff --git a/src/spdk/lib/ftl/ftl_io.c b/src/spdk/lib/ftl/ftl_io.c
new file mode 100644
index 000000000..39a845bae
--- /dev/null
+++ b/src/spdk/lib/ftl/ftl_io.c
@@ -0,0 +1,563 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+#include "spdk/ftl.h"
+#include "spdk/likely.h"
+#include "spdk/util.h"
+
+#include "ftl_io.h"
+#include "ftl_core.h"
+#include "ftl_band.h"
+#include "ftl_debug.h"
+
+void
+ftl_io_inc_req(struct ftl_io *io)
+{
+ struct ftl_band *band = io->band;
+
+ if (!(io->flags & FTL_IO_CACHE) && io->type != FTL_IO_READ && io->type != FTL_IO_ERASE) {
+ ftl_band_acquire_lba_map(band);
+ }
+
+ __atomic_fetch_add(&io->dev->num_inflight, 1, __ATOMIC_SEQ_CST);
+
+ ++io->req_cnt;
+}
+
+void
+ftl_io_dec_req(struct ftl_io *io)
+{
+ struct ftl_band *band = io->band;
+ unsigned long num_inflight __attribute__((unused));
+
+ if (!(io->flags & FTL_IO_CACHE) && io->type != FTL_IO_READ && io->type != FTL_IO_ERASE) {
+ ftl_band_release_lba_map(band);
+ }
+
+ num_inflight = __atomic_fetch_sub(&io->dev->num_inflight, 1, __ATOMIC_SEQ_CST);
+
+ assert(num_inflight > 0);
+ assert(io->req_cnt > 0);
+
+ --io->req_cnt;
+}
+
+struct iovec *
+ftl_io_iovec(struct ftl_io *io)
+{
+ return &io->iov[0];
+}
+
+uint64_t
+ftl_io_get_lba(const struct ftl_io *io, size_t offset)
+{
+ assert(offset < io->num_blocks);
+
+ if (io->flags & FTL_IO_VECTOR_LBA) {
+ return io->lba.vector[offset];
+ } else {
+ return io->lba.single + offset;
+ }
+}
+
+uint64_t
+ftl_io_current_lba(const struct ftl_io *io)
+{
+ return ftl_io_get_lba(io, io->pos);
+}
+
+void
+ftl_io_advance(struct ftl_io *io, size_t num_blocks)
+{
+ struct iovec *iov = ftl_io_iovec(io);
+ size_t iov_blocks, block_left = num_blocks;
+
+ io->pos += num_blocks;
+
+ if (io->iov_cnt != 0) {
+ while (block_left > 0) {
+ assert(io->iov_pos < io->iov_cnt);
+ iov_blocks = iov[io->iov_pos].iov_len / FTL_BLOCK_SIZE;
+
+ if (io->iov_off + block_left < iov_blocks) {
+ io->iov_off += block_left;
+ break;
+ }
+
+ assert(iov_blocks > io->iov_off);
+ block_left -= (iov_blocks - io->iov_off);
+ io->iov_off = 0;
+ io->iov_pos++;
+ }
+ }
+
+ if (io->parent) {
+ ftl_io_advance(io->parent, num_blocks);
+ }
+}
+
+size_t
+ftl_iovec_num_blocks(struct iovec *iov, size_t iov_cnt)
+{
+ size_t num_blocks = 0, i = 0;
+
+ for (; i < iov_cnt; ++i) {
+ num_blocks += iov[i].iov_len / FTL_BLOCK_SIZE;
+ }
+
+ return num_blocks;
+}
+
+void *
+ftl_io_iovec_addr(struct ftl_io *io)
+{
+ assert(io->iov_pos < io->iov_cnt);
+ assert(io->iov_off * FTL_BLOCK_SIZE < ftl_io_iovec(io)[io->iov_pos].iov_len);
+
+ return (char *)ftl_io_iovec(io)[io->iov_pos].iov_base +
+ io->iov_off * FTL_BLOCK_SIZE;
+}
+
+size_t
+ftl_io_iovec_len_left(struct ftl_io *io)
+{
+ struct iovec *iov = ftl_io_iovec(io);
+ return iov[io->iov_pos].iov_len / FTL_BLOCK_SIZE - io->iov_off;
+}
+
+static void
+ftl_io_init_iovec(struct ftl_io *io, const struct iovec *iov, size_t iov_cnt, size_t iov_off,
+ size_t num_blocks)
+{
+ size_t offset = 0, num_left;
+
+ io->iov_pos = 0;
+ io->iov_cnt = 0;
+ io->num_blocks = num_blocks;
+
+ while (offset < num_blocks) {
+ assert(io->iov_cnt < FTL_IO_MAX_IOVEC && io->iov_cnt < iov_cnt);
+
+ num_left = spdk_min(iov[io->iov_cnt].iov_len / FTL_BLOCK_SIZE - iov_off,
+ num_blocks);
+ io->iov[io->iov_cnt].iov_base = (char *)iov[io->iov_cnt].iov_base +
+ iov_off * FTL_BLOCK_SIZE;
+ io->iov[io->iov_cnt].iov_len = num_left * FTL_BLOCK_SIZE;
+
+ offset += num_left;
+ io->iov_cnt++;
+ iov_off = 0;
+ }
+}
+
+void
+ftl_io_shrink_iovec(struct ftl_io *io, size_t num_blocks)
+{
+ size_t iov_off = 0, block_off = 0;
+
+ assert(io->num_blocks >= num_blocks);
+ assert(io->pos == 0 && io->iov_pos == 0 && io->iov_off == 0);
+
+ for (; iov_off < io->iov_cnt; ++iov_off) {
+ size_t num_iov = io->iov[iov_off].iov_len / FTL_BLOCK_SIZE;
+ size_t num_left = num_blocks - block_off;
+
+ if (num_iov >= num_left) {
+ io->iov[iov_off].iov_len = num_left * FTL_BLOCK_SIZE;
+ io->iov_cnt = iov_off + 1;
+ io->num_blocks = num_blocks;
+ break;
+ }
+
+ block_off += num_iov;
+ }
+}
+
+static void
+ftl_io_init(struct ftl_io *io, struct spdk_ftl_dev *dev,
+ ftl_io_fn fn, void *ctx, int flags, int type)
+{
+ io->flags |= flags | FTL_IO_INITIALIZED;
+ io->type = type;
+ io->dev = dev;
+ io->lba.single = FTL_LBA_INVALID;
+ io->addr.offset = FTL_ADDR_INVALID;
+ io->cb_fn = fn;
+ io->cb_ctx = ctx;
+ io->trace = ftl_trace_alloc_id(dev);
+}
+
+struct ftl_io *
+ftl_io_init_internal(const struct ftl_io_init_opts *opts)
+{
+ struct ftl_io *io = opts->io;
+ struct ftl_io *parent = opts->parent;
+ struct spdk_ftl_dev *dev = opts->dev;
+ const struct iovec *iov;
+ size_t iov_cnt, iov_off;
+
+ if (!io) {
+ if (parent) {
+ io = ftl_io_alloc_child(parent);
+ } else {
+ io = ftl_io_alloc(ftl_get_io_channel(dev));
+ }
+
+ if (!io) {
+ return NULL;
+ }
+ }
+
+ ftl_io_clear(io);
+ ftl_io_init(io, dev, opts->cb_fn, opts->cb_ctx, opts->flags | FTL_IO_INTERNAL, opts->type);
+
+ io->batch = opts->batch;
+ io->band = opts->band;
+ io->md = opts->md;
+ io->iov = &io->iov_buf[0];
+
+ if (parent) {
+ if (parent->flags & FTL_IO_VECTOR_LBA) {
+ io->lba.vector = parent->lba.vector + parent->pos;
+ } else {
+ io->lba.single = parent->lba.single + parent->pos;
+ }
+
+ iov = &parent->iov[parent->iov_pos];
+ iov_cnt = parent->iov_cnt - parent->iov_pos;
+ iov_off = parent->iov_off;
+ } else {
+ iov = &opts->iovs[0];
+ iov_cnt = opts->iovcnt;
+ iov_off = 0;
+ }
+
+ /* Some requests (zone resets) do not use iovecs */
+ if (iov_cnt > 0) {
+ ftl_io_init_iovec(io, iov, iov_cnt, iov_off, opts->num_blocks);
+ }
+
+ if (opts->flags & FTL_IO_VECTOR_LBA) {
+ io->lba.vector = calloc(io->num_blocks, sizeof(uint64_t));
+ if (!io->lba.vector) {
+ ftl_io_free(io);
+ return NULL;
+ }
+ }
+
+ return io;
+}
+
+struct ftl_io *
+ftl_io_wbuf_init(struct spdk_ftl_dev *dev, struct ftl_addr addr, struct ftl_band *band,
+ struct ftl_batch *batch, ftl_io_fn cb)
+{
+ struct ftl_io *io;
+ struct ftl_io_init_opts opts = {
+ .dev = dev,
+ .io = NULL,
+ .batch = batch,
+ .band = band,
+ .size = sizeof(struct ftl_io),
+ .flags = 0,
+ .type = FTL_IO_WRITE,
+ .num_blocks = dev->xfer_size,
+ .cb_fn = cb,
+ .iovcnt = dev->xfer_size,
+ .md = batch->metadata,
+ };
+
+ memcpy(opts.iovs, batch->iov, sizeof(struct iovec) * dev->xfer_size);
+
+ io = ftl_io_init_internal(&opts);
+ if (!io) {
+ return NULL;
+ }
+
+ io->addr = addr;
+
+ return io;
+}
+
+struct ftl_io *
+ftl_io_erase_init(struct ftl_band *band, size_t num_blocks, ftl_io_fn cb)
+{
+ struct ftl_io *io;
+ struct ftl_io_init_opts opts = {
+ .dev = band->dev,
+ .io = NULL,
+ .band = band,
+ .size = sizeof(struct ftl_io),
+ .flags = FTL_IO_PHYSICAL_MODE,
+ .type = FTL_IO_ERASE,
+ .num_blocks = 1,
+ .cb_fn = cb,
+ .iovcnt = 0,
+ .md = NULL,
+ };
+
+ io = ftl_io_init_internal(&opts);
+ if (!io) {
+ return NULL;
+ }
+
+ io->num_blocks = num_blocks;
+
+ return io;
+}
+
+static void
+_ftl_user_cb(struct ftl_io *io, void *arg, int status)
+{
+ io->user_fn(arg, status);
+}
+
+struct ftl_io *
+ftl_io_user_init(struct spdk_io_channel *_ioch, uint64_t lba, size_t num_blocks, struct iovec *iov,
+ size_t iov_cnt, spdk_ftl_fn cb_fn, void *cb_ctx, int type)
+{
+ struct ftl_io_channel *ioch = ftl_io_channel_get_ctx(_ioch);
+ struct spdk_ftl_dev *dev = ioch->dev;
+ struct ftl_io *io;
+
+ io = ftl_io_alloc(_ioch);
+ if (spdk_unlikely(!io)) {
+ return NULL;
+ }
+
+ ftl_io_init(io, dev, _ftl_user_cb, cb_ctx, 0, type);
+ io->lba.single = lba;
+ io->user_fn = cb_fn;
+ io->iov = iov;
+ io->iov_cnt = iov_cnt;
+ io->num_blocks = num_blocks;
+
+ ftl_trace_lba_io_init(io->dev, io);
+ return io;
+}
+
+static void
+_ftl_io_free(struct ftl_io *io)
+{
+ struct ftl_io_channel *ioch;
+
+ assert(LIST_EMPTY(&io->children));
+
+ if (io->flags & FTL_IO_VECTOR_LBA) {
+ free(io->lba.vector);
+ }
+
+ if (pthread_spin_destroy(&io->lock)) {
+ SPDK_ERRLOG("pthread_spin_destroy failed\n");
+ }
+
+ ioch = ftl_io_channel_get_ctx(io->ioch);
+ spdk_mempool_put(ioch->io_pool, io);
+}
+
+static bool
+ftl_io_remove_child(struct ftl_io *io)
+{
+ struct ftl_io *parent = io->parent;
+ bool parent_done;
+
+ pthread_spin_lock(&parent->lock);
+ LIST_REMOVE(io, child_entry);
+ parent_done = parent->done && LIST_EMPTY(&parent->children);
+ parent->status = parent->status ? : io->status;
+ pthread_spin_unlock(&parent->lock);
+
+ return parent_done;
+}
+
+void
+ftl_io_complete(struct ftl_io *io)
+{
+ struct ftl_io *parent = io->parent;
+ bool complete;
+
+ io->flags &= ~FTL_IO_INITIALIZED;
+
+ pthread_spin_lock(&io->lock);
+ complete = LIST_EMPTY(&io->children);
+ io->done = true;
+ pthread_spin_unlock(&io->lock);
+
+ if (complete) {
+ if (io->cb_fn) {
+ io->cb_fn(io, io->cb_ctx, io->status);
+ }
+
+ if (parent && ftl_io_remove_child(io)) {
+ ftl_io_complete(parent);
+ }
+
+ _ftl_io_free(io);
+ }
+}
+
+struct ftl_io *
+ftl_io_alloc_child(struct ftl_io *parent)
+{
+ struct ftl_io *io;
+
+ io = ftl_io_alloc(parent->ioch);
+ if (spdk_unlikely(!io)) {
+ return NULL;
+ }
+
+ ftl_io_init(io, parent->dev, NULL, NULL, parent->flags, parent->type);
+ io->parent = parent;
+
+ pthread_spin_lock(&parent->lock);
+ LIST_INSERT_HEAD(&parent->children, io, child_entry);
+ pthread_spin_unlock(&parent->lock);
+
+ return io;
+}
+
+void ftl_io_fail(struct ftl_io *io, int status)
+{
+ io->status = status;
+ ftl_io_advance(io, io->num_blocks - io->pos);
+}
+
+void *
+ftl_io_get_md(const struct ftl_io *io)
+{
+ if (!io->md) {
+ return NULL;
+ }
+
+ return (char *)io->md + io->pos * io->dev->md_size;
+}
+
+struct ftl_io *
+ftl_io_alloc(struct spdk_io_channel *ch)
+{
+ struct ftl_io *io;
+ struct ftl_io_channel *ioch = ftl_io_channel_get_ctx(ch);
+
+ io = spdk_mempool_get(ioch->io_pool);
+ if (!io) {
+ return NULL;
+ }
+
+ memset(io, 0, ioch->elem_size);
+ io->ioch = ch;
+
+ if (pthread_spin_init(&io->lock, PTHREAD_PROCESS_PRIVATE)) {
+ SPDK_ERRLOG("pthread_spin_init failed\n");
+ spdk_mempool_put(ioch->io_pool, io);
+ return NULL;
+ }
+
+ return io;
+}
+
+void
+ftl_io_reinit(struct ftl_io *io, ftl_io_fn cb, void *ctx, int flags, int type)
+{
+ ftl_io_clear(io);
+ ftl_io_init(io, io->dev, cb, ctx, flags, type);
+}
+
+void
+ftl_io_clear(struct ftl_io *io)
+{
+ ftl_io_reset(io);
+
+ io->flags = 0;
+ io->batch = NULL;
+ io->band = NULL;
+}
+
+void
+ftl_io_reset(struct ftl_io *io)
+{
+ io->req_cnt = io->pos = io->iov_pos = io->iov_off = 0;
+ io->done = false;
+}
+
+void
+ftl_io_free(struct ftl_io *io)
+{
+ struct ftl_io *parent;
+
+ if (!io) {
+ return;
+ }
+
+ parent = io->parent;
+ if (parent && ftl_io_remove_child(io)) {
+ ftl_io_complete(parent);
+ }
+
+ _ftl_io_free(io);
+}
+
+void
+ftl_io_call_foreach_child(struct ftl_io *io, int (*callback)(struct ftl_io *))
+{
+ struct ftl_io *child, *tmp;
+
+ assert(!io->done);
+
+ /*
+ * If the IO doesn't have any children, it means that it directly describes a request (i.e.
+ * all of the buffers, LBAs, etc. are filled). Otherwise the IO only groups together several
+ * requests and may be partially filled, so the callback needs to be called on all of its
+ * children instead.
+ */
+ if (LIST_EMPTY(&io->children)) {
+ callback(io);
+ return;
+ }
+
+ LIST_FOREACH_SAFE(child, &io->children, child_entry, tmp) {
+ int rc = callback(child);
+ if (rc) {
+ assert(rc != -EAGAIN);
+ ftl_io_fail(io, rc);
+ break;
+ }
+ }
+
+ /*
+ * If all the callbacks were processed or an error occurred, treat this IO as completed.
+ * Multiple calls to ftl_io_call_foreach_child are not supported, resubmissions are supposed
+ * to be handled in the callback.
+ */
+ ftl_io_complete(io);
+}
diff --git a/src/spdk/lib/ftl/ftl_io.h b/src/spdk/lib/ftl/ftl_io.h
new file mode 100644
index 000000000..d49dc3de7
--- /dev/null
+++ b/src/spdk/lib/ftl/ftl_io.h
@@ -0,0 +1,351 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FTL_IO_H
+#define FTL_IO_H
+
+#include "spdk/stdinc.h"
+#include "spdk/nvme.h"
+#include "spdk/ftl.h"
+
+#include "ftl_addr.h"
+#include "ftl_trace.h"
+
+struct spdk_ftl_dev;
+struct ftl_band;
+struct ftl_batch;
+struct ftl_io;
+
+typedef int (*ftl_md_pack_fn)(struct ftl_band *);
+typedef void (*ftl_io_fn)(struct ftl_io *, void *, int);
+
+/* IO flags */
+enum ftl_io_flags {
+ /* Indicates whether IO is already initialized */
+ FTL_IO_INITIALIZED = (1 << 0),
+ /* Internal based IO (defrag, metadata etc.) */
+ FTL_IO_INTERNAL = (1 << 1),
+ /* Indicates that the IO should not go through if there's */
+ /* already another one scheduled to the same LBA */
+ FTL_IO_WEAK = (1 << 2),
+ /* Indicates that the IO is used for padding */
+ FTL_IO_PAD = (1 << 3),
+ /* The IO operates on metadata */
+ FTL_IO_MD = (1 << 4),
+ /* Using physical instead of logical address */
+ FTL_IO_PHYSICAL_MODE = (1 << 5),
+ /* Indicates that IO contains noncontiguous LBAs */
+ FTL_IO_VECTOR_LBA = (1 << 6),
+ /* The IO is directed to non-volatile cache */
+ FTL_IO_CACHE = (1 << 7),
+ /* Indicates that physical address should be taken from IO struct, */
+ /* not assigned by wptr, only works if wptr is also in direct mode */
+ FTL_IO_DIRECT_ACCESS = (1 << 8),
+ /* Bypass the non-volatile cache */
+ FTL_IO_BYPASS_CACHE = (1 << 9),
+};
+
+enum ftl_io_type {
+ FTL_IO_READ,
+ FTL_IO_WRITE,
+ FTL_IO_ERASE,
+};
+
+#define FTL_IO_MAX_IOVEC 64
+
+struct ftl_io_init_opts {
+ struct spdk_ftl_dev *dev;
+
+ /* IO descriptor */
+ struct ftl_io *io;
+
+ /* Parent request */
+ struct ftl_io *parent;
+
+ /* Size of IO descriptor */
+ size_t size;
+
+ /* IO flags */
+ int flags;
+
+ /* IO type */
+ enum ftl_io_type type;
+
+ /* Transfer batch, set for IO going through the write buffer */
+ struct ftl_batch *batch;
+
+ /* Band to which the IO is directed */
+ struct ftl_band *band;
+
+ /* Number of logical blocks */
+ size_t num_blocks;
+
+ /* Data */
+ struct iovec iovs[FTL_IO_MAX_IOVEC];
+ int iovcnt;
+
+ /* Metadata */
+ void *md;
+
+ /* Callback's function */
+ ftl_io_fn cb_fn;
+
+ /* Callback's context */
+ void *cb_ctx;
+};
+
+struct ftl_io_channel;
+
+struct ftl_wbuf_entry {
+ /* IO channel that owns the write bufer entry */
+ struct ftl_io_channel *ioch;
+ /* Data payload (single block) */
+ void *payload;
+ /* Index within the IO channel's wbuf_entries array */
+ uint32_t index;
+ uint32_t io_flags;
+ /* Points at the band the data is copied from. Only valid for internal
+ * requests coming from reloc.
+ */
+ struct ftl_band *band;
+ /* Physical address of that particular block. Valid once the data has
+ * been written out.
+ */
+ struct ftl_addr addr;
+ /* Logical block address */
+ uint64_t lba;
+
+ /* Trace ID of the requests the entry is part of */
+ uint64_t trace;
+
+ /* Indicates that the entry was written out and is still present in the
+ * L2P table.
+ */
+ bool valid;
+ /* Lock that protects the entry from being evicted from the L2P */
+ pthread_spinlock_t lock;
+ TAILQ_ENTRY(ftl_wbuf_entry) tailq;
+};
+
+#define FTL_IO_CHANNEL_INDEX_INVALID ((uint64_t)-1)
+
+struct ftl_io_channel {
+ /* Device */
+ struct spdk_ftl_dev *dev;
+ /* IO pool element size */
+ size_t elem_size;
+ /* Index within the IO channel array */
+ uint64_t index;
+ /* IO pool */
+ struct spdk_mempool *io_pool;
+ /* Underlying device IO channel */
+ struct spdk_io_channel *base_ioch;
+ /* Persistent cache IO channel */
+ struct spdk_io_channel *cache_ioch;
+ /* Poller used for completing write requests and retrying IO */
+ struct spdk_poller *poller;
+ /* Write completion queue */
+ TAILQ_HEAD(, ftl_io) write_cmpl_queue;
+ TAILQ_HEAD(, ftl_io) retry_queue;
+ TAILQ_ENTRY(ftl_io_channel) tailq;
+
+ /* Array of write buffer entries */
+ struct ftl_wbuf_entry *wbuf_entries;
+ /* Write buffer data payload */
+ void *wbuf_payload;
+ /* Number of write buffer entries */
+ uint32_t num_entries;
+ /* Write buffer queues */
+ struct spdk_ring *free_queue;
+ struct spdk_ring *submit_queue;
+ /* Maximum number of concurrent user writes */
+ uint32_t qdepth_limit;
+ /* Current number of concurrent user writes */
+ uint32_t qdepth_current;
+ /* Means that the IO channel is being flushed */
+ bool flush;
+};
+
+/* General IO descriptor */
+struct ftl_io {
+ /* Device */
+ struct spdk_ftl_dev *dev;
+
+ /* IO channel */
+ struct spdk_io_channel *ioch;
+
+ union {
+ /* LBA table */
+ uint64_t *vector;
+
+ /* First LBA */
+ uint64_t single;
+ } lba;
+
+ /* First block address */
+ struct ftl_addr addr;
+
+ /* Number of processed blocks */
+ size_t pos;
+
+ /* Number of blocks */
+ size_t num_blocks;
+
+ /* IO vector pointer */
+ struct iovec *iov;
+
+ /* IO vector buffer for internal requests */
+ struct iovec iov_buf[FTL_IO_MAX_IOVEC];
+
+ /* Metadata */
+ void *md;
+
+ /* Number of IO vectors */
+ size_t iov_cnt;
+
+ /* Position within the iovec */
+ size_t iov_pos;
+
+ /* Offset within the iovec (in blocks) */
+ size_t iov_off;
+
+ /* Transfer batch (valid only for writes going through the write buffer) */
+ struct ftl_batch *batch;
+
+ /* Band this IO is being written to */
+ struct ftl_band *band;
+
+ /* Request status */
+ int status;
+
+ /* Number of split requests */
+ size_t req_cnt;
+
+ /* Callback's function */
+ ftl_io_fn cb_fn;
+
+ /* Callback's context */
+ void *cb_ctx;
+
+ /* User callback function */
+ spdk_ftl_fn user_fn;
+
+ /* Flags */
+ int flags;
+
+ /* IO type */
+ enum ftl_io_type type;
+
+ /* Done flag */
+ bool done;
+
+ /* Parent request */
+ struct ftl_io *parent;
+ /* Child requests list */
+ LIST_HEAD(, ftl_io) children;
+ /* Child list link */
+ LIST_ENTRY(ftl_io) child_entry;
+ /* Children lock */
+ pthread_spinlock_t lock;
+
+ /* Trace group id */
+ uint64_t trace;
+
+ /* Used by retry and write completion queues */
+ TAILQ_ENTRY(ftl_io) ioch_entry;
+};
+
+/* Metadata IO */
+struct ftl_md_io {
+ /* Parent IO structure */
+ struct ftl_io io;
+
+ /* Serialization/deserialization callback */
+ ftl_md_pack_fn pack_fn;
+
+ /* Callback's function */
+ ftl_io_fn cb_fn;
+
+ /* Callback's context */
+ void *cb_ctx;
+};
+
+static inline bool
+ftl_io_mode_physical(const struct ftl_io *io)
+{
+ return io->flags & FTL_IO_PHYSICAL_MODE;
+}
+
+static inline bool
+ftl_io_mode_logical(const struct ftl_io *io)
+{
+ return !ftl_io_mode_physical(io);
+}
+
+static inline bool
+ftl_io_done(const struct ftl_io *io)
+{
+ return io->req_cnt == 0 && io->pos == io->num_blocks;
+}
+
+struct ftl_io *ftl_io_alloc(struct spdk_io_channel *ch);
+struct ftl_io *ftl_io_alloc_child(struct ftl_io *parent);
+void ftl_io_fail(struct ftl_io *io, int status);
+void ftl_io_free(struct ftl_io *io);
+struct ftl_io *ftl_io_init_internal(const struct ftl_io_init_opts *opts);
+void ftl_io_reinit(struct ftl_io *io, ftl_io_fn cb,
+ void *ctx, int flags, int type);
+void ftl_io_clear(struct ftl_io *io);
+void ftl_io_inc_req(struct ftl_io *io);
+void ftl_io_dec_req(struct ftl_io *io);
+struct iovec *ftl_io_iovec(struct ftl_io *io);
+uint64_t ftl_io_current_lba(const struct ftl_io *io);
+uint64_t ftl_io_get_lba(const struct ftl_io *io, size_t offset);
+void ftl_io_advance(struct ftl_io *io, size_t num_blocks);
+size_t ftl_iovec_num_blocks(struct iovec *iov, size_t iov_cnt);
+void *ftl_io_iovec_addr(struct ftl_io *io);
+size_t ftl_io_iovec_len_left(struct ftl_io *io);
+struct ftl_io *ftl_io_wbuf_init(struct spdk_ftl_dev *dev, struct ftl_addr addr,
+ struct ftl_band *band, struct ftl_batch *batch, ftl_io_fn cb);
+struct ftl_io *ftl_io_erase_init(struct ftl_band *band, size_t num_blocks, ftl_io_fn cb);
+struct ftl_io *ftl_io_user_init(struct spdk_io_channel *ioch, uint64_t lba, size_t num_blocks,
+ struct iovec *iov, size_t iov_cnt, spdk_ftl_fn cb_fn,
+ void *cb_arg, int type);
+void *ftl_io_get_md(const struct ftl_io *io);
+void ftl_io_complete(struct ftl_io *io);
+void ftl_io_shrink_iovec(struct ftl_io *io, size_t num_blocks);
+void ftl_io_process_error(struct ftl_io *io, const struct spdk_nvme_cpl *status);
+void ftl_io_reset(struct ftl_io *io);
+void ftl_io_call_foreach_child(struct ftl_io *io, int (*callback)(struct ftl_io *));
+
+#endif /* FTL_IO_H */
diff --git a/src/spdk/lib/ftl/ftl_reloc.c b/src/spdk/lib/ftl/ftl_reloc.c
new file mode 100644
index 000000000..e59bf4d81
--- /dev/null
+++ b/src/spdk/lib/ftl/ftl_reloc.c
@@ -0,0 +1,860 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/likely.h"
+#include "spdk_internal/log.h"
+#include "spdk/ftl.h"
+
+#include "ftl_reloc.h"
+#include "ftl_core.h"
+#include "ftl_io.h"
+#include "ftl_band.h"
+#include "ftl_debug.h"
+
+/* Maximum active reloc moves */
+#define FTL_RELOC_MAX_MOVES 256
+
+struct ftl_reloc;
+struct ftl_band_reloc;
+
+enum ftl_reloc_move_state {
+ FTL_RELOC_STATE_READ_LBA_MAP,
+ FTL_RELOC_STATE_READ,
+ FTL_RELOC_STATE_WRITE,
+};
+
+enum ftl_band_reloc_state {
+ FTL_BAND_RELOC_STATE_INACTIVE,
+ FTL_BAND_RELOC_STATE_PENDING,
+ FTL_BAND_RELOC_STATE_ACTIVE,
+ FTL_BAND_RELOC_STATE_HIGH_PRIO
+};
+
+struct ftl_reloc_move {
+ struct ftl_band_reloc *breloc;
+
+ /* Start addr */
+ struct ftl_addr addr;
+
+ /* Number of logical blocks */
+ size_t num_blocks;
+
+ /* Data buffer */
+ void *data;
+
+ /* Move state (read lba_map, read, write) */
+ enum ftl_reloc_move_state state;
+
+ /* IO associated with move */
+ struct ftl_io *io;
+
+ STAILQ_ENTRY(ftl_reloc_move) entry;
+};
+
+struct ftl_band_reloc {
+ struct ftl_reloc *parent;
+
+ /* Band being relocated */
+ struct ftl_band *band;
+
+ /* Number of logical blocks to be relocated */
+ size_t num_blocks;
+
+ /* Bitmap of logical blocks to be relocated */
+ struct spdk_bit_array *reloc_map;
+
+ /* State of the band reloc */
+ enum ftl_band_reloc_state state;
+
+ /* The band is being defragged */
+ bool defrag;
+
+ /* Reloc map iterator */
+ struct {
+ /* Array of zone offsets */
+ size_t *zone_offset;
+
+ /* Current zone */
+ size_t zone_current;
+ } iter;
+
+ /* Number of outstanding moves */
+ size_t num_outstanding;
+
+ /* Pool of move objects */
+ struct ftl_reloc_move *moves;
+
+ /* Move queue */
+ STAILQ_HEAD(, ftl_reloc_move) move_queue;
+
+ TAILQ_ENTRY(ftl_band_reloc) entry;
+};
+
+struct ftl_reloc {
+ /* Device associated with relocate */
+ struct spdk_ftl_dev *dev;
+
+ /* Indicates relocate is about to halt */
+ bool halt;
+
+ /* Maximum number of IOs per band */
+ size_t max_qdepth;
+
+ /* Maximum number of active band relocates */
+ size_t max_active;
+
+ /* Maximum transfer size (in logical blocks) per single IO */
+ size_t xfer_size;
+ /* Number of bands being defragged */
+ size_t num_defrag_bands;
+
+ /* Array of band relocates */
+ struct ftl_band_reloc *brelocs;
+
+ /* Number of active/priority band relocates */
+ size_t num_active;
+
+ /* Priority band relocates queue */
+ TAILQ_HEAD(, ftl_band_reloc) prio_queue;
+
+ /* Active band relocates queue */
+ TAILQ_HEAD(, ftl_band_reloc) active_queue;
+
+ /* Pending band relocates queue */
+ TAILQ_HEAD(, ftl_band_reloc) pending_queue;
+};
+
+bool
+ftl_reloc_is_defrag_active(const struct ftl_reloc *reloc)
+{
+ return reloc->num_defrag_bands > 0;
+}
+
+static size_t
+ftl_reloc_iter_zone_offset(struct ftl_band_reloc *breloc)
+{
+ size_t zone = breloc->iter.zone_current;
+
+ return breloc->iter.zone_offset[zone];
+}
+
+static size_t
+ftl_reloc_iter_zone_done(struct ftl_band_reloc *breloc)
+{
+ size_t num_blocks = ftl_get_num_blocks_in_zone(breloc->parent->dev);
+
+ return ftl_reloc_iter_zone_offset(breloc) == num_blocks;
+}
+
+static void
+ftl_reloc_clr_block(struct ftl_band_reloc *breloc, size_t block_off)
+{
+ if (!spdk_bit_array_get(breloc->reloc_map, block_off)) {
+ return;
+ }
+
+ spdk_bit_array_clear(breloc->reloc_map, block_off);
+ assert(breloc->num_blocks);
+ breloc->num_blocks--;
+}
+
+static void
+ftl_reloc_read_lba_map_cb(struct ftl_io *io, void *arg, int status)
+{
+ struct ftl_reloc_move *move = arg;
+ struct ftl_band_reloc *breloc = move->breloc;
+
+ breloc->num_outstanding--;
+ assert(status == 0);
+ move->state = FTL_RELOC_STATE_WRITE;
+ STAILQ_INSERT_TAIL(&breloc->move_queue, move, entry);
+}
+
+static int
+ftl_reloc_read_lba_map(struct ftl_band_reloc *breloc, struct ftl_reloc_move *move)
+{
+ struct ftl_band *band = breloc->band;
+
+ breloc->num_outstanding++;
+ return ftl_band_read_lba_map(band, ftl_band_block_offset_from_addr(band, move->addr),
+ move->num_blocks, ftl_reloc_read_lba_map_cb, move);
+}
+
+static void
+ftl_reloc_prep(struct ftl_band_reloc *breloc)
+{
+ struct ftl_band *band = breloc->band;
+ struct ftl_reloc *reloc = breloc->parent;
+ struct ftl_reloc_move *move;
+ size_t i;
+
+ reloc->num_active++;
+
+ if (!band->high_prio) {
+ if (ftl_band_alloc_lba_map(band)) {
+ SPDK_ERRLOG("Failed to allocate lba map\n");
+ assert(false);
+ }
+ } else {
+ ftl_band_acquire_lba_map(band);
+ }
+
+ for (i = 0; i < reloc->max_qdepth; ++i) {
+ move = &breloc->moves[i];
+ move->state = FTL_RELOC_STATE_READ;
+ STAILQ_INSERT_TAIL(&breloc->move_queue, move, entry);
+ }
+}
+
+static void
+ftl_reloc_free_move(struct ftl_band_reloc *breloc, struct ftl_reloc_move *move)
+{
+ assert(move);
+ spdk_dma_free(move->data);
+ memset(move, 0, sizeof(*move));
+ move->state = FTL_RELOC_STATE_READ;
+}
+
+static void
+ftl_reloc_write_cb(struct ftl_io *io, void *arg, int status)
+{
+ struct ftl_reloc_move *move = arg;
+ struct ftl_addr addr = move->addr;
+ struct ftl_band_reloc *breloc = move->breloc;
+ size_t i;
+
+ breloc->num_outstanding--;
+
+ if (status) {
+ SPDK_ERRLOG("Reloc write failed with status: %d\n", status);
+ assert(false);
+ return;
+ }
+
+ for (i = 0; i < move->num_blocks; ++i) {
+ addr.offset = move->addr.offset + i;
+ size_t block_off = ftl_band_block_offset_from_addr(breloc->band, addr);
+ ftl_reloc_clr_block(breloc, block_off);
+ }
+
+ ftl_reloc_free_move(breloc, move);
+ STAILQ_INSERT_TAIL(&breloc->move_queue, move, entry);
+}
+
+static void
+ftl_reloc_read_cb(struct ftl_io *io, void *arg, int status)
+{
+ struct ftl_reloc_move *move = arg;
+ struct ftl_band_reloc *breloc = move->breloc;
+
+ breloc->num_outstanding--;
+
+ /* TODO: We should handle fail on relocation read. We need to inform */
+ /* user that this group of blocks is bad (update l2p with bad block address and */
+ /* put it to lba_map/sector_lba). Maybe we could also retry read with smaller granularity? */
+ if (status) {
+ SPDK_ERRLOG("Reloc read failed with status: %d\n", status);
+ assert(false);
+ return;
+ }
+
+ move->state = FTL_RELOC_STATE_READ_LBA_MAP;
+ move->io = NULL;
+ STAILQ_INSERT_TAIL(&breloc->move_queue, move, entry);
+}
+
+static void
+ftl_reloc_iter_reset(struct ftl_band_reloc *breloc)
+{
+ memset(breloc->iter.zone_offset, 0, ftl_get_num_punits(breloc->band->dev) *
+ sizeof(*breloc->iter.zone_offset));
+ breloc->iter.zone_current = 0;
+}
+
+static size_t
+ftl_reloc_iter_block_offset(struct ftl_band_reloc *breloc)
+{
+ size_t zone_offset = breloc->iter.zone_current * ftl_get_num_blocks_in_zone(breloc->parent->dev);
+
+ return breloc->iter.zone_offset[breloc->iter.zone_current] + zone_offset;
+}
+
+static void
+ftl_reloc_iter_next_zone(struct ftl_band_reloc *breloc)
+{
+ size_t num_zones = ftl_get_num_punits(breloc->band->dev);
+
+ breloc->iter.zone_current = (breloc->iter.zone_current + 1) % num_zones;
+}
+
+static int
+ftl_reloc_block_valid(struct ftl_band_reloc *breloc, size_t block_off)
+{
+ struct ftl_addr addr = ftl_band_addr_from_block_offset(breloc->band, block_off);
+
+ return ftl_addr_is_written(breloc->band, addr) &&
+ spdk_bit_array_get(breloc->reloc_map, block_off) &&
+ ftl_band_block_offset_valid(breloc->band, block_off);
+}
+
+static int
+ftl_reloc_iter_next(struct ftl_band_reloc *breloc, size_t *block_off)
+{
+ size_t zone = breloc->iter.zone_current;
+
+ *block_off = ftl_reloc_iter_block_offset(breloc);
+
+ if (ftl_reloc_iter_zone_done(breloc)) {
+ return 0;
+ }
+
+ breloc->iter.zone_offset[zone]++;
+
+ if (!ftl_reloc_block_valid(breloc, *block_off)) {
+ ftl_reloc_clr_block(breloc, *block_off);
+ return 0;
+ }
+
+ return 1;
+}
+
+static int
+ftl_reloc_first_valid_block(struct ftl_band_reloc *breloc, size_t *block_off)
+{
+ size_t i, num_blocks = ftl_get_num_blocks_in_zone(breloc->parent->dev);
+
+ for (i = ftl_reloc_iter_zone_offset(breloc); i < num_blocks; ++i) {
+ if (ftl_reloc_iter_next(breloc, block_off)) {
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+static int
+ftl_reloc_iter_done(struct ftl_band_reloc *breloc)
+{
+ size_t i;
+ size_t num_zones = ftl_get_num_punits(breloc->band->dev);
+ size_t num_blocks = ftl_get_num_blocks_in_zone(breloc->parent->dev);
+
+ for (i = 0; i < num_zones; ++i) {
+ if (breloc->iter.zone_offset[i] != num_blocks) {
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+static size_t
+ftl_reloc_find_valid_blocks(struct ftl_band_reloc *breloc,
+ size_t _num_blocks, struct ftl_addr *addr)
+{
+ size_t block_off, num_blocks = 0;
+
+ if (!ftl_reloc_first_valid_block(breloc, &block_off)) {
+ return 0;
+ }
+
+ *addr = ftl_band_addr_from_block_offset(breloc->band, block_off);
+
+ for (num_blocks = 1; num_blocks < _num_blocks; num_blocks++) {
+ if (!ftl_reloc_iter_next(breloc, &block_off)) {
+ break;
+ }
+ }
+
+ return num_blocks;
+}
+
+static size_t
+ftl_reloc_next_blocks(struct ftl_band_reloc *breloc, struct ftl_addr *addr)
+{
+ size_t i, num_blocks = 0;
+ struct spdk_ftl_dev *dev = breloc->parent->dev;
+
+ for (i = 0; i < ftl_get_num_punits(dev); ++i) {
+ num_blocks = ftl_reloc_find_valid_blocks(breloc, breloc->parent->xfer_size, addr);
+ ftl_reloc_iter_next_zone(breloc);
+
+ if (num_blocks || ftl_reloc_iter_done(breloc)) {
+ break;
+ }
+ }
+
+ return num_blocks;
+}
+
+static struct ftl_io *
+ftl_reloc_io_init(struct ftl_band_reloc *breloc, struct ftl_reloc_move *move,
+ ftl_io_fn fn, enum ftl_io_type io_type, int flags)
+{
+ size_t block_off, i;
+ struct ftl_addr addr = move->addr;
+ struct ftl_io *io = NULL;
+ struct ftl_io_init_opts opts = {
+ .dev = breloc->parent->dev,
+ .band = breloc->band,
+ .size = sizeof(*io),
+ .flags = flags | FTL_IO_INTERNAL | FTL_IO_PHYSICAL_MODE,
+ .type = io_type,
+ .num_blocks = move->num_blocks,
+ .iovs = {
+ {
+ .iov_base = move->data,
+ .iov_len = move->num_blocks * FTL_BLOCK_SIZE,
+ }
+ },
+ .iovcnt = 1,
+ .cb_fn = fn,
+ };
+
+ io = ftl_io_init_internal(&opts);
+ if (!io) {
+ return NULL;
+ }
+
+ io->cb_ctx = move;
+ io->addr = move->addr;
+
+ if (flags & FTL_IO_VECTOR_LBA) {
+ for (i = 0; i < io->num_blocks; ++i, ++addr.offset) {
+ block_off = ftl_band_block_offset_from_addr(breloc->band, addr);
+
+ if (!ftl_band_block_offset_valid(breloc->band, block_off)) {
+ io->lba.vector[i] = FTL_LBA_INVALID;
+ continue;
+ }
+
+ io->lba.vector[i] = breloc->band->lba_map.map[block_off];
+ }
+ }
+
+ ftl_trace_lba_io_init(io->dev, io);
+
+ return io;
+}
+
+static int
+ftl_reloc_write(struct ftl_band_reloc *breloc, struct ftl_reloc_move *move)
+{
+ int io_flags = FTL_IO_WEAK | FTL_IO_VECTOR_LBA | FTL_IO_BYPASS_CACHE;
+
+ if (spdk_likely(!move->io)) {
+ move->io = ftl_reloc_io_init(breloc, move, ftl_reloc_write_cb,
+ FTL_IO_WRITE, io_flags);
+ if (!move->io) {
+ ftl_reloc_free_move(breloc, move);
+ STAILQ_INSERT_TAIL(&breloc->move_queue, move, entry);
+ return -ENOMEM;
+ }
+ }
+
+ breloc->num_outstanding++;
+ ftl_io_write(move->io);
+ return 0;
+}
+
+static int
+ftl_reloc_read(struct ftl_band_reloc *breloc, struct ftl_reloc_move *move)
+{
+ struct ftl_addr addr = {};
+
+ move->num_blocks = ftl_reloc_next_blocks(breloc, &addr);
+ move->breloc = breloc;
+ move->addr = addr;
+
+ if (!move->num_blocks) {
+ return 0;
+ }
+
+ move->data = spdk_dma_malloc(FTL_BLOCK_SIZE * move->num_blocks, 4096, NULL);
+ if (!move->data) {
+ return -1;
+ }
+
+ move->io = ftl_reloc_io_init(breloc, move, ftl_reloc_read_cb, FTL_IO_READ, 0);
+ if (!move->io) {
+ ftl_reloc_free_move(breloc, move);
+ STAILQ_INSERT_TAIL(&breloc->move_queue, move, entry);
+ SPDK_ERRLOG("Failed to initialize io for relocation.");
+ return -1;
+ }
+
+ breloc->num_outstanding++;
+ ftl_io_read(move->io);
+ return 0;
+}
+
+static void
+ftl_reloc_process_moves(struct ftl_band_reloc *breloc)
+{
+ struct ftl_reloc_move *move;
+ STAILQ_HEAD(, ftl_reloc_move) move_queue;
+ int rc = 0;
+
+ /*
+ * When IO allocation fails, we do not want to retry immediately so keep moves on
+ * temporary queue
+ */
+ STAILQ_INIT(&move_queue);
+ STAILQ_SWAP(&breloc->move_queue, &move_queue, ftl_reloc_move);
+
+ while (!STAILQ_EMPTY(&move_queue)) {
+ move = STAILQ_FIRST(&move_queue);
+ STAILQ_REMOVE_HEAD(&move_queue, entry);
+
+ switch (move->state) {
+ case FTL_RELOC_STATE_READ_LBA_MAP:
+ rc = ftl_reloc_read_lba_map(breloc, move);
+ break;
+ case FTL_RELOC_STATE_READ:
+ rc = ftl_reloc_read(breloc, move);
+ break;
+ case FTL_RELOC_STATE_WRITE:
+ rc = ftl_reloc_write(breloc, move);
+ break;
+ default:
+ assert(false);
+ break;
+ }
+
+ if (rc) {
+ SPDK_ERRLOG("Move queue processing failed\n");
+ assert(false);
+ }
+ }
+}
+
+static bool
+ftl_reloc_done(struct ftl_band_reloc *breloc)
+{
+ return !breloc->num_outstanding && STAILQ_EMPTY(&breloc->move_queue);
+}
+
+static void
+ftl_reloc_release(struct ftl_band_reloc *breloc)
+{
+ struct ftl_reloc *reloc = breloc->parent;
+ struct ftl_band *band = breloc->band;
+
+ ftl_reloc_iter_reset(breloc);
+ ftl_band_release_lba_map(band);
+ reloc->num_active--;
+
+ if (breloc->state == FTL_BAND_RELOC_STATE_HIGH_PRIO) {
+ /* High prio band must be relocated as a whole and ANM events will be ignored */
+ assert(breloc->num_blocks == 0 && ftl_band_empty(band));
+ TAILQ_REMOVE(&reloc->prio_queue, breloc, entry);
+ band->high_prio = 0;
+ breloc->state = FTL_BAND_RELOC_STATE_INACTIVE;
+ } else {
+ assert(breloc->state == FTL_BAND_RELOC_STATE_ACTIVE);
+ TAILQ_REMOVE(&reloc->active_queue, breloc, entry);
+ breloc->state = FTL_BAND_RELOC_STATE_INACTIVE;
+
+ /* If we got ANM event during relocation put such band back to pending queue */
+ if (breloc->num_blocks != 0) {
+ breloc->state = FTL_BAND_RELOC_STATE_PENDING;
+ TAILQ_INSERT_TAIL(&reloc->pending_queue, breloc, entry);
+ return;
+ }
+ }
+
+ if (ftl_band_empty(band) && band->state == FTL_BAND_STATE_CLOSED) {
+ ftl_band_set_state(breloc->band, FTL_BAND_STATE_FREE);
+
+ if (breloc->defrag) {
+ breloc->defrag = false;
+ assert(reloc->num_defrag_bands > 0);
+ reloc->num_defrag_bands--;
+ }
+ }
+}
+
+static void
+ftl_process_reloc(struct ftl_band_reloc *breloc)
+{
+ ftl_reloc_process_moves(breloc);
+
+ if (ftl_reloc_done(breloc)) {
+ ftl_reloc_release(breloc);
+ }
+}
+
+static int
+ftl_band_reloc_init(struct ftl_reloc *reloc, struct ftl_band_reloc *breloc,
+ struct ftl_band *band)
+{
+ breloc->band = band;
+ breloc->parent = reloc;
+
+ breloc->reloc_map = spdk_bit_array_create(ftl_get_num_blocks_in_band(reloc->dev));
+ if (!breloc->reloc_map) {
+ SPDK_ERRLOG("Failed to initialize reloc map");
+ return -1;
+ }
+
+ breloc->iter.zone_offset = calloc(ftl_get_num_punits(band->dev),
+ sizeof(*breloc->iter.zone_offset));
+ if (!breloc->iter.zone_offset) {
+ SPDK_ERRLOG("Failed to initialize reloc iterator");
+ return -1;
+ }
+
+ STAILQ_INIT(&breloc->move_queue);
+
+ breloc->moves = calloc(reloc->max_qdepth, sizeof(*breloc->moves));
+ if (!breloc->moves) {
+ return -1;
+ }
+
+ return 0;
+}
+
+static void
+ftl_band_reloc_free(struct ftl_band_reloc *breloc)
+{
+ struct ftl_reloc_move *move;
+
+ if (!breloc) {
+ return;
+ }
+
+ assert(breloc->num_outstanding == 0);
+
+ /* Drain write queue if there is active band relocation during shutdown */
+ if (breloc->state == FTL_BAND_RELOC_STATE_ACTIVE ||
+ breloc->state == FTL_BAND_RELOC_STATE_HIGH_PRIO) {
+ assert(breloc->parent->halt);
+ STAILQ_FOREACH(move, &breloc->move_queue, entry) {
+ ftl_reloc_free_move(breloc, move);
+ }
+ }
+
+ spdk_bit_array_free(&breloc->reloc_map);
+ free(breloc->iter.zone_offset);
+ free(breloc->moves);
+}
+
+struct ftl_reloc *
+ftl_reloc_init(struct spdk_ftl_dev *dev)
+{
+ struct ftl_reloc *reloc;
+ size_t i;
+
+ reloc = calloc(1, sizeof(*reloc));
+ if (!reloc) {
+ return NULL;
+ }
+
+ reloc->dev = dev;
+ reloc->halt = true;
+ reloc->max_qdepth = dev->conf.max_reloc_qdepth;
+ reloc->max_active = dev->conf.max_active_relocs;
+ reloc->xfer_size = dev->xfer_size;
+ reloc->num_defrag_bands = 0;
+
+ if (reloc->max_qdepth > FTL_RELOC_MAX_MOVES) {
+ goto error;
+ }
+
+ reloc->brelocs = calloc(ftl_get_num_bands(dev), sizeof(*reloc->brelocs));
+ if (!reloc->brelocs) {
+ goto error;
+ }
+
+ for (i = 0; i < ftl_get_num_bands(reloc->dev); ++i) {
+ if (ftl_band_reloc_init(reloc, &reloc->brelocs[i], &dev->bands[i])) {
+ goto error;
+ }
+ }
+
+ TAILQ_INIT(&reloc->pending_queue);
+ TAILQ_INIT(&reloc->active_queue);
+ TAILQ_INIT(&reloc->prio_queue);
+
+ return reloc;
+error:
+ ftl_reloc_free(reloc);
+ return NULL;
+}
+
+void
+ftl_reloc_free(struct ftl_reloc *reloc)
+{
+ size_t i;
+
+ if (!reloc) {
+ return;
+ }
+
+ for (i = 0; i < ftl_get_num_bands(reloc->dev); ++i) {
+ ftl_band_reloc_free(&reloc->brelocs[i]);
+ }
+
+ free(reloc->brelocs);
+ free(reloc);
+}
+
+bool
+ftl_reloc_is_halted(const struct ftl_reloc *reloc)
+{
+ return reloc->halt;
+}
+
+void
+ftl_reloc_halt(struct ftl_reloc *reloc)
+{
+ reloc->halt = true;
+}
+
+void
+ftl_reloc_resume(struct ftl_reloc *reloc)
+{
+ reloc->halt = false;
+}
+
+void
+ftl_reloc(struct ftl_reloc *reloc)
+{
+ struct ftl_band_reloc *breloc, *tbreloc;
+
+ if (ftl_reloc_is_halted(reloc)) {
+ return;
+ }
+
+ /* Process first band from priority queue and return */
+ breloc = TAILQ_FIRST(&reloc->prio_queue);
+ if (breloc) {
+ ftl_process_reloc(breloc);
+ return;
+ }
+
+ TAILQ_FOREACH_SAFE(breloc, &reloc->pending_queue, entry, tbreloc) {
+ if (reloc->num_active == reloc->max_active) {
+ break;
+ }
+
+ /* Wait for band to close before relocating */
+ if (breloc->band->state != FTL_BAND_STATE_CLOSED) {
+ continue;
+ }
+
+ ftl_reloc_prep(breloc);
+ assert(breloc->state == FTL_BAND_RELOC_STATE_PENDING);
+ TAILQ_REMOVE(&reloc->pending_queue, breloc, entry);
+ breloc->state = FTL_BAND_RELOC_STATE_ACTIVE;
+ TAILQ_INSERT_HEAD(&reloc->active_queue, breloc, entry);
+ }
+
+ TAILQ_FOREACH_SAFE(breloc, &reloc->active_queue, entry, tbreloc) {
+ assert(breloc->state == FTL_BAND_RELOC_STATE_ACTIVE);
+ ftl_process_reloc(breloc);
+ }
+}
+
+void
+ftl_reloc_add(struct ftl_reloc *reloc, struct ftl_band *band, size_t offset,
+ size_t num_blocks, int prio, bool is_defrag)
+{
+ struct ftl_band_reloc *breloc = &reloc->brelocs[band->id];
+ size_t i;
+
+ /* No need to add anything if already at high prio - whole band should be relocated */
+ if (!prio && band->high_prio) {
+ return;
+ }
+
+ pthread_spin_lock(&band->lba_map.lock);
+ if (band->lba_map.num_vld == 0) {
+ pthread_spin_unlock(&band->lba_map.lock);
+
+ /* If the band is closed and has no valid blocks, free it */
+ if (band->state == FTL_BAND_STATE_CLOSED) {
+ ftl_band_set_state(band, FTL_BAND_STATE_FREE);
+ }
+
+ return;
+ }
+ pthread_spin_unlock(&band->lba_map.lock);
+
+ for (i = offset; i < offset + num_blocks; ++i) {
+ if (spdk_bit_array_get(breloc->reloc_map, i)) {
+ continue;
+ }
+ spdk_bit_array_set(breloc->reloc_map, i);
+ breloc->num_blocks++;
+ }
+
+ /* If the band is coming from the defrag process, mark it appropriately */
+ if (is_defrag) {
+ assert(offset == 0 && num_blocks == ftl_get_num_blocks_in_band(band->dev));
+ reloc->num_defrag_bands++;
+ breloc->defrag = true;
+ }
+
+ if (!prio) {
+ if (breloc->state == FTL_BAND_RELOC_STATE_INACTIVE) {
+ breloc->state = FTL_BAND_RELOC_STATE_PENDING;
+ TAILQ_INSERT_HEAD(&reloc->pending_queue, breloc, entry);
+ }
+ } else {
+ bool active = false;
+ /* If priority band is already on pending or active queue, remove it from it */
+ switch (breloc->state) {
+ case FTL_BAND_RELOC_STATE_PENDING:
+ TAILQ_REMOVE(&reloc->pending_queue, breloc, entry);
+ break;
+ case FTL_BAND_RELOC_STATE_ACTIVE:
+ active = true;
+ TAILQ_REMOVE(&reloc->active_queue, breloc, entry);
+ break;
+ default:
+ break;
+ }
+
+ breloc->state = FTL_BAND_RELOC_STATE_HIGH_PRIO;
+ TAILQ_INSERT_TAIL(&reloc->prio_queue, breloc, entry);
+
+ /*
+ * If band has been already on active queue it doesn't need any additional
+ * resources
+ */
+ if (!active) {
+ ftl_reloc_prep(breloc);
+ }
+ }
+}
diff --git a/src/spdk/lib/ftl/ftl_reloc.h b/src/spdk/lib/ftl/ftl_reloc.h
new file mode 100644
index 000000000..21f49a47d
--- /dev/null
+++ b/src/spdk/lib/ftl/ftl_reloc.h
@@ -0,0 +1,53 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FTL_RELOC_H
+#define FTL_RELOC_H
+
+#include "spdk/stdinc.h"
+#include "spdk/ftl.h"
+
+struct ftl_reloc;
+struct ftl_band;
+
+struct ftl_reloc *ftl_reloc_init(struct spdk_ftl_dev *dev);
+void ftl_reloc_free(struct ftl_reloc *reloc);
+void ftl_reloc_add(struct ftl_reloc *reloc, struct ftl_band *band,
+ size_t offset, size_t num_blocks, int prio, bool is_defrag);
+void ftl_reloc(struct ftl_reloc *reloc);
+void ftl_reloc_halt(struct ftl_reloc *reloc);
+void ftl_reloc_resume(struct ftl_reloc *reloc);
+bool ftl_reloc_is_halted(const struct ftl_reloc *reloc);
+bool ftl_reloc_is_defrag_active(const struct ftl_reloc *reloc);
+
+#endif /* FTL_RELOC_H */
diff --git a/src/spdk/lib/ftl/ftl_restore.c b/src/spdk/lib/ftl/ftl_restore.c
new file mode 100644
index 000000000..6f626645d
--- /dev/null
+++ b/src/spdk/lib/ftl/ftl_restore.c
@@ -0,0 +1,1350 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+#include "spdk/ftl.h"
+#include "spdk/util.h"
+#include "spdk/likely.h"
+#include "spdk/string.h"
+#include "spdk/crc32.h"
+
+#include "ftl_core.h"
+#include "ftl_band.h"
+#include "ftl_io.h"
+
+struct ftl_restore_band {
+ struct ftl_restore *parent;
+ /* Associated band */
+ struct ftl_band *band;
+ /* Status of retrieving this band's metadata */
+ enum ftl_md_status md_status;
+ /* Padded queue link */
+ STAILQ_ENTRY(ftl_restore_band) stailq;
+};
+
+struct ftl_nv_cache_restore;
+
+/* Describes single phase to be restored from non-volatile cache */
+struct ftl_nv_cache_range {
+ struct ftl_nv_cache_restore *parent;
+ /* Start offset */
+ uint64_t start_addr;
+ /* Last block's address */
+ uint64_t last_addr;
+ /*
+ * Number of blocks (can be smaller than the difference between the last
+ * and the starting block due to range overlap)
+ */
+ uint64_t num_blocks;
+ /* Number of blocks already recovered */
+ uint64_t num_recovered;
+ /* Current address during recovery */
+ uint64_t current_addr;
+ /* Phase of the range */
+ unsigned int phase;
+ /* Indicates whether the data from this range needs to be recovered */
+ bool recovery;
+};
+
+struct ftl_nv_cache_block {
+ struct ftl_nv_cache_restore *parent;
+ /* Data buffer */
+ void *buf;
+ /* Metadata buffer */
+ void *md_buf;
+ /* Block offset within the cache */
+ uint64_t offset;
+};
+
+struct ftl_nv_cache_restore {
+ struct ftl_nv_cache *nv_cache;
+ /* IO channel to use */
+ struct spdk_io_channel *ioch;
+ /*
+ * Non-volatile cache ranges. The ranges can overlap, as we have no
+ * control over the order of completions. The phase of the range is the
+ * index within the table. The range with index 0 marks blocks that were
+ * never written.
+ */
+ struct ftl_nv_cache_range range[FTL_NV_CACHE_PHASE_COUNT];
+#define FTL_NV_CACHE_RESTORE_DEPTH 128
+ /* Non-volatile cache buffers */
+ struct ftl_nv_cache_block block[FTL_NV_CACHE_RESTORE_DEPTH];
+ /* Current address */
+ uint64_t current_addr;
+ /* Number of outstanding requests */
+ size_t num_outstanding;
+ /* Recovery/scan status */
+ int status;
+ /* Current phase of the recovery */
+ unsigned int phase;
+};
+
+struct ftl_restore {
+ struct spdk_ftl_dev *dev;
+ /* Completion callback (called for each phase of the restoration) */
+ ftl_restore_fn cb;
+ /* Completion callback context */
+ void *cb_arg;
+ /* Number of inflight IOs */
+ unsigned int num_ios;
+ /* Current band number (index in the below bands array) */
+ unsigned int current;
+ /* Array of bands */
+ struct ftl_restore_band *bands;
+ /* Queue of bands to be padded (due to unsafe shutdown) */
+ STAILQ_HEAD(, ftl_restore_band) pad_bands;
+ /* Status of the padding */
+ int pad_status;
+ /* Metadata buffer */
+ void *md_buf;
+ /* LBA map buffer */
+ void *lba_map;
+ /* Indicates we're in the final phase of the restoration */
+ bool final_phase;
+ /* Non-volatile cache recovery */
+ struct ftl_nv_cache_restore nv_cache;
+};
+
+static int
+ftl_restore_tail_md(struct ftl_restore_band *rband);
+static void
+ftl_pad_zone_cb(struct ftl_io *io, void *arg, int status);
+static void
+ftl_restore_pad_band(struct ftl_restore_band *rband);
+
+static void
+ftl_restore_free(struct ftl_restore *restore)
+{
+ unsigned int i;
+
+ if (!restore) {
+ return;
+ }
+
+ for (i = 0; i < FTL_NV_CACHE_RESTORE_DEPTH; ++i) {
+ spdk_dma_free(restore->nv_cache.block[i].buf);
+ }
+
+ spdk_dma_free(restore->md_buf);
+ free(restore->bands);
+ free(restore);
+}
+
+static struct ftl_restore *
+ftl_restore_init(struct spdk_ftl_dev *dev, ftl_restore_fn cb, void *cb_arg)
+{
+ struct ftl_restore *restore;
+ struct ftl_restore_band *rband;
+ size_t i;
+
+ restore = calloc(1, sizeof(*restore));
+ if (!restore) {
+ goto error;
+ }
+
+ restore->dev = dev;
+ restore->cb = cb;
+ restore->cb_arg = cb_arg;
+ restore->final_phase = false;
+
+ restore->bands = calloc(ftl_get_num_bands(dev), sizeof(*restore->bands));
+ if (!restore->bands) {
+ goto error;
+ }
+
+ STAILQ_INIT(&restore->pad_bands);
+
+ for (i = 0; i < ftl_get_num_bands(dev); ++i) {
+ rband = &restore->bands[i];
+ rband->band = &dev->bands[i];
+ rband->parent = restore;
+ rband->md_status = FTL_MD_NO_MD;
+ }
+
+ /* Allocate buffer capable of holding head mds of all bands */
+ restore->md_buf = spdk_dma_zmalloc(ftl_get_num_bands(dev) * ftl_head_md_num_blocks(dev) *
+ FTL_BLOCK_SIZE, 0, NULL);
+ if (!restore->md_buf) {
+ goto error;
+ }
+
+ return restore;
+error:
+ ftl_restore_free(restore);
+ return NULL;
+}
+
+static void
+ftl_restore_complete(struct ftl_restore *restore, int status)
+{
+ struct ftl_restore *ctx = status ? NULL : restore;
+ bool final_phase = restore->final_phase;
+
+ restore->cb(ctx, status, restore->cb_arg);
+ if (status || final_phase) {
+ ftl_restore_free(restore);
+ }
+}
+
+static int
+ftl_band_cmp(const void *lband, const void *rband)
+{
+ uint64_t lseq = ((struct ftl_restore_band *)lband)->band->seq;
+ uint64_t rseq = ((struct ftl_restore_band *)rband)->band->seq;
+
+ if (lseq < rseq) {
+ return -1;
+ } else {
+ return 1;
+ }
+}
+
+static int
+ftl_restore_check_seq(const struct ftl_restore *restore)
+{
+ const struct spdk_ftl_dev *dev = restore->dev;
+ const struct ftl_restore_band *rband;
+ const struct ftl_band *next_band;
+ size_t i;
+
+ for (i = 0; i < ftl_get_num_bands(dev); ++i) {
+ rband = &restore->bands[i];
+ if (rband->md_status != FTL_MD_SUCCESS) {
+ continue;
+ }
+
+ next_band = LIST_NEXT(rband->band, list_entry);
+ if (next_band && rband->band->seq == next_band->seq) {
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+static bool
+ftl_restore_head_valid(struct spdk_ftl_dev *dev, struct ftl_restore *restore, size_t *num_valid)
+{
+ struct ftl_restore_band *rband;
+ size_t i;
+
+ for (i = 0; i < ftl_get_num_bands(dev); ++i) {
+ rband = &restore->bands[i];
+
+ if (rband->md_status != FTL_MD_SUCCESS &&
+ rband->md_status != FTL_MD_NO_MD &&
+ rband->md_status != FTL_MD_IO_FAILURE) {
+ SPDK_ERRLOG("Inconsistent head metadata found on band %u\n",
+ rband->band->id);
+ return false;
+ }
+
+ if (rband->md_status == FTL_MD_SUCCESS) {
+ (*num_valid)++;
+ }
+ }
+
+ return true;
+}
+
+static void
+ftl_restore_head_complete(struct ftl_restore *restore)
+{
+ struct spdk_ftl_dev *dev = restore->dev;
+ size_t num_valid = 0;
+ int status = -EIO;
+
+ if (!ftl_restore_head_valid(dev, restore, &num_valid)) {
+ goto out;
+ }
+
+ if (num_valid == 0) {
+ SPDK_ERRLOG("Couldn't find any valid bands\n");
+ goto out;
+ }
+
+ /* Sort bands in sequence number ascending order */
+ qsort(restore->bands, ftl_get_num_bands(dev), sizeof(struct ftl_restore_band),
+ ftl_band_cmp);
+
+ if (ftl_restore_check_seq(restore)) {
+ SPDK_ERRLOG("Band sequence consistency failed\n");
+ goto out;
+ }
+
+ dev->num_lbas = dev->global_md.num_lbas;
+ status = 0;
+out:
+ ftl_restore_complete(restore, status);
+}
+
+static void
+ftl_restore_head_cb(struct ftl_io *io, void *ctx, int status)
+{
+ struct ftl_restore_band *rband = ctx;
+ struct ftl_restore *restore = rband->parent;
+ unsigned int num_ios;
+
+ rband->md_status = status;
+ num_ios = __atomic_fetch_sub(&restore->num_ios, 1, __ATOMIC_SEQ_CST);
+ assert(num_ios > 0);
+
+ if (num_ios == 1) {
+ ftl_restore_head_complete(restore);
+ }
+}
+
+static void
+ftl_restore_head_md(void *ctx)
+{
+ struct ftl_restore *restore = ctx;
+ struct spdk_ftl_dev *dev = restore->dev;
+ struct ftl_restore_band *rband;
+ struct ftl_lba_map *lba_map;
+ unsigned int num_failed = 0, num_ios;
+ size_t i;
+
+ restore->num_ios = ftl_get_num_bands(dev);
+
+ for (i = 0; i < ftl_get_num_bands(dev); ++i) {
+ rband = &restore->bands[i];
+ lba_map = &rband->band->lba_map;
+
+ lba_map->dma_buf = restore->md_buf + i * ftl_head_md_num_blocks(dev) * FTL_BLOCK_SIZE;
+
+ if (ftl_band_read_head_md(rband->band, ftl_restore_head_cb, rband)) {
+ if (spdk_likely(rband->band->num_zones)) {
+ SPDK_ERRLOG("Failed to read metadata on band %zu\n", i);
+
+ rband->md_status = FTL_MD_INVALID_CRC;
+
+ /* If the first IO fails, don't bother sending anything else */
+ if (i == 0) {
+ ftl_restore_complete(restore, -EIO);
+ }
+ }
+
+ num_failed++;
+ }
+ }
+
+ if (spdk_unlikely(num_failed > 0)) {
+ num_ios = __atomic_fetch_sub(&restore->num_ios, num_failed, __ATOMIC_SEQ_CST);
+ if (num_ios == num_failed) {
+ ftl_restore_complete(restore, -EIO);
+ }
+ }
+}
+
+int
+ftl_restore_md(struct spdk_ftl_dev *dev, ftl_restore_fn cb, void *cb_arg)
+{
+ struct ftl_restore *restore;
+
+ restore = ftl_restore_init(dev, cb, cb_arg);
+ if (!restore) {
+ return -ENOMEM;
+ }
+
+ spdk_thread_send_msg(ftl_get_core_thread(dev), ftl_restore_head_md, restore);
+
+ return 0;
+}
+
+static int
+ftl_restore_l2p(struct ftl_band *band)
+{
+ struct spdk_ftl_dev *dev = band->dev;
+ struct ftl_addr addr;
+ uint64_t lba;
+ size_t i;
+
+ for (i = 0; i < ftl_get_num_blocks_in_band(band->dev); ++i) {
+ if (!spdk_bit_array_get(band->lba_map.vld, i)) {
+ continue;
+ }
+
+ lba = band->lba_map.map[i];
+ if (lba >= dev->num_lbas) {
+ return -1;
+ }
+
+ addr = ftl_l2p_get(dev, lba);
+ if (!ftl_addr_invalid(addr)) {
+ ftl_invalidate_addr(dev, addr);
+ }
+
+ addr = ftl_band_addr_from_block_offset(band, i);
+
+ ftl_band_set_addr(band, lba, addr);
+ ftl_l2p_set(dev, lba, addr);
+ }
+
+ return 0;
+}
+
+static struct ftl_restore_band *
+ftl_restore_next_band(struct ftl_restore *restore)
+{
+ struct ftl_restore_band *rband;
+
+ for (; restore->current < ftl_get_num_bands(restore->dev); ++restore->current) {
+ rband = &restore->bands[restore->current];
+
+ if (spdk_likely(rband->band->num_zones) &&
+ rband->md_status == FTL_MD_SUCCESS) {
+ restore->current++;
+ return rband;
+ }
+ }
+
+ return NULL;
+}
+
+static void
+ftl_nv_cache_restore_complete(struct ftl_nv_cache_restore *restore, int status)
+{
+ struct ftl_restore *ftl_restore = SPDK_CONTAINEROF(restore, struct ftl_restore, nv_cache);
+
+ restore->status = restore->status ? : status;
+ if (restore->num_outstanding == 0) {
+ ftl_restore_complete(ftl_restore, restore->status);
+ }
+}
+
+static void ftl_nv_cache_block_read_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg);
+
+static void
+ftl_nv_cache_restore_done(struct ftl_nv_cache_restore *restore, uint64_t current_addr)
+{
+ struct ftl_nv_cache *nv_cache = restore->nv_cache;
+
+ pthread_spin_lock(&nv_cache->lock);
+ nv_cache->current_addr = current_addr;
+ nv_cache->ready = true;
+ pthread_spin_unlock(&nv_cache->lock);
+
+ SPDK_DEBUGLOG(SPDK_LOG_FTL_INIT, "Enabling non-volatile cache (phase: %u, addr: %"
+ PRIu64")\n", nv_cache->phase, current_addr);
+
+ ftl_nv_cache_restore_complete(restore, 0);
+}
+
+static void
+ftl_nv_cache_write_header_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct ftl_nv_cache_restore *restore = cb_arg;
+
+ spdk_bdev_free_io(bdev_io);
+ if (spdk_unlikely(!success)) {
+ SPDK_ERRLOG("Unable to write the non-volatile cache metadata header\n");
+ ftl_nv_cache_restore_complete(restore, -EIO);
+ return;
+ }
+
+ ftl_nv_cache_restore_done(restore, FTL_NV_CACHE_DATA_OFFSET);
+}
+
+static void
+ftl_nv_cache_scrub_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct ftl_nv_cache_restore *restore = cb_arg;
+ struct ftl_nv_cache *nv_cache = restore->nv_cache;
+ int rc;
+
+ spdk_bdev_free_io(bdev_io);
+ if (spdk_unlikely(!success)) {
+ SPDK_ERRLOG("Scrubbing non-volatile cache failed\n");
+ ftl_nv_cache_restore_complete(restore, -EIO);
+ return;
+ }
+
+ nv_cache->phase = 1;
+ rc = ftl_nv_cache_write_header(nv_cache, false, ftl_nv_cache_write_header_cb, restore);
+ if (spdk_unlikely(rc != 0)) {
+ SPDK_ERRLOG("Unable to write the non-volatile cache metadata header: %s\n",
+ spdk_strerror(-rc));
+ ftl_nv_cache_restore_complete(restore, -EIO);
+ }
+}
+
+static void
+ftl_nv_cache_scrub_header_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct ftl_nv_cache_restore *restore = cb_arg;
+ struct ftl_nv_cache *nv_cache = restore->nv_cache;
+ int rc;
+
+ spdk_bdev_free_io(bdev_io);
+ if (spdk_unlikely(!success)) {
+ SPDK_ERRLOG("Unable to write non-volatile cache metadata header\n");
+ ftl_nv_cache_restore_complete(restore, -EIO);
+ return;
+ }
+
+ rc = ftl_nv_cache_scrub(nv_cache, ftl_nv_cache_scrub_cb, restore);
+ if (spdk_unlikely(rc != 0)) {
+ SPDK_ERRLOG("Unable to scrub the non-volatile cache: %s\n", spdk_strerror(-rc));
+ ftl_nv_cache_restore_complete(restore, rc);
+ }
+}
+
+static void
+ftl_nv_cache_band_flush_cb(void *ctx, int status)
+{
+ struct ftl_nv_cache_restore *restore = ctx;
+ struct ftl_nv_cache *nv_cache = restore->nv_cache;
+ int rc;
+
+ if (spdk_unlikely(status != 0)) {
+ SPDK_ERRLOG("Flushing active bands failed: %s\n", spdk_strerror(-status));
+ ftl_nv_cache_restore_complete(restore, status);
+ return;
+ }
+
+ /*
+ * Use phase 0 to indicate that the cache is being scrubbed. If the power is lost during
+ * this process, we'll know it needs to be resumed.
+ */
+ nv_cache->phase = 0;
+ rc = ftl_nv_cache_write_header(nv_cache, false, ftl_nv_cache_scrub_header_cb, restore);
+ if (spdk_unlikely(rc != 0)) {
+ SPDK_ERRLOG("Unable to write non-volatile cache metadata header: %s\n",
+ spdk_strerror(-rc));
+ ftl_nv_cache_restore_complete(restore, rc);
+ }
+}
+
+static void
+ftl_nv_cache_wbuf_flush_cb(void *ctx, int status)
+{
+ struct ftl_nv_cache_restore *restore = ctx;
+ struct ftl_nv_cache *nv_cache = restore->nv_cache;
+ struct spdk_ftl_dev *dev = SPDK_CONTAINEROF(nv_cache, struct spdk_ftl_dev, nv_cache);
+ int rc;
+
+ if (spdk_unlikely(status != 0)) {
+ SPDK_ERRLOG("Flushing the write buffer failed: %s\n", spdk_strerror(-status));
+ ftl_nv_cache_restore_complete(restore, status);
+ return;
+ }
+
+ rc = ftl_flush_active_bands(dev, ftl_nv_cache_band_flush_cb, restore);
+ if (spdk_unlikely(rc != 0)) {
+ SPDK_ERRLOG("Unable to flush active bands: %s\n", spdk_strerror(-rc));
+ ftl_nv_cache_restore_complete(restore, rc);
+ }
+}
+
+static void
+ftl_nv_cache_recovery_done(struct ftl_nv_cache_restore *restore)
+{
+ struct ftl_nv_cache *nv_cache = restore->nv_cache;
+ struct ftl_nv_cache_range *range_prev, *range_current;
+ struct spdk_ftl_dev *dev = SPDK_CONTAINEROF(nv_cache, struct spdk_ftl_dev, nv_cache);
+ struct spdk_bdev *bdev;
+ uint64_t current_addr;
+ int rc;
+
+ range_prev = &restore->range[ftl_nv_cache_prev_phase(nv_cache->phase)];
+ range_current = &restore->range[nv_cache->phase];
+ bdev = spdk_bdev_desc_get_bdev(nv_cache->bdev_desc);
+
+ /*
+ * If there are more than two ranges or the ranges overlap, scrub the non-volatile cache to
+ * make sure that any subsequent power loss will find the cache in usable state
+ */
+ if ((range_prev->num_blocks + range_current->num_blocks < nv_cache->num_data_blocks) ||
+ (range_prev->start_addr < range_current->last_addr &&
+ range_current->start_addr < range_prev->last_addr)) {
+ SPDK_DEBUGLOG(SPDK_LOG_FTL_INIT, "Non-volatile cache inconsistency detected\n");
+
+ rc = ftl_flush_wbuf(dev, ftl_nv_cache_wbuf_flush_cb, restore);
+ if (spdk_unlikely(rc != 0)) {
+ SPDK_ERRLOG("Unable to flush the write buffer: %s\n", spdk_strerror(-rc));
+ ftl_nv_cache_restore_complete(restore, rc);
+ }
+
+ return;
+ }
+
+ /* The latest phase is the one written in the header (set in nvc_cache->phase) */
+ current_addr = range_current->last_addr + 1;
+
+ /*
+ * The first range might be empty (only the header was written) or the range might
+ * end at the last available address, in which case set current address to the
+ * beginning of the device.
+ */
+ if (range_current->num_blocks == 0 || current_addr >= spdk_bdev_get_num_blocks(bdev)) {
+ current_addr = FTL_NV_CACHE_DATA_OFFSET;
+ }
+
+ ftl_nv_cache_restore_done(restore, current_addr);
+}
+
+static void
+ftl_nv_cache_recover_block(struct ftl_nv_cache_block *block)
+{
+ struct ftl_nv_cache_restore *restore = block->parent;
+ struct ftl_nv_cache *nv_cache = restore->nv_cache;
+ struct ftl_nv_cache_range *range = &restore->range[restore->phase];
+ int rc;
+
+ assert(range->current_addr <= range->last_addr);
+
+ restore->num_outstanding++;
+ block->offset = range->current_addr++;
+ rc = spdk_bdev_read_blocks_with_md(nv_cache->bdev_desc, restore->ioch,
+ block->buf, block->md_buf,
+ block->offset, 1, ftl_nv_cache_block_read_cb,
+ block);
+ if (spdk_unlikely(rc != 0)) {
+ SPDK_ERRLOG("Non-volatile cache restoration failed on block %"PRIu64" (%s)\n",
+ block->offset, spdk_strerror(-rc));
+ restore->num_outstanding--;
+ ftl_nv_cache_restore_complete(restore, rc);
+ }
+}
+
+static void
+ftl_nv_cache_recover_range(struct ftl_nv_cache_restore *restore)
+{
+ struct ftl_nv_cache_range *range;
+ unsigned int phase = restore->phase;
+
+ do {
+ /* Find first range with non-zero number of blocks that is marked for recovery */
+ range = &restore->range[phase];
+ if (range->recovery && range->num_recovered < range->num_blocks) {
+ break;
+ }
+
+ phase = ftl_nv_cache_next_phase(phase);
+ } while (phase != restore->phase);
+
+ /* There are no ranges to be recovered, we're done */
+ if (range->num_recovered == range->num_blocks || !range->recovery) {
+ SPDK_DEBUGLOG(SPDK_LOG_FTL_INIT, "Non-volatile cache recovery done\n");
+ ftl_nv_cache_recovery_done(restore);
+ return;
+ }
+
+ range->current_addr = range->start_addr;
+ restore->phase = phase;
+
+ SPDK_DEBUGLOG(SPDK_LOG_FTL_INIT, "Recovering range %u %"PRIu64"-%"PRIu64" (%"PRIu64")\n",
+ phase, range->start_addr, range->last_addr, range->num_blocks);
+
+ ftl_nv_cache_recover_block(&restore->block[0]);
+}
+
+static void
+ftl_nv_cache_write_cb(struct ftl_io *io, void *cb_arg, int status)
+{
+ struct ftl_nv_cache_block *block = cb_arg;
+ struct ftl_nv_cache_restore *restore = block->parent;
+ struct ftl_nv_cache_range *range = &restore->range[restore->phase];
+
+ restore->num_outstanding--;
+ if (status != 0) {
+ SPDK_ERRLOG("Non-volatile cache restoration failed on block %"PRIu64" (%s)\n",
+ block->offset, spdk_strerror(-status));
+ ftl_nv_cache_restore_complete(restore, -ENOMEM);
+ return;
+ }
+
+ range->num_recovered++;
+ if (range->current_addr <= range->last_addr) {
+ ftl_nv_cache_recover_block(block);
+ } else if (restore->num_outstanding == 0) {
+ assert(range->num_recovered == range->num_blocks);
+ ftl_nv_cache_recover_range(restore);
+ }
+}
+
+static struct ftl_io *
+ftl_nv_cache_alloc_io(struct ftl_nv_cache_block *block, uint64_t lba)
+{
+ struct ftl_restore *restore = SPDK_CONTAINEROF(block->parent, struct ftl_restore, nv_cache);
+ struct ftl_io_init_opts opts = {
+ .dev = restore->dev,
+ .io = NULL,
+ .flags = FTL_IO_BYPASS_CACHE,
+ .type = FTL_IO_WRITE,
+ .num_blocks = 1,
+ .cb_fn = ftl_nv_cache_write_cb,
+ .cb_ctx = block,
+ .iovs = {
+ {
+ .iov_base = block->buf,
+ .iov_len = FTL_BLOCK_SIZE,
+ }
+ },
+ .iovcnt = 1,
+ };
+ struct ftl_io *io;
+
+ io = ftl_io_init_internal(&opts);
+ if (spdk_unlikely(!io)) {
+ return NULL;
+ }
+
+ io->lba.single = lba;
+ return io;
+}
+
+static void
+ftl_nv_cache_block_read_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct ftl_nv_cache_block *block = cb_arg;
+ struct ftl_nv_cache_restore *restore = block->parent;
+ struct ftl_nv_cache_range *range = &restore->range[restore->phase];
+ struct ftl_io *io;
+ unsigned int phase;
+ uint64_t lba;
+
+ spdk_bdev_free_io(bdev_io);
+ restore->num_outstanding--;
+
+ if (!success) {
+ SPDK_ERRLOG("Non-volatile cache restoration failed on block %"PRIu64"\n",
+ block->offset);
+ ftl_nv_cache_restore_complete(restore, -EIO);
+ return;
+ }
+
+ ftl_nv_cache_unpack_lba(*(uint64_t *)block->md_buf, &lba, &phase);
+ if (spdk_unlikely(phase != restore->phase)) {
+ if (range->current_addr < range->last_addr) {
+ ftl_nv_cache_recover_block(block);
+ } else if (restore->num_outstanding == 0) {
+ ftl_nv_cache_recover_range(restore);
+ }
+
+ return;
+ }
+
+ io = ftl_nv_cache_alloc_io(block, lba);
+ if (spdk_unlikely(!io)) {
+ SPDK_ERRLOG("Failed to allocate ftl_io during non-volatile cache recovery\n");
+ ftl_nv_cache_restore_complete(restore, -ENOMEM);
+ return;
+ }
+
+ restore->num_outstanding++;
+ ftl_io_write(io);
+}
+
+/*
+ * Since we have no control over the order in which the requests complete in regards to their
+ * submission, the cache can be in either of the following states:
+ * - [1 1 1 1 1 1 1 1 1 1]: simplest case, whole cache contains single phase (although it should be
+ * very rare),
+ * - [1 1 1 1 3 3 3 3 3 3]: two phases, changing somewhere in the middle with no overlap. This is
+ * the state left by clean shutdown,
+ * - [1 1 1 1 3 1 3 3 3 3]: similar to the above, but this time the two ranges overlap. This
+ * happens when completions are reordered during unsafe shutdown,
+ * - [2 1 2 1 1 1 1 3 1 3]: three different phases, each one of which can overlap with
+ * previous/next one. The data from the oldest phase doesn't need to be
+ * recovered, as it was already being written to, which means it's
+ * already on the main storage.
+ */
+static void
+ftl_nv_cache_scan_done(struct ftl_nv_cache_restore *restore)
+{
+ struct ftl_nv_cache *nv_cache = restore->nv_cache;
+#if defined(DEBUG)
+ struct ftl_nv_cache_range *range;
+ uint64_t i, num_blocks = 0;
+
+ for (i = 0; i < FTL_NV_CACHE_PHASE_COUNT; ++i) {
+ range = &restore->range[i];
+ SPDK_DEBUGLOG(SPDK_LOG_FTL_INIT, "Range %"PRIu64": %"PRIu64"-%"PRIu64" (%" PRIu64
+ ")\n", i, range->start_addr, range->last_addr, range->num_blocks);
+ num_blocks += range->num_blocks;
+ }
+ assert(num_blocks == nv_cache->num_data_blocks);
+#endif
+ restore->phase = ftl_nv_cache_prev_phase(nv_cache->phase);
+
+ /*
+ * Only the latest two phases need to be recovered. The third one, even if present,
+ * already has to be stored on the main storage, as it's already started to be
+ * overwritten (only present here because of reordering of requests' completions).
+ */
+ restore->range[nv_cache->phase].recovery = true;
+ restore->range[restore->phase].recovery = true;
+
+ ftl_nv_cache_recover_range(restore);
+}
+
+static int ftl_nv_cache_scan_block(struct ftl_nv_cache_block *block);
+
+static void
+ftl_nv_cache_scan_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct ftl_nv_cache_block *block = cb_arg;
+ struct ftl_nv_cache_restore *restore = block->parent;
+ struct ftl_nv_cache_range *range;
+ struct spdk_bdev *bdev;
+ unsigned int phase;
+ uint64_t lba;
+
+ restore->num_outstanding--;
+ bdev = spdk_bdev_desc_get_bdev(restore->nv_cache->bdev_desc);
+ spdk_bdev_free_io(bdev_io);
+
+ if (!success) {
+ SPDK_ERRLOG("Non-volatile cache scan failed on block %"PRIu64"\n",
+ block->offset);
+ ftl_nv_cache_restore_complete(restore, -EIO);
+ return;
+ }
+
+ /* If we've already hit an error, don't bother with scanning anything else */
+ if (spdk_unlikely(restore->status != 0)) {
+ ftl_nv_cache_restore_complete(restore, restore->status);
+ return;
+ }
+
+ ftl_nv_cache_unpack_lba(*(uint64_t *)block->md_buf, &lba, &phase);
+ range = &restore->range[phase];
+ range->num_blocks++;
+
+ if (range->start_addr == FTL_LBA_INVALID || range->start_addr > block->offset) {
+ range->start_addr = block->offset;
+ }
+
+ if (range->last_addr == FTL_LBA_INVALID || range->last_addr < block->offset) {
+ range->last_addr = block->offset;
+ }
+
+ /* All the blocks were read, once they're all completed and we're finished */
+ if (restore->current_addr == spdk_bdev_get_num_blocks(bdev)) {
+ if (restore->num_outstanding == 0) {
+ ftl_nv_cache_scan_done(restore);
+ }
+
+ return;
+ }
+
+ ftl_nv_cache_scan_block(block);
+}
+
+static int
+ftl_nv_cache_scan_block(struct ftl_nv_cache_block *block)
+{
+ struct ftl_nv_cache_restore *restore = block->parent;
+ struct ftl_nv_cache *nv_cache = restore->nv_cache;
+ int rc;
+
+ restore->num_outstanding++;
+ block->offset = restore->current_addr++;
+ rc = spdk_bdev_read_blocks_with_md(nv_cache->bdev_desc, restore->ioch,
+ block->buf, block->md_buf,
+ block->offset, 1, ftl_nv_cache_scan_cb,
+ block);
+ if (spdk_unlikely(rc != 0)) {
+ SPDK_ERRLOG("Non-volatile cache scan failed on block %"PRIu64" (%s)\n",
+ block->offset, spdk_strerror(-rc));
+ restore->num_outstanding--;
+ ftl_nv_cache_restore_complete(restore, rc);
+ return rc;
+ }
+
+ return 0;
+}
+
+static void
+ftl_nv_cache_clean_header_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct ftl_nv_cache_restore *restore = cb_arg;
+
+ spdk_bdev_free_io(bdev_io);
+ if (spdk_unlikely(!success)) {
+ SPDK_ERRLOG("Unable to write the non-volatile cache metadata header\n");
+ ftl_nv_cache_restore_complete(restore, -EIO);
+ return;
+ }
+
+ ftl_nv_cache_restore_done(restore, restore->current_addr);
+}
+
+static bool
+ftl_nv_cache_header_valid(struct spdk_ftl_dev *dev, const struct ftl_nv_cache_header *hdr)
+{
+ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(dev->nv_cache.bdev_desc);
+ uint32_t checksum;
+
+ checksum = spdk_crc32c_update(hdr, offsetof(struct ftl_nv_cache_header, checksum), 0);
+ if (checksum != hdr->checksum) {
+ SPDK_ERRLOG("Invalid header checksum (found: %"PRIu32", expected: %"PRIu32")\n",
+ checksum, hdr->checksum);
+ return false;
+ }
+
+ if (hdr->version != FTL_NV_CACHE_HEADER_VERSION) {
+ SPDK_ERRLOG("Invalid header version (found: %"PRIu32", expected: %"PRIu32")\n",
+ hdr->version, FTL_NV_CACHE_HEADER_VERSION);
+ return false;
+ }
+
+ if (hdr->size != spdk_bdev_get_num_blocks(bdev)) {
+ SPDK_ERRLOG("Unexpected size of the non-volatile cache bdev (%"PRIu64", expected: %"
+ PRIu64")\n", hdr->size, spdk_bdev_get_num_blocks(bdev));
+ return false;
+ }
+
+ if (spdk_uuid_compare(&hdr->uuid, &dev->uuid)) {
+ SPDK_ERRLOG("Invalid device UUID\n");
+ return false;
+ }
+
+ if (!ftl_nv_cache_phase_is_valid(hdr->phase) && hdr->phase != 0) {
+ return false;
+ }
+
+ if ((hdr->current_addr >= spdk_bdev_get_num_blocks(bdev) ||
+ hdr->current_addr < FTL_NV_CACHE_DATA_OFFSET) &&
+ (hdr->current_addr != FTL_LBA_INVALID)) {
+ SPDK_ERRLOG("Unexpected value of non-volatile cache's current address: %"PRIu64"\n",
+ hdr->current_addr);
+ return false;
+ }
+
+ return true;
+}
+
+static void
+ftl_nv_cache_read_header_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct ftl_restore *restore = cb_arg;
+ struct spdk_ftl_dev *dev = restore->dev;
+ struct ftl_nv_cache *nv_cache = &dev->nv_cache;
+ struct ftl_nv_cache_header *hdr;
+ struct iovec *iov = NULL;
+ int iov_cnt = 0, i, rc;
+
+ if (!success) {
+ SPDK_ERRLOG("Unable to read non-volatile cache metadata header\n");
+ ftl_restore_complete(restore, -ENOTRECOVERABLE);
+ goto out;
+ }
+
+ spdk_bdev_io_get_iovec(bdev_io, &iov, &iov_cnt);
+ assert(iov != NULL);
+ hdr = iov[0].iov_base;
+
+ if (!ftl_nv_cache_header_valid(dev, hdr)) {
+ ftl_restore_complete(restore, -ENOTRECOVERABLE);
+ goto out;
+ }
+
+ /* Remember the latest phase */
+ nv_cache->phase = hdr->phase;
+
+ /* If the phase equals zero, we lost power during recovery. We need to finish it up
+ * by scrubbing the device once again.
+ */
+ if (hdr->phase == 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_FTL_INIT, "Detected phase 0, restarting scrub\n");
+ rc = ftl_nv_cache_scrub(nv_cache, ftl_nv_cache_scrub_cb, restore);
+ if (spdk_unlikely(rc != 0)) {
+ SPDK_ERRLOG("Unable to scrub the non-volatile cache: %s\n",
+ spdk_strerror(-rc));
+ ftl_restore_complete(restore, -ENOTRECOVERABLE);
+ }
+
+ goto out;
+ }
+
+ /* Valid current_addr means that the shutdown was clean, so we just need to overwrite the
+ * header to make sure that any power loss occurring before the cache is wrapped won't be
+ * mistaken for a clean shutdown.
+ */
+ if (hdr->current_addr != FTL_LBA_INVALID) {
+ restore->nv_cache.current_addr = hdr->current_addr;
+
+ rc = ftl_nv_cache_write_header(nv_cache, false, ftl_nv_cache_clean_header_cb,
+ &restore->nv_cache);
+ if (spdk_unlikely(rc != 0)) {
+ SPDK_ERRLOG("Failed to overwrite the non-volatile cache header: %s\n",
+ spdk_strerror(-rc));
+ ftl_restore_complete(restore, -ENOTRECOVERABLE);
+ }
+
+ goto out;
+ }
+
+ /* Otherwise the shutdown was unexpected, so we need to recover the data from the cache */
+ restore->nv_cache.current_addr = FTL_NV_CACHE_DATA_OFFSET;
+
+ for (i = 0; i < FTL_NV_CACHE_RESTORE_DEPTH; ++i) {
+ if (ftl_nv_cache_scan_block(&restore->nv_cache.block[i])) {
+ break;
+ }
+ }
+out:
+ spdk_bdev_free_io(bdev_io);
+}
+
+void
+ftl_restore_nv_cache(struct ftl_restore *restore, ftl_restore_fn cb, void *cb_arg)
+{
+ struct spdk_ftl_dev *dev = restore->dev;
+ struct spdk_bdev *bdev;
+ struct ftl_nv_cache *nv_cache = &dev->nv_cache;
+ struct ftl_io_channel *ioch;
+ struct ftl_nv_cache_restore *nvc_restore = &restore->nv_cache;
+ struct ftl_nv_cache_block *block;
+ size_t alignment;
+ int rc, i;
+
+ ioch = ftl_io_channel_get_ctx(ftl_get_io_channel(dev));
+ bdev = spdk_bdev_desc_get_bdev(nv_cache->bdev_desc);
+ alignment = spdk_max(spdk_bdev_get_buf_align(bdev), sizeof(uint64_t));
+
+ nvc_restore->nv_cache = nv_cache;
+ nvc_restore->ioch = ioch->cache_ioch;
+
+ restore->final_phase = true;
+ restore->cb = cb;
+ restore->cb_arg = cb_arg;
+
+ for (i = 0; i < FTL_NV_CACHE_RESTORE_DEPTH; ++i) {
+ block = &nvc_restore->block[i];
+ block->parent = nvc_restore;
+ block->buf = spdk_dma_zmalloc(spdk_bdev_get_block_size(bdev) +
+ spdk_bdev_get_md_size(bdev),
+ alignment, NULL);
+ if (!block->buf) {
+ /* The memory will be freed in ftl_restore_free */
+ SPDK_ERRLOG("Unable to allocate memory\n");
+ ftl_restore_complete(restore, -ENOMEM);
+ return;
+ }
+
+ block->md_buf = (char *)block->buf + spdk_bdev_get_block_size(bdev);
+ }
+
+ for (i = 0; i < FTL_NV_CACHE_PHASE_COUNT; ++i) {
+ nvc_restore->range[i].parent = nvc_restore;
+ nvc_restore->range[i].start_addr = FTL_LBA_INVALID;
+ nvc_restore->range[i].last_addr = FTL_LBA_INVALID;
+ nvc_restore->range[i].num_blocks = 0;
+ nvc_restore->range[i].recovery = false;
+ nvc_restore->range[i].phase = i;
+ }
+
+ rc = spdk_bdev_read_blocks(nv_cache->bdev_desc, ioch->cache_ioch, nv_cache->dma_buf,
+ 0, FTL_NV_CACHE_DATA_OFFSET, ftl_nv_cache_read_header_cb, restore);
+ if (spdk_unlikely(rc != 0)) {
+ SPDK_ERRLOG("Failed to read non-volatile cache metadata header: %s\n",
+ spdk_strerror(-rc));
+ ftl_restore_complete(restore, rc);
+ }
+}
+
+static bool
+ftl_pad_zone_pad_finish(struct ftl_restore_band *rband, bool direct_access)
+{
+ struct ftl_restore *restore = rband->parent;
+ struct ftl_restore_band *next_band;
+ size_t i, num_pad_zones = 0;
+
+ if (spdk_unlikely(restore->pad_status && !restore->num_ios)) {
+ if (direct_access) {
+ /* In case of any errors found we want to clear direct access. */
+ /* Direct access bands have their own allocated md, which would be lost */
+ /* on restore complete otherwise. */
+ rband->band->state = FTL_BAND_STATE_CLOSED;
+ ftl_band_set_direct_access(rband->band, false);
+ }
+ ftl_restore_complete(restore, restore->pad_status);
+ return true;
+ }
+
+ for (i = 0; i < rband->band->num_zones; ++i) {
+ if (rband->band->zone_buf[i].info.state != SPDK_BDEV_ZONE_STATE_FULL) {
+ num_pad_zones++;
+ }
+ }
+
+ /* Finished all zones in a band, check if all bands are done */
+ if (num_pad_zones == 0) {
+ if (direct_access) {
+ rband->band->state = FTL_BAND_STATE_CLOSED;
+ ftl_band_set_direct_access(rband->band, false);
+ }
+
+ next_band = STAILQ_NEXT(rband, stailq);
+ if (!next_band) {
+ ftl_restore_complete(restore, restore->pad_status);
+ return true;
+ } else {
+ /* Start off padding in the next band */
+ ftl_restore_pad_band(next_band);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static struct ftl_io *
+ftl_restore_init_pad_io(struct ftl_restore_band *rband, void *buffer,
+ struct ftl_addr addr)
+{
+ struct ftl_band *band = rband->band;
+ struct spdk_ftl_dev *dev = band->dev;
+ int flags = FTL_IO_PAD | FTL_IO_INTERNAL | FTL_IO_PHYSICAL_MODE | FTL_IO_MD |
+ FTL_IO_DIRECT_ACCESS;
+ struct ftl_io_init_opts opts = {
+ .dev = dev,
+ .io = NULL,
+ .band = band,
+ .size = sizeof(struct ftl_io),
+ .flags = flags,
+ .type = FTL_IO_WRITE,
+ .num_blocks = dev->xfer_size,
+ .cb_fn = ftl_pad_zone_cb,
+ .cb_ctx = rband,
+ .iovs = {
+ {
+ .iov_base = buffer,
+ .iov_len = dev->xfer_size * FTL_BLOCK_SIZE,
+ }
+ },
+ .iovcnt = 1,
+ .parent = NULL,
+ };
+ struct ftl_io *io;
+
+ io = ftl_io_init_internal(&opts);
+ if (spdk_unlikely(!io)) {
+ return NULL;
+ }
+
+ io->addr = addr;
+ rband->parent->num_ios++;
+
+ return io;
+}
+
+static void
+ftl_pad_zone_cb(struct ftl_io *io, void *arg, int status)
+{
+ struct ftl_restore_band *rband = arg;
+ struct ftl_restore *restore = rband->parent;
+ struct ftl_band *band = io->band;
+ struct ftl_zone *zone;
+ struct ftl_io *new_io;
+ uint64_t offset;
+
+ restore->num_ios--;
+ /* TODO check for next unit error vs early close error */
+ if (status) {
+ restore->pad_status = status;
+ goto end;
+ }
+
+ offset = io->addr.offset % ftl_get_num_blocks_in_zone(restore->dev);
+ if (offset + io->num_blocks == ftl_get_num_blocks_in_zone(restore->dev)) {
+ zone = ftl_band_zone_from_addr(band, io->addr);
+ zone->info.state = SPDK_BDEV_ZONE_STATE_FULL;
+ } else {
+ struct ftl_addr addr = io->addr;
+ addr.offset += io->num_blocks;
+ new_io = ftl_restore_init_pad_io(rband, io->iov[0].iov_base, addr);
+ if (spdk_unlikely(!new_io)) {
+ restore->pad_status = -ENOMEM;
+ goto end;
+ }
+
+ ftl_io_write(new_io);
+ return;
+ }
+
+end:
+ spdk_dma_free(io->iov[0].iov_base);
+ ftl_pad_zone_pad_finish(rband, true);
+}
+
+static void
+ftl_restore_pad_band(struct ftl_restore_band *rband)
+{
+ struct ftl_restore *restore = rband->parent;
+ struct ftl_band *band = rband->band;
+ struct spdk_ftl_dev *dev = band->dev;
+ void *buffer = NULL;
+ struct ftl_io *io;
+ struct ftl_addr addr;
+ size_t i;
+ int rc = 0;
+
+ /* Check if some zones are not closed */
+ if (ftl_pad_zone_pad_finish(rband, false)) {
+ /*
+ * If we're here, end meta wasn't recognized, but the whole band is written
+ * Assume the band was padded and ignore it
+ */
+ return;
+ }
+
+ band->state = FTL_BAND_STATE_OPEN;
+ rc = ftl_band_set_direct_access(band, true);
+ if (rc) {
+ ftl_restore_complete(restore, rc);
+ return;
+ }
+
+ for (i = 0; i < band->num_zones; ++i) {
+ if (band->zone_buf[i].info.state == SPDK_BDEV_ZONE_STATE_FULL) {
+ continue;
+ }
+
+ addr.offset = band->zone_buf[i].info.write_pointer;
+
+ buffer = spdk_dma_zmalloc(FTL_BLOCK_SIZE * dev->xfer_size, 0, NULL);
+ if (spdk_unlikely(!buffer)) {
+ rc = -ENOMEM;
+ goto error;
+ }
+
+ io = ftl_restore_init_pad_io(rband, buffer, addr);
+ if (spdk_unlikely(!io)) {
+ rc = -ENOMEM;
+ spdk_dma_free(buffer);
+ goto error;
+ }
+
+ ftl_io_write(io);
+ }
+
+ return;
+
+error:
+ restore->pad_status = rc;
+ ftl_pad_zone_pad_finish(rband, true);
+}
+
+static void
+ftl_restore_pad_open_bands(void *ctx)
+{
+ struct ftl_restore *restore = ctx;
+
+ ftl_restore_pad_band(STAILQ_FIRST(&restore->pad_bands));
+}
+
+static void
+ftl_restore_tail_md_cb(struct ftl_io *io, void *ctx, int status)
+{
+ struct ftl_restore_band *rband = ctx;
+ struct ftl_restore *restore = rband->parent;
+ struct spdk_ftl_dev *dev = restore->dev;
+
+ if (status) {
+ if (!dev->conf.allow_open_bands) {
+ SPDK_ERRLOG("%s while restoring tail md in band %u.\n",
+ spdk_strerror(-status), rband->band->id);
+ ftl_band_release_lba_map(rband->band);
+ ftl_restore_complete(restore, status);
+ return;
+ } else {
+ SPDK_ERRLOG("%s while restoring tail md. Will attempt to pad band %u.\n",
+ spdk_strerror(-status), rband->band->id);
+ STAILQ_INSERT_TAIL(&restore->pad_bands, rband, stailq);
+ }
+ }
+
+ if (!status && ftl_restore_l2p(rband->band)) {
+ ftl_band_release_lba_map(rband->band);
+ ftl_restore_complete(restore, -ENOTRECOVERABLE);
+ return;
+ }
+ ftl_band_release_lba_map(rband->band);
+
+ rband = ftl_restore_next_band(restore);
+ if (!rband) {
+ if (!STAILQ_EMPTY(&restore->pad_bands)) {
+ spdk_thread_send_msg(ftl_get_core_thread(dev), ftl_restore_pad_open_bands,
+ restore);
+ } else {
+ ftl_restore_complete(restore, 0);
+ }
+
+ return;
+ }
+
+ ftl_restore_tail_md(rband);
+}
+
+static int
+ftl_restore_tail_md(struct ftl_restore_band *rband)
+{
+ struct ftl_restore *restore = rband->parent;
+ struct ftl_band *band = rband->band;
+
+ if (ftl_band_alloc_lba_map(band)) {
+ SPDK_ERRLOG("Failed to allocate lba map\n");
+ ftl_restore_complete(restore, -ENOMEM);
+ return -ENOMEM;
+ }
+
+ if (ftl_band_read_tail_md(band, band->tail_md_addr, ftl_restore_tail_md_cb, rband)) {
+ SPDK_ERRLOG("Failed to send tail metadata read\n");
+ ftl_restore_complete(restore, -EIO);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+int
+ftl_restore_device(struct ftl_restore *restore, ftl_restore_fn cb, void *cb_arg)
+{
+ struct spdk_ftl_dev *dev = restore->dev;
+ struct ftl_restore_band *rband;
+
+ restore->current = 0;
+ restore->cb = cb;
+ restore->cb_arg = cb_arg;
+ restore->final_phase = dev->nv_cache.bdev_desc == NULL;
+
+ /* If restore_device is called, there must be at least one valid band */
+ rband = ftl_restore_next_band(restore);
+ assert(rband);
+ return ftl_restore_tail_md(rband);
+}
diff --git a/src/spdk/lib/ftl/ftl_trace.c b/src/spdk/lib/ftl/ftl_trace.c
new file mode 100644
index 000000000..ba66323ad
--- /dev/null
+++ b/src/spdk/lib/ftl/ftl_trace.c
@@ -0,0 +1,361 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/trace.h"
+
+#include "ftl_core.h"
+#include "ftl_trace.h"
+#include "ftl_io.h"
+#include "ftl_band.h"
+
+#if defined(DEBUG)
+
+#define OWNER_FTL 0x20
+#define TRACE_GROUP_FTL 0x6
+
+enum ftl_trace_source {
+ FTL_TRACE_SOURCE_INTERNAL,
+ FTL_TRACE_SOURCE_USER,
+ FTL_TRACE_SOURCE_MAX,
+};
+
+#define FTL_TPOINT_ID(id, src) SPDK_TPOINT_ID(TRACE_GROUP_FTL, (((id) << 1) | (!!(src))))
+
+#define FTL_TRACE_BAND_DEFRAG(src) FTL_TPOINT_ID(0, src)
+#define FTL_TRACE_BAND_WRITE(src) FTL_TPOINT_ID(1, src)
+#define FTL_TRACE_LIMITS(src) FTL_TPOINT_ID(2, src)
+#define FTL_TRACE_WBUF_POP(src) FTL_TPOINT_ID(3, src)
+
+#define FTL_TRACE_READ_SCHEDULE(src) FTL_TPOINT_ID(4, src)
+#define FTL_TRACE_READ_SUBMISSION(src) FTL_TPOINT_ID(5, src)
+#define FTL_TRACE_READ_COMPLETION_INVALID(src) FTL_TPOINT_ID(6, src)
+#define FTL_TRACE_READ_COMPLETION_CACHE(src) FTL_TPOINT_ID(7, src)
+#define FTL_TRACE_READ_COMPLETION_DISK(src) FTL_TPOINT_ID(8, src)
+
+#define FTL_TRACE_MD_READ_SCHEDULE(src) FTL_TPOINT_ID(9, src)
+#define FTL_TRACE_MD_READ_SUBMISSION(src) FTL_TPOINT_ID(10, src)
+#define FTL_TRACE_MD_READ_COMPLETION(src) FTL_TPOINT_ID(11, src)
+
+#define FTL_TRACE_WRITE_SCHEDULE(src) FTL_TPOINT_ID(12, src)
+#define FTL_TRACE_WRITE_WBUF_FILL(src) FTL_TPOINT_ID(13, src)
+#define FTL_TRACE_WRITE_SUBMISSION(src) FTL_TPOINT_ID(14, src)
+#define FTL_TRACE_WRITE_COMPLETION(src) FTL_TPOINT_ID(15, src)
+
+#define FTL_TRACE_MD_WRITE_SCHEDULE(src) FTL_TPOINT_ID(16, src)
+#define FTL_TRACE_MD_WRITE_SUBMISSION(src) FTL_TPOINT_ID(17, src)
+#define FTL_TRACE_MD_WRITE_COMPLETION(src) FTL_TPOINT_ID(18, src)
+
+#define FTL_TRACE_ERASE_SUBMISSION(src) FTL_TPOINT_ID(19, src)
+#define FTL_TRACE_ERASE_COMPLETION(src) FTL_TPOINT_ID(20, src)
+
+SPDK_TRACE_REGISTER_FN(ftl_trace_func, "ftl", TRACE_GROUP_FTL)
+{
+ const char source[] = { 'i', 'u' };
+ char descbuf[128];
+ int i;
+
+ spdk_trace_register_owner(OWNER_FTL, 'f');
+
+ for (i = 0; i < FTL_TRACE_SOURCE_MAX; ++i) {
+ snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "band_defrag");
+ spdk_trace_register_description(descbuf, FTL_TRACE_BAND_DEFRAG(i),
+ OWNER_FTL, OBJECT_NONE, 0, 0, "band: ");
+ snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "band_write");
+ spdk_trace_register_description(descbuf, FTL_TRACE_BAND_WRITE(i),
+ OWNER_FTL, OBJECT_NONE, 0, 0, "band: ");
+ snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "limits");
+ spdk_trace_register_description(descbuf, FTL_TRACE_LIMITS(i),
+ OWNER_FTL, OBJECT_NONE, 0, 0, "limits: ");
+ snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "rwb_pop");
+ spdk_trace_register_description(descbuf, FTL_TRACE_WBUF_POP(i),
+ OWNER_FTL, OBJECT_NONE, 0, 0, "lba: ");
+
+ snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "md_read_sched");
+ spdk_trace_register_description(descbuf, FTL_TRACE_MD_READ_SCHEDULE(i),
+ OWNER_FTL, OBJECT_NONE, 0, 0, "addr: ");
+ snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "md_read_submit");
+ spdk_trace_register_description(descbuf, FTL_TRACE_MD_READ_SUBMISSION(i),
+ OWNER_FTL, OBJECT_NONE, 0, 0, "addr: ");
+ snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "md_read_cmpl");
+ spdk_trace_register_description(descbuf, FTL_TRACE_MD_READ_COMPLETION(i),
+ OWNER_FTL, OBJECT_NONE, 0, 0, "lba: ");
+
+ snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "md_write_sched");
+ spdk_trace_register_description(descbuf, FTL_TRACE_MD_WRITE_SCHEDULE(i),
+ OWNER_FTL, OBJECT_NONE, 0, 0, "addr: ");
+ snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "md_write_submit");
+ spdk_trace_register_description(descbuf, FTL_TRACE_MD_WRITE_SUBMISSION(i),
+ OWNER_FTL, OBJECT_NONE, 0, 0, "addr: ");
+ snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "md_write_cmpl");
+ spdk_trace_register_description(descbuf, FTL_TRACE_MD_WRITE_COMPLETION(i),
+ OWNER_FTL, OBJECT_NONE, 0, 0, "lba: ");
+
+ snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "read_sched");
+ spdk_trace_register_description(descbuf, FTL_TRACE_READ_SCHEDULE(i),
+ OWNER_FTL, OBJECT_NONE, 0, 0, "lba: ");
+ snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "read_submit");
+ spdk_trace_register_description(descbuf, FTL_TRACE_READ_SUBMISSION(i),
+ OWNER_FTL, OBJECT_NONE, 0, 0, "addr: ");
+ snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "read_cmpl_invld");
+ spdk_trace_register_description(descbuf, FTL_TRACE_READ_COMPLETION_INVALID(i),
+ OWNER_FTL, OBJECT_NONE, 0, 0, "lba: ");
+ snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "read_cmpl_cache");
+ spdk_trace_register_description(descbuf, FTL_TRACE_READ_COMPLETION_CACHE(i),
+ OWNER_FTL, OBJECT_NONE, 0, 0, "lba: ");
+ snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "read_cmpl_ssd");
+ spdk_trace_register_description(descbuf, FTL_TRACE_READ_COMPLETION_DISK(i),
+ OWNER_FTL, OBJECT_NONE, 0, 0, "lba: ");
+
+ snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "write_sched");
+ spdk_trace_register_description(descbuf, FTL_TRACE_WRITE_SCHEDULE(i),
+ OWNER_FTL, OBJECT_NONE, 0, 0, "lba: ");
+ snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "rwb_fill");
+ spdk_trace_register_description(descbuf, FTL_TRACE_WRITE_WBUF_FILL(i),
+ OWNER_FTL, OBJECT_NONE, 0, 0, "lba: ");
+ snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "write_submit");
+ spdk_trace_register_description(descbuf, FTL_TRACE_WRITE_SUBMISSION(i),
+ OWNER_FTL, OBJECT_NONE, 0, 0, "addr: ");
+ snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "write_cmpl");
+ spdk_trace_register_description(descbuf, FTL_TRACE_WRITE_COMPLETION(i),
+ OWNER_FTL, OBJECT_NONE, 0, 0, "lba: ");
+
+ snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "erase_submit");
+ spdk_trace_register_description(descbuf, FTL_TRACE_ERASE_SUBMISSION(i),
+ OWNER_FTL, OBJECT_NONE, 0, 0, "addr: ");
+ snprintf(descbuf, sizeof(descbuf), "%c %s", source[i], "erase_cmpl");
+ spdk_trace_register_description(descbuf, FTL_TRACE_ERASE_COMPLETION(i),
+ OWNER_FTL, OBJECT_NONE, 0, 0, "addr: ");
+ }
+}
+
+static uint16_t
+ftl_trace_io_source(const struct ftl_io *io)
+{
+ if (io->flags & FTL_IO_INTERNAL) {
+ return FTL_TRACE_SOURCE_INTERNAL;
+ } else {
+ return FTL_TRACE_SOURCE_USER;
+ }
+}
+
+static uint64_t
+ftl_trace_next_id(struct ftl_trace *trace)
+{
+ assert(trace->id != FTL_TRACE_INVALID_ID);
+ return __atomic_fetch_add(&trace->id, 1, __ATOMIC_SEQ_CST);
+}
+
+void
+ftl_trace_defrag_band(struct spdk_ftl_dev *dev, const struct ftl_band *band)
+{
+ struct ftl_trace *trace = &dev->stats.trace;
+
+ spdk_trace_record(FTL_TRACE_BAND_DEFRAG(FTL_TRACE_SOURCE_INTERNAL),
+ ftl_trace_next_id(trace), 0, band->lba_map.num_vld, band->id);
+}
+
+void
+ftl_trace_write_band(struct spdk_ftl_dev *dev, const struct ftl_band *band)
+{
+ struct ftl_trace *trace = &dev->stats.trace;
+
+ spdk_trace_record(FTL_TRACE_BAND_WRITE(FTL_TRACE_SOURCE_INTERNAL),
+ ftl_trace_next_id(trace), 0, 0, band->id);
+}
+
+void
+ftl_trace_lba_io_init(struct spdk_ftl_dev *dev, const struct ftl_io *io)
+{
+ uint16_t tpoint_id = 0, source;
+
+ assert(io->trace != FTL_TRACE_INVALID_ID);
+ source = ftl_trace_io_source(io);
+
+ if (io->flags & FTL_IO_MD) {
+ switch (io->type) {
+ case FTL_IO_READ:
+ tpoint_id = FTL_TRACE_MD_READ_SCHEDULE(source);
+ break;
+ case FTL_IO_WRITE:
+ tpoint_id = FTL_TRACE_MD_WRITE_SCHEDULE(source);
+ break;
+ default:
+ assert(0);
+ }
+ } else {
+ switch (io->type) {
+ case FTL_IO_READ:
+ tpoint_id = FTL_TRACE_READ_SCHEDULE(source);
+ break;
+ case FTL_IO_WRITE:
+ tpoint_id = FTL_TRACE_WRITE_SCHEDULE(source);
+ break;
+ default:
+ assert(0);
+ }
+ }
+
+ spdk_trace_record(tpoint_id, io->trace, io->num_blocks, 0, ftl_io_get_lba(io, 0));
+}
+
+void
+ftl_trace_wbuf_fill(struct spdk_ftl_dev *dev, const struct ftl_io *io)
+{
+ assert(io->trace != FTL_TRACE_INVALID_ID);
+
+ spdk_trace_record(FTL_TRACE_WRITE_WBUF_FILL(ftl_trace_io_source(io)), io->trace,
+ 0, 0, ftl_io_current_lba(io));
+}
+
+void
+ftl_trace_wbuf_pop(struct spdk_ftl_dev *dev, const struct ftl_wbuf_entry *entry)
+{
+ uint16_t tpoint_id;
+
+ assert(entry->trace != FTL_TRACE_INVALID_ID);
+
+ if (entry->io_flags & FTL_IO_INTERNAL) {
+ tpoint_id = FTL_TRACE_WBUF_POP(FTL_TRACE_SOURCE_INTERNAL);
+ } else {
+ tpoint_id = FTL_TRACE_WBUF_POP(FTL_TRACE_SOURCE_USER);
+ }
+
+ spdk_trace_record(tpoint_id, entry->trace, 0, entry->addr.offset, entry->lba);
+}
+
+void
+ftl_trace_completion(struct spdk_ftl_dev *dev, const struct ftl_io *io,
+ enum ftl_trace_completion completion)
+{
+ uint16_t tpoint_id = 0, source;
+
+ assert(io->trace != FTL_TRACE_INVALID_ID);
+ source = ftl_trace_io_source(io);
+
+ if (io->flags & FTL_IO_MD) {
+ switch (io->type) {
+ case FTL_IO_READ:
+ tpoint_id = FTL_TRACE_MD_READ_COMPLETION(source);
+ break;
+ case FTL_IO_WRITE:
+ tpoint_id = FTL_TRACE_MD_WRITE_COMPLETION(source);
+ break;
+ default:
+ assert(0);
+ }
+ } else {
+ switch (io->type) {
+ case FTL_IO_READ:
+ switch (completion) {
+ case FTL_TRACE_COMPLETION_INVALID:
+ tpoint_id = FTL_TRACE_READ_COMPLETION_INVALID(source);
+ break;
+ case FTL_TRACE_COMPLETION_CACHE:
+ tpoint_id = FTL_TRACE_READ_COMPLETION_CACHE(source);
+ break;
+ case FTL_TRACE_COMPLETION_DISK:
+ tpoint_id = FTL_TRACE_READ_COMPLETION_DISK(source);
+ break;
+ }
+ break;
+ case FTL_IO_WRITE:
+ tpoint_id = FTL_TRACE_WRITE_COMPLETION(source);
+ break;
+ case FTL_IO_ERASE:
+ tpoint_id = FTL_TRACE_ERASE_COMPLETION(source);
+ break;
+ default:
+ assert(0);
+ }
+ }
+
+ spdk_trace_record(tpoint_id, io->trace, 0, 0, ftl_io_get_lba(io, io->pos - 1));
+}
+
+void
+ftl_trace_submission(struct spdk_ftl_dev *dev, const struct ftl_io *io, struct ftl_addr addr,
+ size_t addr_cnt)
+{
+ uint16_t tpoint_id = 0, source;
+
+ assert(io->trace != FTL_TRACE_INVALID_ID);
+ source = ftl_trace_io_source(io);
+
+ if (io->flags & FTL_IO_MD) {
+ switch (io->type) {
+ case FTL_IO_READ:
+ tpoint_id = FTL_TRACE_MD_READ_SUBMISSION(source);
+ break;
+ case FTL_IO_WRITE:
+ tpoint_id = FTL_TRACE_MD_WRITE_SUBMISSION(source);
+ break;
+ default:
+ assert(0);
+ }
+ } else {
+ switch (io->type) {
+ case FTL_IO_READ:
+ tpoint_id = FTL_TRACE_READ_SUBMISSION(source);
+ break;
+ case FTL_IO_WRITE:
+ tpoint_id = FTL_TRACE_WRITE_SUBMISSION(source);
+ break;
+ case FTL_IO_ERASE:
+ tpoint_id = FTL_TRACE_ERASE_SUBMISSION(source);
+ break;
+ default:
+ assert(0);
+ }
+ }
+
+ spdk_trace_record(tpoint_id, io->trace, addr_cnt, 0, addr.offset);
+}
+
+void
+ftl_trace_limits(struct spdk_ftl_dev *dev, int limit, size_t num_free)
+{
+ struct ftl_trace *trace = &dev->stats.trace;
+
+ spdk_trace_record(FTL_TRACE_LIMITS(FTL_TRACE_SOURCE_INTERNAL), ftl_trace_next_id(trace),
+ num_free, limit, 0);
+}
+
+uint64_t
+ftl_trace_alloc_id(struct spdk_ftl_dev *dev)
+{
+ struct ftl_trace *trace = &dev->stats.trace;
+
+ return ftl_trace_next_id(trace);
+}
+
+#endif /* defined(DEBUG) */
diff --git a/src/spdk/lib/ftl/ftl_trace.h b/src/spdk/lib/ftl/ftl_trace.h
new file mode 100644
index 000000000..52988cff6
--- /dev/null
+++ b/src/spdk/lib/ftl/ftl_trace.h
@@ -0,0 +1,84 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FTL_TRACE_H
+#define FTL_TRACE_H
+
+#include "ftl_addr.h"
+
+#define FTL_TRACE_INVALID_ID ((uint64_t) -1)
+
+enum ftl_trace_completion {
+ FTL_TRACE_COMPLETION_INVALID,
+ FTL_TRACE_COMPLETION_CACHE,
+ FTL_TRACE_COMPLETION_DISK,
+};
+
+struct ftl_trace {
+ /* Monotonically incrementing event id */
+ uint64_t id;
+};
+
+struct spdk_ftl_dev;
+struct ftl_trace;
+struct ftl_io;
+struct ftl_wbuf_entry;
+struct ftl_band;
+
+#if defined(DEBUG)
+uint64_t ftl_trace_alloc_id(struct spdk_ftl_dev *dev);
+void ftl_trace_defrag_band(struct spdk_ftl_dev *dev, const struct ftl_band *band);
+void ftl_trace_write_band(struct spdk_ftl_dev *dev, const struct ftl_band *band);
+void ftl_trace_lba_io_init(struct spdk_ftl_dev *dev, const struct ftl_io *io);
+void ftl_trace_wbuf_fill(struct spdk_ftl_dev *dev, const struct ftl_io *io);
+void ftl_trace_wbuf_pop(struct spdk_ftl_dev *dev, const struct ftl_wbuf_entry *entry);
+void ftl_trace_submission(struct spdk_ftl_dev *dev,
+ const struct ftl_io *io,
+ struct ftl_addr addr, size_t addr_cnt);
+void ftl_trace_completion(struct spdk_ftl_dev *dev,
+ const struct ftl_io *io,
+ enum ftl_trace_completion type);
+void ftl_trace_limits(struct spdk_ftl_dev *dev, int limit, size_t num_free);
+#else /* defined(DEBUG) */
+#define ftl_trace_alloc_id(dev) FTL_TRACE_INVALID_ID
+#define ftl_trace_defrag_band(dev, band)
+#define ftl_trace_write_band(dev, band)
+#define ftl_trace_lba_io_init(dev, io)
+#define ftl_trace_wbuf_fill(dev, io)
+#define ftl_trace_wbuf_pop(dev, entry)
+#define ftl_trace_submission(dev, io, addr, addr_cnt)
+#define ftl_trace_completion(dev, io, type)
+#define ftl_trace_limits(dev, limits, num_free)
+#endif
+
+#endif /* FTL_TRACE_H */
diff --git a/src/spdk/lib/ftl/spdk_ftl.map b/src/spdk/lib/ftl/spdk_ftl.map
new file mode 100644
index 000000000..141fd01e0
--- /dev/null
+++ b/src/spdk/lib/ftl/spdk_ftl.map
@@ -0,0 +1,14 @@
+{
+ global:
+
+ # public functions
+ spdk_ftl_dev_init;
+ spdk_ftl_dev_free;
+ spdk_ftl_conf_init_defaults;
+ spdk_ftl_dev_get_attrs;
+ spdk_ftl_read;
+ spdk_ftl_write;
+ spdk_ftl_flush;
+
+ local: *;
+};
diff --git a/src/spdk/lib/idxd/Makefile b/src/spdk/lib/idxd/Makefile
new file mode 100644
index 000000000..ed66aeb15
--- /dev/null
+++ b/src/spdk/lib/idxd/Makefile
@@ -0,0 +1,45 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = idxd.c
+LIBNAME = idxd
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_idxd.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/idxd/idxd.c b/src/spdk/lib/idxd/idxd.c
new file mode 100644
index 000000000..992d96211
--- /dev/null
+++ b/src/spdk/lib/idxd/idxd.c
@@ -0,0 +1,1292 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/env.h"
+#include "spdk/util.h"
+#include "spdk/memory.h"
+
+#include "spdk_internal/log.h"
+#include "spdk_internal/idxd.h"
+
+#include "idxd.h"
+
+#define ALIGN_4K 0x1000
+
+pthread_mutex_t g_driver_lock = PTHREAD_MUTEX_INITIALIZER;
+
+/*
+ * g_dev_cfg gives us 2 pre-set configurations of DSA to choose from
+ * via RPC.
+ */
+struct device_config *g_dev_cfg = NULL;
+
+/*
+ * Pre-built configurations. Variations depend on various factors
+ * including how many different types of target latency profiles there
+ * are, how many different QOS requirements there might be, etc.
+ */
+struct device_config g_dev_cfg0 = {
+ .config_num = 0,
+ .num_groups = 4,
+ .num_wqs_per_group = 1,
+ .num_engines_per_group = 1,
+ .total_wqs = 4,
+ .total_engines = 4,
+};
+
+struct device_config g_dev_cfg1 = {
+ .config_num = 1,
+ .num_groups = 2,
+ .num_wqs_per_group = 2,
+ .num_engines_per_group = 2,
+ .total_wqs = 4,
+ .total_engines = 4,
+};
+
+static uint32_t
+_idxd_read_4(struct spdk_idxd_device *idxd, uint32_t offset)
+{
+ return spdk_mmio_read_4((uint32_t *)(idxd->reg_base + offset));
+}
+
+static void
+_idxd_write_4(struct spdk_idxd_device *idxd, uint32_t offset, uint32_t value)
+{
+ spdk_mmio_write_4((uint32_t *)(idxd->reg_base + offset), value);
+}
+
+static uint64_t
+_idxd_read_8(struct spdk_idxd_device *idxd, uint32_t offset)
+{
+ return spdk_mmio_read_8((uint64_t *)(idxd->reg_base + offset));
+}
+
+static void
+_idxd_write_8(struct spdk_idxd_device *idxd, uint32_t offset, uint64_t value)
+{
+ spdk_mmio_write_8((uint64_t *)(idxd->reg_base + offset), value);
+}
+
+struct spdk_idxd_io_channel *
+spdk_idxd_get_channel(struct spdk_idxd_device *idxd)
+{
+ struct spdk_idxd_io_channel *chan;
+ struct idxd_batch *batch;
+ int i;
+
+ chan = calloc(1, sizeof(struct spdk_idxd_io_channel));
+ if (chan == NULL) {
+ SPDK_ERRLOG("Failed to allocate idxd chan\n");
+ return NULL;
+ }
+ chan->idxd = idxd;
+
+ TAILQ_INIT(&chan->batches);
+
+ TAILQ_INIT(&chan->batch_pool);
+ for (i = 0 ; i < NUM_BATCHES ; i++) {
+ batch = calloc(1, sizeof(struct idxd_batch));
+ if (batch == NULL) {
+ SPDK_ERRLOG("Failed to allocate batch\n");
+ while ((batch = TAILQ_FIRST(&chan->batch_pool))) {
+ TAILQ_REMOVE(&chan->batch_pool, batch, link);
+ free(batch);
+ }
+ return NULL;
+ }
+ TAILQ_INSERT_TAIL(&chan->batch_pool, batch, link);
+ }
+
+ return chan;
+}
+
+void
+spdk_idxd_put_channel(struct spdk_idxd_io_channel *chan)
+{
+ free(chan);
+}
+
+int
+spdk_idxd_configure_chan(struct spdk_idxd_io_channel *chan)
+{
+ uint32_t num_ring_slots;
+ int rc;
+
+ /* Round robin the WQ selection for the chan on this IDXD device. */
+ chan->idxd->wq_id++;
+ if (chan->idxd->wq_id == g_dev_cfg->total_wqs) {
+ chan->idxd->wq_id = 0;
+ }
+
+ num_ring_slots = chan->idxd->queues[chan->idxd->wq_id].wqcfg.wq_size;
+
+ chan->ring_ctrl.ring_slots = spdk_bit_array_create(num_ring_slots);
+ if (chan->ring_ctrl.ring_slots == NULL) {
+ SPDK_ERRLOG("Failed to allocate bit array for ring\n");
+ return -ENOMEM;
+ }
+
+ /*
+ * max ring slots can change as channels come and go but we
+ * start off getting all of the slots for this work queue.
+ */
+ chan->ring_ctrl.max_ring_slots = num_ring_slots;
+
+ /* Store the original size of the ring. */
+ chan->ring_ctrl.ring_size = num_ring_slots;
+
+ chan->ring_ctrl.desc = spdk_zmalloc(num_ring_slots * sizeof(struct idxd_hw_desc),
+ 0x40, NULL,
+ SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+ if (chan->ring_ctrl.desc == NULL) {
+ SPDK_ERRLOG("Failed to allocate descriptor memory\n");
+ rc = -ENOMEM;
+ goto err_desc;
+ }
+
+ chan->ring_ctrl.completions = spdk_zmalloc(num_ring_slots * sizeof(struct idxd_comp),
+ 0x40, NULL,
+ SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+ if (chan->ring_ctrl.completions == NULL) {
+ SPDK_ERRLOG("Failed to allocate completion memory\n");
+ rc = -ENOMEM;
+ goto err_comp;
+ }
+
+ chan->ring_ctrl.user_desc = spdk_zmalloc(TOTAL_USER_DESC * sizeof(struct idxd_hw_desc),
+ 0x40, NULL,
+ SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+ if (chan->ring_ctrl.user_desc == NULL) {
+ SPDK_ERRLOG("Failed to allocate batch descriptor memory\n");
+ rc = -ENOMEM;
+ goto err_user_desc;
+ }
+
+ /* Each slot on the ring reserves DESC_PER_BATCH elemnts in user_desc. */
+ chan->ring_ctrl.user_ring_slots = spdk_bit_array_create(NUM_BATCHES);
+ if (chan->ring_ctrl.user_ring_slots == NULL) {
+ SPDK_ERRLOG("Failed to allocate bit array for user ring\n");
+ rc = -ENOMEM;
+ goto err_user_ring;
+ }
+
+ chan->ring_ctrl.user_completions = spdk_zmalloc(TOTAL_USER_DESC * sizeof(struct idxd_comp),
+ 0x40, NULL,
+ SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+ if (chan->ring_ctrl.user_completions == NULL) {
+ SPDK_ERRLOG("Failed to allocate user completion memory\n");
+ rc = -ENOMEM;
+ goto err_user_comp;
+ }
+
+ chan->ring_ctrl.portal = (char *)chan->idxd->portals + chan->idxd->wq_id * PORTAL_SIZE;
+
+ return 0;
+
+err_user_comp:
+ spdk_bit_array_free(&chan->ring_ctrl.user_ring_slots);
+err_user_ring:
+ spdk_free(chan->ring_ctrl.user_desc);
+err_user_desc:
+ spdk_free(chan->ring_ctrl.completions);
+err_comp:
+ spdk_free(chan->ring_ctrl.desc);
+err_desc:
+ spdk_bit_array_free(&chan->ring_ctrl.ring_slots);
+
+ return rc;
+}
+
+/* Used for control commands, not for descriptor submission. */
+static int
+idxd_wait_cmd(struct spdk_idxd_device *idxd, int _timeout)
+{
+ uint32_t timeout = _timeout;
+ union idxd_cmdsts_reg cmd_status = {};
+
+ cmd_status.raw = _idxd_read_4(idxd, IDXD_CMDSTS_OFFSET);
+ while (cmd_status.active && --timeout) {
+ usleep(1);
+ cmd_status.raw = _idxd_read_4(idxd, IDXD_CMDSTS_OFFSET);
+ }
+
+ /* Check for timeout */
+ if (timeout == 0 && cmd_status.active) {
+ SPDK_ERRLOG("Command timeout, waited %u\n", _timeout);
+ return -EBUSY;
+ }
+
+ /* Check for error */
+ if (cmd_status.err) {
+ SPDK_ERRLOG("Command status reg reports error 0x%x\n", cmd_status.err);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void
+_idxd_drain(struct spdk_idxd_io_channel *chan)
+{
+ uint32_t index;
+ int set = 0;
+
+ do {
+ spdk_idxd_process_events(chan);
+ set = 0;
+ for (index = 0; index < chan->ring_ctrl.max_ring_slots; index++) {
+ set |= spdk_bit_array_get(chan->ring_ctrl.ring_slots, index);
+ }
+ } while (set);
+}
+
+int
+spdk_idxd_reconfigure_chan(struct spdk_idxd_io_channel *chan, uint32_t num_channels)
+{
+ uint32_t num_ring_slots;
+ int rc;
+ struct idxd_batch *batch;
+
+ _idxd_drain(chan);
+
+ assert(spdk_bit_array_count_set(chan->ring_ctrl.ring_slots) == 0);
+
+ if (num_channels == 0) {
+ spdk_free(chan->ring_ctrl.completions);
+ spdk_free(chan->ring_ctrl.desc);
+ spdk_bit_array_free(&chan->ring_ctrl.ring_slots);
+ spdk_free(chan->ring_ctrl.user_completions);
+ spdk_free(chan->ring_ctrl.user_desc);
+ spdk_bit_array_free(&chan->ring_ctrl.user_ring_slots);
+ while ((batch = TAILQ_FIRST(&chan->batch_pool))) {
+ TAILQ_REMOVE(&chan->batch_pool, batch, link);
+ free(batch);
+ }
+ return 0;
+ }
+
+ num_ring_slots = chan->ring_ctrl.ring_size / num_channels;
+
+ /* re-allocate our descriptor ring for hw flow control. */
+ rc = spdk_bit_array_resize(&chan->ring_ctrl.ring_slots, num_ring_slots);
+ if (rc < 0) {
+ SPDK_ERRLOG("Unable to resize channel bit array\n");
+ return -ENOMEM;
+ }
+
+ chan->ring_ctrl.max_ring_slots = num_ring_slots;
+
+ /*
+ * Note: The batch descriptor ring does not change with the
+ * number of channels as descriptors on this ring do not
+ * "count" for flow control.
+ */
+
+ return rc;
+}
+
+/* Called via RPC to select a pre-defined configuration. */
+void
+spdk_idxd_set_config(uint32_t config_num)
+{
+ switch (config_num) {
+ case 0:
+ g_dev_cfg = &g_dev_cfg0;
+ break;
+ case 1:
+ g_dev_cfg = &g_dev_cfg1;
+ break;
+ default:
+ g_dev_cfg = &g_dev_cfg0;
+ SPDK_ERRLOG("Invalid config, using default\n");
+ break;
+ }
+}
+
+static int
+idxd_unmap_pci_bar(struct spdk_idxd_device *idxd, int bar)
+{
+ int rc = 0;
+ void *addr = NULL;
+
+ if (bar == IDXD_MMIO_BAR) {
+ addr = (void *)idxd->reg_base;
+ } else if (bar == IDXD_WQ_BAR) {
+ addr = (void *)idxd->portals;
+ }
+
+ if (addr) {
+ rc = spdk_pci_device_unmap_bar(idxd->device, 0, addr);
+ }
+ return rc;
+}
+
+static int
+idxd_map_pci_bars(struct spdk_idxd_device *idxd)
+{
+ int rc;
+ void *addr;
+ uint64_t phys_addr, size;
+
+ rc = spdk_pci_device_map_bar(idxd->device, IDXD_MMIO_BAR, &addr, &phys_addr, &size);
+ if (rc != 0 || addr == NULL) {
+ SPDK_ERRLOG("pci_device_map_range failed with error code %d\n", rc);
+ return -1;
+ }
+ idxd->reg_base = addr;
+
+ rc = spdk_pci_device_map_bar(idxd->device, IDXD_WQ_BAR, &addr, &phys_addr, &size);
+ if (rc != 0 || addr == NULL) {
+ SPDK_ERRLOG("pci_device_map_range failed with error code %d\n", rc);
+ rc = idxd_unmap_pci_bar(idxd, IDXD_MMIO_BAR);
+ if (rc) {
+ SPDK_ERRLOG("unable to unmap MMIO bar\n");
+ }
+ return -EINVAL;
+ }
+ idxd->portals = addr;
+
+ return 0;
+}
+
+static int
+idxd_reset_dev(struct spdk_idxd_device *idxd)
+{
+ int rc;
+
+ _idxd_write_4(idxd, IDXD_CMD_OFFSET, IDXD_RESET_DEVICE << IDXD_CMD_SHIFT);
+ rc = idxd_wait_cmd(idxd, IDXD_REGISTER_TIMEOUT_US);
+ if (rc < 0) {
+ SPDK_ERRLOG("Error resetting device %u\n", rc);
+ }
+
+ return rc;
+}
+
+/*
+ * Build group config based on getting info from the device combined
+ * with the defined configuration. Once built, it is written to the
+ * device.
+ */
+static int
+idxd_group_config(struct spdk_idxd_device *idxd)
+{
+ int i;
+ uint64_t base_offset;
+
+ assert(g_dev_cfg->num_groups <= idxd->registers.groupcap.num_groups);
+ idxd->groups = calloc(idxd->registers.groupcap.num_groups, sizeof(struct idxd_group));
+ if (idxd->groups == NULL) {
+ SPDK_ERRLOG("Failed to allocate group memory\n");
+ return -ENOMEM;
+ }
+
+ assert(g_dev_cfg->total_engines <= idxd->registers.enginecap.num_engines);
+ for (i = 0; i < g_dev_cfg->total_engines; i++) {
+ idxd->groups[i % g_dev_cfg->num_groups].grpcfg.engines |= (1 << i);
+ }
+
+ assert(g_dev_cfg->total_wqs <= idxd->registers.wqcap.num_wqs);
+ for (i = 0; i < g_dev_cfg->total_wqs; i++) {
+ idxd->groups[i % g_dev_cfg->num_groups].grpcfg.wqs[0] |= (1 << i);
+ }
+
+ for (i = 0; i < g_dev_cfg->num_groups; i++) {
+ idxd->groups[i].idxd = idxd;
+ idxd->groups[i].id = i;
+
+ /* Divide BW tokens evenly */
+ idxd->groups[i].grpcfg.flags.tokens_allowed =
+ idxd->registers.groupcap.total_tokens / g_dev_cfg->num_groups;
+ }
+
+ /*
+ * Now write the group config to the device for all groups. We write
+ * to the max number of groups in order to 0 out the ones we didn't
+ * configure.
+ */
+ for (i = 0 ; i < idxd->registers.groupcap.num_groups; i++) {
+
+ base_offset = idxd->grpcfg_offset + i * 64;
+
+ /* GRPWQCFG, work queues config */
+ _idxd_write_8(idxd, base_offset, idxd->groups[i].grpcfg.wqs[0]);
+
+ /* GRPENGCFG, engine config */
+ _idxd_write_8(idxd, base_offset + CFG_ENGINE_OFFSET, idxd->groups[i].grpcfg.engines);
+
+ /* GRPFLAGS, flags config */
+ _idxd_write_8(idxd, base_offset + CFG_FLAG_OFFSET, idxd->groups[i].grpcfg.flags.raw);
+ }
+
+ return 0;
+}
+
+/*
+ * Build work queue (WQ) config based on getting info from the device combined
+ * with the defined configuration. Once built, it is written to the device.
+ */
+static int
+idxd_wq_config(struct spdk_idxd_device *idxd)
+{
+ int i, j;
+ struct idxd_wq *queue;
+ u_int32_t wq_size = idxd->registers.wqcap.total_wq_size / g_dev_cfg->total_wqs;
+
+ SPDK_NOTICELOG("Total ring slots available space 0x%x, so per work queue is 0x%x\n",
+ idxd->registers.wqcap.total_wq_size, wq_size);
+ assert(g_dev_cfg->total_wqs <= IDXD_MAX_QUEUES);
+ assert(g_dev_cfg->total_wqs <= idxd->registers.wqcap.num_wqs);
+ assert(LOG2_WQ_MAX_BATCH <= idxd->registers.gencap.max_batch_shift);
+ assert(LOG2_WQ_MAX_XFER <= idxd->registers.gencap.max_xfer_shift);
+
+ idxd->queues = calloc(1, idxd->registers.wqcap.num_wqs * sizeof(struct idxd_wq));
+ if (idxd->queues == NULL) {
+ SPDK_ERRLOG("Failed to allocate queue memory\n");
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < g_dev_cfg->total_wqs; i++) {
+ queue = &idxd->queues[i];
+ queue->wqcfg.wq_size = wq_size;
+ queue->wqcfg.mode = WQ_MODE_DEDICATED;
+ queue->wqcfg.max_batch_shift = LOG2_WQ_MAX_BATCH;
+ queue->wqcfg.max_xfer_shift = LOG2_WQ_MAX_XFER;
+ queue->wqcfg.wq_state = WQ_ENABLED;
+ queue->wqcfg.priority = WQ_PRIORITY_1;
+
+ /* Not part of the config struct */
+ queue->idxd = idxd;
+ queue->group = &idxd->groups[i % g_dev_cfg->num_groups];
+ }
+
+ /*
+ * Now write the work queue config to the device for all wq space
+ */
+ for (i = 0 ; i < idxd->registers.wqcap.num_wqs; i++) {
+ queue = &idxd->queues[i];
+ for (j = 0 ; j < WQCFG_NUM_DWORDS; j++) {
+ _idxd_write_4(idxd, idxd->wqcfg_offset + i * 32 + j * 4,
+ queue->wqcfg.raw[j]);
+ }
+ }
+
+ return 0;
+}
+
+static int
+idxd_device_configure(struct spdk_idxd_device *idxd)
+{
+ int i, rc = 0;
+ union idxd_offsets_register offsets_reg;
+ union idxd_genstatus_register genstatus_reg;
+
+ /*
+ * Map BAR0 and BAR2
+ */
+ rc = idxd_map_pci_bars(idxd);
+ if (rc) {
+ return rc;
+ }
+
+ /*
+ * Reset the device
+ */
+ rc = idxd_reset_dev(idxd);
+ if (rc) {
+ goto err_reset;
+ }
+
+ /*
+ * Read in config registers
+ */
+ idxd->registers.version = _idxd_read_4(idxd, IDXD_VERSION_OFFSET);
+ idxd->registers.gencap.raw = _idxd_read_8(idxd, IDXD_GENCAP_OFFSET);
+ idxd->registers.wqcap.raw = _idxd_read_8(idxd, IDXD_WQCAP_OFFSET);
+ idxd->registers.groupcap.raw = _idxd_read_8(idxd, IDXD_GRPCAP_OFFSET);
+ idxd->registers.enginecap.raw = _idxd_read_8(idxd, IDXD_ENGCAP_OFFSET);
+ for (i = 0; i < IDXD_OPCAP_WORDS; i++) {
+ idxd->registers.opcap.raw[i] =
+ _idxd_read_8(idxd, i * sizeof(uint64_t) + IDXD_OPCAP_OFFSET);
+ }
+ offsets_reg.raw[0] = _idxd_read_8(idxd, IDXD_TABLE_OFFSET);
+ offsets_reg.raw[1] = _idxd_read_8(idxd, IDXD_TABLE_OFFSET + sizeof(uint64_t));
+ idxd->grpcfg_offset = offsets_reg.grpcfg * IDXD_TABLE_OFFSET_MULT;
+ idxd->wqcfg_offset = offsets_reg.wqcfg * IDXD_TABLE_OFFSET_MULT;
+ idxd->ims_offset = offsets_reg.ims * IDXD_TABLE_OFFSET_MULT;
+ idxd->msix_perm_offset = offsets_reg.msix_perm * IDXD_TABLE_OFFSET_MULT;
+ idxd->perfmon_offset = offsets_reg.perfmon * IDXD_TABLE_OFFSET_MULT;
+
+ /*
+ * Configure groups and work queues.
+ */
+ rc = idxd_group_config(idxd);
+ if (rc) {
+ goto err_group_cfg;
+ }
+
+ rc = idxd_wq_config(idxd);
+ if (rc) {
+ goto err_wq_cfg;
+ }
+
+ /*
+ * Enable the device
+ */
+ genstatus_reg.raw = _idxd_read_4(idxd, IDXD_GENSTATUS_OFFSET);
+ assert(genstatus_reg.state == IDXD_DEVICE_STATE_DISABLED);
+
+ _idxd_write_4(idxd, IDXD_CMD_OFFSET, IDXD_ENABLE_DEV << IDXD_CMD_SHIFT);
+ rc = idxd_wait_cmd(idxd, IDXD_REGISTER_TIMEOUT_US);
+ genstatus_reg.raw = _idxd_read_4(idxd, IDXD_GENSTATUS_OFFSET);
+ if ((rc < 0) || (genstatus_reg.state != IDXD_DEVICE_STATE_ENABLED)) {
+ rc = -EINVAL;
+ SPDK_ERRLOG("Error enabling device %u\n", rc);
+ goto err_device_enable;
+ }
+
+ genstatus_reg.raw = spdk_mmio_read_4((uint32_t *)(idxd->reg_base + IDXD_GENSTATUS_OFFSET));
+ assert(genstatus_reg.state == IDXD_DEVICE_STATE_ENABLED);
+
+ /*
+ * Enable the work queues that we've configured
+ */
+ for (i = 0; i < g_dev_cfg->total_wqs; i++) {
+ _idxd_write_4(idxd, IDXD_CMD_OFFSET,
+ (IDXD_ENABLE_WQ << IDXD_CMD_SHIFT) | i);
+ rc = idxd_wait_cmd(idxd, IDXD_REGISTER_TIMEOUT_US);
+ if (rc < 0) {
+ SPDK_ERRLOG("Error enabling work queues 0x%x\n", rc);
+ goto err_wq_enable;
+ }
+ }
+
+ if ((rc == 0) && (genstatus_reg.state == IDXD_DEVICE_STATE_ENABLED)) {
+ SPDK_NOTICELOG("Device enabled, version 0x%x gencap: 0x%lx\n",
+ idxd->registers.version,
+ idxd->registers.gencap.raw);
+
+ }
+
+ return rc;
+err_wq_enable:
+err_device_enable:
+ free(idxd->queues);
+err_wq_cfg:
+ free(idxd->groups);
+err_group_cfg:
+err_reset:
+ idxd_unmap_pci_bar(idxd, IDXD_MMIO_BAR);
+ idxd_unmap_pci_bar(idxd, IDXD_MMIO_BAR);
+
+ return rc;
+}
+
+static void
+idxd_device_destruct(struct spdk_idxd_device *idxd)
+{
+ idxd_unmap_pci_bar(idxd, IDXD_MMIO_BAR);
+ idxd_unmap_pci_bar(idxd, IDXD_WQ_BAR);
+ free(idxd->groups);
+ free(idxd->queues);
+ free(idxd);
+}
+
+/* Caller must hold g_driver_lock */
+static struct spdk_idxd_device *
+idxd_attach(struct spdk_pci_device *device)
+{
+ struct spdk_idxd_device *idxd;
+ uint32_t cmd_reg;
+ int rc;
+
+ idxd = calloc(1, sizeof(struct spdk_idxd_device));
+ if (idxd == NULL) {
+ SPDK_ERRLOG("Failed to allocate memory for idxd device.\n");
+ return NULL;
+ }
+
+ idxd->device = device;
+
+ /* Enable PCI busmaster. */
+ spdk_pci_device_cfg_read32(device, &cmd_reg, 4);
+ cmd_reg |= 0x4;
+ spdk_pci_device_cfg_write32(device, cmd_reg, 4);
+
+ rc = idxd_device_configure(idxd);
+ if (rc) {
+ goto err;
+ }
+
+ return idxd;
+err:
+ idxd_device_destruct(idxd);
+ return NULL;
+}
+
+struct idxd_enum_ctx {
+ spdk_idxd_probe_cb probe_cb;
+ spdk_idxd_attach_cb attach_cb;
+ void *cb_ctx;
+};
+
+/* This function must only be called while holding g_driver_lock */
+static int
+idxd_enum_cb(void *ctx, struct spdk_pci_device *pci_dev)
+{
+ struct idxd_enum_ctx *enum_ctx = ctx;
+ struct spdk_idxd_device *idxd;
+
+ if (enum_ctx->probe_cb(enum_ctx->cb_ctx, pci_dev)) {
+ idxd = idxd_attach(pci_dev);
+ if (idxd == NULL) {
+ SPDK_ERRLOG("idxd_attach() failed\n");
+ return -EINVAL;
+ }
+
+ enum_ctx->attach_cb(enum_ctx->cb_ctx, pci_dev, idxd);
+ }
+
+ return 0;
+}
+
+int
+spdk_idxd_probe(void *cb_ctx, spdk_idxd_probe_cb probe_cb, spdk_idxd_attach_cb attach_cb)
+{
+ int rc;
+ struct idxd_enum_ctx enum_ctx;
+
+ enum_ctx.probe_cb = probe_cb;
+ enum_ctx.attach_cb = attach_cb;
+ enum_ctx.cb_ctx = cb_ctx;
+
+ pthread_mutex_lock(&g_driver_lock);
+ rc = spdk_pci_enumerate(spdk_pci_idxd_get_driver(), idxd_enum_cb, &enum_ctx);
+ pthread_mutex_unlock(&g_driver_lock);
+
+ return rc;
+}
+
+void
+spdk_idxd_detach(struct spdk_idxd_device *idxd)
+{
+ idxd_device_destruct(idxd);
+}
+
+static struct idxd_hw_desc *
+_idxd_prep_command(struct spdk_idxd_io_channel *chan, spdk_idxd_req_cb cb_fn,
+ void *cb_arg, struct idxd_batch *batch)
+{
+ uint32_t index;
+ struct idxd_hw_desc *desc;
+ struct idxd_comp *comp;
+
+ index = spdk_bit_array_find_first_clear(chan->ring_ctrl.ring_slots, 0);
+ if (index == UINT32_MAX) {
+ /* ran out of ring slots */
+ return NULL;
+ }
+
+ spdk_bit_array_set(chan->ring_ctrl.ring_slots, index);
+
+ desc = &chan->ring_ctrl.desc[index];
+ comp = &chan->ring_ctrl.completions[index];
+
+ desc->flags = IDXD_FLAG_COMPLETION_ADDR_VALID | IDXD_FLAG_REQUEST_COMPLETION;
+ desc->completion_addr = (uintptr_t)&comp->hw;
+ comp->cb_arg = cb_arg;
+ comp->cb_fn = cb_fn;
+ if (batch) {
+ comp->batch = batch;
+ batch->batch_desc_index = index;
+ }
+
+ return desc;
+}
+
+int
+spdk_idxd_submit_copy(struct spdk_idxd_io_channel *chan, void *dst, const void *src,
+ uint64_t nbytes, spdk_idxd_req_cb cb_fn, void *cb_arg)
+{
+ struct idxd_hw_desc *desc;
+
+ /* Common prep. */
+ desc = _idxd_prep_command(chan, cb_fn, cb_arg, NULL);
+ if (desc == NULL) {
+ return -EBUSY;
+ }
+
+ /* Command specific. */
+ desc->opcode = IDXD_OPCODE_MEMMOVE;
+ desc->src_addr = (uintptr_t)src;
+ desc->dst_addr = (uintptr_t)dst;
+ desc->xfer_size = nbytes;
+
+ /* Submit operation. */
+ movdir64b(chan->ring_ctrl.portal, desc);
+
+ return 0;
+}
+
+/* Dual-cast copies the same source to two separate destination buffers. */
+int
+spdk_idxd_submit_dualcast(struct spdk_idxd_io_channel *chan, void *dst1, void *dst2,
+ const void *src, uint64_t nbytes, spdk_idxd_req_cb cb_fn, void *cb_arg)
+{
+ struct idxd_hw_desc *desc;
+
+ if ((uintptr_t)dst1 & (ALIGN_4K - 1) || (uintptr_t)dst2 & (ALIGN_4K - 1)) {
+ SPDK_ERRLOG("Dualcast requires 4K alignment on dst addresses\n");
+ return -EINVAL;
+ }
+
+ /* Common prep. */
+ desc = _idxd_prep_command(chan, cb_fn, cb_arg, NULL);
+ if (desc == NULL) {
+ return -EBUSY;
+ }
+
+ /* Command specific. */
+ desc->opcode = IDXD_OPCODE_DUALCAST;
+ desc->src_addr = (uintptr_t)src;
+ desc->dst_addr = (uintptr_t)dst1;
+ desc->dest2 = (uintptr_t)dst2;
+ desc->xfer_size = nbytes;
+
+ /* Submit operation. */
+ movdir64b(chan->ring_ctrl.portal, desc);
+
+ return 0;
+}
+
+int
+spdk_idxd_submit_compare(struct spdk_idxd_io_channel *chan, void *src1, const void *src2,
+ uint64_t nbytes,
+ spdk_idxd_req_cb cb_fn, void *cb_arg)
+{
+ struct idxd_hw_desc *desc;
+
+ /* Common prep. */
+ desc = _idxd_prep_command(chan, cb_fn, cb_arg, NULL);
+ if (desc == NULL) {
+ return -EBUSY;
+ }
+
+ /* Command specific. */
+ desc->opcode = IDXD_OPCODE_COMPARE;
+ desc->src_addr = (uintptr_t)src1;
+ desc->src2_addr = (uintptr_t)src2;
+ desc->xfer_size = nbytes;
+
+ /* Submit operation. */
+ movdir64b(chan->ring_ctrl.portal, desc);
+
+ return 0;
+}
+
+int
+spdk_idxd_submit_fill(struct spdk_idxd_io_channel *chan, void *dst, uint64_t fill_pattern,
+ uint64_t nbytes, spdk_idxd_req_cb cb_fn, void *cb_arg)
+{
+ struct idxd_hw_desc *desc;
+
+ /* Common prep. */
+ desc = _idxd_prep_command(chan, cb_fn, cb_arg, NULL);
+ if (desc == NULL) {
+ return -EBUSY;
+ }
+
+ /* Command specific. */
+ desc->opcode = IDXD_OPCODE_MEMFILL;
+ desc->pattern = fill_pattern;
+ desc->dst_addr = (uintptr_t)dst;
+ desc->xfer_size = nbytes;
+
+ /* Submit operation. */
+ movdir64b(chan->ring_ctrl.portal, desc);
+
+ return 0;
+}
+
+int
+spdk_idxd_submit_crc32c(struct spdk_idxd_io_channel *chan, uint32_t *dst, void *src,
+ uint32_t seed, uint64_t nbytes,
+ spdk_idxd_req_cb cb_fn, void *cb_arg)
+{
+ struct idxd_hw_desc *desc;
+
+ /* Common prep. */
+ desc = _idxd_prep_command(chan, cb_fn, cb_arg, NULL);
+ if (desc == NULL) {
+ return -EBUSY;
+ }
+
+ /* Command specific. */
+ desc->opcode = IDXD_OPCODE_CRC32C_GEN;
+ desc->dst_addr = (uintptr_t)dst;
+ desc->src_addr = (uintptr_t)src;
+ desc->flags &= IDXD_CLEAR_CRC_FLAGS;
+ desc->crc32c.seed = seed;
+ desc->xfer_size = nbytes;
+
+ /* Submit operation. */
+ movdir64b(chan->ring_ctrl.portal, desc);
+
+ return 0;
+}
+
+uint32_t
+spdk_idxd_batch_get_max(void)
+{
+ return DESC_PER_BATCH; /* TODO maybe add startup RPC to set this */
+}
+
+struct idxd_batch *
+spdk_idxd_batch_create(struct spdk_idxd_io_channel *chan)
+{
+ struct idxd_batch *batch = NULL;
+
+ if (!TAILQ_EMPTY(&chan->batch_pool)) {
+ batch = TAILQ_FIRST(&chan->batch_pool);
+ TAILQ_REMOVE(&chan->batch_pool, batch, link);
+ } else {
+ /* The application needs to handle this. */
+ return NULL;
+ }
+
+ batch->batch_num = spdk_bit_array_find_first_clear(chan->ring_ctrl.user_ring_slots, 0);
+ if (batch->batch_num == UINT32_MAX) {
+ /* ran out of ring slots, the application needs to handle this. */
+ TAILQ_INSERT_TAIL(&chan->batch_pool, batch, link);
+ return NULL;
+ }
+
+ spdk_bit_array_set(chan->ring_ctrl.user_ring_slots, batch->batch_num);
+
+ /*
+ * Find the first descriptor address for the given batch. The
+ * descriptor ring used for user desctipors is allocated in
+ * units of DESC_PER_BATCH. The actual index is in units of
+ * one descriptor.
+ */
+ batch->start_index = batch->cur_index = batch->batch_num * DESC_PER_BATCH;
+
+ TAILQ_INSERT_TAIL(&chan->batches, batch, link);
+ SPDK_DEBUGLOG(SPDK_LOG_IDXD, "New batch %p num %u\n", batch, batch->batch_num);
+
+ return batch;
+}
+
+static bool
+_does_batch_exist(struct idxd_batch *batch, struct spdk_idxd_io_channel *chan)
+{
+ bool found = false;
+ struct idxd_batch *cur_batch;
+
+ TAILQ_FOREACH(cur_batch, &chan->batches, link) {
+ if (cur_batch == batch) {
+ found = true;
+ break;
+ }
+ }
+
+ return found;
+}
+
+int
+spdk_idxd_batch_cancel(struct spdk_idxd_io_channel *chan, struct idxd_batch *batch)
+{
+ if (_does_batch_exist(batch, chan) == false) {
+ SPDK_ERRLOG("Attempt to cancel a batch that doesn't exist\n.");
+ return -EINVAL;
+ }
+
+ if (batch->remaining > 0) {
+ SPDK_ERRLOG("Cannot cancel batch, already submitted to HW\n.");
+ return -EINVAL;
+ }
+
+ TAILQ_REMOVE(&chan->batches, batch, link);
+ spdk_bit_array_clear(chan->ring_ctrl.user_ring_slots, batch->batch_num);
+ TAILQ_INSERT_TAIL(&chan->batch_pool, batch, link);
+
+ return 0;
+}
+
+int
+spdk_idxd_batch_submit(struct spdk_idxd_io_channel *chan, struct idxd_batch *batch,
+ spdk_idxd_req_cb cb_fn, void *cb_arg)
+{
+ struct idxd_hw_desc *desc;
+
+ if (_does_batch_exist(batch, chan) == false) {
+ SPDK_ERRLOG("Attempt to submit a batch that doesn't exist\n.");
+ return -EINVAL;
+ }
+
+ /* Common prep. */
+ desc = _idxd_prep_command(chan, cb_fn, cb_arg, batch);
+ if (desc == NULL) {
+ SPDK_DEBUGLOG(SPDK_LOG_IDXD, "Can't submit batch %p busy batch num %u\n", batch, batch->batch_num);
+ return -EBUSY;
+ }
+
+ /* Command specific. */
+ desc->opcode = IDXD_OPCODE_BATCH;
+ desc->desc_list_addr = (uintptr_t)&chan->ring_ctrl.user_desc[batch->start_index];
+ desc->desc_count = batch->cur_index - batch->start_index;
+ assert(desc->desc_count <= DESC_PER_BATCH);
+
+ if (desc->desc_count < MIN_USER_DESC_COUNT) {
+ SPDK_ERRLOG("Attempt to submit a batch without at least %u operations.\n",
+ MIN_USER_DESC_COUNT);
+ return -EINVAL;
+ }
+
+ /* Total completions for the batch = num desc plus 1 for the batch desc itself. */
+ batch->remaining = desc->desc_count + 1;
+
+ /* Submit operation. */
+ movdir64b(chan->ring_ctrl.portal, desc);
+
+ return 0;
+}
+
+static struct idxd_hw_desc *
+_idxd_prep_batch_cmd(struct spdk_idxd_io_channel *chan, spdk_idxd_req_cb cb_fn,
+ void *cb_arg, struct idxd_batch *batch)
+{
+ struct idxd_hw_desc *desc;
+ struct idxd_comp *comp;
+
+ if (_does_batch_exist(batch, chan) == false) {
+ SPDK_ERRLOG("Attempt to add to a batch that doesn't exist\n.");
+ return NULL;
+ }
+
+ if ((batch->cur_index - batch->start_index) == DESC_PER_BATCH) {
+ SPDK_ERRLOG("Attempt to add to a batch that is already full\n.");
+ return NULL;
+ }
+
+ desc = &chan->ring_ctrl.user_desc[batch->cur_index];
+ comp = &chan->ring_ctrl.user_completions[batch->cur_index];
+ SPDK_DEBUGLOG(SPDK_LOG_IDXD, "Prep batch %p index %u\n", batch, batch->cur_index);
+
+ batch->cur_index++;
+ assert(batch->cur_index > batch->start_index);
+
+ desc->flags = IDXD_FLAG_COMPLETION_ADDR_VALID | IDXD_FLAG_REQUEST_COMPLETION;
+ desc->completion_addr = (uintptr_t)&comp->hw;
+ comp->cb_arg = cb_arg;
+ comp->cb_fn = cb_fn;
+ comp->batch = batch;
+
+ return desc;
+}
+
+int
+spdk_idxd_batch_prep_copy(struct spdk_idxd_io_channel *chan, struct idxd_batch *batch,
+ void *dst, const void *src, uint64_t nbytes, spdk_idxd_req_cb cb_fn, void *cb_arg)
+{
+ struct idxd_hw_desc *desc;
+
+ /* Common prep. */
+ desc = _idxd_prep_batch_cmd(chan, cb_fn, cb_arg, batch);
+ if (desc == NULL) {
+ return -EINVAL;
+ }
+
+ /* Command specific. */
+ desc->opcode = IDXD_OPCODE_MEMMOVE;
+ desc->src_addr = (uintptr_t)src;
+ desc->dst_addr = (uintptr_t)dst;
+ desc->xfer_size = nbytes;
+
+ return 0;
+}
+
+int
+spdk_idxd_batch_prep_fill(struct spdk_idxd_io_channel *chan, struct idxd_batch *batch,
+ void *dst, uint64_t fill_pattern, uint64_t nbytes,
+ spdk_idxd_req_cb cb_fn, void *cb_arg)
+{
+ struct idxd_hw_desc *desc;
+
+ /* Common prep. */
+ desc = _idxd_prep_batch_cmd(chan, cb_fn, cb_arg, batch);
+ if (desc == NULL) {
+ return -EINVAL;
+ }
+
+ /* Command specific. */
+ desc->opcode = IDXD_OPCODE_MEMFILL;
+ desc->pattern = fill_pattern;
+ desc->dst_addr = (uintptr_t)dst;
+ desc->xfer_size = nbytes;
+
+ return 0;
+}
+
+int
+spdk_idxd_batch_prep_dualcast(struct spdk_idxd_io_channel *chan, struct idxd_batch *batch,
+ void *dst1, void *dst2, const void *src, uint64_t nbytes, spdk_idxd_req_cb cb_fn, void *cb_arg)
+{
+ struct idxd_hw_desc *desc;
+
+ if ((uintptr_t)dst1 & (ALIGN_4K - 1) || (uintptr_t)dst2 & (ALIGN_4K - 1)) {
+ SPDK_ERRLOG("Dualcast requires 4K alignment on dst addresses\n");
+ return -EINVAL;
+ }
+
+ /* Common prep. */
+ desc = _idxd_prep_batch_cmd(chan, cb_fn, cb_arg, batch);
+ if (desc == NULL) {
+ return -EINVAL;
+ }
+ desc->opcode = IDXD_OPCODE_DUALCAST;
+ desc->src_addr = (uintptr_t)src;
+ desc->dst_addr = (uintptr_t)dst1;
+ desc->dest2 = (uintptr_t)dst2;
+ desc->xfer_size = nbytes;
+
+ return 0;
+}
+
+int
+spdk_idxd_batch_prep_crc32c(struct spdk_idxd_io_channel *chan, struct idxd_batch *batch,
+ uint32_t *dst, void *src, uint32_t seed, uint64_t nbytes,
+ spdk_idxd_req_cb cb_fn, void *cb_arg)
+{
+ struct idxd_hw_desc *desc;
+
+ /* Common prep. */
+ desc = _idxd_prep_batch_cmd(chan, cb_fn, cb_arg, batch);
+ if (desc == NULL) {
+ return -EINVAL;
+ }
+
+ /* Command specific. */
+ desc->opcode = IDXD_OPCODE_CRC32C_GEN;
+ desc->dst_addr = (uintptr_t)dst;
+ desc->src_addr = (uintptr_t)src;
+ desc->flags &= IDXD_CLEAR_CRC_FLAGS;
+ desc->crc32c.seed = seed;
+ desc->xfer_size = nbytes;
+
+ return 0;
+}
+
+int
+spdk_idxd_batch_prep_compare(struct spdk_idxd_io_channel *chan, struct idxd_batch *batch,
+ void *src1, void *src2, uint64_t nbytes, spdk_idxd_req_cb cb_fn,
+ void *cb_arg)
+{
+ struct idxd_hw_desc *desc;
+
+ /* Common prep. */
+ desc = _idxd_prep_batch_cmd(chan, cb_fn, cb_arg, batch);
+ if (desc == NULL) {
+ return -EINVAL;
+ }
+
+ /* Command specific. */
+ desc->opcode = IDXD_OPCODE_COMPARE;
+ desc->src_addr = (uintptr_t)src1;
+ desc->src2_addr = (uintptr_t)src2;
+ desc->xfer_size = nbytes;
+
+ return 0;
+}
+
+static void
+_dump_error_reg(struct spdk_idxd_io_channel *chan)
+{
+ uint64_t sw_error_0;
+ uint16_t i;
+
+ sw_error_0 = _idxd_read_8(chan->idxd, IDXD_SWERR_OFFSET);
+
+ SPDK_NOTICELOG("SW Error bits set:");
+ for (i = 0; i < CHAR_BIT; i++) {
+ if ((1ULL << i) & sw_error_0) {
+ SPDK_NOTICELOG(" %d\n", i);
+ }
+ }
+ SPDK_NOTICELOG("SW Error error code: %#x\n", (uint8_t)(sw_error_0 >> 8));
+ SPDK_NOTICELOG("SW Error WQ index: %u\n", (uint8_t)(sw_error_0 >> 16));
+ SPDK_NOTICELOG("SW Error Operation: %u\n", (uint8_t)(sw_error_0 >> 32));
+}
+
+static void
+_free_batch(struct idxd_batch *batch, struct spdk_idxd_io_channel *chan,
+ struct idxd_comp *comp)
+{
+ TAILQ_REMOVE(&chan->batches, batch, link);
+ TAILQ_INSERT_TAIL(&chan->batch_pool, batch, link);
+ comp->batch = NULL;
+ spdk_bit_array_clear(chan->ring_ctrl.user_ring_slots, batch->batch_num);
+ spdk_bit_array_clear(chan->ring_ctrl.ring_slots, batch->batch_desc_index);
+}
+
+static void
+_spdk_idxd_process_batch_events(struct spdk_idxd_io_channel *chan)
+{
+ uint16_t index;
+ struct idxd_comp *comp;
+ uint64_t sw_error_0;
+ int status = 0;
+ struct idxd_batch *batch;
+
+ /*
+ * We don't check the bit array for user completions as there's only
+ * one bit per per batch.
+ */
+ for (index = 0; index < TOTAL_USER_DESC; index++) {
+ comp = &chan->ring_ctrl.user_completions[index];
+ if (comp->hw.status == 1) {
+ struct idxd_hw_desc *desc;
+
+ sw_error_0 = _idxd_read_8(chan->idxd, IDXD_SWERR_OFFSET);
+ if (sw_error_0 & 0x1) {
+ _dump_error_reg(chan);
+ status = -EINVAL;
+ }
+
+ desc = &chan->ring_ctrl.user_desc[index];
+ switch (desc->opcode) {
+ case IDXD_OPCODE_CRC32C_GEN:
+ *(uint32_t *)desc->dst_addr = comp->hw.crc32c_val;
+ *(uint32_t *)desc->dst_addr ^= ~0;
+ break;
+ case IDXD_OPCODE_COMPARE:
+ if (status == 0) {
+ status = comp->hw.result;
+ }
+ break;
+ case IDXD_OPCODE_MEMFILL:
+ case IDXD_OPCODE_DUALCAST:
+ case IDXD_OPCODE_MEMMOVE:
+ break;
+ default:
+ assert(false);
+ break;
+ }
+
+ /* The hw will complete all user desc first before the batch
+ * desc (see spec for configuration exceptions) however
+ * because of the order that we check for comps in the poller
+ * we may "see" them in a different order than they actually
+ * completed in.
+ */
+ batch = comp->batch;
+ assert(batch->remaining > 0);
+ if (--batch->remaining == 0) {
+ _free_batch(batch, chan, comp);
+ }
+
+ comp->cb_fn((void *)comp->cb_arg, status);
+ comp->hw.status = status = 0;
+ }
+ }
+}
+
+/*
+ * TODO: Experiment with different methods of reaping completions for performance
+ * once we have real silicon.
+ */
+void
+spdk_idxd_process_events(struct spdk_idxd_io_channel *chan)
+{
+ uint16_t index;
+ struct idxd_comp *comp;
+ uint64_t sw_error_0;
+ int status = 0;
+ struct idxd_batch *batch;
+
+ if (!TAILQ_EMPTY(&chan->batches)) {
+ _spdk_idxd_process_batch_events(chan);
+ }
+
+ for (index = 0; index < chan->ring_ctrl.max_ring_slots; index++) {
+ if (spdk_bit_array_get(chan->ring_ctrl.ring_slots, index)) {
+ comp = &chan->ring_ctrl.completions[index];
+ if (comp->hw.status == 1) {
+ struct idxd_hw_desc *desc;
+
+ sw_error_0 = _idxd_read_8(chan->idxd, IDXD_SWERR_OFFSET);
+ if (sw_error_0 & 0x1) {
+ _dump_error_reg(chan);
+ status = -EINVAL;
+ }
+
+ desc = &chan->ring_ctrl.desc[index];
+ switch (desc->opcode) {
+ case IDXD_OPCODE_BATCH:
+ /* The hw will complete all user desc first before the batch
+ * desc (see spec for configuration exceptions) however
+ * because of the order that we check for comps in the poller
+ * we may "see" them in a different order than they actually
+ * completed in.
+ */
+ batch = comp->batch;
+ assert(batch->remaining > 0);
+ if (--batch->remaining == 0) {
+ _free_batch(batch, chan, comp);
+ }
+ break;
+ case IDXD_OPCODE_CRC32C_GEN:
+ *(uint32_t *)desc->dst_addr = comp->hw.crc32c_val;
+ *(uint32_t *)desc->dst_addr ^= ~0;
+ break;
+ case IDXD_OPCODE_COMPARE:
+ if (status == 0) {
+ status = comp->hw.result;
+ }
+ break;
+ }
+
+ comp->cb_fn(comp->cb_arg, status);
+ comp->hw.status = status = 0;
+ if (desc->opcode != IDXD_OPCODE_BATCH) {
+ spdk_bit_array_clear(chan->ring_ctrl.ring_slots, index);
+ }
+ }
+ }
+ }
+}
+
+SPDK_LOG_REGISTER_COMPONENT("idxd", SPDK_LOG_IDXD)
diff --git a/src/spdk/lib/idxd/idxd.h b/src/spdk/lib/idxd/idxd.h
new file mode 100644
index 000000000..09d021152
--- /dev/null
+++ b/src/spdk/lib/idxd/idxd.h
@@ -0,0 +1,188 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __IDXD_H__
+#define __IDXD_H__
+
+#include "spdk/stdinc.h"
+
+#include "spdk/idxd.h"
+#include "spdk/queue.h"
+#include "spdk/mmio.h"
+#include "spdk/bit_array.h"
+
+#include "idxd_spec.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* TODO: get the gcc intrinsic to work. */
+#define nop() asm volatile ("nop")
+static inline void movdir64b(void *dst, const void *src)
+{
+ asm volatile(".byte 0x66, 0x0f, 0x38, 0xf8, 0x02"
+ : "=m"(*(char *)dst)
+ : "d"(src), "a"(dst));
+}
+
+#define IDXD_REGISTER_TIMEOUT_US 50
+#define IDXD_DRAIN_TIMEOUT_US 500000
+
+/* TODO: make some of these RPC selectable */
+#define WQ_MODE_DEDICATED 1
+#define LOG2_WQ_MAX_BATCH 8 /* 2^8 = 256 */
+#define LOG2_WQ_MAX_XFER 30 /* 2^30 = 1073741824 */
+#define WQCFG_NUM_DWORDS 8
+#define WQ_PRIORITY_1 1
+#define IDXD_MAX_QUEUES 64
+
+#define TOTAL_USER_DESC (1 << LOG2_WQ_MAX_BATCH)
+#define DESC_PER_BATCH 16 /* TODO maybe make this a startup RPC */
+#define NUM_BATCHES (TOTAL_USER_DESC / DESC_PER_BATCH)
+#define MIN_USER_DESC_COUNT 2
+
+struct idxd_batch {
+ uint32_t batch_desc_index;
+ uint32_t batch_num;
+ uint32_t cur_index;
+ uint32_t start_index;
+ uint32_t remaining;
+ TAILQ_ENTRY(idxd_batch) link;
+};
+
+struct device_config {
+ uint8_t config_num;
+ uint8_t num_wqs_per_group;
+ uint8_t num_engines_per_group;
+ uint8_t num_groups;
+ uint16_t total_wqs;
+ uint16_t total_engines;
+};
+
+struct idxd_ring_control {
+ void *portal;
+
+ uint16_t ring_size;
+
+ /*
+ * Rings for this channel, one for descriptors and one
+ * for completions, share the same index. Batch descriptors
+ * are managed independently from data descriptors.
+ */
+ struct idxd_hw_desc *desc;
+ struct idxd_comp *completions;
+ struct idxd_hw_desc *user_desc;
+ struct idxd_comp *user_completions;
+
+ /*
+ * We use one bit array to track ring slots for both
+ * desc and completions.
+ */
+ struct spdk_bit_array *ring_slots;
+ uint32_t max_ring_slots;
+
+ /*
+ * We use a separate bit array to track ring slots for
+ * descriptors submitted via the user in a batch.
+ */
+ struct spdk_bit_array *user_ring_slots;
+};
+
+struct spdk_idxd_io_channel {
+ struct spdk_idxd_device *idxd;
+ struct idxd_ring_control ring_ctrl;
+ TAILQ_HEAD(, idxd_batch) batch_pool; /* free batches */
+ TAILQ_HEAD(, idxd_batch) batches; /* in use batches */
+};
+
+struct pci_dev_id {
+ int vendor_id;
+ int device_id;
+};
+
+struct idxd_group {
+ struct spdk_idxd_device *idxd;
+ struct idxd_grpcfg grpcfg;
+ struct pci_dev_id pcidev;
+ int num_engines;
+ int num_wqs;
+ int id;
+ uint8_t tokens_allowed;
+ bool use_token_limit;
+ uint8_t tokens_reserved;
+ int tc_a;
+ int tc_b;
+};
+
+/*
+ * This struct wraps the hardware completion record which is 32 bytes in
+ * size and must be 32 byte aligned.
+ */
+struct idxd_comp {
+ struct idxd_hw_comp_record hw;
+ void *cb_arg;
+ spdk_idxd_req_cb cb_fn;
+ struct idxd_batch *batch;
+ uint64_t pad2;
+} __attribute__((packed));
+SPDK_STATIC_ASSERT(sizeof(struct idxd_comp) == 64, "size mismatch");
+
+struct idxd_wq {
+ struct spdk_idxd_device *idxd;
+ struct idxd_group *group;
+ union idxd_wqcfg wqcfg;
+};
+
+struct spdk_idxd_device {
+ struct spdk_pci_device *device;
+ void *reg_base;
+ void *portals;
+ int socket_id;
+ int wq_id;
+
+ struct idxd_registers registers;
+ uint32_t ims_offset;
+ uint32_t msix_perm_offset;
+ uint32_t wqcfg_offset;
+ uint32_t grpcfg_offset;
+ uint32_t perfmon_offset;
+ struct idxd_group *groups;
+ struct idxd_wq *queues;
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __IDXD_H__ */
diff --git a/src/spdk/lib/idxd/idxd_spec.h b/src/spdk/lib/idxd/idxd_spec.h
new file mode 100644
index 000000000..51d52cdcc
--- /dev/null
+++ b/src/spdk/lib/idxd/idxd_spec.h
@@ -0,0 +1,503 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file
+ * IDXD specification definitions
+ */
+
+#ifndef SPDK_IDXD_SPEC_H
+#define SPDK_IDXD_SPEC_H
+
+#include "spdk/stdinc.h"
+#include "spdk/assert.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define IDXD_MMIO_BAR 0
+#define IDXD_WQ_BAR 2
+#define PORTAL_SIZE (4096 * 4)
+
+#define CFG_ENGINE_OFFSET 0x20
+#define CFG_FLAG_OFFSET 0x28
+
+#define IDXD_CMD_SHIFT 20
+
+#define IDXD_VERSION_OFFSET 0x00
+#define IDXD_GENCAP_OFFSET 0x10
+#define IDXD_WQCAP_OFFSET 0x20
+#define IDXD_GRPCAP_OFFSET 0x30
+#define IDXD_OPCAP_OFFSET 0x40
+#define IDXD_ENGCAP_OFFSET 0x38
+#define IDXD_OPCAP_OFFSET 0x40
+#define IDXD_TABLE_OFFSET 0x60
+#define IDXD_GENCFG_OFFSET 0x80
+#define IDXD_GENCTRL_OFFSET 0x88
+#define IDXD_GENSTATUS_OFFSET 0x90
+#define IDXD_INTCAUSE_OFFSET 0x98
+#define IDXD_CMD_OFFSET 0xa0
+#define IDXD_CMDSTS_OFFSET 0xa8
+#define IDXD_SWERR_OFFSET 0xc0
+#define IDXD_TABLE_OFFSET_MULT 0x100
+
+#define IDXD_OPCAP_WORDS 0x4
+
+#define IDXD_CLEAR_CRC_FLAGS 0xFFFFu
+
+#define IDXD_FLAG_FENCE (1 << 0)
+#define IDXD_FLAG_COMPLETION_ADDR_VALID (1 << 2)
+#define IDXD_FLAG_REQUEST_COMPLETION (1 << 3)
+#define IDXD_FLAG_CACHE_CONTROL (1 << 8)
+
+/*
+ * IDXD is a family of devices, DSA is the only currently
+ * supported one.
+ */
+enum dsa_completion_status {
+ IDXD_COMP_NONE = 0,
+ IDXD_COMP_SUCCESS = 1,
+ IDXD_COMP_SUCCESS_PRED = 2,
+ IDXD_COMP_PAGE_FAULT_NOBOF = 3,
+ IDXD_COMP_PAGE_FAULT_IR = 4,
+ IDXD_COMP_BATCH_FAIL = 5,
+ IDXD_COMP_BATCH_PAGE_FAULT = 6,
+ IDXD_COMP_DR_OFFSET_NOINC = 7,
+ IDXD_COMP_DR_OFFSET_ERANGE = 8,
+ IDXD_COMP_DIF_ERR = 9,
+ IDXD_COMP_BAD_OPCODE = 16,
+ IDXD_COMP_INVALID_FLAGS = 17,
+ IDXD_COMP_NOZERO_RESERVE = 18,
+ IDXD_COMP_XFER_ERANGE = 19,
+ IDXD_COMP_DESC_CNT_ERANGE = 20,
+ IDXD_COMP_DR_ERANGE = 21,
+ IDXD_COMP_OVERLAP_BUFFERS = 22,
+ IDXD_COMP_DCAST_ERR = 23,
+ IDXD_COMP_DESCLIST_ALIGN = 24,
+ IDXD_COMP_INT_HANDLE_INVAL = 25,
+ IDXD_COMP_CRA_XLAT = 26,
+ IDXD_COMP_CRA_ALIGN = 27,
+ IDXD_COMP_ADDR_ALIGN = 28,
+ IDXD_COMP_PRIV_BAD = 29,
+ IDXD_COMP_TRAFFIC_CLASS_CONF = 30,
+ IDXD_COMP_PFAULT_RDBA = 31,
+ IDXD_COMP_HW_ERR1 = 32,
+ IDXD_COMP_HW_ERR_DRB = 33,
+ IDXD_COMP_TRANSLATION_FAIL = 34,
+};
+
+enum idxd_wq_state {
+ WQ_DISABLED = 0,
+ WQ_ENABLED = 1,
+};
+
+enum idxd_wq_flag {
+ WQ_FLAG_DEDICATED = 0,
+ WQ_FLAG_BOF = 1,
+};
+
+enum idxd_wq_type {
+ WQT_NONE = 0,
+ WQT_KERNEL = 1,
+ WQT_USER = 2,
+ WQT_MDEV = 3,
+};
+
+enum idxd_dev_state {
+ IDXD_DEVICE_STATE_DISABLED = 0,
+ IDXD_DEVICE_STATE_ENABLED = 1,
+ IDXD_DEVICE_STATE_DRAIN = 2,
+ IDXD_DEVICE_STATE_HALT = 3,
+};
+
+enum idxd_device_reset_type {
+ IDXD_DEVICE_RESET_SOFTWARE = 0,
+ IDXD_DEVICE_RESET_FLR = 1,
+ IDXD_DEVICE_RESET_WARM = 2,
+ IDXD_DEVICE_RESET_COLD = 3,
+};
+
+enum idxd_cmds {
+ IDXD_ENABLE_DEV = 1,
+ IDXD_DISABLE_DEV = 2,
+ IDXD_DRAIN_ALL = 3,
+ IDXD_ABORT_ALL = 4,
+ IDXD_RESET_DEVICE = 5,
+ IDXD_ENABLE_WQ = 6,
+ IDXD_DISABLE_WQ = 7,
+ IDXD_DRAIN_WQ = 8,
+ IDXD_ABORT_WQ = 9,
+ IDXD_RESET_WQ = 10,
+};
+
+enum idxd_cmdsts_err {
+ IDXD_CMDSTS_SUCCESS = 0,
+ IDXD_CMDSTS_INVAL_CMD = 1,
+ IDXD_CMDSTS_INVAL_WQIDX = 2,
+ IDXD_CMDSTS_HW_ERR = 3,
+ IDXD_CMDSTS_ERR_DEV_ENABLED = 16,
+ IDXD_CMDSTS_ERR_CONFIG = 17,
+ IDXD_CMDSTS_ERR_BUSMASTER_EN = 18,
+ IDXD_CMDSTS_ERR_PASID_INVAL = 19,
+ IDXD_CMDSTS_ERR_WQ_SIZE_ERANGE = 20,
+ IDXD_CMDSTS_ERR_GRP_CONFIG = 21,
+ IDXD_CMDSTS_ERR_GRP_CONFIG2 = 22,
+ IDXD_CMDSTS_ERR_GRP_CONFIG3 = 23,
+ IDXD_CMDSTS_ERR_GRP_CONFIG4 = 24,
+ IDXD_CMDSTS_ERR_DEV_NOTEN = 32,
+ IDXD_CMDSTS_ERR_WQ_ENABLED = 33,
+ IDXD_CMDSTS_ERR_WQ_SIZE = 34,
+ IDXD_CMDSTS_ERR_WQ_PRIOR = 35,
+ IDXD_CMDSTS_ERR_WQ_MODE = 36,
+ IDXD_CMDSTS_ERR_BOF_EN = 37,
+ IDXD_CMDSTS_ERR_PASID_EN = 38,
+ IDXD_CMDSTS_ERR_MAX_BATCH_SIZE = 39,
+ IDXD_CMDSTS_ERR_MAX_XFER_SIZE = 40,
+ IDXD_CMDSTS_ERR_DIS_DEV_EN = 49,
+ IDXD_CMDSTS_ERR_DEV_NOT_EN = 50,
+ IDXD_CMDSTS_ERR_INVAL_INT_IDX = 65,
+ IDXD_CMDSTS_ERR_NO_HANDLE = 66,
+};
+
+enum idxd_wq_hw_state {
+ IDXD_WQ_DEV_DISABLED = 0,
+ IDXD_WQ_DEV_ENABLED = 1,
+ IDXD_WQ_DEV_BUSY = 2,
+};
+
+struct idxd_hw_desc {
+ uint32_t pasid: 20;
+ uint32_t rsvd: 11;
+ uint32_t priv: 1;
+ uint32_t flags: 24;
+ uint32_t opcode: 8;
+ uint64_t completion_addr;
+ union {
+ uint64_t src_addr;
+ uint64_t readback_addr;
+ uint64_t pattern;
+ uint64_t desc_list_addr;
+ };
+ union {
+ uint64_t dst_addr;
+ uint64_t readback_addr2;
+ uint64_t src2_addr;
+ uint64_t comp_pattern;
+ };
+ union {
+ uint32_t xfer_size;
+ uint32_t desc_count;
+ };
+ uint16_t int_handle;
+ uint16_t rsvd1;
+ union {
+ uint8_t expected_res;
+ struct delta {
+ uint64_t addr;
+ uint32_t max_size;
+ } delta;
+ uint32_t delta_rec_size;
+ uint64_t dest2;
+ struct crc32c {
+ uint32_t seed;
+ uint32_t rsvd;
+ uint64_t addr;
+ } crc32c;
+ struct dif_chk {
+ uint8_t src_flags;
+ uint8_t rsvd1;
+ uint8_t flags;
+ uint8_t rsvd2[5];
+ uint32_t ref_tag_seed;
+ uint16_t app_tag_mask;
+ uint16_t app_tag_seed;
+ } dif_chk;
+ struct dif_ins {
+ uint8_t rsvd1;
+ uint8_t dest_flag;
+ uint8_t flags;
+ uint8_t rsvd2[13];
+ uint32_t ref_tag_seed;
+ uint16_t app_tag_mask;
+ uint16_t app_tag_seed;
+ } dif_ins;
+ struct dif_upd {
+ uint8_t src_flags;
+ uint8_t dest_flags;
+ uint8_t flags;
+ uint8_t rsvd[5];
+ uint32_t src_ref_tag_seed;
+ uint16_t src_app_tag_mask;
+ uint16_t src_app_tag_seed;
+ uint32_t dest_ref_tag_seed;
+ uint16_t dest_app_tag_mask;
+ uint16_t dest_app_tag_seed;
+ } dif_upd;
+ uint8_t op_specific[24];
+ };
+} __attribute__((packed));
+SPDK_STATIC_ASSERT(sizeof(struct idxd_hw_desc) == 64, "size mismatch");
+
+struct idxd_hw_comp_record {
+ volatile uint8_t status;
+ union {
+ uint8_t result;
+ uint8_t dif_status;
+ };
+ uint16_t rsvd;
+ uint32_t bytes_completed;
+ uint64_t fault_addr;
+ union {
+ uint32_t delta_rec_size;
+ uint32_t crc32c_val;
+ struct {
+ uint32_t dif_chk_ref_tag;
+ uint16_t dif_chk_app_tag_mask;
+ uint16_t dif_chk_app_tag;
+ };
+ struct dif_ins_comp {
+ uint64_t rsvd;
+ uint32_t ref_tag;
+ uint16_t app_tag_mask;
+ uint16_t app_tag;
+ } dif_ins_comp;
+ struct dif_upd_comp {
+ uint32_t src_ref_tag;
+ uint16_t src_app_tag_mask;
+ uint16_t src_app_tag;
+ uint32_t dest_ref_tag;
+ uint16_t dest_app_tag_mask;
+ uint16_t dest_app_tag;
+ } dif_upd_comp;
+ uint8_t op_specific[16];
+ };
+} __attribute__((packed));
+SPDK_STATIC_ASSERT(sizeof(struct idxd_hw_comp_record) == 32, "size mismatch");
+
+union idxd_gencap_register {
+ struct {
+ uint64_t block_on_fault: 1;
+ uint64_t overlap_copy: 1;
+ uint64_t cache_control_mem: 1;
+ uint64_t cache_control_cache: 1;
+ uint64_t rsvd: 3;
+ uint64_t int_handle_req: 1;
+ uint64_t dest_readback: 1;
+ uint64_t drain_readback: 1;
+ uint64_t rsvd2: 6;
+ uint64_t max_xfer_shift: 5;
+ uint64_t max_batch_shift: 4;
+ uint64_t max_ims_mult: 6;
+ uint64_t config_en: 1;
+ uint64_t max_descs_per_engine: 8;
+ uint64_t rsvd3: 24;
+ } __attribute__((packed));
+ uint64_t raw;
+};
+SPDK_STATIC_ASSERT(sizeof(union idxd_gencap_register) == 8, "size mismatch");
+
+union idxd_wqcap_register {
+ struct {
+ uint64_t total_wq_size: 16;
+ uint64_t num_wqs: 8;
+ uint64_t rsvd: 24;
+ uint64_t shared_mode: 1;
+ uint64_t dedicated_mode: 1;
+ uint64_t rsvd2: 1;
+ uint64_t priority: 1;
+ uint64_t occupancy: 1;
+ uint64_t occupancy_int: 1;
+ uint64_t rsvd3: 10;
+ } __attribute__((packed));
+ uint64_t raw;
+};
+SPDK_STATIC_ASSERT(sizeof(union idxd_wqcap_register) == 8, "size mismatch");
+
+union idxd_groupcap_register {
+ struct {
+ uint64_t num_groups: 8;
+ uint64_t total_tokens: 8;
+ uint64_t token_en: 1;
+ uint64_t token_limit: 1;
+ uint64_t rsvd: 46;
+ } __attribute__((packed));
+ uint64_t raw;
+};
+SPDK_STATIC_ASSERT(sizeof(union idxd_groupcap_register) == 8, "size mismatch");
+
+union idxd_enginecap_register {
+ struct {
+ uint64_t num_engines: 8;
+ uint64_t rsvd: 56;
+ } __attribute__((packed));
+ uint64_t raw;
+};
+SPDK_STATIC_ASSERT(sizeof(union idxd_enginecap_register) == 8, "size mismatch");
+
+struct idxd_opcap_register {
+ uint64_t raw[4];
+};
+SPDK_STATIC_ASSERT(sizeof(struct idxd_opcap_register) == 32, "size mismatch");
+
+struct idxd_registers {
+ uint32_t version;
+ union idxd_gencap_register gencap;
+ union idxd_wqcap_register wqcap;
+ union idxd_groupcap_register groupcap;
+ union idxd_enginecap_register enginecap;
+ struct idxd_opcap_register opcap;
+};
+SPDK_STATIC_ASSERT(sizeof(struct idxd_registers) == 72, "size mismatch");
+
+union idxd_offsets_register {
+ struct {
+ uint64_t grpcfg: 16;
+ uint64_t wqcfg: 16;
+ uint64_t msix_perm: 16;
+ uint64_t ims: 16;
+ uint64_t perfmon: 16;
+ uint64_t rsvd: 48;
+ } __attribute__((packed));
+ uint64_t raw[2];
+};
+SPDK_STATIC_ASSERT(sizeof(union idxd_offsets_register) == 16, "size mismatch");
+
+union idxd_genstatus_register {
+ struct {
+ uint32_t state: 2;
+ uint32_t reset_type: 2;
+ uint32_t rsvd: 28;
+ } __attribute__((packed));
+ uint32_t raw;
+};
+SPDK_STATIC_ASSERT(sizeof(union idxd_genstatus_register) == 4, "size mismatch");
+
+union idxd_cmdsts_reg {
+ struct {
+ uint8_t err;
+ uint16_t result;
+ uint8_t rsvd: 7;
+ uint8_t active: 1;
+ } __attribute__((packed));
+ uint32_t raw;
+};
+SPDK_STATIC_ASSERT(sizeof(union idxd_cmdsts_reg) == 4, "size mismatch");
+
+union idxd_swerr_register {
+ struct {
+ uint64_t valid: 1;
+ uint64_t overflow: 1;
+ uint64_t desc_valid: 1;
+ uint64_t wq_idx_valid: 1;
+ uint64_t batch: 1;
+ uint64_t fault_rw: 1;
+ uint64_t priv: 1;
+ uint64_t rsvd: 1;
+ uint64_t error: 8;
+ uint64_t wq_idx: 8;
+ uint64_t rsvd2: 8;
+ uint64_t operation: 8;
+ uint64_t pasid: 20;
+ uint64_t rsvd3: 4;
+ uint64_t batch_idx: 16;
+ uint64_t rsvd4: 16;
+ uint64_t invalid_flags: 32;
+ uint64_t fault_addr;
+ uint64_t rsvd5;
+ } __attribute__((packed));
+ uint64_t raw[4];
+};
+SPDK_STATIC_ASSERT(sizeof(union idxd_swerr_register) == 32, "size mismatch");
+
+union idxd_group_flags {
+ struct {
+ uint32_t tc_a: 3;
+ uint32_t tc_b: 3;
+ uint32_t rsvd: 1;
+ uint32_t use_token_limit: 1;
+ uint32_t tokens_reserved: 8;
+ uint32_t rsvd2: 4;
+ uint32_t tokens_allowed: 8;
+ uint32_t rsvd3: 4;
+ } __attribute__((packed));
+ uint32_t raw;
+};
+SPDK_STATIC_ASSERT(sizeof(union idxd_group_flags) == 4, "size mismatch");
+
+struct idxd_grpcfg {
+ uint64_t wqs[4];
+ uint64_t engines;
+ union idxd_group_flags flags;
+};
+SPDK_STATIC_ASSERT(sizeof(struct idxd_grpcfg) == 48, "size mismatch");
+
+union idxd_wqcfg {
+ struct {
+ uint16_t wq_size;
+ uint16_t rsvd;
+ uint16_t wq_thresh;
+ uint16_t rsvd1;
+ uint32_t mode: 1;
+ uint32_t bof: 1;
+ uint32_t rsvd2: 2;
+ uint32_t priority: 4;
+ uint32_t pasid: 20;
+ uint32_t pasid_en: 1;
+ uint32_t priv: 1;
+ uint32_t rsvd3: 2;
+ uint32_t max_xfer_shift: 5;
+ uint32_t max_batch_shift: 4;
+ uint32_t rsvd4: 23;
+ uint16_t occupancy_inth;
+ uint16_t occupancy_table_sel: 1;
+ uint16_t rsvd5: 15;
+ uint16_t occupancy_limit;
+ uint16_t occupancy_int_en: 1;
+ uint16_t rsvd6: 15;
+ uint16_t occupancy;
+ uint16_t occupancy_int: 1;
+ uint16_t rsvd7: 12;
+ uint16_t mode_support: 1;
+ uint16_t wq_state: 2;
+ uint32_t rsvd8;
+ } __attribute__((packed));
+ uint32_t raw[8];
+};
+SPDK_STATIC_ASSERT(sizeof(union idxd_wqcfg) == 32, "size mismatch");
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* SPDK_IDXD_SPEC_H */
diff --git a/src/spdk/lib/idxd/spdk_idxd.map b/src/spdk/lib/idxd/spdk_idxd.map
new file mode 100644
index 000000000..4bffdf209
--- /dev/null
+++ b/src/spdk/lib/idxd/spdk_idxd.map
@@ -0,0 +1,29 @@
+{
+ global:
+
+ # public functions
+ spdk_idxd_configure_chan;
+ spdk_idxd_reconfigure_chan;
+ spdk_idxd_probe;
+ spdk_idxd_detach;
+ spdk_idxd_batch_prep_copy;
+ spdk_idxd_batch_prep_dualcast;
+ spdk_idxd_batch_prep_fill;
+ spdk_idxd_batch_prep_crc32c;
+ spdk_idxd_batch_prep_compare;
+ spdk_idxd_batch_submit;
+ spdk_idxd_batch_create;
+ spdk_idxd_batch_cancel;
+ spdk_idxd_batch_get_max;
+ spdk_idxd_set_config;
+ spdk_idxd_submit_compare;
+ spdk_idxd_submit_crc32c;
+ spdk_idxd_submit_copy;
+ spdk_idxd_submit_dualcast;
+ spdk_idxd_submit_fill;
+ spdk_idxd_process_events;
+ spdk_idxd_get_channel;
+ spdk_idxd_put_channel;
+
+ local: *;
+};
diff --git a/src/spdk/lib/ioat/Makefile b/src/spdk/lib/ioat/Makefile
new file mode 100644
index 000000000..4cada5685
--- /dev/null
+++ b/src/spdk/lib/ioat/Makefile
@@ -0,0 +1,45 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 3
+SO_MINOR := 0
+
+C_SRCS = ioat.c
+LIBNAME = ioat
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_ioat.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/ioat/ioat.c b/src/spdk/lib/ioat/ioat.c
new file mode 100644
index 000000000..516fa545c
--- /dev/null
+++ b/src/spdk/lib/ioat/ioat.c
@@ -0,0 +1,775 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "ioat_internal.h"
+
+#include "spdk/env.h"
+#include "spdk/util.h"
+#include "spdk/memory.h"
+
+#include "spdk_internal/log.h"
+
+struct ioat_driver {
+ pthread_mutex_t lock;
+ TAILQ_HEAD(, spdk_ioat_chan) attached_chans;
+};
+
+static struct ioat_driver g_ioat_driver = {
+ .lock = PTHREAD_MUTEX_INITIALIZER,
+ .attached_chans = TAILQ_HEAD_INITIALIZER(g_ioat_driver.attached_chans),
+};
+
+static uint64_t
+ioat_get_chansts(struct spdk_ioat_chan *ioat)
+{
+ return spdk_mmio_read_8(&ioat->regs->chansts);
+}
+
+static void
+ioat_write_chancmp(struct spdk_ioat_chan *ioat, uint64_t addr)
+{
+ spdk_mmio_write_8(&ioat->regs->chancmp, addr);
+}
+
+static void
+ioat_write_chainaddr(struct spdk_ioat_chan *ioat, uint64_t addr)
+{
+ spdk_mmio_write_8(&ioat->regs->chainaddr, addr);
+}
+
+static inline void
+ioat_suspend(struct spdk_ioat_chan *ioat)
+{
+ ioat->regs->chancmd = SPDK_IOAT_CHANCMD_SUSPEND;
+}
+
+static inline void
+ioat_reset(struct spdk_ioat_chan *ioat)
+{
+ ioat->regs->chancmd = SPDK_IOAT_CHANCMD_RESET;
+}
+
+static inline uint32_t
+ioat_reset_pending(struct spdk_ioat_chan *ioat)
+{
+ uint8_t cmd;
+
+ cmd = ioat->regs->chancmd;
+ return (cmd & SPDK_IOAT_CHANCMD_RESET) == SPDK_IOAT_CHANCMD_RESET;
+}
+
+static int
+ioat_map_pci_bar(struct spdk_ioat_chan *ioat)
+{
+ int regs_bar, rc;
+ void *addr;
+ uint64_t phys_addr, size;
+
+ regs_bar = 0;
+ rc = spdk_pci_device_map_bar(ioat->device, regs_bar, &addr, &phys_addr, &size);
+ if (rc != 0 || addr == NULL) {
+ SPDK_ERRLOG("pci_device_map_range failed with error code %d\n",
+ rc);
+ return -1;
+ }
+
+ ioat->regs = (volatile struct spdk_ioat_registers *)addr;
+
+ return 0;
+}
+
+static int
+ioat_unmap_pci_bar(struct spdk_ioat_chan *ioat)
+{
+ int rc = 0;
+ void *addr = (void *)ioat->regs;
+
+ if (addr) {
+ rc = spdk_pci_device_unmap_bar(ioat->device, 0, addr);
+ }
+ return rc;
+}
+
+
+static inline uint32_t
+ioat_get_active(struct spdk_ioat_chan *ioat)
+{
+ return (ioat->head - ioat->tail) & ((1 << ioat->ring_size_order) - 1);
+}
+
+static inline uint32_t
+ioat_get_ring_space(struct spdk_ioat_chan *ioat)
+{
+ return (1 << ioat->ring_size_order) - ioat_get_active(ioat) - 1;
+}
+
+static uint32_t
+ioat_get_ring_index(struct spdk_ioat_chan *ioat, uint32_t index)
+{
+ return index & ((1 << ioat->ring_size_order) - 1);
+}
+
+static void
+ioat_get_ring_entry(struct spdk_ioat_chan *ioat, uint32_t index,
+ struct ioat_descriptor **desc,
+ union spdk_ioat_hw_desc **hw_desc)
+{
+ uint32_t i = ioat_get_ring_index(ioat, index);
+
+ *desc = &ioat->ring[i];
+ *hw_desc = &ioat->hw_ring[i];
+}
+
+static void
+ioat_submit_single(struct spdk_ioat_chan *ioat)
+{
+ ioat->head++;
+}
+
+void
+spdk_ioat_flush(struct spdk_ioat_chan *ioat)
+{
+ uint32_t index = ioat_get_ring_index(ioat, ioat->head - 1);
+ union spdk_ioat_hw_desc *hw_desc;
+
+ hw_desc = &ioat->hw_ring[index];
+ hw_desc->dma.u.control.completion_update = 1;
+ ioat->regs->dmacount = (uint16_t)ioat->head;
+}
+
+static struct ioat_descriptor *
+ioat_prep_null(struct spdk_ioat_chan *ioat)
+{
+ struct ioat_descriptor *desc;
+ union spdk_ioat_hw_desc *hw_desc;
+
+ if (ioat_get_ring_space(ioat) < 1) {
+ return NULL;
+ }
+
+ ioat_get_ring_entry(ioat, ioat->head, &desc, &hw_desc);
+
+ hw_desc->dma.u.control_raw = 0;
+ hw_desc->dma.u.control.op = SPDK_IOAT_OP_COPY;
+ hw_desc->dma.u.control.null = 1;
+
+ hw_desc->dma.size = 8;
+ hw_desc->dma.src_addr = 0;
+ hw_desc->dma.dest_addr = 0;
+
+ desc->callback_fn = NULL;
+ desc->callback_arg = NULL;
+
+ ioat_submit_single(ioat);
+
+ return desc;
+}
+
+static struct ioat_descriptor *
+ioat_prep_copy(struct spdk_ioat_chan *ioat, uint64_t dst,
+ uint64_t src, uint32_t len)
+{
+ struct ioat_descriptor *desc;
+ union spdk_ioat_hw_desc *hw_desc;
+
+ assert(len <= ioat->max_xfer_size);
+
+ if (ioat_get_ring_space(ioat) < 1) {
+ return NULL;
+ }
+
+ ioat_get_ring_entry(ioat, ioat->head, &desc, &hw_desc);
+
+ hw_desc->dma.u.control_raw = 0;
+ hw_desc->dma.u.control.op = SPDK_IOAT_OP_COPY;
+
+ hw_desc->dma.size = len;
+ hw_desc->dma.src_addr = src;
+ hw_desc->dma.dest_addr = dst;
+
+ desc->callback_fn = NULL;
+ desc->callback_arg = NULL;
+
+ ioat_submit_single(ioat);
+
+ return desc;
+}
+
+static struct ioat_descriptor *
+ioat_prep_fill(struct spdk_ioat_chan *ioat, uint64_t dst,
+ uint64_t fill_pattern, uint32_t len)
+{
+ struct ioat_descriptor *desc;
+ union spdk_ioat_hw_desc *hw_desc;
+
+ assert(len <= ioat->max_xfer_size);
+
+ if (ioat_get_ring_space(ioat) < 1) {
+ return NULL;
+ }
+
+ ioat_get_ring_entry(ioat, ioat->head, &desc, &hw_desc);
+
+ hw_desc->fill.u.control_raw = 0;
+ hw_desc->fill.u.control.op = SPDK_IOAT_OP_FILL;
+
+ hw_desc->fill.size = len;
+ hw_desc->fill.src_data = fill_pattern;
+ hw_desc->fill.dest_addr = dst;
+
+ desc->callback_fn = NULL;
+ desc->callback_arg = NULL;
+
+ ioat_submit_single(ioat);
+
+ return desc;
+}
+
+static int ioat_reset_hw(struct spdk_ioat_chan *ioat)
+{
+ int timeout;
+ uint64_t status;
+ uint32_t chanerr;
+ int rc;
+
+ status = ioat_get_chansts(ioat);
+ if (is_ioat_active(status) || is_ioat_idle(status)) {
+ ioat_suspend(ioat);
+ }
+
+ timeout = 20; /* in milliseconds */
+ while (is_ioat_active(status) || is_ioat_idle(status)) {
+ spdk_delay_us(1000);
+ timeout--;
+ if (timeout == 0) {
+ SPDK_ERRLOG("timed out waiting for suspend\n");
+ return -1;
+ }
+ status = ioat_get_chansts(ioat);
+ }
+
+ /*
+ * Clear any outstanding errors.
+ * CHANERR is write-1-to-clear, so write the current CHANERR bits back to reset everything.
+ */
+ chanerr = ioat->regs->chanerr;
+ ioat->regs->chanerr = chanerr;
+
+ if (ioat->regs->cbver < SPDK_IOAT_VER_3_3) {
+ rc = spdk_pci_device_cfg_read32(ioat->device, &chanerr,
+ SPDK_IOAT_PCI_CHANERR_INT_OFFSET);
+ if (rc) {
+ SPDK_ERRLOG("failed to read the internal channel error register\n");
+ return -1;
+ }
+
+ spdk_pci_device_cfg_write32(ioat->device, chanerr,
+ SPDK_IOAT_PCI_CHANERR_INT_OFFSET);
+ }
+
+ ioat_reset(ioat);
+
+ timeout = 20;
+ while (ioat_reset_pending(ioat)) {
+ spdk_delay_us(1000);
+ timeout--;
+ if (timeout == 0) {
+ SPDK_ERRLOG("timed out waiting for reset\n");
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+static int
+ioat_process_channel_events(struct spdk_ioat_chan *ioat)
+{
+ struct ioat_descriptor *desc;
+ uint64_t status, completed_descriptor, hw_desc_phys_addr, events_count = 0;
+ uint32_t tail;
+
+ if (ioat->head == ioat->tail) {
+ return 0;
+ }
+
+ status = *ioat->comp_update;
+ completed_descriptor = status & SPDK_IOAT_CHANSTS_COMPLETED_DESCRIPTOR_MASK;
+
+ if (is_ioat_halted(status)) {
+ SPDK_ERRLOG("Channel halted (%x)\n", ioat->regs->chanerr);
+ return -1;
+ }
+
+ if (completed_descriptor == ioat->last_seen) {
+ return 0;
+ }
+
+ do {
+ tail = ioat_get_ring_index(ioat, ioat->tail);
+ desc = &ioat->ring[tail];
+
+ if (desc->callback_fn) {
+ desc->callback_fn(desc->callback_arg);
+ }
+
+ hw_desc_phys_addr = desc->phys_addr;
+ ioat->tail++;
+ events_count++;
+ } while (hw_desc_phys_addr != completed_descriptor);
+
+ ioat->last_seen = hw_desc_phys_addr;
+
+ return events_count;
+}
+
+static void
+ioat_channel_destruct(struct spdk_ioat_chan *ioat)
+{
+ ioat_unmap_pci_bar(ioat);
+
+ if (ioat->ring) {
+ free(ioat->ring);
+ }
+
+ if (ioat->hw_ring) {
+ spdk_free(ioat->hw_ring);
+ }
+
+ if (ioat->comp_update) {
+ spdk_free((void *)ioat->comp_update);
+ ioat->comp_update = NULL;
+ }
+}
+
+uint32_t
+spdk_ioat_get_max_descriptors(struct spdk_ioat_chan *ioat)
+{
+ return 1 << ioat->ring_size_order;
+}
+
+static int
+ioat_channel_start(struct spdk_ioat_chan *ioat)
+{
+ uint8_t xfercap, version;
+ uint64_t status;
+ int i, num_descriptors;
+ uint64_t comp_update_bus_addr = 0;
+ uint64_t phys_addr;
+
+ if (ioat_map_pci_bar(ioat) != 0) {
+ SPDK_ERRLOG("ioat_map_pci_bar() failed\n");
+ return -1;
+ }
+
+ version = ioat->regs->cbver;
+ if (version < SPDK_IOAT_VER_3_0) {
+ SPDK_ERRLOG(" unsupported IOAT version %u.%u\n",
+ version >> 4, version & 0xF);
+ return -1;
+ }
+
+ /* Always support DMA copy */
+ ioat->dma_capabilities = SPDK_IOAT_ENGINE_COPY_SUPPORTED;
+ if (ioat->regs->dmacapability & SPDK_IOAT_DMACAP_BFILL) {
+ ioat->dma_capabilities |= SPDK_IOAT_ENGINE_FILL_SUPPORTED;
+ }
+ xfercap = ioat->regs->xfercap;
+
+ /* Only bits [4:0] are valid. */
+ xfercap &= 0x1f;
+ if (xfercap == 0) {
+ /* 0 means 4 GB max transfer size. */
+ ioat->max_xfer_size = 1ULL << 32;
+ } else if (xfercap < 12) {
+ /* XFERCAP must be at least 12 (4 KB) according to the spec. */
+ SPDK_ERRLOG("invalid XFERCAP value %u\n", xfercap);
+ return -1;
+ } else {
+ ioat->max_xfer_size = 1U << xfercap;
+ }
+
+ ioat->comp_update = spdk_zmalloc(sizeof(*ioat->comp_update), SPDK_IOAT_CHANCMP_ALIGN,
+ NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+ if (ioat->comp_update == NULL) {
+ return -1;
+ }
+
+ comp_update_bus_addr = spdk_vtophys((void *)ioat->comp_update, NULL);
+ if (comp_update_bus_addr == SPDK_VTOPHYS_ERROR) {
+ spdk_free((void *)ioat->comp_update);
+ return -1;
+ }
+
+ ioat->ring_size_order = IOAT_DEFAULT_ORDER;
+
+ num_descriptors = 1 << ioat->ring_size_order;
+
+ ioat->ring = calloc(num_descriptors, sizeof(struct ioat_descriptor));
+ if (!ioat->ring) {
+ return -1;
+ }
+
+ ioat->hw_ring = spdk_zmalloc(num_descriptors * sizeof(union spdk_ioat_hw_desc), 64,
+ NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+ if (!ioat->hw_ring) {
+ return -1;
+ }
+
+ for (i = 0; i < num_descriptors; i++) {
+ phys_addr = spdk_vtophys(&ioat->hw_ring[i], NULL);
+ if (phys_addr == SPDK_VTOPHYS_ERROR) {
+ SPDK_ERRLOG("Failed to translate descriptor %u to physical address\n", i);
+ return -1;
+ }
+
+ ioat->ring[i].phys_addr = phys_addr;
+ ioat->hw_ring[ioat_get_ring_index(ioat, i - 1)].generic.next = phys_addr;
+ }
+
+ ioat->head = 0;
+ ioat->tail = 0;
+ ioat->last_seen = 0;
+
+ ioat_reset_hw(ioat);
+
+ ioat->regs->chanctrl = SPDK_IOAT_CHANCTRL_ANY_ERR_ABORT_EN;
+ ioat_write_chancmp(ioat, comp_update_bus_addr);
+ ioat_write_chainaddr(ioat, ioat->ring[0].phys_addr);
+
+ ioat_prep_null(ioat);
+ spdk_ioat_flush(ioat);
+
+ i = 100;
+ while (i-- > 0) {
+ spdk_delay_us(100);
+ status = ioat_get_chansts(ioat);
+ if (is_ioat_idle(status)) {
+ break;
+ }
+ }
+
+ if (is_ioat_idle(status)) {
+ ioat_process_channel_events(ioat);
+ } else {
+ SPDK_ERRLOG("could not start channel: status = %p\n error = %#x\n",
+ (void *)status, ioat->regs->chanerr);
+ return -1;
+ }
+
+ return 0;
+}
+
+/* Caller must hold g_ioat_driver.lock */
+static struct spdk_ioat_chan *
+ioat_attach(struct spdk_pci_device *device)
+{
+ struct spdk_ioat_chan *ioat;
+ uint32_t cmd_reg;
+
+ ioat = calloc(1, sizeof(struct spdk_ioat_chan));
+ if (ioat == NULL) {
+ return NULL;
+ }
+
+ /* Enable PCI busmaster. */
+ spdk_pci_device_cfg_read32(device, &cmd_reg, 4);
+ cmd_reg |= 0x4;
+ spdk_pci_device_cfg_write32(device, cmd_reg, 4);
+
+ ioat->device = device;
+
+ if (ioat_channel_start(ioat) != 0) {
+ ioat_channel_destruct(ioat);
+ free(ioat);
+ return NULL;
+ }
+
+ return ioat;
+}
+
+struct ioat_enum_ctx {
+ spdk_ioat_probe_cb probe_cb;
+ spdk_ioat_attach_cb attach_cb;
+ void *cb_ctx;
+};
+
+/* This function must only be called while holding g_ioat_driver.lock */
+static int
+ioat_enum_cb(void *ctx, struct spdk_pci_device *pci_dev)
+{
+ struct ioat_enum_ctx *enum_ctx = ctx;
+ struct spdk_ioat_chan *ioat;
+
+ /* Verify that this device is not already attached */
+ TAILQ_FOREACH(ioat, &g_ioat_driver.attached_chans, tailq) {
+ /*
+ * NOTE: This assumes that the PCI abstraction layer will use the same device handle
+ * across enumerations; we could compare by BDF instead if this is not true.
+ */
+ if (pci_dev == ioat->device) {
+ return 0;
+ }
+ }
+
+ if (enum_ctx->probe_cb(enum_ctx->cb_ctx, pci_dev)) {
+ /*
+ * Since I/OAT init is relatively quick, just perform the full init during probing.
+ * If this turns out to be a bottleneck later, this can be changed to work like
+ * NVMe with a list of devices to initialize in parallel.
+ */
+ ioat = ioat_attach(pci_dev);
+ if (ioat == NULL) {
+ SPDK_ERRLOG("ioat_attach() failed\n");
+ return -1;
+ }
+
+ TAILQ_INSERT_TAIL(&g_ioat_driver.attached_chans, ioat, tailq);
+
+ enum_ctx->attach_cb(enum_ctx->cb_ctx, pci_dev, ioat);
+ }
+
+ return 0;
+}
+
+int
+spdk_ioat_probe(void *cb_ctx, spdk_ioat_probe_cb probe_cb, spdk_ioat_attach_cb attach_cb)
+{
+ int rc;
+ struct ioat_enum_ctx enum_ctx;
+
+ pthread_mutex_lock(&g_ioat_driver.lock);
+
+ enum_ctx.probe_cb = probe_cb;
+ enum_ctx.attach_cb = attach_cb;
+ enum_ctx.cb_ctx = cb_ctx;
+
+ rc = spdk_pci_enumerate(spdk_pci_ioat_get_driver(), ioat_enum_cb, &enum_ctx);
+
+ pthread_mutex_unlock(&g_ioat_driver.lock);
+
+ return rc;
+}
+
+void
+spdk_ioat_detach(struct spdk_ioat_chan *ioat)
+{
+ struct ioat_driver *driver = &g_ioat_driver;
+
+ /* ioat should be in the free list (not registered to a thread)
+ * when calling ioat_detach().
+ */
+ pthread_mutex_lock(&driver->lock);
+ TAILQ_REMOVE(&driver->attached_chans, ioat, tailq);
+ pthread_mutex_unlock(&driver->lock);
+
+ ioat_channel_destruct(ioat);
+ free(ioat);
+}
+
+int
+spdk_ioat_build_copy(struct spdk_ioat_chan *ioat, void *cb_arg, spdk_ioat_req_cb cb_fn,
+ void *dst, const void *src, uint64_t nbytes)
+{
+ struct ioat_descriptor *last_desc;
+ uint64_t remaining, op_size;
+ uint64_t vdst, vsrc;
+ uint64_t vdst_page, vsrc_page;
+ uint64_t pdst_page, psrc_page;
+ uint32_t orig_head;
+
+ if (!ioat) {
+ return -EINVAL;
+ }
+
+ orig_head = ioat->head;
+
+ vdst = (uint64_t)dst;
+ vsrc = (uint64_t)src;
+ vdst_page = vsrc_page = 0;
+ pdst_page = psrc_page = SPDK_VTOPHYS_ERROR;
+
+ remaining = nbytes;
+ while (remaining) {
+ if (_2MB_PAGE(vsrc) != vsrc_page) {
+ vsrc_page = _2MB_PAGE(vsrc);
+ psrc_page = spdk_vtophys((void *)vsrc_page, NULL);
+ }
+
+ if (_2MB_PAGE(vdst) != vdst_page) {
+ vdst_page = _2MB_PAGE(vdst);
+ pdst_page = spdk_vtophys((void *)vdst_page, NULL);
+ }
+ op_size = remaining;
+ op_size = spdk_min(op_size, (VALUE_2MB - _2MB_OFFSET(vsrc)));
+ op_size = spdk_min(op_size, (VALUE_2MB - _2MB_OFFSET(vdst)));
+ op_size = spdk_min(op_size, ioat->max_xfer_size);
+ remaining -= op_size;
+
+ last_desc = ioat_prep_copy(ioat,
+ pdst_page + _2MB_OFFSET(vdst),
+ psrc_page + _2MB_OFFSET(vsrc),
+ op_size);
+
+ if (remaining == 0 || last_desc == NULL) {
+ break;
+ }
+
+ vsrc += op_size;
+ vdst += op_size;
+
+ }
+ /* Issue null descriptor for null transfer */
+ if (nbytes == 0) {
+ last_desc = ioat_prep_null(ioat);
+ }
+
+ if (last_desc) {
+ last_desc->callback_fn = cb_fn;
+ last_desc->callback_arg = cb_arg;
+ } else {
+ /*
+ * Ran out of descriptors in the ring - reset head to leave things as they were
+ * in case we managed to fill out any descriptors.
+ */
+ ioat->head = orig_head;
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+int
+spdk_ioat_submit_copy(struct spdk_ioat_chan *ioat, void *cb_arg, spdk_ioat_req_cb cb_fn,
+ void *dst, const void *src, uint64_t nbytes)
+{
+ int rc;
+
+ rc = spdk_ioat_build_copy(ioat, cb_arg, cb_fn, dst, src, nbytes);
+ if (rc != 0) {
+ return rc;
+ }
+
+ spdk_ioat_flush(ioat);
+ return 0;
+}
+
+int
+spdk_ioat_build_fill(struct spdk_ioat_chan *ioat, void *cb_arg, spdk_ioat_req_cb cb_fn,
+ void *dst, uint64_t fill_pattern, uint64_t nbytes)
+{
+ struct ioat_descriptor *last_desc = NULL;
+ uint64_t remaining, op_size;
+ uint64_t vdst;
+ uint32_t orig_head;
+
+ if (!ioat) {
+ return -EINVAL;
+ }
+
+ if (!(ioat->dma_capabilities & SPDK_IOAT_ENGINE_FILL_SUPPORTED)) {
+ SPDK_ERRLOG("Channel does not support memory fill\n");
+ return -1;
+ }
+
+ orig_head = ioat->head;
+
+ vdst = (uint64_t)dst;
+ remaining = nbytes;
+
+ while (remaining) {
+ op_size = remaining;
+ op_size = spdk_min(op_size, (VALUE_2MB - _2MB_OFFSET(vdst)));
+ op_size = spdk_min(op_size, ioat->max_xfer_size);
+ remaining -= op_size;
+
+ last_desc = ioat_prep_fill(ioat,
+ spdk_vtophys((void *)vdst, NULL),
+ fill_pattern,
+ op_size);
+
+ if (remaining == 0 || last_desc == NULL) {
+ break;
+ }
+
+ vdst += op_size;
+ }
+
+ if (last_desc) {
+ last_desc->callback_fn = cb_fn;
+ last_desc->callback_arg = cb_arg;
+ } else {
+ /*
+ * Ran out of descriptors in the ring - reset head to leave things as they were
+ * in case we managed to fill out any descriptors.
+ */
+ ioat->head = orig_head;
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+int
+spdk_ioat_submit_fill(struct spdk_ioat_chan *ioat, void *cb_arg, spdk_ioat_req_cb cb_fn,
+ void *dst, uint64_t fill_pattern, uint64_t nbytes)
+{
+ int rc;
+
+ rc = spdk_ioat_build_fill(ioat, cb_arg, cb_fn, dst, fill_pattern, nbytes);
+ if (rc != 0) {
+ return rc;
+ }
+
+ spdk_ioat_flush(ioat);
+ return 0;
+}
+
+uint32_t
+spdk_ioat_get_dma_capabilities(struct spdk_ioat_chan *ioat)
+{
+ if (!ioat) {
+ return 0;
+ }
+ return ioat->dma_capabilities;
+}
+
+int
+spdk_ioat_process_events(struct spdk_ioat_chan *ioat)
+{
+ return ioat_process_channel_events(ioat);
+}
+
+SPDK_LOG_REGISTER_COMPONENT("ioat", SPDK_LOG_IOAT)
diff --git a/src/spdk/lib/ioat/ioat_internal.h b/src/spdk/lib/ioat/ioat_internal.h
new file mode 100644
index 000000000..19593bb00
--- /dev/null
+++ b/src/spdk/lib/ioat/ioat_internal.h
@@ -0,0 +1,100 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __IOAT_INTERNAL_H__
+#define __IOAT_INTERNAL_H__
+
+#include "spdk/stdinc.h"
+
+#include "spdk/ioat.h"
+#include "spdk/ioat_spec.h"
+#include "spdk/queue.h"
+#include "spdk/mmio.h"
+
+/* Allocate 1 << 15 (32K) descriptors per channel by default. */
+#define IOAT_DEFAULT_ORDER 15
+
+struct ioat_descriptor {
+ uint64_t phys_addr;
+ spdk_ioat_req_cb callback_fn;
+ void *callback_arg;
+};
+
+/* One of these per allocated PCI device. */
+struct spdk_ioat_chan {
+ /* Opaque handle to upper layer */
+ struct spdk_pci_device *device;
+ uint64_t max_xfer_size;
+ volatile struct spdk_ioat_registers *regs;
+
+ volatile uint64_t *comp_update;
+
+ uint32_t head;
+ uint32_t tail;
+
+ uint32_t ring_size_order;
+ uint64_t last_seen;
+
+ struct ioat_descriptor *ring;
+ union spdk_ioat_hw_desc *hw_ring;
+ uint32_t dma_capabilities;
+
+ /* tailq entry for attached_chans */
+ TAILQ_ENTRY(spdk_ioat_chan) tailq;
+};
+
+static inline uint32_t
+is_ioat_active(uint64_t status)
+{
+ return (status & SPDK_IOAT_CHANSTS_STATUS) == SPDK_IOAT_CHANSTS_ACTIVE;
+}
+
+static inline uint32_t
+is_ioat_idle(uint64_t status)
+{
+ return (status & SPDK_IOAT_CHANSTS_STATUS) == SPDK_IOAT_CHANSTS_IDLE;
+}
+
+static inline uint32_t
+is_ioat_halted(uint64_t status)
+{
+ return (status & SPDK_IOAT_CHANSTS_STATUS) == SPDK_IOAT_CHANSTS_HALTED;
+}
+
+static inline uint32_t
+is_ioat_suspended(uint64_t status)
+{
+ return (status & SPDK_IOAT_CHANSTS_STATUS) == SPDK_IOAT_CHANSTS_SUSPENDED;
+}
+
+#endif /* __IOAT_INTERNAL_H__ */
diff --git a/src/spdk/lib/ioat/spdk_ioat.map b/src/spdk/lib/ioat/spdk_ioat.map
new file mode 100644
index 000000000..f467da817
--- /dev/null
+++ b/src/spdk/lib/ioat/spdk_ioat.map
@@ -0,0 +1,17 @@
+{
+ global:
+
+ # public functions
+ spdk_ioat_probe;
+ spdk_ioat_detach;
+ spdk_ioat_build_copy;
+ spdk_ioat_submit_copy;
+ spdk_ioat_build_fill;
+ spdk_ioat_submit_fill;
+ spdk_ioat_flush;
+ spdk_ioat_process_events;
+ spdk_ioat_get_dma_capabilities;
+ spdk_ioat_get_max_descriptors;
+
+ local: *;
+};
diff --git a/src/spdk/lib/iscsi/Makefile b/src/spdk/lib/iscsi/Makefile
new file mode 100644
index 000000000..2c663d880
--- /dev/null
+++ b/src/spdk/lib/iscsi/Makefile
@@ -0,0 +1,50 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 3
+SO_MINOR := 0
+
+CFLAGS += -I$(SPDK_ROOT_DIR)/lib
+C_SRCS = conn.c \
+ init_grp.c iscsi.c md5.c param.c portal_grp.c \
+ tgt_node.c iscsi_subsystem.c \
+ iscsi_rpc.c task.c
+LIBNAME = iscsi
+LOCAL_SYS_LIBS = -lcrypto
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_iscsi.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/iscsi/conn.c b/src/spdk/lib/iscsi/conn.c
new file mode 100644
index 000000000..4c7a54fcf
--- /dev/null
+++ b/src/spdk/lib/iscsi/conn.c
@@ -0,0 +1,1714 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/endian.h"
+#include "spdk/env.h"
+#include "spdk/event.h"
+#include "spdk/likely.h"
+#include "spdk/thread.h"
+#include "spdk/queue.h"
+#include "spdk/trace.h"
+#include "spdk/net.h"
+#include "spdk/sock.h"
+#include "spdk/string.h"
+
+#include "spdk_internal/log.h"
+
+#include "iscsi/task.h"
+#include "iscsi/conn.h"
+#include "iscsi/tgt_node.h"
+#include "iscsi/portal_grp.h"
+
+#define MAKE_DIGEST_WORD(BUF, CRC32C) \
+ ( ((*((uint8_t *)(BUF)+0)) = (uint8_t)((uint32_t)(CRC32C) >> 0)), \
+ ((*((uint8_t *)(BUF)+1)) = (uint8_t)((uint32_t)(CRC32C) >> 8)), \
+ ((*((uint8_t *)(BUF)+2)) = (uint8_t)((uint32_t)(CRC32C) >> 16)), \
+ ((*((uint8_t *)(BUF)+3)) = (uint8_t)((uint32_t)(CRC32C) >> 24)))
+
+#define SPDK_ISCSI_CONNECTION_MEMSET(conn) \
+ memset(&(conn)->portal, 0, sizeof(*(conn)) - \
+ offsetof(struct spdk_iscsi_conn, portal));
+
+struct spdk_iscsi_conn *g_conns_array = MAP_FAILED;
+static int g_conns_array_fd = -1;
+static char g_shm_name[64];
+
+static TAILQ_HEAD(, spdk_iscsi_conn) g_free_conns = TAILQ_HEAD_INITIALIZER(g_free_conns);
+static TAILQ_HEAD(, spdk_iscsi_conn) g_active_conns = TAILQ_HEAD_INITIALIZER(g_active_conns);
+
+static pthread_mutex_t g_conns_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static struct spdk_poller *g_shutdown_timer = NULL;
+
+static void iscsi_conn_sock_cb(void *arg, struct spdk_sock_group *group,
+ struct spdk_sock *sock);
+
+static struct spdk_iscsi_conn *
+allocate_conn(void)
+{
+ struct spdk_iscsi_conn *conn;
+
+ pthread_mutex_lock(&g_conns_mutex);
+ conn = TAILQ_FIRST(&g_free_conns);
+ if (conn != NULL) {
+ assert(!conn->is_valid);
+ TAILQ_REMOVE(&g_free_conns, conn, conn_link);
+ SPDK_ISCSI_CONNECTION_MEMSET(conn);
+ conn->is_valid = 1;
+
+ TAILQ_INSERT_TAIL(&g_active_conns, conn, conn_link);
+ }
+ pthread_mutex_unlock(&g_conns_mutex);
+
+ return conn;
+}
+
+static void
+_free_conn(struct spdk_iscsi_conn *conn)
+{
+ TAILQ_REMOVE(&g_active_conns, conn, conn_link);
+
+ memset(conn->portal_host, 0, sizeof(conn->portal_host));
+ memset(conn->portal_port, 0, sizeof(conn->portal_port));
+ conn->is_valid = 0;
+
+ TAILQ_INSERT_TAIL(&g_free_conns, conn, conn_link);
+}
+
+static void
+free_conn(struct spdk_iscsi_conn *conn)
+{
+ pthread_mutex_lock(&g_conns_mutex);
+ _free_conn(conn);
+ pthread_mutex_unlock(&g_conns_mutex);
+}
+
+static void
+_iscsi_conns_cleanup(void)
+{
+ if (g_conns_array != MAP_FAILED) {
+ munmap(g_conns_array, sizeof(struct spdk_iscsi_conn) *
+ MAX_ISCSI_CONNECTIONS);
+ g_conns_array = MAP_FAILED;
+ }
+
+ if (g_conns_array_fd >= 0) {
+ close(g_conns_array_fd);
+ g_conns_array_fd = -1;
+ shm_unlink(g_shm_name);
+ }
+}
+
+int initialize_iscsi_conns(void)
+{
+ size_t conns_size = sizeof(struct spdk_iscsi_conn) * MAX_ISCSI_CONNECTIONS;
+ uint32_t i;
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "spdk_iscsi_init\n");
+
+ snprintf(g_shm_name, sizeof(g_shm_name), "/spdk_iscsi_conns.%d", spdk_app_get_shm_id());
+ g_conns_array_fd = shm_open(g_shm_name, O_RDWR | O_CREAT, 0600);
+ if (g_conns_array_fd < 0) {
+ SPDK_ERRLOG("could not shm_open %s\n", g_shm_name);
+ goto err;
+ }
+
+ if (ftruncate(g_conns_array_fd, conns_size) != 0) {
+ SPDK_ERRLOG("could not ftruncate\n");
+ goto err;
+ }
+ g_conns_array = mmap(0, conns_size, PROT_READ | PROT_WRITE, MAP_SHARED,
+ g_conns_array_fd, 0);
+
+ if (g_conns_array == MAP_FAILED) {
+ SPDK_ERRLOG("could not mmap cons array file %s (%d)\n", g_shm_name, errno);
+ goto err;
+ }
+
+ memset(g_conns_array, 0, conns_size);
+
+ for (i = 0; i < MAX_ISCSI_CONNECTIONS; i++) {
+ g_conns_array[i].id = i;
+ TAILQ_INSERT_TAIL(&g_free_conns, &g_conns_array[i], conn_link);
+ }
+
+ return 0;
+
+err:
+ _iscsi_conns_cleanup();
+
+ return -1;
+}
+
+static void
+iscsi_poll_group_add_conn(struct spdk_iscsi_poll_group *pg, struct spdk_iscsi_conn *conn)
+{
+ int rc;
+
+ rc = spdk_sock_group_add_sock(pg->sock_group, conn->sock, iscsi_conn_sock_cb, conn);
+ if (rc < 0) {
+ SPDK_ERRLOG("Failed to add sock=%p of conn=%p\n", conn->sock, conn);
+ return;
+ }
+
+ conn->is_stopped = false;
+ STAILQ_INSERT_TAIL(&pg->connections, conn, pg_link);
+}
+
+static void
+iscsi_poll_group_remove_conn(struct spdk_iscsi_poll_group *pg, struct spdk_iscsi_conn *conn)
+{
+ int rc;
+
+ assert(conn->sock != NULL);
+ rc = spdk_sock_group_remove_sock(pg->sock_group, conn->sock);
+ if (rc < 0) {
+ SPDK_ERRLOG("Failed to remove sock=%p of conn=%p\n", conn->sock, conn);
+ }
+
+ conn->is_stopped = true;
+ STAILQ_REMOVE(&pg->connections, conn, spdk_iscsi_conn, pg_link);
+}
+
+static void
+iscsi_conn_start(void *ctx)
+{
+ struct spdk_iscsi_conn *conn = ctx;
+
+ iscsi_poll_group_add_conn(conn->pg, conn);
+}
+
+int
+iscsi_conn_construct(struct spdk_iscsi_portal *portal,
+ struct spdk_sock *sock)
+{
+ struct spdk_iscsi_poll_group *pg;
+ struct spdk_iscsi_conn *conn;
+ int i, rc;
+
+ conn = allocate_conn();
+ if (conn == NULL) {
+ SPDK_ERRLOG("Could not allocate connection.\n");
+ return -1;
+ }
+
+ pthread_mutex_lock(&g_iscsi.mutex);
+ conn->timeout = g_iscsi.timeout * spdk_get_ticks_hz(); /* seconds to TSC */
+ conn->nopininterval = g_iscsi.nopininterval;
+ conn->nopininterval *= spdk_get_ticks_hz(); /* seconds to TSC */
+ conn->nop_outstanding = false;
+ conn->data_out_cnt = 0;
+ conn->data_in_cnt = 0;
+ conn->disable_chap = portal->group->disable_chap;
+ conn->require_chap = portal->group->require_chap;
+ conn->mutual_chap = portal->group->mutual_chap;
+ conn->chap_group = portal->group->chap_group;
+ pthread_mutex_unlock(&g_iscsi.mutex);
+ conn->MaxRecvDataSegmentLength = 8192; /* RFC3720(12.12) */
+
+ conn->portal = portal;
+ conn->pg_tag = portal->group->tag;
+ memcpy(conn->portal_host, portal->host, strlen(portal->host));
+ memcpy(conn->portal_port, portal->port, strlen(portal->port));
+ conn->sock = sock;
+
+ conn->state = ISCSI_CONN_STATE_INVALID;
+ conn->login_phase = ISCSI_SECURITY_NEGOTIATION_PHASE;
+ conn->ttt = 0;
+
+ conn->partial_text_parameter = NULL;
+
+ for (i = 0; i < MAX_CONNECTION_PARAMS; i++) {
+ conn->conn_param_state_negotiated[i] = false;
+ }
+
+ for (i = 0; i < MAX_SESSION_PARAMS; i++) {
+ conn->sess_param_state_negotiated[i] = false;
+ }
+
+ conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_AWAIT_PDU_READY;
+
+ TAILQ_INIT(&conn->write_pdu_list);
+ TAILQ_INIT(&conn->snack_pdu_list);
+ TAILQ_INIT(&conn->queued_r2t_tasks);
+ TAILQ_INIT(&conn->active_r2t_tasks);
+ TAILQ_INIT(&conn->queued_datain_tasks);
+ memset(&conn->luns, 0, sizeof(conn->luns));
+
+ rc = spdk_sock_getaddr(sock, conn->target_addr, sizeof conn->target_addr, NULL,
+ conn->initiator_addr, sizeof conn->initiator_addr, NULL);
+ if (rc < 0) {
+ SPDK_ERRLOG("spdk_sock_getaddr() failed\n");
+ goto error_return;
+ }
+
+ /* set low water mark */
+ rc = spdk_sock_set_recvlowat(conn->sock, 1);
+ if (rc != 0) {
+ SPDK_ERRLOG("spdk_sock_set_recvlowat() failed\n");
+ goto error_return;
+ }
+
+ /* set default params */
+ rc = iscsi_conn_params_init(&conn->params);
+ if (rc < 0) {
+ SPDK_ERRLOG("iscsi_conn_params_init() failed\n");
+ goto error_return;
+ }
+ conn->logout_request_timer = NULL;
+ conn->logout_timer = NULL;
+ conn->shutdown_timer = NULL;
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Launching connection on acceptor thread\n");
+ conn->pending_task_cnt = 0;
+
+ /* Get the first poll group. */
+ pg = TAILQ_FIRST(&g_iscsi.poll_group_head);
+ if (pg == NULL) {
+ SPDK_ERRLOG("There is no poll group.\n");
+ assert(false);
+ goto error_return;
+ }
+
+ conn->pg = pg;
+ spdk_thread_send_msg(spdk_io_channel_get_thread(spdk_io_channel_from_ctx(pg)),
+ iscsi_conn_start, conn);
+ return 0;
+
+error_return:
+ iscsi_param_free(conn->params);
+ free_conn(conn);
+ return -1;
+}
+
+void
+iscsi_conn_free_pdu(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu)
+{
+ iscsi_conn_xfer_complete_cb cb_fn;
+ void *cb_arg;
+
+ cb_fn = pdu->cb_fn;
+ cb_arg = pdu->cb_arg;
+
+ assert(cb_fn != NULL);
+ pdu->cb_fn = NULL;
+
+ if (pdu->task) {
+ iscsi_task_put(pdu->task);
+ }
+ iscsi_put_pdu(pdu);
+
+ cb_fn(cb_arg);
+}
+
+static int
+iscsi_conn_free_tasks(struct spdk_iscsi_conn *conn)
+{
+ struct spdk_iscsi_pdu *pdu, *tmp_pdu;
+ struct spdk_iscsi_task *iscsi_task, *tmp_iscsi_task;
+
+ TAILQ_FOREACH_SAFE(pdu, &conn->snack_pdu_list, tailq, tmp_pdu) {
+ TAILQ_REMOVE(&conn->snack_pdu_list, pdu, tailq);
+ iscsi_conn_free_pdu(conn, pdu);
+ }
+
+ TAILQ_FOREACH_SAFE(iscsi_task, &conn->queued_datain_tasks, link, tmp_iscsi_task) {
+ if (!iscsi_task->is_queued) {
+ TAILQ_REMOVE(&conn->queued_datain_tasks, iscsi_task, link);
+ iscsi_task_put(iscsi_task);
+ }
+ }
+
+ /* We have to parse conn->write_pdu_list in the end. In iscsi_conn_free_pdu(),
+ * iscsi_conn_handle_queued_datain_tasks() may be called, and
+ * iscsi_conn_handle_queued_datain_tasks() will parse conn->queued_datain_tasks
+ * and may stack some PDUs to conn->write_pdu_list. Hence when we come here, we
+ * have to ensure there is no associated task in conn->queued_datain_tasks.
+ */
+ TAILQ_FOREACH_SAFE(pdu, &conn->write_pdu_list, tailq, tmp_pdu) {
+ TAILQ_REMOVE(&conn->write_pdu_list, pdu, tailq);
+ iscsi_conn_free_pdu(conn, pdu);
+ }
+
+ if (conn->pending_task_cnt) {
+ return -1;
+ }
+
+ return 0;
+}
+
+static void
+iscsi_conn_cleanup_backend(struct spdk_iscsi_conn *conn)
+{
+ int rc;
+ struct spdk_iscsi_tgt_node *target;
+
+ if (conn->sess->connections > 1) {
+ /* connection specific cleanup */
+ } else if (!g_iscsi.AllowDuplicateIsid) {
+ /* clean up all tasks to all LUNs for session */
+ target = conn->sess->target;
+ if (target != NULL) {
+ rc = iscsi_tgt_node_cleanup_luns(conn, target);
+ if (rc < 0) {
+ SPDK_ERRLOG("target abort failed\n");
+ }
+ }
+ }
+}
+
+static void
+iscsi_conn_free(struct spdk_iscsi_conn *conn)
+{
+ struct spdk_iscsi_sess *sess;
+ int idx;
+ uint32_t i;
+
+ pthread_mutex_lock(&g_conns_mutex);
+
+ if (conn->sess == NULL) {
+ goto end;
+ }
+
+ idx = -1;
+ sess = conn->sess;
+ conn->sess = NULL;
+
+ for (i = 0; i < sess->connections; i++) {
+ if (sess->conns[i] == conn) {
+ idx = i;
+ break;
+ }
+ }
+
+ if (idx < 0) {
+ SPDK_ERRLOG("remove conn not found\n");
+ } else {
+ for (i = idx; i < sess->connections - 1; i++) {
+ sess->conns[i] = sess->conns[i + 1];
+ }
+ sess->conns[sess->connections - 1] = NULL;
+ sess->connections--;
+
+ if (sess->connections == 0) {
+ /* cleanup last connection */
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI,
+ "cleanup last conn free sess\n");
+ iscsi_free_sess(sess);
+ }
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Terminating connections(tsih %d): %d\n",
+ sess->tsih, sess->connections);
+
+end:
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "cleanup free conn\n");
+ iscsi_param_free(conn->params);
+ _free_conn(conn);
+
+ pthread_mutex_unlock(&g_conns_mutex);
+}
+
+static void
+iscsi_conn_close_lun(struct spdk_iscsi_conn *conn, int lun_id)
+{
+ struct spdk_iscsi_lun *iscsi_lun;
+
+ iscsi_lun = conn->luns[lun_id];
+ if (iscsi_lun == NULL) {
+ return;
+ }
+
+ spdk_scsi_lun_free_io_channel(iscsi_lun->desc);
+ spdk_scsi_lun_close(iscsi_lun->desc);
+ spdk_poller_unregister(&iscsi_lun->remove_poller);
+ free(iscsi_lun);
+
+ conn->luns[lun_id] = NULL;
+}
+
+static void
+iscsi_conn_close_luns(struct spdk_iscsi_conn *conn)
+{
+ int i;
+
+ for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) {
+ iscsi_conn_close_lun(conn, i);
+ }
+}
+
+static bool
+iscsi_conn_check_tasks_for_lun(struct spdk_iscsi_conn *conn,
+ struct spdk_scsi_lun *lun)
+{
+ struct spdk_iscsi_pdu *pdu, *tmp_pdu;
+ struct spdk_iscsi_task *task;
+
+ assert(lun != NULL);
+
+ /* We can remove deferred PDUs safely because they are already flushed. */
+ TAILQ_FOREACH_SAFE(pdu, &conn->snack_pdu_list, tailq, tmp_pdu) {
+ if (lun == pdu->task->scsi.lun) {
+ TAILQ_REMOVE(&conn->snack_pdu_list, pdu, tailq);
+ iscsi_conn_free_pdu(conn, pdu);
+ }
+ }
+
+ TAILQ_FOREACH(task, &conn->queued_datain_tasks, link) {
+ if (lun == task->scsi.lun) {
+ return false;
+ }
+ }
+
+ /* This check loop works even when connection exits in the middle of LUN hotplug
+ * because all PDUs in write_pdu_list are removed in iscsi_conn_free_tasks().
+ */
+ TAILQ_FOREACH(pdu, &conn->write_pdu_list, tailq) {
+ if (pdu->task && lun == pdu->task->scsi.lun) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static int
+iscsi_conn_remove_lun(void *ctx)
+{
+ struct spdk_iscsi_lun *iscsi_lun = ctx;
+ struct spdk_iscsi_conn *conn = iscsi_lun->conn;
+ struct spdk_scsi_lun *lun = iscsi_lun->lun;
+ int lun_id = spdk_scsi_lun_get_id(lun);
+
+ if (!iscsi_conn_check_tasks_for_lun(conn, lun)) {
+ return SPDK_POLLER_BUSY;
+ }
+ iscsi_conn_close_lun(conn, lun_id);
+ return SPDK_POLLER_BUSY;
+}
+
+static void
+_iscsi_conn_hotremove_lun(void *ctx)
+{
+ struct spdk_iscsi_lun *iscsi_lun = ctx;
+ struct spdk_iscsi_conn *conn = iscsi_lun->conn;
+ struct spdk_scsi_lun *lun = iscsi_lun->lun;
+
+ assert(spdk_io_channel_get_thread(spdk_io_channel_from_ctx(conn->pg)) ==
+ spdk_get_thread());
+
+ /* If a connection is already in stating status, just return */
+ if (conn->state >= ISCSI_CONN_STATE_EXITING) {
+ return;
+ }
+
+ iscsi_clear_all_transfer_task(conn, lun, NULL);
+
+ iscsi_lun->remove_poller = SPDK_POLLER_REGISTER(iscsi_conn_remove_lun, iscsi_lun,
+ 1000);
+}
+
+static void
+iscsi_conn_hotremove_lun(struct spdk_scsi_lun *lun, void *remove_ctx)
+{
+ struct spdk_iscsi_conn *conn = remove_ctx;
+ int lun_id = spdk_scsi_lun_get_id(lun);
+ struct spdk_iscsi_lun *iscsi_lun;
+
+ iscsi_lun = conn->luns[lun_id];
+ if (iscsi_lun == NULL) {
+ SPDK_ERRLOG("LUN hotplug was notified to the unallocated LUN %d.\n", lun_id);
+ return;
+ }
+
+ spdk_thread_send_msg(spdk_io_channel_get_thread(spdk_io_channel_from_ctx(conn->pg)),
+ _iscsi_conn_hotremove_lun, iscsi_lun);
+}
+
+static int
+iscsi_conn_open_lun(struct spdk_iscsi_conn *conn, int lun_id,
+ struct spdk_scsi_lun *lun)
+{
+ int rc;
+ struct spdk_iscsi_lun *iscsi_lun;
+
+ iscsi_lun = calloc(1, sizeof(*iscsi_lun));
+ if (iscsi_lun == NULL) {
+ return -ENOMEM;
+ }
+
+ iscsi_lun->conn = conn;
+ iscsi_lun->lun = lun;
+
+ rc = spdk_scsi_lun_open(lun, iscsi_conn_hotremove_lun, conn, &iscsi_lun->desc);
+ if (rc != 0) {
+ free(iscsi_lun);
+ return rc;
+ }
+
+ rc = spdk_scsi_lun_allocate_io_channel(iscsi_lun->desc);
+ if (rc != 0) {
+ spdk_scsi_lun_close(iscsi_lun->desc);
+ free(iscsi_lun);
+ return rc;
+ }
+
+ conn->luns[lun_id] = iscsi_lun;
+
+ return 0;
+}
+
+static void
+iscsi_conn_open_luns(struct spdk_iscsi_conn *conn)
+{
+ int i, rc;
+ struct spdk_scsi_lun *lun;
+
+ for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) {
+ lun = spdk_scsi_dev_get_lun(conn->dev, i);
+ if (lun == NULL) {
+ continue;
+ }
+
+ rc = iscsi_conn_open_lun(conn, i, lun);
+ if (rc != 0) {
+ goto error;
+ }
+ }
+
+ return;
+
+error:
+ iscsi_conn_close_luns(conn);
+}
+
+/**
+ * This function will stop executing the specified connection.
+ */
+static void
+iscsi_conn_stop(struct spdk_iscsi_conn *conn)
+{
+ struct spdk_iscsi_tgt_node *target;
+
+ assert(conn->state == ISCSI_CONN_STATE_EXITED);
+ assert(conn->data_in_cnt == 0);
+ assert(conn->data_out_cnt == 0);
+
+ if (conn->sess != NULL &&
+ conn->sess->session_type == SESSION_TYPE_NORMAL &&
+ conn->full_feature) {
+ target = conn->sess->target;
+ pthread_mutex_lock(&target->mutex);
+ target->num_active_conns--;
+ pthread_mutex_unlock(&target->mutex);
+
+ iscsi_conn_close_luns(conn);
+ }
+
+ assert(spdk_io_channel_get_thread(spdk_io_channel_from_ctx(conn->pg)) ==
+ spdk_get_thread());
+}
+
+static int
+_iscsi_conn_check_shutdown(void *arg)
+{
+ struct spdk_iscsi_conn *conn = arg;
+ int rc;
+
+ rc = iscsi_conn_free_tasks(conn);
+ if (rc < 0) {
+ return SPDK_POLLER_BUSY;
+ }
+
+ spdk_poller_unregister(&conn->shutdown_timer);
+
+ iscsi_conn_stop(conn);
+ iscsi_conn_free(conn);
+
+ return SPDK_POLLER_BUSY;
+}
+
+static void
+_iscsi_conn_destruct(struct spdk_iscsi_conn *conn)
+{
+ int rc;
+
+ iscsi_poll_group_remove_conn(conn->pg, conn);
+ spdk_sock_close(&conn->sock);
+ iscsi_clear_all_transfer_task(conn, NULL, NULL);
+ spdk_poller_unregister(&conn->logout_request_timer);
+ spdk_poller_unregister(&conn->logout_timer);
+
+ rc = iscsi_conn_free_tasks(conn);
+ if (rc < 0) {
+ /* The connection cannot be freed yet. Check back later. */
+ conn->shutdown_timer = SPDK_POLLER_REGISTER(_iscsi_conn_check_shutdown, conn, 1000);
+ } else {
+ iscsi_conn_stop(conn);
+ iscsi_conn_free(conn);
+ }
+}
+
+static int
+_iscsi_conn_check_pending_tasks(void *arg)
+{
+ struct spdk_iscsi_conn *conn = arg;
+
+ if (conn->dev != NULL &&
+ spdk_scsi_dev_has_pending_tasks(conn->dev, conn->initiator_port)) {
+ return SPDK_POLLER_BUSY;
+ }
+
+ spdk_poller_unregister(&conn->shutdown_timer);
+
+ _iscsi_conn_destruct(conn);
+
+ return SPDK_POLLER_BUSY;
+}
+
+void
+iscsi_conn_destruct(struct spdk_iscsi_conn *conn)
+{
+ struct spdk_iscsi_pdu *pdu;
+ struct spdk_iscsi_task *task;
+ int opcode;
+
+ /* If a connection is already in exited status, just return */
+ if (conn->state >= ISCSI_CONN_STATE_EXITED) {
+ return;
+ }
+
+ conn->state = ISCSI_CONN_STATE_EXITED;
+
+ /*
+ * Each connection pre-allocates its next PDU - make sure these get
+ * freed here.
+ */
+ pdu = conn->pdu_in_progress;
+ if (pdu) {
+ /* remove the task left in the PDU too. */
+ task = pdu->task;
+ if (task) {
+ opcode = pdu->bhs.opcode;
+ switch (opcode) {
+ case ISCSI_OP_SCSI:
+ case ISCSI_OP_SCSI_DATAOUT:
+ spdk_scsi_task_process_abort(&task->scsi);
+ iscsi_task_cpl(&task->scsi);
+ break;
+ default:
+ SPDK_ERRLOG("unexpected opcode %x\n", opcode);
+ iscsi_task_put(task);
+ break;
+ }
+ }
+ iscsi_put_pdu(pdu);
+ conn->pdu_in_progress = NULL;
+ }
+
+ if (conn->sess != NULL && conn->pending_task_cnt > 0) {
+ iscsi_conn_cleanup_backend(conn);
+ }
+
+ if (conn->dev != NULL &&
+ spdk_scsi_dev_has_pending_tasks(conn->dev, conn->initiator_port)) {
+ conn->shutdown_timer = SPDK_POLLER_REGISTER(_iscsi_conn_check_pending_tasks, conn, 1000);
+ } else {
+ _iscsi_conn_destruct(conn);
+ }
+}
+
+int
+iscsi_get_active_conns(struct spdk_iscsi_tgt_node *target)
+{
+ struct spdk_iscsi_conn *conn;
+ int num = 0;
+
+ if (g_conns_array == MAP_FAILED) {
+ return 0;
+ }
+
+ pthread_mutex_lock(&g_conns_mutex);
+ TAILQ_FOREACH(conn, &g_active_conns, conn_link) {
+ if (target == NULL || conn->target == target) {
+ num++;
+ }
+ }
+ pthread_mutex_unlock(&g_conns_mutex);
+ return num;
+}
+
+static void
+iscsi_conn_check_shutdown_cb(void *arg1)
+{
+ _iscsi_conns_cleanup();
+ shutdown_iscsi_conns_done();
+}
+
+static int
+iscsi_conn_check_shutdown(void *arg)
+{
+ if (iscsi_get_active_conns(NULL) != 0) {
+ return SPDK_POLLER_BUSY;
+ }
+
+ spdk_poller_unregister(&g_shutdown_timer);
+
+ spdk_thread_send_msg(spdk_get_thread(), iscsi_conn_check_shutdown_cb, NULL);
+
+ return SPDK_POLLER_BUSY;
+}
+
+static void
+iscsi_send_logout_request(struct spdk_iscsi_conn *conn)
+{
+ struct spdk_iscsi_pdu *rsp_pdu;
+ struct iscsi_bhs_async *rsph;
+
+ rsp_pdu = iscsi_get_pdu(conn);
+ assert(rsp_pdu != NULL);
+
+ rsph = (struct iscsi_bhs_async *)&rsp_pdu->bhs;
+ rsp_pdu->data = NULL;
+
+ rsph->opcode = ISCSI_OP_ASYNC;
+ to_be32(&rsph->ffffffff, 0xFFFFFFFF);
+ rsph->async_event = 1;
+ to_be16(&rsph->param3, ISCSI_LOGOUT_REQUEST_TIMEOUT);
+
+ to_be32(&rsph->stat_sn, conn->StatSN);
+ to_be32(&rsph->exp_cmd_sn, conn->sess->ExpCmdSN);
+ to_be32(&rsph->max_cmd_sn, conn->sess->MaxCmdSN);
+
+ iscsi_conn_write_pdu(conn, rsp_pdu, iscsi_conn_pdu_generic_complete, NULL);
+}
+
+static int
+logout_request_timeout(void *arg)
+{
+ struct spdk_iscsi_conn *conn = arg;
+
+ if (conn->state < ISCSI_CONN_STATE_EXITING) {
+ conn->state = ISCSI_CONN_STATE_EXITING;
+ }
+
+ return SPDK_POLLER_BUSY;
+}
+
+/* If the connection is running and logout is not requested yet, request logout
+ * to initiator and wait for the logout process to start.
+ */
+static void
+_iscsi_conn_request_logout(void *ctx)
+{
+ struct spdk_iscsi_conn *conn = ctx;
+
+ if (conn->state > ISCSI_CONN_STATE_RUNNING ||
+ conn->logout_request_timer != NULL) {
+ return;
+ }
+
+ iscsi_send_logout_request(conn);
+
+ conn->logout_request_timer = SPDK_POLLER_REGISTER(logout_request_timeout,
+ conn, ISCSI_LOGOUT_REQUEST_TIMEOUT * 1000000);
+}
+
+static void
+iscsi_conn_request_logout(struct spdk_iscsi_conn *conn)
+{
+ struct spdk_thread *thread;
+
+ if (conn->state == ISCSI_CONN_STATE_INVALID) {
+ /* Move it to EXITING state if the connection is in login. */
+ conn->state = ISCSI_CONN_STATE_EXITING;
+ } else if (conn->state == ISCSI_CONN_STATE_RUNNING &&
+ conn->logout_request_timer == NULL) {
+ thread = spdk_io_channel_get_thread(spdk_io_channel_from_ctx(conn->pg));
+ spdk_thread_send_msg(thread, _iscsi_conn_request_logout, conn);
+ }
+}
+
+void
+iscsi_conns_request_logout(struct spdk_iscsi_tgt_node *target)
+{
+ struct spdk_iscsi_conn *conn;
+
+ if (g_conns_array == MAP_FAILED) {
+ return;
+ }
+
+ pthread_mutex_lock(&g_conns_mutex);
+ TAILQ_FOREACH(conn, &g_active_conns, conn_link) {
+ if (target == NULL || conn->target == target) {
+ iscsi_conn_request_logout(conn);
+ }
+ }
+ pthread_mutex_unlock(&g_conns_mutex);
+}
+
+void
+shutdown_iscsi_conns(void)
+{
+ iscsi_conns_request_logout(NULL);
+
+ g_shutdown_timer = SPDK_POLLER_REGISTER(iscsi_conn_check_shutdown, NULL, 1000);
+}
+
+/* Do not set conn->state if the connection has already started exiting.
+ * This ensures we do not move a connection from EXITED state back to EXITING.
+ */
+static void
+_iscsi_conn_drop(void *ctx)
+{
+ struct spdk_iscsi_conn *conn = ctx;
+
+ if (conn->state < ISCSI_CONN_STATE_EXITING) {
+ conn->state = ISCSI_CONN_STATE_EXITING;
+ }
+}
+
+int
+iscsi_drop_conns(struct spdk_iscsi_conn *conn, const char *conn_match,
+ int drop_all)
+{
+ struct spdk_iscsi_conn *xconn;
+ const char *xconn_match;
+ struct spdk_thread *thread;
+ int num;
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "iscsi_drop_conns\n");
+
+ num = 0;
+ pthread_mutex_lock(&g_conns_mutex);
+ if (g_conns_array == MAP_FAILED) {
+ goto exit;
+ }
+
+ TAILQ_FOREACH(xconn, &g_active_conns, conn_link) {
+ if (xconn == conn) {
+ continue;
+ }
+
+ if (!drop_all && xconn->initiator_port == NULL) {
+ continue;
+ }
+
+ xconn_match =
+ drop_all ? xconn->initiator_name : spdk_scsi_port_get_name(xconn->initiator_port);
+
+ if (!strcasecmp(conn_match, xconn_match) &&
+ conn->target == xconn->target) {
+
+ if (num == 0) {
+ /*
+ * Only print this message before we report the
+ * first dropped connection.
+ */
+ SPDK_ERRLOG("drop old connections %s by %s\n",
+ conn->target->name, conn_match);
+ }
+
+ SPDK_ERRLOG("exiting conn by %s (%s)\n",
+ xconn_match, xconn->initiator_addr);
+ if (xconn->sess != NULL) {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "TSIH=%u\n", xconn->sess->tsih);
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "TSIH=xx\n");
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "CID=%u\n", xconn->cid);
+
+ thread = spdk_io_channel_get_thread(spdk_io_channel_from_ctx(xconn->pg));
+ spdk_thread_send_msg(thread, _iscsi_conn_drop, xconn);
+
+ num++;
+ }
+ }
+
+exit:
+ pthread_mutex_unlock(&g_conns_mutex);
+
+ if (num != 0) {
+ SPDK_ERRLOG("exiting %d conns\n", num);
+ }
+
+ return 0;
+}
+
+static int
+_iscsi_conn_abort_queued_datain_task(struct spdk_iscsi_conn *conn,
+ struct spdk_iscsi_task *task)
+{
+ struct spdk_iscsi_task *subtask;
+ uint32_t remaining_size;
+
+ if (conn->data_in_cnt >= MAX_LARGE_DATAIN_PER_CONNECTION) {
+ return -1;
+ }
+
+ assert(task->current_datain_offset <= task->scsi.transfer_len);
+ /* Stop split and abort read I/O for remaining data. */
+ if (task->current_datain_offset < task->scsi.transfer_len) {
+ remaining_size = task->scsi.transfer_len - task->current_datain_offset;
+ subtask = iscsi_task_get(conn, task, iscsi_task_cpl);
+ assert(subtask != NULL);
+ subtask->scsi.offset = task->current_datain_offset;
+ subtask->scsi.length = remaining_size;
+ spdk_scsi_task_set_data(&subtask->scsi, NULL, 0);
+ task->current_datain_offset += subtask->scsi.length;
+
+ subtask->scsi.transfer_len = subtask->scsi.length;
+ spdk_scsi_task_process_abort(&subtask->scsi);
+ iscsi_task_cpl(&subtask->scsi);
+ }
+
+ /* Remove the primary task from the list because all subtasks are submitted
+ * or aborted.
+ */
+ assert(task->current_datain_offset == task->scsi.transfer_len);
+ TAILQ_REMOVE(&conn->queued_datain_tasks, task, link);
+ return 0;
+}
+
+int
+iscsi_conn_abort_queued_datain_task(struct spdk_iscsi_conn *conn,
+ uint32_t ref_task_tag)
+{
+ struct spdk_iscsi_task *task;
+
+ TAILQ_FOREACH(task, &conn->queued_datain_tasks, link) {
+ if (task->tag == ref_task_tag) {
+ return _iscsi_conn_abort_queued_datain_task(conn, task);
+ }
+ }
+
+ return 0;
+}
+
+int
+iscsi_conn_abort_queued_datain_tasks(struct spdk_iscsi_conn *conn,
+ struct spdk_scsi_lun *lun,
+ struct spdk_iscsi_pdu *pdu)
+{
+ struct spdk_iscsi_task *task, *task_tmp;
+ struct spdk_iscsi_pdu *pdu_tmp;
+ int rc;
+
+ TAILQ_FOREACH_SAFE(task, &conn->queued_datain_tasks, link, task_tmp) {
+ pdu_tmp = iscsi_task_get_pdu(task);
+ if ((lun == NULL || lun == task->scsi.lun) &&
+ (pdu == NULL || (spdk_sn32_lt(pdu_tmp->cmd_sn, pdu->cmd_sn)))) {
+ rc = _iscsi_conn_abort_queued_datain_task(conn, task);
+ if (rc != 0) {
+ return rc;
+ }
+ }
+ }
+
+ return 0;
+}
+
+int
+iscsi_conn_handle_queued_datain_tasks(struct spdk_iscsi_conn *conn)
+{
+ struct spdk_iscsi_task *task;
+
+ while (!TAILQ_EMPTY(&conn->queued_datain_tasks) &&
+ conn->data_in_cnt < MAX_LARGE_DATAIN_PER_CONNECTION) {
+ task = TAILQ_FIRST(&conn->queued_datain_tasks);
+ assert(task->current_datain_offset <= task->scsi.transfer_len);
+ if (task->current_datain_offset < task->scsi.transfer_len) {
+ struct spdk_iscsi_task *subtask;
+ uint32_t remaining_size = 0;
+
+ remaining_size = task->scsi.transfer_len - task->current_datain_offset;
+ subtask = iscsi_task_get(conn, task, iscsi_task_cpl);
+ assert(subtask != NULL);
+ subtask->scsi.offset = task->current_datain_offset;
+ spdk_scsi_task_set_data(&subtask->scsi, NULL, 0);
+
+ if (spdk_scsi_dev_get_lun(conn->dev, task->lun_id) == NULL) {
+ /* Stop submitting split read I/Os for remaining data. */
+ TAILQ_REMOVE(&conn->queued_datain_tasks, task, link);
+ task->current_datain_offset += remaining_size;
+ assert(task->current_datain_offset == task->scsi.transfer_len);
+ subtask->scsi.transfer_len = remaining_size;
+ spdk_scsi_task_process_null_lun(&subtask->scsi);
+ iscsi_task_cpl(&subtask->scsi);
+ return 0;
+ }
+
+ subtask->scsi.length = spdk_min(SPDK_BDEV_LARGE_BUF_MAX_SIZE, remaining_size);
+ task->current_datain_offset += subtask->scsi.length;
+ iscsi_queue_task(conn, subtask);
+ }
+ if (task->current_datain_offset == task->scsi.transfer_len) {
+ TAILQ_REMOVE(&conn->queued_datain_tasks, task, link);
+ }
+ }
+ return 0;
+}
+
+void
+iscsi_task_mgmt_cpl(struct spdk_scsi_task *scsi_task)
+{
+ struct spdk_iscsi_task *task = iscsi_task_from_scsi_task(scsi_task);
+
+ iscsi_task_mgmt_response(task->conn, task);
+ iscsi_task_put(task);
+}
+
+static void
+iscsi_task_copy_to_rsp_scsi_status(struct spdk_iscsi_task *primary,
+ struct spdk_scsi_task *task)
+{
+ memcpy(primary->rsp_sense_data, task->sense_data, task->sense_data_len);
+ primary->rsp_sense_data_len = task->sense_data_len;
+ primary->rsp_scsi_status = task->status;
+}
+
+static void
+iscsi_task_copy_from_rsp_scsi_status(struct spdk_scsi_task *task,
+ struct spdk_iscsi_task *primary)
+{
+ memcpy(task->sense_data, primary->rsp_sense_data,
+ primary->rsp_sense_data_len);
+ task->sense_data_len = primary->rsp_sense_data_len;
+ task->status = primary->rsp_scsi_status;
+}
+
+static void
+process_completed_read_subtask_list(struct spdk_iscsi_conn *conn,
+ struct spdk_iscsi_task *primary)
+{
+ struct spdk_iscsi_task *subtask, *tmp;
+
+ TAILQ_FOREACH_SAFE(subtask, &primary->subtask_list, subtask_link, tmp) {
+ if (subtask->scsi.offset == primary->bytes_completed) {
+ TAILQ_REMOVE(&primary->subtask_list, subtask, subtask_link);
+ primary->bytes_completed += subtask->scsi.length;
+ iscsi_task_response(conn, subtask);
+ iscsi_task_put(subtask);
+ } else {
+ break;
+ }
+ }
+
+ if (primary->bytes_completed == primary->scsi.transfer_len) {
+ iscsi_task_put(primary);
+ }
+}
+
+static void
+process_read_task_completion(struct spdk_iscsi_conn *conn,
+ struct spdk_iscsi_task *task,
+ struct spdk_iscsi_task *primary)
+{
+ struct spdk_iscsi_task *tmp;
+
+ /* If the status of the completed subtask is the first failure,
+ * copy it to out-of-order subtasks and remember it as the status
+ * of the command,
+ *
+ * Even if the status of the completed task is success,
+ * there are any failed subtask ever, copy the first failed status
+ * to it.
+ */
+ if (task->scsi.status != SPDK_SCSI_STATUS_GOOD) {
+ if (primary->rsp_scsi_status == SPDK_SCSI_STATUS_GOOD) {
+ TAILQ_FOREACH(tmp, &primary->subtask_list, subtask_link) {
+ spdk_scsi_task_copy_status(&tmp->scsi, &task->scsi);
+ }
+ iscsi_task_copy_to_rsp_scsi_status(primary, &task->scsi);
+ }
+ } else if (primary->rsp_scsi_status != SPDK_SCSI_STATUS_GOOD) {
+ iscsi_task_copy_from_rsp_scsi_status(&task->scsi, primary);
+ }
+
+ if (task == primary) {
+ primary->bytes_completed = task->scsi.length;
+ /* For non split read I/O */
+ assert(primary->bytes_completed == task->scsi.transfer_len);
+ iscsi_task_response(conn, task);
+ iscsi_task_put(task);
+ } else {
+ if (task->scsi.offset != primary->bytes_completed) {
+ TAILQ_FOREACH(tmp, &primary->subtask_list, subtask_link) {
+ if (task->scsi.offset < tmp->scsi.offset) {
+ TAILQ_INSERT_BEFORE(tmp, task, subtask_link);
+ return;
+ }
+ }
+
+ TAILQ_INSERT_TAIL(&primary->subtask_list, task, subtask_link);
+ } else {
+ TAILQ_INSERT_HEAD(&primary->subtask_list, task, subtask_link);
+ process_completed_read_subtask_list(conn, primary);
+ }
+ }
+}
+
+static void
+process_non_read_task_completion(struct spdk_iscsi_conn *conn,
+ struct spdk_iscsi_task *task,
+ struct spdk_iscsi_task *primary)
+{
+ primary->bytes_completed += task->scsi.length;
+
+ /* If the status of the subtask is the first failure, remember it as
+ * the status of the command and set it to the status of the primary
+ * task later.
+ *
+ * If the first failed task is the primary, two copies can be avoided
+ * but code simplicity is prioritized.
+ */
+ if (task->scsi.status == SPDK_SCSI_STATUS_GOOD) {
+ if (task != primary) {
+ primary->scsi.data_transferred += task->scsi.data_transferred;
+ }
+ } else if (primary->rsp_scsi_status == SPDK_SCSI_STATUS_GOOD) {
+ iscsi_task_copy_to_rsp_scsi_status(primary, &task->scsi);
+ }
+
+ if (primary->bytes_completed == primary->scsi.transfer_len) {
+ /*
+ * Check if this is the last task completed for an iSCSI write
+ * that required child subtasks. If task != primary, we know
+ * for sure that it was part of an iSCSI write with child subtasks.
+ * The trickier case is when the last task completed was the initial
+ * task - in this case the task will have a smaller length than
+ * the overall transfer length.
+ */
+ if (task != primary || task->scsi.length != task->scsi.transfer_len) {
+ /* If LUN is removed in the middle of the iSCSI write sequence,
+ * primary might complete the write to the initiator because it is not
+ * ensured that the initiator will send all data requested by R2Ts.
+ *
+ * We check it and skip the following if primary is completed. (see
+ * iscsi_clear_all_transfer_task() in iscsi.c.)
+ */
+ if (primary->is_r2t_active) {
+ if (primary->rsp_scsi_status != SPDK_SCSI_STATUS_GOOD) {
+ iscsi_task_copy_from_rsp_scsi_status(&primary->scsi, primary);
+ }
+ iscsi_task_response(conn, primary);
+ iscsi_del_transfer_task(conn, primary->tag);
+ }
+ } else {
+ iscsi_task_response(conn, task);
+ }
+ }
+ iscsi_task_put(task);
+}
+
+void
+iscsi_task_cpl(struct spdk_scsi_task *scsi_task)
+{
+ struct spdk_iscsi_task *primary;
+ struct spdk_iscsi_task *task = iscsi_task_from_scsi_task(scsi_task);
+ struct spdk_iscsi_conn *conn = task->conn;
+ struct spdk_iscsi_pdu *pdu = task->pdu;
+
+ spdk_trace_record(TRACE_ISCSI_TASK_DONE, conn->id, 0, (uintptr_t)task, 0);
+
+ task->is_queued = false;
+ primary = iscsi_task_get_primary(task);
+
+ if (iscsi_task_is_read(primary)) {
+ process_read_task_completion(conn, task, primary);
+ } else {
+ process_non_read_task_completion(conn, task, primary);
+ }
+ if (!task->parent) {
+ spdk_trace_record(TRACE_ISCSI_PDU_COMPLETED, 0, 0, (uintptr_t)pdu, 0);
+ }
+}
+
+static void
+iscsi_conn_send_nopin(struct spdk_iscsi_conn *conn)
+{
+ struct spdk_iscsi_pdu *rsp_pdu;
+ struct iscsi_bhs_nop_in *rsp;
+ /* Only send nopin if we have logged in and are in a normal session. */
+ if (conn->sess == NULL ||
+ !conn->full_feature ||
+ !iscsi_param_eq_val(conn->sess->params, "SessionType", "Normal")) {
+ return;
+ }
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "send NOPIN isid=%"PRIx64", tsih=%u, cid=%u\n",
+ conn->sess->isid, conn->sess->tsih, conn->cid);
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "StatSN=%u, ExpCmdSN=%u, MaxCmdSN=%u\n",
+ conn->StatSN, conn->sess->ExpCmdSN,
+ conn->sess->MaxCmdSN);
+ rsp_pdu = iscsi_get_pdu(conn);
+ rsp = (struct iscsi_bhs_nop_in *) &rsp_pdu->bhs;
+ rsp_pdu->data = NULL;
+ /*
+ * iscsi_get_pdu() memset's the PDU for us, so only fill out the needed
+ * fields.
+ */
+ rsp->opcode = ISCSI_OP_NOPIN;
+ rsp->flags = 0x80;
+ /*
+ * Technically the to_be32() is not needed here, since
+ * to_be32(0xFFFFFFFU) returns 0xFFFFFFFFU.
+ */
+ to_be32(&rsp->itt, 0xFFFFFFFFU);
+ to_be32(&rsp->ttt, conn->id);
+ to_be32(&rsp->stat_sn, conn->StatSN);
+ to_be32(&rsp->exp_cmd_sn, conn->sess->ExpCmdSN);
+ to_be32(&rsp->max_cmd_sn, conn->sess->MaxCmdSN);
+ iscsi_conn_write_pdu(conn, rsp_pdu, iscsi_conn_pdu_generic_complete, NULL);
+ conn->last_nopin = spdk_get_ticks();
+ conn->nop_outstanding = true;
+}
+
+void
+iscsi_conn_handle_nop(struct spdk_iscsi_conn *conn)
+{
+ uint64_t tsc;
+
+ /**
+ * This function will be executed by nop_poller of iSCSI polling group, so
+ * we need to check the connection state first, then do the nop interval
+ * expiration check work.
+ */
+ if ((conn->state == ISCSI_CONN_STATE_EXITED) ||
+ (conn->state == ISCSI_CONN_STATE_EXITING)) {
+ return;
+ }
+
+ /* Check for nop interval expiration */
+ tsc = spdk_get_ticks();
+ if (conn->nop_outstanding) {
+ if ((tsc - conn->last_nopin) > conn->timeout) {
+ SPDK_ERRLOG("Timed out waiting for NOP-Out response from initiator\n");
+ SPDK_ERRLOG(" tsc=0x%lx, last_nopin=0x%lx\n", tsc, conn->last_nopin);
+ SPDK_ERRLOG(" initiator=%s, target=%s\n", conn->initiator_name,
+ conn->target_short_name);
+ conn->state = ISCSI_CONN_STATE_EXITING;
+ }
+ } else if (tsc - conn->last_nopin > conn->nopininterval) {
+ iscsi_conn_send_nopin(conn);
+ }
+}
+
+/**
+ * \brief Reads data for the specified iSCSI connection from its TCP socket.
+ *
+ * The TCP socket is marked as non-blocking, so this function may not read
+ * all data requested.
+ *
+ * Returns SPDK_ISCSI_CONNECTION_FATAL if the recv() operation indicates a fatal
+ * error with the TCP connection (including if the TCP connection was closed
+ * unexpectedly.
+ *
+ * Otherwise returns the number of bytes successfully read.
+ */
+int
+iscsi_conn_read_data(struct spdk_iscsi_conn *conn, int bytes,
+ void *buf)
+{
+ int ret;
+
+ if (bytes == 0) {
+ return 0;
+ }
+
+ ret = spdk_sock_recv(conn->sock, buf, bytes);
+
+ if (ret > 0) {
+ spdk_trace_record(TRACE_ISCSI_READ_FROM_SOCKET_DONE, conn->id, ret, 0, 0);
+ return ret;
+ }
+
+ if (ret < 0) {
+ if (errno == EAGAIN || errno == EWOULDBLOCK) {
+ return 0;
+ }
+
+ /* For connect reset issue, do not output error log */
+ if (errno == ECONNRESET) {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "spdk_sock_recv() failed, errno %d: %s\n",
+ errno, spdk_strerror(errno));
+ } else {
+ SPDK_ERRLOG("spdk_sock_recv() failed, errno %d: %s\n",
+ errno, spdk_strerror(errno));
+ }
+ }
+
+ /* connection closed */
+ return SPDK_ISCSI_CONNECTION_FATAL;
+}
+
+int
+iscsi_conn_readv_data(struct spdk_iscsi_conn *conn,
+ struct iovec *iov, int iovcnt)
+{
+ int ret;
+
+ if (iov == NULL || iovcnt == 0) {
+ return 0;
+ }
+
+ if (iovcnt == 1) {
+ return iscsi_conn_read_data(conn, iov[0].iov_len,
+ iov[0].iov_base);
+ }
+
+ ret = spdk_sock_readv(conn->sock, iov, iovcnt);
+
+ if (ret > 0) {
+ spdk_trace_record(TRACE_ISCSI_READ_FROM_SOCKET_DONE, conn->id, ret, 0, 0);
+ return ret;
+ }
+
+ if (ret < 0) {
+ if (errno == EAGAIN || errno == EWOULDBLOCK) {
+ return 0;
+ }
+
+ /* For connect reset issue, do not output error log */
+ if (errno == ECONNRESET) {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "spdk_sock_readv() failed, errno %d: %s\n",
+ errno, spdk_strerror(errno));
+ } else {
+ SPDK_ERRLOG("spdk_sock_readv() failed, errno %d: %s\n",
+ errno, spdk_strerror(errno));
+ }
+ }
+
+ /* connection closed */
+ return SPDK_ISCSI_CONNECTION_FATAL;
+}
+
+static bool
+iscsi_is_free_pdu_deferred(struct spdk_iscsi_pdu *pdu)
+{
+ if (pdu == NULL) {
+ return false;
+ }
+
+ if (pdu->bhs.opcode == ISCSI_OP_R2T ||
+ pdu->bhs.opcode == ISCSI_OP_SCSI_DATAIN) {
+ return true;
+ }
+
+ return false;
+}
+
+static int
+iscsi_dif_verify(struct spdk_iscsi_pdu *pdu, struct spdk_dif_ctx *dif_ctx)
+{
+ struct iovec iov;
+ struct spdk_dif_error err_blk = {};
+ uint32_t num_blocks;
+ int rc;
+
+ iov.iov_base = pdu->data;
+ iov.iov_len = pdu->data_buf_len;
+ num_blocks = pdu->data_buf_len / dif_ctx->block_size;
+
+ rc = spdk_dif_verify(&iov, 1, num_blocks, dif_ctx, &err_blk);
+ if (rc != 0) {
+ SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n",
+ err_blk.err_type, err_blk.err_offset);
+ }
+
+ return rc;
+}
+
+static void
+_iscsi_conn_pdu_write_done(void *cb_arg, int err)
+{
+ struct spdk_iscsi_pdu *pdu = cb_arg;
+ struct spdk_iscsi_conn *conn = pdu->conn;
+
+ assert(conn != NULL);
+
+ if (spdk_unlikely(conn->state >= ISCSI_CONN_STATE_EXITING)) {
+ /* The other policy will recycle the resource */
+ return;
+ }
+
+ TAILQ_REMOVE(&conn->write_pdu_list, pdu, tailq);
+
+ if (err != 0) {
+ conn->state = ISCSI_CONN_STATE_EXITING;
+ } else {
+ spdk_trace_record(TRACE_ISCSI_FLUSH_WRITEBUF_DONE, conn->id, pdu->mapped_length, (uintptr_t)pdu, 0);
+ }
+
+ if ((conn->full_feature) &&
+ (conn->sess->ErrorRecoveryLevel >= 1) &&
+ iscsi_is_free_pdu_deferred(pdu)) {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "stat_sn=%d\n",
+ from_be32(&pdu->bhs.stat_sn));
+ TAILQ_INSERT_TAIL(&conn->snack_pdu_list, pdu,
+ tailq);
+ } else {
+ iscsi_conn_free_pdu(conn, pdu);
+ }
+}
+
+void
+iscsi_conn_pdu_generic_complete(void *cb_arg)
+{
+}
+
+void
+iscsi_conn_write_pdu(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu,
+ iscsi_conn_xfer_complete_cb cb_fn,
+ void *cb_arg)
+{
+ uint32_t crc32c;
+ ssize_t rc;
+
+ if (spdk_unlikely(pdu->dif_insert_or_strip)) {
+ rc = iscsi_dif_verify(pdu, &pdu->dif_ctx);
+ if (rc != 0) {
+ iscsi_conn_free_pdu(conn, pdu);
+ conn->state = ISCSI_CONN_STATE_EXITING;
+ return;
+ }
+ }
+
+ if (pdu->bhs.opcode != ISCSI_OP_LOGIN_RSP) {
+ /* Header Digest */
+ if (conn->header_digest) {
+ crc32c = iscsi_pdu_calc_header_digest(pdu);
+ MAKE_DIGEST_WORD(pdu->header_digest, crc32c);
+ }
+
+ /* Data Digest */
+ if (conn->data_digest && DGET24(pdu->bhs.data_segment_len) != 0) {
+ crc32c = iscsi_pdu_calc_data_digest(pdu);
+ MAKE_DIGEST_WORD(pdu->data_digest, crc32c);
+ }
+ }
+
+ pdu->cb_fn = cb_fn;
+ pdu->cb_arg = cb_arg;
+ TAILQ_INSERT_TAIL(&conn->write_pdu_list, pdu, tailq);
+
+ if (spdk_unlikely(conn->state >= ISCSI_CONN_STATE_EXITING)) {
+ return;
+ }
+ pdu->sock_req.iovcnt = iscsi_build_iovs(conn, pdu->iov, SPDK_COUNTOF(pdu->iov), pdu,
+ &pdu->mapped_length);
+ pdu->sock_req.cb_fn = _iscsi_conn_pdu_write_done;
+ pdu->sock_req.cb_arg = pdu;
+
+ spdk_trace_record(TRACE_ISCSI_FLUSH_WRITEBUF_START, conn->id, pdu->mapped_length, (uintptr_t)pdu,
+ pdu->sock_req.iovcnt);
+ spdk_sock_writev_async(conn->sock, &pdu->sock_req);
+}
+
+static void
+iscsi_conn_sock_cb(void *arg, struct spdk_sock_group *group, struct spdk_sock *sock)
+{
+ struct spdk_iscsi_conn *conn = arg;
+ int rc;
+
+ assert(conn != NULL);
+
+ if ((conn->state == ISCSI_CONN_STATE_EXITED) ||
+ (conn->state == ISCSI_CONN_STATE_EXITING)) {
+ return;
+ }
+
+ /* Handle incoming PDUs */
+ rc = iscsi_handle_incoming_pdus(conn);
+ if (rc < 0) {
+ conn->state = ISCSI_CONN_STATE_EXITING;
+ }
+}
+
+static void
+iscsi_conn_full_feature_migrate(void *arg)
+{
+ struct spdk_iscsi_conn *conn = arg;
+
+ if (conn->state >= ISCSI_CONN_STATE_EXITING) {
+ /* Connection is being exited before this callback is executed. */
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Connection is already exited.\n");
+ return;
+ }
+
+ if (conn->sess->session_type == SESSION_TYPE_NORMAL) {
+ iscsi_conn_open_luns(conn);
+ }
+
+ /* Add this connection to the assigned poll group. */
+ iscsi_poll_group_add_conn(conn->pg, conn);
+}
+
+static struct spdk_iscsi_poll_group *g_next_pg = NULL;
+
+void
+iscsi_conn_schedule(struct spdk_iscsi_conn *conn)
+{
+ struct spdk_iscsi_poll_group *pg;
+ struct spdk_iscsi_tgt_node *target;
+
+ if (conn->sess->session_type != SESSION_TYPE_NORMAL) {
+ /* Leave all non-normal sessions on the acceptor
+ * thread. */
+ return;
+ }
+ pthread_mutex_lock(&g_iscsi.mutex);
+
+ target = conn->sess->target;
+ pthread_mutex_lock(&target->mutex);
+ target->num_active_conns++;
+ if (target->num_active_conns == 1) {
+ /**
+ * This is the only active connection for this target node.
+ * Pick a poll group using round-robin.
+ */
+ if (g_next_pg == NULL) {
+ g_next_pg = TAILQ_FIRST(&g_iscsi.poll_group_head);
+ assert(g_next_pg != NULL);
+ }
+
+ pg = g_next_pg;
+ g_next_pg = TAILQ_NEXT(g_next_pg, link);
+
+ /* Save the pg in the target node so it can be used for any other connections to this target node. */
+ target->pg = pg;
+ } else {
+ /**
+ * There are other active connections for this target node.
+ */
+ pg = target->pg;
+ }
+
+ pthread_mutex_unlock(&target->mutex);
+ pthread_mutex_unlock(&g_iscsi.mutex);
+
+ assert(spdk_io_channel_get_thread(spdk_io_channel_from_ctx(conn->pg)) ==
+ spdk_get_thread());
+
+ /* Remove this connection from the previous poll group */
+ iscsi_poll_group_remove_conn(conn->pg, conn);
+
+ conn->last_nopin = spdk_get_ticks();
+ conn->pg = pg;
+
+ spdk_thread_send_msg(spdk_io_channel_get_thread(spdk_io_channel_from_ctx(pg)),
+ iscsi_conn_full_feature_migrate, conn);
+}
+
+static int
+logout_timeout(void *arg)
+{
+ struct spdk_iscsi_conn *conn = arg;
+
+ if (conn->state < ISCSI_CONN_STATE_EXITING) {
+ conn->state = ISCSI_CONN_STATE_EXITING;
+ }
+
+ return SPDK_POLLER_BUSY;
+}
+
+void
+iscsi_conn_logout(struct spdk_iscsi_conn *conn)
+{
+ conn->is_logged_out = true;
+ conn->logout_timer = SPDK_POLLER_REGISTER(logout_timeout, conn, ISCSI_LOGOUT_TIMEOUT * 1000000);
+}
+
+SPDK_TRACE_REGISTER_FN(iscsi_conn_trace, "iscsi_conn", TRACE_GROUP_ISCSI)
+{
+ spdk_trace_register_owner(OWNER_ISCSI_CONN, 'c');
+ spdk_trace_register_object(OBJECT_ISCSI_PDU, 'p');
+ spdk_trace_register_description("ISCSI_READ_DONE", TRACE_ISCSI_READ_FROM_SOCKET_DONE,
+ OWNER_ISCSI_CONN, OBJECT_NONE, 0, 0, "");
+ spdk_trace_register_description("ISCSI_WRITE_START", TRACE_ISCSI_FLUSH_WRITEBUF_START,
+ OWNER_ISCSI_CONN, OBJECT_NONE, 0, 0, "iovec: ");
+ spdk_trace_register_description("ISCSI_WRITE_DONE", TRACE_ISCSI_FLUSH_WRITEBUF_DONE,
+ OWNER_ISCSI_CONN, OBJECT_NONE, 0, 0, "");
+ spdk_trace_register_description("ISCSI_READ_PDU", TRACE_ISCSI_READ_PDU,
+ OWNER_ISCSI_CONN, OBJECT_ISCSI_PDU, 1, 0, "opc: ");
+ spdk_trace_register_description("ISCSI_TASK_DONE", TRACE_ISCSI_TASK_DONE,
+ OWNER_ISCSI_CONN, OBJECT_SCSI_TASK, 0, 0, "");
+ spdk_trace_register_description("ISCSI_TASK_QUEUE", TRACE_ISCSI_TASK_QUEUE,
+ OWNER_ISCSI_CONN, OBJECT_SCSI_TASK, 1, 1, "pdu: ");
+ spdk_trace_register_description("ISCSI_TASK_EXECUTED", TRACE_ISCSI_TASK_EXECUTED,
+ OWNER_ISCSI_CONN, OBJECT_ISCSI_PDU, 0, 0, "");
+ spdk_trace_register_description("ISCSI_PDU_COMPLETED", TRACE_ISCSI_PDU_COMPLETED,
+ OWNER_ISCSI_CONN, OBJECT_ISCSI_PDU, 0, 0, "");
+}
+
+void
+iscsi_conn_info_json(struct spdk_json_write_ctx *w, struct spdk_iscsi_conn *conn)
+{
+ uint16_t tsih;
+
+ if (!conn->is_valid) {
+ return;
+ }
+
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_int32(w, "id", conn->id);
+
+ spdk_json_write_named_int32(w, "cid", conn->cid);
+
+ /*
+ * If we try to return data for a connection that has not
+ * logged in yet, the session will not be set. So in this
+ * case, return -1 for the tsih rather than segfaulting
+ * on the null conn->sess.
+ */
+ if (conn->sess == NULL) {
+ tsih = -1;
+ } else {
+ tsih = conn->sess->tsih;
+ }
+ spdk_json_write_named_int32(w, "tsih", tsih);
+
+ spdk_json_write_named_string(w, "initiator_addr", conn->initiator_addr);
+
+ spdk_json_write_named_string(w, "target_addr", conn->target_addr);
+
+ spdk_json_write_named_string(w, "target_node_name", conn->target_short_name);
+
+ spdk_json_write_named_string(w, "thread_name",
+ spdk_thread_get_name(spdk_get_thread()));
+
+ spdk_json_write_object_end(w);
+}
diff --git a/src/spdk/lib/iscsi/conn.h b/src/spdk/lib/iscsi/conn.h
new file mode 100644
index 000000000..a85d2ddeb
--- /dev/null
+++ b/src/spdk/lib/iscsi/conn.h
@@ -0,0 +1,237 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_ISCSI_CONN_H
+#define SPDK_ISCSI_CONN_H
+
+#include "spdk/stdinc.h"
+
+#include "iscsi/iscsi.h"
+#include "spdk/queue.h"
+#include "spdk/cpuset.h"
+#include "spdk/scsi.h"
+
+/*
+ * MAX_CONNECTION_PARAMS: The numbers of the params in conn_param_table
+ * MAX_SESSION_PARAMS: The numbers of the params in sess_param_table
+ */
+#define MAX_CONNECTION_PARAMS 14
+#define MAX_SESSION_PARAMS 19
+
+#define MAX_ADDRBUF 64
+#define MAX_INITIATOR_ADDR (MAX_ADDRBUF)
+#define MAX_TARGET_ADDR (MAX_ADDRBUF)
+
+#define OWNER_ISCSI_CONN 0x1
+
+#define OBJECT_ISCSI_PDU 0x1
+
+#define TRACE_GROUP_ISCSI 0x1
+#define TRACE_ISCSI_READ_FROM_SOCKET_DONE SPDK_TPOINT_ID(TRACE_GROUP_ISCSI, 0x0)
+#define TRACE_ISCSI_FLUSH_WRITEBUF_START SPDK_TPOINT_ID(TRACE_GROUP_ISCSI, 0x1)
+#define TRACE_ISCSI_FLUSH_WRITEBUF_DONE SPDK_TPOINT_ID(TRACE_GROUP_ISCSI, 0x2)
+#define TRACE_ISCSI_READ_PDU SPDK_TPOINT_ID(TRACE_GROUP_ISCSI, 0x3)
+#define TRACE_ISCSI_TASK_DONE SPDK_TPOINT_ID(TRACE_GROUP_ISCSI, 0x4)
+#define TRACE_ISCSI_TASK_QUEUE SPDK_TPOINT_ID(TRACE_GROUP_ISCSI, 0x5)
+#define TRACE_ISCSI_TASK_EXECUTED SPDK_TPOINT_ID(TRACE_GROUP_ISCSI, 0x6)
+#define TRACE_ISCSI_PDU_COMPLETED SPDK_TPOINT_ID(TRACE_GROUP_ISCSI, 0x7)
+
+enum iscsi_pdu_recv_state {
+ /* Ready to wait for PDU */
+ ISCSI_PDU_RECV_STATE_AWAIT_PDU_READY,
+
+ /* Active connection waiting for any PDU header */
+ ISCSI_PDU_RECV_STATE_AWAIT_PDU_HDR,
+
+ /* Active connection waiting for payload */
+ ISCSI_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD,
+
+ /* Active connection does not wait for payload */
+ ISCSI_PDU_RECV_STATE_ERROR,
+};
+
+struct spdk_poller;
+struct spdk_iscsi_conn;
+
+struct spdk_iscsi_lun {
+ struct spdk_iscsi_conn *conn;
+ struct spdk_scsi_lun *lun;
+ struct spdk_scsi_lun_desc *desc;
+ struct spdk_poller *remove_poller;
+};
+
+struct spdk_iscsi_conn {
+ int id;
+ int is_valid;
+ /*
+ * All fields below this point are reinitialized each time the
+ * connection object is allocated. Make sure to update the
+ * SPDK_ISCSI_CONNECTION_MEMSET() macro if changing which fields
+ * are initialized when allocated.
+ */
+ struct spdk_iscsi_portal *portal;
+ int pg_tag;
+ char portal_host[MAX_PORTAL_ADDR + 1];
+ char portal_port[MAX_PORTAL_ADDR + 1];
+ struct spdk_iscsi_poll_group *pg;
+ struct spdk_sock *sock;
+ struct spdk_iscsi_sess *sess;
+
+ enum iscsi_connection_state state;
+ int login_phase;
+ bool is_logged_out;
+ struct spdk_iscsi_pdu *login_rsp_pdu;
+
+ uint64_t last_flush;
+ uint64_t last_fill;
+ uint64_t last_nopin;
+
+ /* Timer used to destroy connection after requesting logout if
+ * initiator does not send logout request.
+ */
+ struct spdk_poller *logout_request_timer;
+
+ /* Timer used to destroy connection after logout if initiator does
+ * not close the connection.
+ */
+ struct spdk_poller *logout_timer;
+
+ /* Timer used to wait for connection to close
+ */
+ struct spdk_poller *shutdown_timer;
+
+ struct spdk_iscsi_pdu *pdu_in_progress;
+ enum iscsi_pdu_recv_state pdu_recv_state;
+
+ TAILQ_HEAD(, spdk_iscsi_pdu) write_pdu_list;
+ TAILQ_HEAD(, spdk_iscsi_pdu) snack_pdu_list;
+
+ int pending_r2t;
+
+ uint16_t cid;
+
+ /* IP address */
+ char initiator_addr[MAX_INITIATOR_ADDR];
+ char target_addr[MAX_TARGET_ADDR];
+
+ /* Initiator/Target port binds */
+ char initiator_name[MAX_INITIATOR_NAME];
+ struct spdk_scsi_port *initiator_port;
+ char target_short_name[MAX_TARGET_NAME];
+ struct spdk_scsi_port *target_port;
+ struct spdk_iscsi_tgt_node *target;
+ struct spdk_scsi_dev *dev;
+
+ /* for fast access */
+ int header_digest;
+ int data_digest;
+ int full_feature;
+
+ struct iscsi_param *params;
+ bool sess_param_state_negotiated[MAX_SESSION_PARAMS];
+ bool conn_param_state_negotiated[MAX_CONNECTION_PARAMS];
+ struct iscsi_chap_auth auth;
+ bool authenticated;
+ bool disable_chap;
+ bool require_chap;
+ bool mutual_chap;
+ int32_t chap_group;
+ uint32_t pending_task_cnt;
+ uint32_t data_out_cnt;
+ uint32_t data_in_cnt;
+
+ uint64_t timeout;
+ uint64_t nopininterval;
+ bool nop_outstanding;
+
+ /*
+ * This is the maximum data segment length that iscsi target can send
+ * to the initiator on this connection. Not to be confused with the
+ * maximum data segment length that initiators can send to iscsi target, which
+ * is statically defined as SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH.
+ */
+ int MaxRecvDataSegmentLength;
+
+ uint32_t StatSN;
+ uint32_t exp_statsn;
+ uint32_t ttt; /* target transfer tag */
+ char *partial_text_parameter;
+
+ STAILQ_ENTRY(spdk_iscsi_conn) pg_link;
+ bool is_stopped; /* Set true when connection is stopped for migration */
+ TAILQ_HEAD(queued_r2t_tasks, spdk_iscsi_task) queued_r2t_tasks;
+ TAILQ_HEAD(active_r2t_tasks, spdk_iscsi_task) active_r2t_tasks;
+ TAILQ_HEAD(queued_datain_tasks, spdk_iscsi_task) queued_datain_tasks;
+
+ struct spdk_iscsi_lun *luns[SPDK_SCSI_DEV_MAX_LUN];
+
+ TAILQ_ENTRY(spdk_iscsi_conn) conn_link;
+};
+
+extern struct spdk_iscsi_conn *g_conns_array;
+
+void iscsi_task_cpl(struct spdk_scsi_task *scsi_task);
+void iscsi_task_mgmt_cpl(struct spdk_scsi_task *scsi_task);
+
+int initialize_iscsi_conns(void);
+void shutdown_iscsi_conns(void);
+void iscsi_conns_request_logout(struct spdk_iscsi_tgt_node *target);
+int iscsi_get_active_conns(struct spdk_iscsi_tgt_node *target);
+
+int iscsi_conn_construct(struct spdk_iscsi_portal *portal, struct spdk_sock *sock);
+void iscsi_conn_destruct(struct spdk_iscsi_conn *conn);
+void iscsi_conn_handle_nop(struct spdk_iscsi_conn *conn);
+void iscsi_conn_schedule(struct spdk_iscsi_conn *conn);
+void iscsi_conn_logout(struct spdk_iscsi_conn *conn);
+int iscsi_drop_conns(struct spdk_iscsi_conn *conn,
+ const char *conn_match, int drop_all);
+int iscsi_conn_handle_queued_datain_tasks(struct spdk_iscsi_conn *conn);
+int iscsi_conn_abort_queued_datain_task(struct spdk_iscsi_conn *conn,
+ uint32_t ref_task_tag);
+int iscsi_conn_abort_queued_datain_tasks(struct spdk_iscsi_conn *conn,
+ struct spdk_scsi_lun *lun,
+ struct spdk_iscsi_pdu *pdu);
+
+int iscsi_conn_read_data(struct spdk_iscsi_conn *conn, int len, void *buf);
+int iscsi_conn_readv_data(struct spdk_iscsi_conn *conn,
+ struct iovec *iov, int iovcnt);
+void iscsi_conn_write_pdu(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu,
+ iscsi_conn_xfer_complete_cb cb_fn,
+ void *cb_arg);
+
+void iscsi_conn_free_pdu(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu);
+
+void iscsi_conn_info_json(struct spdk_json_write_ctx *w, struct spdk_iscsi_conn *conn);
+void iscsi_conn_pdu_generic_complete(void *cb_arg);
+#endif /* SPDK_ISCSI_CONN_H */
diff --git a/src/spdk/lib/iscsi/init_grp.c b/src/spdk/lib/iscsi/init_grp.c
new file mode 100644
index 000000000..49e78d89d
--- /dev/null
+++ b/src/spdk/lib/iscsi/init_grp.c
@@ -0,0 +1,787 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/conf.h"
+#include "spdk/string.h"
+
+#include "spdk_internal/log.h"
+
+#include "iscsi/iscsi.h"
+#include "iscsi/init_grp.h"
+
+static struct spdk_iscsi_init_grp *
+iscsi_init_grp_create(int tag)
+{
+ struct spdk_iscsi_init_grp *ig;
+
+ ig = calloc(1, sizeof(*ig));
+ if (ig == NULL) {
+ SPDK_ERRLOG("calloc() failed for initiator group\n");
+ return NULL;
+ }
+
+ ig->tag = tag;
+ TAILQ_INIT(&ig->initiator_head);
+ TAILQ_INIT(&ig->netmask_head);
+ return ig;
+}
+
+static struct spdk_iscsi_initiator_name *
+iscsi_init_grp_find_initiator(struct spdk_iscsi_init_grp *ig, char *name)
+{
+ struct spdk_iscsi_initiator_name *iname;
+
+ TAILQ_FOREACH(iname, &ig->initiator_head, tailq) {
+ if (!strcmp(iname->name, name)) {
+ return iname;
+ }
+ }
+ return NULL;
+}
+
+static int
+iscsi_init_grp_add_initiator(struct spdk_iscsi_init_grp *ig, char *name)
+{
+ struct spdk_iscsi_initiator_name *iname;
+ char *p;
+ size_t len;
+
+ if (ig->ninitiators >= MAX_INITIATOR) {
+ SPDK_ERRLOG("> MAX_INITIATOR(=%d) is not allowed\n", MAX_INITIATOR);
+ return -EPERM;
+ }
+
+ len = strlen(name);
+ if (len > MAX_INITIATOR_NAME) {
+ SPDK_ERRLOG("Initiator Name is larger than 223 bytes\n");
+ return -EINVAL;
+ }
+
+ iname = iscsi_init_grp_find_initiator(ig, name);
+ if (iname != NULL) {
+ return -EEXIST;
+ }
+
+ iname = calloc(1, sizeof(*iname));
+ if (iname == NULL) {
+ SPDK_ERRLOG("malloc() failed for initiator name str\n");
+ return -ENOMEM;
+ }
+
+ memcpy(iname->name, name, len);
+
+ /* Replace "ALL" by "ANY" if set */
+ p = strstr(iname->name, "ALL");
+ if (p != NULL) {
+ SPDK_WARNLOG("Please use \"%s\" instead of \"%s\"\n", "ANY", "ALL");
+ SPDK_WARNLOG("Converting \"%s\" to \"%s\" automatically\n", "ALL", "ANY");
+ memcpy(p, "ANY", 3);
+ }
+
+ TAILQ_INSERT_TAIL(&ig->initiator_head, iname, tailq);
+ ig->ninitiators++;
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "InitiatorName %s\n", name);
+ return 0;
+}
+
+static int
+iscsi_init_grp_delete_initiator(struct spdk_iscsi_init_grp *ig, char *name)
+{
+ struct spdk_iscsi_initiator_name *iname;
+
+ iname = iscsi_init_grp_find_initiator(ig, name);
+ if (iname == NULL) {
+ return -ENOENT;
+ }
+
+ TAILQ_REMOVE(&ig->initiator_head, iname, tailq);
+ ig->ninitiators--;
+ free(iname);
+ return 0;
+}
+
+static int
+iscsi_init_grp_add_initiators(struct spdk_iscsi_init_grp *ig, int num_inames,
+ char **inames)
+{
+ int i;
+ int rc;
+
+ for (i = 0; i < num_inames; i++) {
+ rc = iscsi_init_grp_add_initiator(ig, inames[i]);
+ if (rc < 0) {
+ goto cleanup;
+ }
+ }
+ return 0;
+
+cleanup:
+ for (; i > 0; --i) {
+ iscsi_init_grp_delete_initiator(ig, inames[i - 1]);
+ }
+ return rc;
+}
+
+static void
+iscsi_init_grp_delete_all_initiators(struct spdk_iscsi_init_grp *ig)
+{
+ struct spdk_iscsi_initiator_name *iname, *tmp;
+
+ TAILQ_FOREACH_SAFE(iname, &ig->initiator_head, tailq, tmp) {
+ TAILQ_REMOVE(&ig->initiator_head, iname, tailq);
+ ig->ninitiators--;
+ free(iname);
+ }
+}
+
+static int
+iscsi_init_grp_delete_initiators(struct spdk_iscsi_init_grp *ig, int num_inames, char **inames)
+{
+ int i;
+ int rc;
+
+ for (i = 0; i < num_inames; i++) {
+ rc = iscsi_init_grp_delete_initiator(ig, inames[i]);
+ if (rc < 0) {
+ goto cleanup;
+ }
+ }
+ return 0;
+
+cleanup:
+ for (; i > 0; --i) {
+ rc = iscsi_init_grp_add_initiator(ig, inames[i - 1]);
+ if (rc != 0) {
+ iscsi_init_grp_delete_all_initiators(ig);
+ break;
+ }
+ }
+ return -1;
+}
+
+static struct spdk_iscsi_initiator_netmask *
+iscsi_init_grp_find_netmask(struct spdk_iscsi_init_grp *ig, const char *mask)
+{
+ struct spdk_iscsi_initiator_netmask *netmask;
+
+ TAILQ_FOREACH(netmask, &ig->netmask_head, tailq) {
+ if (!strcmp(netmask->mask, mask)) {
+ return netmask;
+ }
+ }
+ return NULL;
+}
+
+static int
+iscsi_init_grp_add_netmask(struct spdk_iscsi_init_grp *ig, char *mask)
+{
+ struct spdk_iscsi_initiator_netmask *imask;
+ char *p;
+ size_t len;
+
+ if (ig->nnetmasks >= MAX_NETMASK) {
+ SPDK_ERRLOG("> MAX_NETMASK(=%d) is not allowed\n", MAX_NETMASK);
+ return -EPERM;
+ }
+
+ len = strlen(mask);
+ if (len > MAX_INITIATOR_ADDR) {
+ SPDK_ERRLOG("Initiator Name is larger than %d bytes\n", MAX_INITIATOR_ADDR);
+ return -EINVAL;
+ }
+
+ imask = iscsi_init_grp_find_netmask(ig, mask);
+ if (imask != NULL) {
+ return -EEXIST;
+ }
+
+ imask = calloc(1, sizeof(*imask));
+ if (imask == NULL) {
+ SPDK_ERRLOG("malloc() failed for inititator mask str\n");
+ return -ENOMEM;
+ }
+
+ memcpy(imask->mask, mask, len);
+
+ /* Replace "ALL" by "ANY" if set */
+ p = strstr(imask->mask, "ALL");
+ if (p != NULL) {
+ SPDK_WARNLOG("Please use \"%s\" instead of \"%s\"\n", "ANY", "ALL");
+ SPDK_WARNLOG("Converting \"%s\" to \"%s\" automatically\n", "ALL", "ANY");
+ memcpy(p, "ANY", 3);
+ }
+
+ TAILQ_INSERT_TAIL(&ig->netmask_head, imask, tailq);
+ ig->nnetmasks++;
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Netmask %s\n", mask);
+ return 0;
+}
+
+static int
+iscsi_init_grp_delete_netmask(struct spdk_iscsi_init_grp *ig, char *mask)
+{
+ struct spdk_iscsi_initiator_netmask *imask;
+
+ imask = iscsi_init_grp_find_netmask(ig, mask);
+ if (imask == NULL) {
+ return -ENOENT;
+ }
+
+ TAILQ_REMOVE(&ig->netmask_head, imask, tailq);
+ ig->nnetmasks--;
+ free(imask);
+ return 0;
+}
+
+static int
+iscsi_init_grp_add_netmasks(struct spdk_iscsi_init_grp *ig, int num_imasks, char **imasks)
+{
+ int i;
+ int rc;
+
+ for (i = 0; i < num_imasks; i++) {
+ rc = iscsi_init_grp_add_netmask(ig, imasks[i]);
+ if (rc != 0) {
+ goto cleanup;
+ }
+ }
+ return 0;
+
+cleanup:
+ for (; i > 0; --i) {
+ iscsi_init_grp_delete_netmask(ig, imasks[i - 1]);
+ }
+ return rc;
+}
+
+static void
+iscsi_init_grp_delete_all_netmasks(struct spdk_iscsi_init_grp *ig)
+{
+ struct spdk_iscsi_initiator_netmask *imask, *tmp;
+
+ TAILQ_FOREACH_SAFE(imask, &ig->netmask_head, tailq, tmp) {
+ TAILQ_REMOVE(&ig->netmask_head, imask, tailq);
+ ig->nnetmasks--;
+ free(imask);
+ }
+}
+
+static int
+iscsi_init_grp_delete_netmasks(struct spdk_iscsi_init_grp *ig, int num_imasks, char **imasks)
+{
+ int i;
+ int rc;
+
+ for (i = 0; i < num_imasks; i++) {
+ rc = iscsi_init_grp_delete_netmask(ig, imasks[i]);
+ if (rc != 0) {
+ goto cleanup;
+ }
+ }
+ return 0;
+
+cleanup:
+ for (; i > 0; --i) {
+ rc = iscsi_init_grp_add_netmask(ig, imasks[i - 1]);
+ if (rc != 0) {
+ iscsi_init_grp_delete_all_netmasks(ig);
+ break;
+ }
+ }
+ return -1;
+}
+
+/* Read spdk iscsi target's config file and create initiator group */
+static int
+iscsi_parse_init_grp(struct spdk_conf_section *sp)
+{
+ int i, rc = 0;
+ const char *val = NULL;
+ int num_initiator_names;
+ int num_initiator_masks;
+ char **initiators = NULL, **netmasks = NULL;
+ int tag = spdk_conf_section_get_num(sp);
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "add initiator group %d\n", tag);
+
+ val = spdk_conf_section_get_val(sp, "Comment");
+ if (val != NULL) {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Comment %s\n", val);
+ }
+
+ /* counts number of definitions */
+ for (i = 0; ; i++) {
+ val = spdk_conf_section_get_nval(sp, "InitiatorName", i);
+ if (val == NULL) {
+ break;
+ }
+ }
+ if (i == 0) {
+ SPDK_ERRLOG("num_initiator_names = 0\n");
+ return -EINVAL;
+ }
+ num_initiator_names = i;
+ if (num_initiator_names > MAX_INITIATOR) {
+ SPDK_ERRLOG("%d > MAX_INITIATOR\n", num_initiator_names);
+ return -E2BIG;
+ }
+ for (i = 0; ; i++) {
+ val = spdk_conf_section_get_nval(sp, "Netmask", i);
+ if (val == NULL) {
+ break;
+ }
+ }
+ if (i == 0) {
+ SPDK_ERRLOG("num_initiator_mask = 0\n");
+ return -EINVAL;
+ }
+ num_initiator_masks = i;
+ if (num_initiator_masks > MAX_NETMASK) {
+ SPDK_ERRLOG("%d > MAX_NETMASK\n", num_initiator_masks);
+ return -E2BIG;
+ }
+
+ initiators = calloc(num_initiator_names, sizeof(char *));
+ if (!initiators) {
+ SPDK_ERRLOG("calloc() failed for temp initiator name array\n");
+ return -ENOMEM;
+ }
+ for (i = 0; i < num_initiator_names; i++) {
+ val = spdk_conf_section_get_nval(sp, "InitiatorName", i);
+ if (!val) {
+ SPDK_ERRLOG("InitiatorName %d not found\n", i);
+ rc = -EINVAL;
+ goto cleanup;
+ }
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "InitiatorName %s\n", val);
+ initiators[i] = strdup(val);
+ if (!initiators[i]) {
+ SPDK_ERRLOG("strdup() failed for temp initiator name\n");
+ rc = -ENOMEM;
+ goto cleanup;
+ }
+ }
+ netmasks = calloc(num_initiator_masks, sizeof(char *));
+ if (!netmasks) {
+ SPDK_ERRLOG("malloc() failed for portal group\n");
+ rc = -ENOMEM;
+ goto cleanup;
+ }
+ for (i = 0; i < num_initiator_masks; i++) {
+ val = spdk_conf_section_get_nval(sp, "Netmask", i);
+ if (!val) {
+ SPDK_ERRLOG("Netmask %d not found\n", i);
+ rc = -EINVAL;
+ goto cleanup;
+ }
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Netmask %s\n", val);
+ netmasks[i] = strdup(val);
+ if (!netmasks[i]) {
+ SPDK_ERRLOG("strdup() failed for temp initiator mask\n");
+ rc = -ENOMEM;
+ goto cleanup;
+ }
+ }
+
+ rc = iscsi_init_grp_create_from_initiator_list(tag,
+ num_initiator_names, initiators, num_initiator_masks, netmasks);
+
+cleanup:
+ if (initiators) {
+ for (i = 0; i < num_initiator_names; i++) {
+ if (initiators[i]) {
+ free(initiators[i]);
+ }
+ }
+ free(initiators);
+ }
+ if (netmasks) {
+ for (i = 0; i < num_initiator_masks; i++) {
+ if (netmasks[i]) {
+ free(netmasks[i]);
+ }
+ }
+ free(netmasks);
+ }
+ return rc;
+}
+
+int
+iscsi_init_grp_register(struct spdk_iscsi_init_grp *ig)
+{
+ struct spdk_iscsi_init_grp *tmp;
+ int rc = -1;
+
+ assert(ig != NULL);
+
+ pthread_mutex_lock(&g_iscsi.mutex);
+ tmp = iscsi_init_grp_find_by_tag(ig->tag);
+ if (tmp == NULL) {
+ TAILQ_INSERT_TAIL(&g_iscsi.ig_head, ig, tailq);
+ rc = 0;
+ }
+ pthread_mutex_unlock(&g_iscsi.mutex);
+
+ return rc;
+}
+
+/*
+ * Create initiator group from list of initiator ip/hostnames and netmasks
+ * The initiator hostname/netmask lists are allocated by the caller on the
+ * heap. Freed later by common initiator_group_destroy() code
+ */
+int
+iscsi_init_grp_create_from_initiator_list(int tag,
+ int num_initiator_names,
+ char **initiator_names,
+ int num_initiator_masks,
+ char **initiator_masks)
+{
+ int rc = -1;
+ struct spdk_iscsi_init_grp *ig = NULL;
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI,
+ "add initiator group (from initiator list) tag=%d, #initiators=%d, #masks=%d\n",
+ tag, num_initiator_names, num_initiator_masks);
+
+ ig = iscsi_init_grp_create(tag);
+ if (!ig) {
+ SPDK_ERRLOG("initiator group create error (%d)\n", tag);
+ return rc;
+ }
+
+ rc = iscsi_init_grp_add_initiators(ig, num_initiator_names,
+ initiator_names);
+ if (rc < 0) {
+ SPDK_ERRLOG("add initiator name error\n");
+ goto cleanup;
+ }
+
+ rc = iscsi_init_grp_add_netmasks(ig, num_initiator_masks,
+ initiator_masks);
+ if (rc < 0) {
+ SPDK_ERRLOG("add initiator netmask error\n");
+ goto cleanup;
+ }
+
+ rc = iscsi_init_grp_register(ig);
+ if (rc < 0) {
+ SPDK_ERRLOG("initiator group register error (%d)\n", tag);
+ goto cleanup;
+ }
+ return 0;
+
+cleanup:
+ iscsi_init_grp_destroy(ig);
+ return rc;
+}
+
+int
+iscsi_init_grp_add_initiators_from_initiator_list(int tag,
+ int num_initiator_names,
+ char **initiator_names,
+ int num_initiator_masks,
+ char **initiator_masks)
+{
+ int rc = -1;
+ struct spdk_iscsi_init_grp *ig;
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI,
+ "add initiator to initiator group: tag=%d, #initiators=%d, #masks=%d\n",
+ tag, num_initiator_names, num_initiator_masks);
+
+ pthread_mutex_lock(&g_iscsi.mutex);
+ ig = iscsi_init_grp_find_by_tag(tag);
+ if (!ig) {
+ pthread_mutex_unlock(&g_iscsi.mutex);
+ SPDK_ERRLOG("initiator group (%d) is not found\n", tag);
+ return rc;
+ }
+
+ rc = iscsi_init_grp_add_initiators(ig, num_initiator_names,
+ initiator_names);
+ if (rc < 0) {
+ SPDK_ERRLOG("add initiator name error\n");
+ goto error;
+ }
+
+ rc = iscsi_init_grp_add_netmasks(ig, num_initiator_masks,
+ initiator_masks);
+ if (rc < 0) {
+ SPDK_ERRLOG("add initiator netmask error\n");
+ iscsi_init_grp_delete_initiators(ig, num_initiator_names,
+ initiator_names);
+ }
+
+error:
+ pthread_mutex_unlock(&g_iscsi.mutex);
+ return rc;
+}
+
+int
+iscsi_init_grp_delete_initiators_from_initiator_list(int tag,
+ int num_initiator_names,
+ char **initiator_names,
+ int num_initiator_masks,
+ char **initiator_masks)
+{
+ int rc = -1;
+ struct spdk_iscsi_init_grp *ig;
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI,
+ "delete initiator from initiator group: tag=%d, #initiators=%d, #masks=%d\n",
+ tag, num_initiator_names, num_initiator_masks);
+
+ pthread_mutex_lock(&g_iscsi.mutex);
+ ig = iscsi_init_grp_find_by_tag(tag);
+ if (!ig) {
+ pthread_mutex_unlock(&g_iscsi.mutex);
+ SPDK_ERRLOG("initiator group (%d) is not found\n", tag);
+ return rc;
+ }
+
+ rc = iscsi_init_grp_delete_initiators(ig, num_initiator_names,
+ initiator_names);
+ if (rc < 0) {
+ SPDK_ERRLOG("delete initiator name error\n");
+ goto error;
+ }
+
+ rc = iscsi_init_grp_delete_netmasks(ig, num_initiator_masks,
+ initiator_masks);
+ if (rc < 0) {
+ SPDK_ERRLOG("delete initiator netmask error\n");
+ iscsi_init_grp_add_initiators(ig, num_initiator_names,
+ initiator_names);
+ goto error;
+ }
+
+error:
+ pthread_mutex_unlock(&g_iscsi.mutex);
+ return rc;
+}
+
+void
+iscsi_init_grp_destroy(struct spdk_iscsi_init_grp *ig)
+{
+ if (!ig) {
+ return;
+ }
+
+ iscsi_init_grp_delete_all_initiators(ig);
+ iscsi_init_grp_delete_all_netmasks(ig);
+ free(ig);
+};
+
+struct spdk_iscsi_init_grp *
+iscsi_init_grp_find_by_tag(int tag)
+{
+ struct spdk_iscsi_init_grp *ig;
+
+ TAILQ_FOREACH(ig, &g_iscsi.ig_head, tailq) {
+ if (ig->tag == tag) {
+ return ig;
+ }
+ }
+
+ return NULL;
+}
+
+int
+iscsi_parse_init_grps(void)
+{
+ struct spdk_conf_section *sp;
+ int rc;
+
+ sp = spdk_conf_first_section(NULL);
+ while (sp != NULL) {
+ if (spdk_conf_section_match_prefix(sp, "InitiatorGroup")) {
+ if (spdk_conf_section_get_num(sp) == 0) {
+ SPDK_ERRLOG("Group 0 is invalid\n");
+ return -1;
+ }
+ rc = iscsi_parse_init_grp(sp);
+ if (rc < 0) {
+ SPDK_ERRLOG("parse_init_group() failed\n");
+ return -1;
+ }
+ }
+ sp = spdk_conf_next_section(sp);
+ }
+ return 0;
+}
+
+void
+iscsi_init_grps_destroy(void)
+{
+ struct spdk_iscsi_init_grp *ig, *tmp;
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "iscsi_init_grp_array_destroy\n");
+ pthread_mutex_lock(&g_iscsi.mutex);
+ TAILQ_FOREACH_SAFE(ig, &g_iscsi.ig_head, tailq, tmp) {
+ TAILQ_REMOVE(&g_iscsi.ig_head, ig, tailq);
+ iscsi_init_grp_destroy(ig);
+ }
+ pthread_mutex_unlock(&g_iscsi.mutex);
+}
+
+struct spdk_iscsi_init_grp *
+iscsi_init_grp_unregister(int tag)
+{
+ struct spdk_iscsi_init_grp *ig;
+
+ pthread_mutex_lock(&g_iscsi.mutex);
+ TAILQ_FOREACH(ig, &g_iscsi.ig_head, tailq) {
+ if (ig->tag == tag) {
+ TAILQ_REMOVE(&g_iscsi.ig_head, ig, tailq);
+ pthread_mutex_unlock(&g_iscsi.mutex);
+ return ig;
+ }
+ }
+ pthread_mutex_unlock(&g_iscsi.mutex);
+ return NULL;
+}
+
+static const char *initiator_group_section = \
+ "\n"
+ "# Users must change the InitiatorGroup section(s) to match the IP\n"
+ "# addresses and initiator configuration in their environment.\n"
+ "# Netmask can be used to specify a single IP address or a range of IP addresses\n"
+ "# Netmask 192.168.1.20 <== single IP address\n"
+ "# Netmask 192.168.1.0/24 <== IP range 192.168.1.*\n";
+
+#define INITIATOR_GROUP_TMPL \
+"[InitiatorGroup%d]\n" \
+" Comment \"Initiator Group%d\"\n"
+
+#define INITIATOR_TMPL \
+" InitiatorName "
+
+#define NETMASK_TMPL \
+" Netmask "
+
+void
+iscsi_init_grps_config_text(FILE *fp)
+{
+ struct spdk_iscsi_init_grp *ig;
+ struct spdk_iscsi_initiator_name *iname;
+ struct spdk_iscsi_initiator_netmask *imask;
+
+ /* Create initiator group section */
+ fprintf(fp, "%s", initiator_group_section);
+
+ /* Dump initiator groups */
+ TAILQ_FOREACH(ig, &g_iscsi.ig_head, tailq) {
+ if (NULL == ig) { continue; }
+ fprintf(fp, INITIATOR_GROUP_TMPL, ig->tag, ig->tag);
+
+ /* Dump initiators */
+ fprintf(fp, INITIATOR_TMPL);
+ TAILQ_FOREACH(iname, &ig->initiator_head, tailq) {
+ fprintf(fp, "%s ", iname->name);
+ }
+ fprintf(fp, "\n");
+
+ /* Dump netmasks */
+ fprintf(fp, NETMASK_TMPL);
+ TAILQ_FOREACH(imask, &ig->netmask_head, tailq) {
+ fprintf(fp, "%s ", imask->mask);
+ }
+ fprintf(fp, "\n");
+ }
+}
+
+static void
+iscsi_init_grp_info_json(struct spdk_iscsi_init_grp *ig,
+ struct spdk_json_write_ctx *w)
+{
+ struct spdk_iscsi_initiator_name *iname;
+ struct spdk_iscsi_initiator_netmask *imask;
+
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_int32(w, "tag", ig->tag);
+
+ spdk_json_write_named_array_begin(w, "initiators");
+ TAILQ_FOREACH(iname, &ig->initiator_head, tailq) {
+ spdk_json_write_string(w, iname->name);
+ }
+ spdk_json_write_array_end(w);
+
+ spdk_json_write_named_array_begin(w, "netmasks");
+ TAILQ_FOREACH(imask, &ig->netmask_head, tailq) {
+ spdk_json_write_string(w, imask->mask);
+ }
+ spdk_json_write_array_end(w);
+
+ spdk_json_write_object_end(w);
+}
+
+static void
+iscsi_init_grp_config_json(struct spdk_iscsi_init_grp *ig,
+ struct spdk_json_write_ctx *w)
+{
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_string(w, "method", "iscsi_create_initiator_group");
+
+ spdk_json_write_name(w, "params");
+ iscsi_init_grp_info_json(ig, w);
+
+ spdk_json_write_object_end(w);
+}
+
+void
+iscsi_init_grps_info_json(struct spdk_json_write_ctx *w)
+{
+ struct spdk_iscsi_init_grp *ig;
+
+ TAILQ_FOREACH(ig, &g_iscsi.ig_head, tailq) {
+ iscsi_init_grp_info_json(ig, w);
+ }
+}
+
+void
+iscsi_init_grps_config_json(struct spdk_json_write_ctx *w)
+{
+ struct spdk_iscsi_init_grp *ig;
+
+ TAILQ_FOREACH(ig, &g_iscsi.ig_head, tailq) {
+ iscsi_init_grp_config_json(ig, w);
+ }
+}
diff --git a/src/spdk/lib/iscsi/init_grp.h b/src/spdk/lib/iscsi/init_grp.h
new file mode 100644
index 000000000..8913c98cd
--- /dev/null
+++ b/src/spdk/lib/iscsi/init_grp.h
@@ -0,0 +1,81 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_INIT_GRP_H
+#define SPDK_INIT_GRP_H
+
+#include "spdk/conf.h"
+#include "iscsi/iscsi.h"
+#include "iscsi/conn.h"
+
+struct spdk_iscsi_initiator_name {
+ char name[MAX_INITIATOR_NAME + 1];
+ TAILQ_ENTRY(spdk_iscsi_initiator_name) tailq;
+};
+
+struct spdk_iscsi_initiator_netmask {
+ char mask[MAX_INITIATOR_ADDR + 1];
+ TAILQ_ENTRY(spdk_iscsi_initiator_netmask) tailq;
+};
+
+struct spdk_iscsi_init_grp {
+ int ninitiators;
+ TAILQ_HEAD(, spdk_iscsi_initiator_name) initiator_head;
+ int nnetmasks;
+ TAILQ_HEAD(, spdk_iscsi_initiator_netmask) netmask_head;
+ int ref;
+ int tag;
+ TAILQ_ENTRY(spdk_iscsi_init_grp) tailq;
+};
+
+/* SPDK iSCSI Initiator Group management API */
+int iscsi_init_grp_create_from_initiator_list(int tag,
+ int num_initiator_names, char **initiator_names,
+ int num_initiator_masks, char **initiator_masks);
+int iscsi_init_grp_add_initiators_from_initiator_list(int tag,
+ int num_initiator_names, char **initiator_names,
+ int num_initiator_masks, char **initiator_masks);
+int iscsi_init_grp_delete_initiators_from_initiator_list(int tag,
+ int num_initiator_names, char **initiator_names,
+ int num_initiator_masks, char **initiator_masks);
+int iscsi_init_grp_register(struct spdk_iscsi_init_grp *ig);
+struct spdk_iscsi_init_grp *iscsi_init_grp_unregister(int tag);
+struct spdk_iscsi_init_grp *iscsi_init_grp_find_by_tag(int tag);
+void iscsi_init_grp_destroy(struct spdk_iscsi_init_grp *ig);
+int iscsi_parse_init_grps(void);
+void iscsi_init_grps_destroy(void);
+void iscsi_init_grps_config_text(FILE *fp);
+void iscsi_init_grps_info_json(struct spdk_json_write_ctx *w);
+void iscsi_init_grps_config_json(struct spdk_json_write_ctx *w);
+#endif /* SPDK_INIT_GRP_H */
diff --git a/src/spdk/lib/iscsi/iscsi.c b/src/spdk/lib/iscsi/iscsi.c
new file mode 100644
index 000000000..febf4cac4
--- /dev/null
+++ b/src/spdk/lib/iscsi/iscsi.c
@@ -0,0 +1,4797 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/base64.h"
+#include "spdk/crc32.h"
+#include "spdk/endian.h"
+#include "spdk/env.h"
+#include "spdk/likely.h"
+#include "spdk/trace.h"
+#include "spdk/sock.h"
+#include "spdk/string.h"
+#include "spdk/queue.h"
+#include "spdk/net.h"
+
+#include "iscsi/md5.h"
+#include "iscsi/iscsi.h"
+#include "iscsi/param.h"
+#include "iscsi/tgt_node.h"
+#include "iscsi/task.h"
+#include "iscsi/conn.h"
+#include "spdk/scsi.h"
+#include "spdk/bdev.h"
+#include "iscsi/portal_grp.h"
+
+#include "spdk_internal/log.h"
+
+#define MAX_TMPBUF 1024
+
+#define SPDK_CRC32C_INITIAL 0xffffffffUL
+#define SPDK_CRC32C_XOR 0xffffffffUL
+
+#ifdef __FreeBSD__
+#define HAVE_SRANDOMDEV 1
+#define HAVE_ARC4RANDOM 1
+#endif
+
+struct spdk_iscsi_globals g_iscsi = {
+ .mutex = PTHREAD_MUTEX_INITIALIZER,
+ .portal_head = TAILQ_HEAD_INITIALIZER(g_iscsi.portal_head),
+ .pg_head = TAILQ_HEAD_INITIALIZER(g_iscsi.pg_head),
+ .ig_head = TAILQ_HEAD_INITIALIZER(g_iscsi.ig_head),
+ .target_head = TAILQ_HEAD_INITIALIZER(g_iscsi.target_head),
+ .auth_group_head = TAILQ_HEAD_INITIALIZER(g_iscsi.auth_group_head),
+ .poll_group_head = TAILQ_HEAD_INITIALIZER(g_iscsi.poll_group_head),
+};
+
+#define MATCH_DIGEST_WORD(BUF, CRC32C) \
+ ( ((((uint32_t) *((uint8_t *)(BUF)+0)) << 0) \
+ | (((uint32_t) *((uint8_t *)(BUF)+1)) << 8) \
+ | (((uint32_t) *((uint8_t *)(BUF)+2)) << 16) \
+ | (((uint32_t) *((uint8_t *)(BUF)+3)) << 24)) \
+ == (CRC32C))
+
+#ifndef HAVE_SRANDOMDEV
+static void
+srandomdev(void)
+{
+ unsigned long seed;
+ time_t now;
+ pid_t pid;
+
+ pid = getpid();
+ now = time(NULL);
+ seed = pid ^ now;
+ srandom(seed);
+}
+#endif /* HAVE_SRANDOMDEV */
+
+#ifndef HAVE_ARC4RANDOM
+static int g_arc4random_initialized = 0;
+
+static uint32_t
+arc4random(void)
+{
+ uint32_t r;
+ uint32_t r1, r2;
+
+ if (!g_arc4random_initialized) {
+ srandomdev();
+ g_arc4random_initialized = 1;
+ }
+ r1 = (uint32_t)(random() & 0xffff);
+ r2 = (uint32_t)(random() & 0xffff);
+ r = (r1 << 16) | r2;
+ return r;
+}
+#endif /* HAVE_ARC4RANDOM */
+
+static void
+gen_random(uint8_t *buf, size_t len)
+{
+ uint32_t r;
+ size_t idx;
+
+ for (idx = 0; idx < len; idx++) {
+ r = arc4random();
+ buf[idx] = (uint8_t) r;
+ }
+}
+
+static uint64_t
+iscsi_get_isid(const uint8_t isid[6])
+{
+ return (uint64_t)isid[0] << 40 |
+ (uint64_t)isid[1] << 32 |
+ (uint64_t)isid[2] << 24 |
+ (uint64_t)isid[3] << 16 |
+ (uint64_t)isid[4] << 8 |
+ (uint64_t)isid[5];
+}
+
+static int
+bin2hex(char *buf, size_t len, const uint8_t *data, size_t data_len)
+{
+ const char *digits = "0123456789ABCDEF";
+ size_t total = 0;
+ size_t idx;
+
+ if (len < 3) {
+ return -1;
+ }
+ buf[total] = '0';
+ total++;
+ buf[total] = 'x';
+ total++;
+ buf[total] = '\0';
+
+ for (idx = 0; idx < data_len; idx++) {
+ if (total + 3 > len) {
+ buf[total] = '\0';
+ return - 1;
+ }
+ buf[total] = digits[(data[idx] >> 4) & 0x0fU];
+ total++;
+ buf[total] = digits[data[idx] & 0x0fU];
+ total++;
+ }
+ buf[total] = '\0';
+ return total;
+}
+
+static int
+hex2bin(uint8_t *data, size_t data_len, const char *str)
+{
+ const char *digits = "0123456789ABCDEF";
+ const char *dp;
+ const char *p;
+ size_t total = 0;
+ int n0, n1;
+
+ p = str;
+ if (p[0] != '0' && (p[1] != 'x' && p[1] != 'X')) {
+ return -1;
+ }
+ p += 2;
+
+ while (p[0] != '\0' && p[1] != '\0') {
+ if (total >= data_len) {
+ return -1;
+ }
+ dp = strchr(digits, toupper((int) p[0]));
+ if (dp == NULL) {
+ return -1;
+ }
+ n0 = (int)(dp - digits);
+ dp = strchr(digits, toupper((int) p[1]));
+ if (dp == NULL) {
+ return -1;
+ }
+ n1 = (int)(dp - digits);
+
+ data[total] = (uint8_t)(((n0 & 0x0fU) << 4) | (n1 & 0x0fU));
+ total++;
+ p += 2;
+ }
+ return total;
+}
+
+static int
+iscsi_reject(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu,
+ int reason)
+{
+ struct spdk_iscsi_pdu *rsp_pdu;
+ struct iscsi_bhs_reject *rsph;
+ uint8_t *data;
+ int total_ahs_len;
+ int data_len;
+ int alloc_len;
+
+ pdu->is_rejected = true;
+
+ total_ahs_len = pdu->bhs.total_ahs_len;
+ data_len = 0;
+ alloc_len = ISCSI_BHS_LEN + (4 * total_ahs_len);
+
+ if (conn->header_digest) {
+ alloc_len += ISCSI_DIGEST_LEN;
+ }
+
+ data = calloc(1, alloc_len);
+ if (!data) {
+ SPDK_ERRLOG("calloc() failed for data segment\n");
+ return -ENOMEM;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Reject PDU reason=%d\n", reason);
+
+ if (conn->sess != NULL) {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI,
+ "StatSN=%u, ExpCmdSN=%u, MaxCmdSN=%u\n",
+ conn->StatSN, conn->sess->ExpCmdSN,
+ conn->sess->MaxCmdSN);
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "StatSN=%u\n", conn->StatSN);
+ }
+
+ memcpy(data, &pdu->bhs, ISCSI_BHS_LEN);
+ data_len += ISCSI_BHS_LEN;
+
+ if (total_ahs_len != 0) {
+ total_ahs_len = spdk_min((4 * total_ahs_len), ISCSI_AHS_LEN);
+ memcpy(data + data_len, pdu->ahs, total_ahs_len);
+ data_len += total_ahs_len;
+ }
+
+ if (conn->header_digest) {
+ memcpy(data + data_len, pdu->header_digest, ISCSI_DIGEST_LEN);
+ data_len += ISCSI_DIGEST_LEN;
+ }
+
+ rsp_pdu = iscsi_get_pdu(conn);
+ if (rsp_pdu == NULL) {
+ free(data);
+ return -ENOMEM;
+ }
+
+ rsph = (struct iscsi_bhs_reject *)&rsp_pdu->bhs;
+ rsp_pdu->data = data;
+ rsph->opcode = ISCSI_OP_REJECT;
+ rsph->flags |= 0x80; /* bit 0 is default to 1 */
+ rsph->reason = reason;
+ DSET24(rsph->data_segment_len, data_len);
+
+ rsph->ffffffff = 0xffffffffU;
+ to_be32(&rsph->stat_sn, conn->StatSN);
+ conn->StatSN++;
+
+ if (conn->sess != NULL) {
+ to_be32(&rsph->exp_cmd_sn, conn->sess->ExpCmdSN);
+ to_be32(&rsph->max_cmd_sn, conn->sess->MaxCmdSN);
+ } else {
+ to_be32(&rsph->exp_cmd_sn, 1);
+ to_be32(&rsph->max_cmd_sn, 1);
+ }
+
+ SPDK_LOGDUMP(SPDK_LOG_ISCSI, "PDU", (void *)&rsp_pdu->bhs, ISCSI_BHS_LEN);
+
+ iscsi_conn_write_pdu(conn, rsp_pdu, iscsi_conn_pdu_generic_complete, NULL);
+
+ return 0;
+}
+
+uint32_t
+iscsi_pdu_calc_header_digest(struct spdk_iscsi_pdu *pdu)
+{
+ uint32_t crc32c;
+ uint32_t ahs_len_bytes = pdu->bhs.total_ahs_len * 4;
+
+ crc32c = SPDK_CRC32C_INITIAL;
+ crc32c = spdk_crc32c_update(&pdu->bhs, ISCSI_BHS_LEN, crc32c);
+
+ if (ahs_len_bytes) {
+ crc32c = spdk_crc32c_update(pdu->ahs, ahs_len_bytes, crc32c);
+ }
+
+ /* BHS and AHS are always 4-byte multiples in length, so no padding is necessary. */
+ crc32c = crc32c ^ SPDK_CRC32C_XOR;
+ return crc32c;
+}
+
+uint32_t
+iscsi_pdu_calc_data_digest(struct spdk_iscsi_pdu *pdu)
+{
+ uint32_t data_len = DGET24(pdu->bhs.data_segment_len);
+ uint32_t crc32c;
+ uint32_t mod;
+ struct iovec iov;
+ uint32_t num_blocks;
+
+ crc32c = SPDK_CRC32C_INITIAL;
+ if (spdk_likely(!pdu->dif_insert_or_strip)) {
+ crc32c = spdk_crc32c_update(pdu->data, data_len, crc32c);
+ } else {
+ iov.iov_base = pdu->data_buf;
+ iov.iov_len = pdu->data_buf_len;
+ num_blocks = pdu->data_buf_len / pdu->dif_ctx.block_size;
+
+ spdk_dif_update_crc32c(&iov, 1, num_blocks, &crc32c, &pdu->dif_ctx);
+ }
+
+ mod = data_len % ISCSI_ALIGNMENT;
+ if (mod != 0) {
+ uint32_t pad_length = ISCSI_ALIGNMENT - mod;
+ uint8_t pad[3] = {0, 0, 0};
+
+ assert(pad_length > 0);
+ assert(pad_length <= sizeof(pad));
+ crc32c = spdk_crc32c_update(pad, pad_length, crc32c);
+ }
+
+ crc32c = crc32c ^ SPDK_CRC32C_XOR;
+ return crc32c;
+}
+
+static int
+iscsi_conn_read_data_segment(struct spdk_iscsi_conn *conn,
+ struct spdk_iscsi_pdu *pdu,
+ uint32_t segment_len)
+{
+ struct iovec buf_iov, iovs[32];
+ int rc, _rc;
+
+ if (spdk_likely(!pdu->dif_insert_or_strip)) {
+ return iscsi_conn_read_data(conn,
+ segment_len - pdu->data_valid_bytes,
+ pdu->data_buf + pdu->data_valid_bytes);
+ } else {
+ buf_iov.iov_base = pdu->data_buf;
+ buf_iov.iov_len = pdu->data_buf_len;
+ rc = spdk_dif_set_md_interleave_iovs(iovs, 32, &buf_iov, 1,
+ pdu->data_valid_bytes,
+ segment_len - pdu->data_valid_bytes, NULL,
+ &pdu->dif_ctx);
+ if (rc > 0) {
+ rc = iscsi_conn_readv_data(conn, iovs, rc);
+ if (rc > 0) {
+ _rc = spdk_dif_generate_stream(&buf_iov, 1,
+ pdu->data_valid_bytes, rc,
+ &pdu->dif_ctx);
+ if (_rc != 0) {
+ SPDK_ERRLOG("DIF generate failed\n");
+ rc = _rc;
+ }
+ }
+ } else {
+ SPDK_ERRLOG("Setup iovs for interleaved metadata failed\n");
+ }
+ return rc;
+ }
+}
+
+struct _iscsi_sgl {
+ struct iovec *iov;
+ int iovcnt;
+ uint32_t iov_offset;
+ uint32_t total_size;
+};
+
+static inline void
+_iscsi_sgl_init(struct _iscsi_sgl *s, struct iovec *iovs, int iovcnt,
+ uint32_t iov_offset)
+{
+ s->iov = iovs;
+ s->iovcnt = iovcnt;
+ s->iov_offset = iov_offset;
+ s->total_size = 0;
+}
+
+static inline bool
+_iscsi_sgl_append(struct _iscsi_sgl *s, uint8_t *data, uint32_t data_len)
+{
+ if (s->iov_offset >= data_len) {
+ s->iov_offset -= data_len;
+ } else {
+ assert(s->iovcnt > 0);
+ s->iov->iov_base = data + s->iov_offset;
+ s->iov->iov_len = data_len - s->iov_offset;
+ s->total_size += data_len - s->iov_offset;
+ s->iov_offset = 0;
+ s->iov++;
+ s->iovcnt--;
+ if (s->iovcnt == 0) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/* Build iovec array to leave metadata space for every data block
+ * when reading data segment from socket.
+ */
+static inline bool
+_iscsi_sgl_append_with_md(struct _iscsi_sgl *s,
+ void *buf, uint32_t buf_len, uint32_t data_len,
+ struct spdk_dif_ctx *dif_ctx)
+{
+ int rc;
+ uint32_t total_size = 0;
+ struct iovec buf_iov;
+
+ if (s->iov_offset >= data_len) {
+ s->iov_offset -= data_len;
+ } else {
+ buf_iov.iov_base = buf;
+ buf_iov.iov_len = buf_len;
+ rc = spdk_dif_set_md_interleave_iovs(s->iov, s->iovcnt, &buf_iov, 1,
+ s->iov_offset, data_len - s->iov_offset,
+ &total_size, dif_ctx);
+ if (rc < 0) {
+ SPDK_ERRLOG("Failed to setup iovs for DIF strip\n");
+ return false;
+ }
+
+ s->total_size += total_size;
+ s->iov_offset = 0;
+ assert(s->iovcnt >= rc);
+ s->iovcnt -= rc;
+ s->iov += rc;
+
+ if (s->iovcnt == 0) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+int
+iscsi_build_iovs(struct spdk_iscsi_conn *conn, struct iovec *iovs, int iovcnt,
+ struct spdk_iscsi_pdu *pdu, uint32_t *_mapped_length)
+{
+ struct _iscsi_sgl sgl;
+ int enable_digest;
+ uint32_t total_ahs_len;
+ uint32_t data_len;
+
+ if (iovcnt == 0) {
+ return 0;
+ }
+
+ total_ahs_len = pdu->bhs.total_ahs_len;
+ data_len = DGET24(pdu->bhs.data_segment_len);
+ data_len = ISCSI_ALIGN(data_len);
+
+ enable_digest = 1;
+ if (pdu->bhs.opcode == ISCSI_OP_LOGIN_RSP) {
+ /* this PDU should be sent without digest */
+ enable_digest = 0;
+ }
+
+ _iscsi_sgl_init(&sgl, iovs, iovcnt, pdu->writev_offset);
+
+ /* BHS */
+ if (!_iscsi_sgl_append(&sgl, (uint8_t *)&pdu->bhs, ISCSI_BHS_LEN)) {
+ goto end;
+ }
+ /* AHS */
+ if (total_ahs_len > 0) {
+ if (!_iscsi_sgl_append(&sgl, pdu->ahs, 4 * total_ahs_len)) {
+ goto end;
+ }
+ }
+
+ /* Header Digest */
+ if (enable_digest && conn->header_digest) {
+ if (!_iscsi_sgl_append(&sgl, pdu->header_digest, ISCSI_DIGEST_LEN)) {
+ goto end;
+ }
+ }
+
+ /* Data Segment */
+ if (data_len > 0) {
+ if (!pdu->dif_insert_or_strip) {
+ if (!_iscsi_sgl_append(&sgl, pdu->data, data_len)) {
+ goto end;
+ }
+ } else {
+ if (!_iscsi_sgl_append_with_md(&sgl, pdu->data, pdu->data_buf_len,
+ data_len, &pdu->dif_ctx)) {
+ goto end;
+ }
+ }
+ }
+
+ /* Data Digest */
+ if (enable_digest && conn->data_digest && data_len != 0) {
+ _iscsi_sgl_append(&sgl, pdu->data_digest, ISCSI_DIGEST_LEN);
+ }
+
+end:
+ if (_mapped_length != NULL) {
+ *_mapped_length = sgl.total_size;
+ }
+
+ return iovcnt - sgl.iovcnt;
+}
+
+void iscsi_free_sess(struct spdk_iscsi_sess *sess)
+{
+ if (sess == NULL) {
+ return;
+ }
+
+ sess->tag = 0;
+ sess->target = NULL;
+ sess->session_type = SESSION_TYPE_INVALID;
+ iscsi_param_free(sess->params);
+ free(sess->conns);
+ spdk_scsi_port_free(&sess->initiator_port);
+ spdk_mempool_put(g_iscsi.session_pool, (void *)sess);
+}
+
+static int
+create_iscsi_sess(struct spdk_iscsi_conn *conn,
+ struct spdk_iscsi_tgt_node *target,
+ enum session_type session_type)
+{
+ struct spdk_iscsi_sess *sess;
+ int rc;
+
+ sess = spdk_mempool_get(g_iscsi.session_pool);
+ if (!sess) {
+ SPDK_ERRLOG("Unable to get session object\n");
+ SPDK_ERRLOG("MaxSessions set to %d\n", g_iscsi.MaxSessions);
+ return -ENOMEM;
+ }
+
+ /* configuration values */
+ pthread_mutex_lock(&g_iscsi.mutex);
+
+ sess->MaxConnections = g_iscsi.MaxConnectionsPerSession;
+ sess->MaxOutstandingR2T = DEFAULT_MAXOUTSTANDINGR2T;
+
+ sess->DefaultTime2Wait = g_iscsi.DefaultTime2Wait;
+ sess->DefaultTime2Retain = g_iscsi.DefaultTime2Retain;
+ sess->FirstBurstLength = g_iscsi.FirstBurstLength;
+ sess->MaxBurstLength = SPDK_ISCSI_MAX_BURST_LENGTH;
+ sess->InitialR2T = DEFAULT_INITIALR2T;
+ sess->ImmediateData = g_iscsi.ImmediateData;
+ sess->DataPDUInOrder = DEFAULT_DATAPDUINORDER;
+ sess->DataSequenceInOrder = DEFAULT_DATASEQUENCEINORDER;
+ sess->ErrorRecoveryLevel = g_iscsi.ErrorRecoveryLevel;
+
+ pthread_mutex_unlock(&g_iscsi.mutex);
+
+ sess->tag = conn->pg_tag;
+
+ sess->conns = calloc(sess->MaxConnections, sizeof(*sess->conns));
+ if (!sess->conns) {
+ SPDK_ERRLOG("calloc() failed for connection array\n");
+ return -ENOMEM;
+ }
+
+ sess->connections = 0;
+
+ sess->conns[sess->connections] = conn;
+ sess->connections++;
+
+ sess->params = NULL;
+ sess->target = target;
+ sess->isid = 0;
+ sess->session_type = session_type;
+ sess->current_text_itt = 0xffffffffU;
+
+ /* set default params */
+ rc = iscsi_sess_params_init(&sess->params);
+ if (rc < 0) {
+ SPDK_ERRLOG("iscsi_sess_params_init() failed\n");
+ goto error_return;
+ }
+ /* replace with config value */
+ rc = iscsi_param_set_int(sess->params, "MaxConnections",
+ sess->MaxConnections);
+ if (rc < 0) {
+ SPDK_ERRLOG("iscsi_param_set_int() failed\n");
+ goto error_return;
+ }
+
+ rc = iscsi_param_set_int(sess->params, "MaxOutstandingR2T",
+ sess->MaxOutstandingR2T);
+ if (rc < 0) {
+ SPDK_ERRLOG("iscsi_param_set_int() failed\n");
+ goto error_return;
+ }
+
+ rc = iscsi_param_set_int(sess->params, "DefaultTime2Wait",
+ sess->DefaultTime2Wait);
+ if (rc < 0) {
+ SPDK_ERRLOG("iscsi_param_set_int() failed\n");
+ goto error_return;
+ }
+
+ rc = iscsi_param_set_int(sess->params, "DefaultTime2Retain",
+ sess->DefaultTime2Retain);
+ if (rc < 0) {
+ SPDK_ERRLOG("iscsi_param_set_int() failed\n");
+ goto error_return;
+ }
+
+ rc = iscsi_param_set_int(sess->params, "FirstBurstLength",
+ sess->FirstBurstLength);
+ if (rc < 0) {
+ SPDK_ERRLOG("iscsi_param_set_int() failed\n");
+ goto error_return;
+ }
+
+ rc = iscsi_param_set_int(sess->params, "MaxBurstLength",
+ sess->MaxBurstLength);
+ if (rc < 0) {
+ SPDK_ERRLOG("iscsi_param_set_int() failed\n");
+ goto error_return;
+ }
+
+ rc = iscsi_param_set(sess->params, "InitialR2T",
+ sess->InitialR2T ? "Yes" : "No");
+ if (rc < 0) {
+ SPDK_ERRLOG("iscsi_param_set() failed\n");
+ goto error_return;
+ }
+
+ rc = iscsi_param_set(sess->params, "ImmediateData",
+ sess->ImmediateData ? "Yes" : "No");
+ if (rc < 0) {
+ SPDK_ERRLOG("iscsi_param_set() failed\n");
+ goto error_return;
+ }
+
+ rc = iscsi_param_set(sess->params, "DataPDUInOrder",
+ sess->DataPDUInOrder ? "Yes" : "No");
+ if (rc < 0) {
+ SPDK_ERRLOG("iscsi_param_set() failed\n");
+ goto error_return;
+ }
+
+ rc = iscsi_param_set(sess->params, "DataSequenceInOrder",
+ sess->DataSequenceInOrder ? "Yes" : "No");
+ if (rc < 0) {
+ SPDK_ERRLOG("iscsi_param_set() failed\n");
+ goto error_return;
+ }
+
+ rc = iscsi_param_set_int(sess->params, "ErrorRecoveryLevel",
+ sess->ErrorRecoveryLevel);
+ if (rc < 0) {
+ SPDK_ERRLOG("iscsi_param_set_int() failed\n");
+ goto error_return;
+ }
+
+ /* realloc buffer */
+ rc = iscsi_param_set_int(conn->params, "MaxRecvDataSegmentLength",
+ conn->MaxRecvDataSegmentLength);
+ if (rc < 0) {
+ SPDK_ERRLOG("iscsi_param_set_int() failed\n");
+ goto error_return;
+ }
+
+ /* sess for first connection of session */
+ conn->sess = sess;
+ return 0;
+
+error_return:
+ iscsi_free_sess(sess);
+ conn->sess = NULL;
+ return -1;
+}
+
+static struct spdk_iscsi_sess *
+get_iscsi_sess_by_tsih(uint16_t tsih)
+{
+ struct spdk_iscsi_sess *session;
+
+ if (tsih == 0 || tsih > g_iscsi.MaxSessions) {
+ return NULL;
+ }
+
+ session = g_iscsi.session[tsih - 1];
+ assert(tsih == session->tsih);
+
+ return session;
+}
+
+static uint8_t
+append_iscsi_sess(struct spdk_iscsi_conn *conn,
+ const char *initiator_port_name, uint16_t tsih, uint16_t cid)
+{
+ struct spdk_iscsi_sess *sess;
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "append session: init port name=%s, tsih=%u, cid=%u\n",
+ initiator_port_name, tsih, cid);
+
+ sess = get_iscsi_sess_by_tsih(tsih);
+ if (sess == NULL) {
+ SPDK_ERRLOG("spdk_get_iscsi_sess_by_tsih failed\n");
+ return ISCSI_LOGIN_CONN_ADD_FAIL;
+ }
+ if ((conn->pg_tag != sess->tag) ||
+ (strcasecmp(initiator_port_name, spdk_scsi_port_get_name(sess->initiator_port)) != 0) ||
+ (conn->target != sess->target)) {
+ /* no match */
+ SPDK_ERRLOG("no MCS session for init port name=%s, tsih=%d, cid=%d\n",
+ initiator_port_name, tsih, cid);
+ return ISCSI_LOGIN_CONN_ADD_FAIL;
+ }
+
+ if (sess->connections >= sess->MaxConnections) {
+ /* no slot for connection */
+ SPDK_ERRLOG("too many connections for init port name=%s, tsih=%d, cid=%d\n",
+ initiator_port_name, tsih, cid);
+ return ISCSI_LOGIN_TOO_MANY_CONNECTIONS;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Connections (tsih %d): %d\n", sess->tsih, sess->connections);
+ conn->sess = sess;
+
+ /*
+ * TODO: need a mutex or other sync mechanism to protect the session's
+ * connection list.
+ */
+ sess->conns[sess->connections] = conn;
+ sess->connections++;
+
+ return 0;
+}
+
+static int
+iscsi_append_text(struct spdk_iscsi_conn *conn __attribute__((__unused__)),
+ const char *key, const char *val, uint8_t *data,
+ int alloc_len, int data_len)
+{
+ int total;
+ int len;
+
+ total = data_len;
+ if (alloc_len < 1) {
+ return 0;
+ }
+ if (total > alloc_len) {
+ total = alloc_len;
+ data[total - 1] = '\0';
+ return total;
+ }
+
+ if (alloc_len - total < 1) {
+ SPDK_ERRLOG("data space small %d\n", alloc_len);
+ return total;
+ }
+ len = snprintf((char *) data + total, alloc_len - total, "%s=%s", key, val);
+ total += len + 1;
+
+ return total;
+}
+
+static int
+iscsi_append_param(struct spdk_iscsi_conn *conn, const char *key,
+ uint8_t *data, int alloc_len, int data_len)
+{
+ struct iscsi_param *param;
+ int rc;
+
+ param = iscsi_param_find(conn->params, key);
+ if (param == NULL) {
+ param = iscsi_param_find(conn->sess->params, key);
+ if (param == NULL) {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "no key %.64s\n", key);
+ return data_len;
+ }
+ }
+ rc = iscsi_append_text(conn, param->key, param->val, data,
+ alloc_len, data_len);
+ return rc;
+}
+
+static int
+iscsi_auth_params(struct spdk_iscsi_conn *conn,
+ struct iscsi_param *params, const char *method, uint8_t *data,
+ int alloc_len, int data_len)
+{
+ char *in_val;
+ char *in_next;
+ char *new_val;
+ const char *algorithm;
+ const char *name;
+ const char *response;
+ const char *identifier;
+ const char *challenge;
+ int total;
+ int rc;
+
+ if (conn == NULL || params == NULL || method == NULL) {
+ return -1;
+ }
+ if (strcasecmp(method, "CHAP") == 0) {
+ /* method OK */
+ } else {
+ SPDK_ERRLOG("unsupported AuthMethod %.64s\n", method);
+ return -1;
+ }
+
+ total = data_len;
+ if (alloc_len < 1) {
+ return 0;
+ }
+ if (total > alloc_len) {
+ total = alloc_len;
+ data[total - 1] = '\0';
+ return total;
+ }
+
+ /* for temporary store */
+ in_val = malloc(ISCSI_TEXT_MAX_VAL_LEN + 1);
+ if (!in_val) {
+ SPDK_ERRLOG("malloc() failed for temporary store\n");
+ return -ENOMEM;
+ }
+
+ /* CHAP method (RFC1994) */
+ if ((algorithm = iscsi_param_get_val(params, "CHAP_A")) != NULL) {
+ if (conn->auth.chap_phase != ISCSI_CHAP_PHASE_WAIT_A) {
+ SPDK_ERRLOG("CHAP sequence error\n");
+ goto error_return;
+ }
+
+ /* CHAP_A is LIST type */
+ snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN + 1, "%s", algorithm);
+ in_next = in_val;
+ while ((new_val = spdk_strsepq(&in_next, ",")) != NULL) {
+ if (strcasecmp(new_val, "5") == 0) {
+ /* CHAP with MD5 */
+ break;
+ }
+ }
+ if (new_val == NULL) {
+ snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN + 1, "%s", "Reject");
+ new_val = in_val;
+ iscsi_append_text(conn, "CHAP_A", new_val,
+ data, alloc_len, total);
+ goto error_return;
+ }
+ /* selected algorithm is 5 (MD5) */
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "got CHAP_A=%s\n", new_val);
+ total = iscsi_append_text(conn, "CHAP_A", new_val,
+ data, alloc_len, total);
+
+ /* Identifier is one octet */
+ gen_random(conn->auth.chap_id, 1);
+ snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN, "%d",
+ (int) conn->auth.chap_id[0]);
+ total = iscsi_append_text(conn, "CHAP_I", in_val,
+ data, alloc_len, total);
+
+ /* Challenge Value is a variable stream of octets */
+ /* (binary length MUST not exceed 1024 bytes) */
+ conn->auth.chap_challenge_len = ISCSI_CHAP_CHALLENGE_LEN;
+ gen_random(conn->auth.chap_challenge, conn->auth.chap_challenge_len);
+ bin2hex(in_val, ISCSI_TEXT_MAX_VAL_LEN,
+ conn->auth.chap_challenge, conn->auth.chap_challenge_len);
+ total = iscsi_append_text(conn, "CHAP_C", in_val,
+ data, alloc_len, total);
+
+ conn->auth.chap_phase = ISCSI_CHAP_PHASE_WAIT_NR;
+ } else if ((name = iscsi_param_get_val(params, "CHAP_N")) != NULL) {
+ uint8_t resmd5[SPDK_MD5DIGEST_LEN];
+ uint8_t tgtmd5[SPDK_MD5DIGEST_LEN];
+ struct spdk_md5ctx md5ctx;
+ size_t decoded_len = 0;
+
+ if (conn->auth.chap_phase != ISCSI_CHAP_PHASE_WAIT_NR) {
+ SPDK_ERRLOG("CHAP sequence error\n");
+ goto error_return;
+ }
+
+ response = iscsi_param_get_val(params, "CHAP_R");
+ if (response == NULL) {
+ SPDK_ERRLOG("no response\n");
+ goto error_return;
+ }
+ if (response[0] == '0' &&
+ (response[1] == 'x' || response[1] == 'X')) {
+ rc = hex2bin(resmd5, SPDK_MD5DIGEST_LEN, response);
+ if (rc < 0 || rc != SPDK_MD5DIGEST_LEN) {
+ SPDK_ERRLOG("response format error\n");
+ goto error_return;
+ }
+ } else if (response[0] == '0' &&
+ (response[1] == 'b' || response[1] == 'B')) {
+ response += 2;
+ rc = spdk_base64_decode(resmd5, &decoded_len, response);
+ if (rc < 0 || decoded_len != SPDK_MD5DIGEST_LEN) {
+ SPDK_ERRLOG("response format error\n");
+ goto error_return;
+ }
+ } else {
+ SPDK_ERRLOG("response format error\n");
+ goto error_return;
+ }
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "got CHAP_N/CHAP_R\n");
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "ag_tag=%d\n", conn->chap_group);
+
+ rc = iscsi_chap_get_authinfo(&conn->auth, name, conn->chap_group);
+ if (rc < 0) {
+ /* SPDK_ERRLOG("auth user or secret is missing\n"); */
+ SPDK_ERRLOG("iscsi_chap_get_authinfo() failed\n");
+ goto error_return;
+ }
+ if (conn->auth.user[0] == '\0' || conn->auth.secret[0] == '\0') {
+ /* SPDK_ERRLOG("auth user or secret is missing\n"); */
+ SPDK_ERRLOG("auth failed (name %.64s)\n", name);
+ goto error_return;
+ }
+
+ md5init(&md5ctx);
+ /* Identifier */
+ md5update(&md5ctx, conn->auth.chap_id, 1);
+ /* followed by secret */
+ md5update(&md5ctx, conn->auth.secret,
+ strlen(conn->auth.secret));
+ /* followed by Challenge Value */
+ md5update(&md5ctx, conn->auth.chap_challenge,
+ conn->auth.chap_challenge_len);
+ /* tgtmd5 is expecting Response Value */
+ md5final(tgtmd5, &md5ctx);
+
+ bin2hex(in_val, ISCSI_TEXT_MAX_VAL_LEN, tgtmd5, SPDK_MD5DIGEST_LEN);
+
+#if 0
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "tgtmd5=%s, resmd5=%s\n", in_val, response);
+ spdk_dump("tgtmd5", tgtmd5, SPDK_MD5DIGEST_LEN);
+ spdk_dump("resmd5", resmd5, SPDK_MD5DIGEST_LEN);
+#endif
+
+ /* compare MD5 digest */
+ if (memcmp(tgtmd5, resmd5, SPDK_MD5DIGEST_LEN) != 0) {
+ /* not match */
+ /* SPDK_ERRLOG("auth user or secret is missing\n"); */
+ SPDK_ERRLOG("auth failed (name %.64s)\n", name);
+ goto error_return;
+ }
+ /* OK initiator's secret */
+ conn->authenticated = true;
+
+ /* mutual CHAP? */
+ identifier = iscsi_param_get_val(params, "CHAP_I");
+ if (identifier != NULL) {
+ conn->auth.chap_mid[0] = (uint8_t) strtol(identifier, NULL, 10);
+ challenge = iscsi_param_get_val(params, "CHAP_C");
+ if (challenge == NULL) {
+ SPDK_ERRLOG("CHAP sequence error\n");
+ goto error_return;
+ }
+ if (challenge[0] == '0' &&
+ (challenge[1] == 'x' || challenge[1] == 'X')) {
+ rc = hex2bin(conn->auth.chap_mchallenge,
+ ISCSI_CHAP_CHALLENGE_LEN, challenge);
+ if (rc < 0) {
+ SPDK_ERRLOG("challenge format error\n");
+ goto error_return;
+ }
+ conn->auth.chap_mchallenge_len = rc;
+ } else if (challenge[0] == '0' &&
+ (challenge[1] == 'b' || challenge[1] == 'B')) {
+ challenge += 2;
+ rc = spdk_base64_decode(conn->auth.chap_mchallenge,
+ &decoded_len, challenge);
+ if (rc < 0) {
+ SPDK_ERRLOG("challenge format error\n");
+ goto error_return;
+ }
+ conn->auth.chap_mchallenge_len = decoded_len;
+ } else {
+ SPDK_ERRLOG("challenge format error\n");
+ goto error_return;
+ }
+#if 0
+ spdk_dump("MChallenge", conn->auth.chap_mchallenge,
+ conn->auth.chap_mchallenge_len);
+#endif
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "got CHAP_I/CHAP_C\n");
+
+ if (conn->auth.muser[0] == '\0' || conn->auth.msecret[0] == '\0') {
+ /* SPDK_ERRLOG("mutual auth user or secret is missing\n"); */
+ SPDK_ERRLOG("auth failed (name %.64s)\n", name);
+ goto error_return;
+ }
+
+ md5init(&md5ctx);
+ /* Identifier */
+ md5update(&md5ctx, conn->auth.chap_mid, 1);
+ /* followed by secret */
+ md5update(&md5ctx, conn->auth.msecret,
+ strlen(conn->auth.msecret));
+ /* followed by Challenge Value */
+ md5update(&md5ctx, conn->auth.chap_mchallenge,
+ conn->auth.chap_mchallenge_len);
+ /* tgtmd5 is Response Value */
+ md5final(tgtmd5, &md5ctx);
+
+ bin2hex(in_val, ISCSI_TEXT_MAX_VAL_LEN, tgtmd5, SPDK_MD5DIGEST_LEN);
+
+ total = iscsi_append_text(conn, "CHAP_N",
+ conn->auth.muser, data, alloc_len, total);
+ total = iscsi_append_text(conn, "CHAP_R",
+ in_val, data, alloc_len, total);
+ } else {
+ /* not mutual */
+ if (conn->mutual_chap) {
+ SPDK_ERRLOG("required mutual CHAP\n");
+ goto error_return;
+ }
+ }
+
+ conn->auth.chap_phase = ISCSI_CHAP_PHASE_END;
+ } else {
+ /* not found CHAP keys */
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "start CHAP\n");
+ conn->auth.chap_phase = ISCSI_CHAP_PHASE_WAIT_A;
+ }
+
+ free(in_val);
+ return total;
+
+error_return:
+ conn->auth.chap_phase = ISCSI_CHAP_PHASE_WAIT_A;
+ free(in_val);
+ return -1;
+}
+
+static int
+iscsi_check_values(struct spdk_iscsi_conn *conn)
+{
+ if (conn->sess->FirstBurstLength > conn->sess->MaxBurstLength) {
+ SPDK_ERRLOG("FirstBurstLength(%d) > MaxBurstLength(%d)\n",
+ conn->sess->FirstBurstLength,
+ conn->sess->MaxBurstLength);
+ return -1;
+ }
+ if (conn->sess->FirstBurstLength > g_iscsi.FirstBurstLength) {
+ SPDK_ERRLOG("FirstBurstLength(%d) > iSCSI target restriction(%d)\n",
+ conn->sess->FirstBurstLength, g_iscsi.FirstBurstLength);
+ return -1;
+ }
+ if (conn->sess->MaxBurstLength > 0x00ffffff) {
+ SPDK_ERRLOG("MaxBurstLength(%d) > 0x00ffffff\n",
+ conn->sess->MaxBurstLength);
+ return -1;
+ }
+
+ if (conn->MaxRecvDataSegmentLength < 512) {
+ SPDK_ERRLOG("MaxRecvDataSegmentLength(%d) < 512\n",
+ conn->MaxRecvDataSegmentLength);
+ return -1;
+ }
+ if (conn->MaxRecvDataSegmentLength > 0x00ffffff) {
+ SPDK_ERRLOG("MaxRecvDataSegmentLength(%d) > 0x00ffffff\n",
+ conn->MaxRecvDataSegmentLength);
+ return -1;
+ }
+ return 0;
+}
+
+static int
+iscsi_conn_params_update(struct spdk_iscsi_conn *conn)
+{
+ int rc;
+ uint32_t recv_buf_size;
+
+ /* update internal variables */
+ rc = iscsi_copy_param2var(conn);
+ if (rc < 0) {
+ SPDK_ERRLOG("iscsi_copy_param2var() failed\n");
+ if (conn->state < ISCSI_CONN_STATE_EXITING) {
+ conn->state = ISCSI_CONN_STATE_EXITING;
+ }
+ return rc;
+ }
+
+ /* check value */
+ rc = iscsi_check_values(conn);
+ if (rc < 0) {
+ SPDK_ERRLOG("iscsi_check_values() failed\n");
+ if (conn->state < ISCSI_CONN_STATE_EXITING) {
+ conn->state = ISCSI_CONN_STATE_EXITING;
+ }
+ }
+
+ /* The socket receive buffer may need to be adjusted based on the new parameters */
+
+ /* Don't allow the recv buffer to be 0 or very large. */
+ recv_buf_size = spdk_max(0x1000, spdk_min(0x2000, conn->sess->FirstBurstLength));
+
+ /* Add in extra space for the PDU */
+ recv_buf_size += ISCSI_BHS_LEN + ISCSI_AHS_LEN;
+
+ if (conn->header_digest) {
+ recv_buf_size += ISCSI_DIGEST_LEN;
+ }
+
+ if (conn->data_digest) {
+ recv_buf_size += ISCSI_DIGEST_LEN;
+ }
+
+ /* Set up to buffer up to 4 commands with immediate data at once */
+ if (spdk_sock_set_recvbuf(conn->sock, recv_buf_size * 4) < 0) {
+ /* Not fatal. */
+ }
+
+ return rc;
+}
+
+static void
+iscsi_conn_login_pdu_err_complete(void *arg)
+{
+ struct spdk_iscsi_conn *conn = arg;
+
+ if (conn->full_feature) {
+ iscsi_conn_params_update(conn);
+ }
+}
+
+static void
+iscsi_conn_login_pdu_success_complete(void *arg)
+{
+ struct spdk_iscsi_conn *conn = arg;
+
+ if (conn->state >= ISCSI_CONN_STATE_EXITING) {
+ /* Connection is being exited before this callback is executed. */
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Connection is already exited.\n");
+ return;
+ }
+ if (conn->full_feature) {
+ if (iscsi_conn_params_update(conn) != 0) {
+ return;
+ }
+ }
+ conn->state = ISCSI_CONN_STATE_RUNNING;
+ if (conn->full_feature != 0) {
+ iscsi_conn_schedule(conn);
+ }
+}
+
+/*
+ * The response function of spdk_iscsi_op_login
+ */
+static void
+iscsi_op_login_response(struct spdk_iscsi_conn *conn,
+ struct spdk_iscsi_pdu *rsp_pdu, struct iscsi_param *params,
+ iscsi_conn_xfer_complete_cb cb_fn)
+{
+ struct iscsi_bhs_login_rsp *rsph;
+
+ rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs;
+ rsph->version_max = ISCSI_VERSION;
+ rsph->version_act = ISCSI_VERSION;
+ DSET24(rsph->data_segment_len, rsp_pdu->data_segment_len);
+
+ to_be32(&rsph->stat_sn, conn->StatSN);
+ conn->StatSN++;
+
+ if (conn->sess != NULL) {
+ to_be32(&rsph->exp_cmd_sn, conn->sess->ExpCmdSN);
+ to_be32(&rsph->max_cmd_sn, conn->sess->MaxCmdSN);
+ } else {
+ to_be32(&rsph->exp_cmd_sn, rsp_pdu->cmd_sn);
+ to_be32(&rsph->max_cmd_sn, rsp_pdu->cmd_sn);
+ }
+
+ SPDK_LOGDUMP(SPDK_LOG_ISCSI, "PDU", (uint8_t *)rsph, ISCSI_BHS_LEN);
+ SPDK_LOGDUMP(SPDK_LOG_ISCSI, "DATA", rsp_pdu->data, rsp_pdu->data_segment_len);
+
+ /* Set T/CSG/NSG to reserved if login error. */
+ if (rsph->status_class != 0) {
+ rsph->flags &= ~ISCSI_LOGIN_TRANSIT;
+ rsph->flags &= ~ISCSI_LOGIN_CURRENT_STAGE_MASK;
+ rsph->flags &= ~ISCSI_LOGIN_NEXT_STAGE_MASK;
+ }
+ iscsi_param_free(params);
+ iscsi_conn_write_pdu(conn, rsp_pdu, cb_fn, conn);
+}
+
+/*
+ * The function which is used to initialize the internal response data
+ * structure of iscsi login function.
+ * return:
+ * 0, success;
+ * otherwise, error;
+ */
+static int
+iscsi_op_login_rsp_init(struct spdk_iscsi_conn *conn,
+ struct spdk_iscsi_pdu *pdu, struct spdk_iscsi_pdu *rsp_pdu)
+{
+ struct iscsi_bhs_login_req *reqh;
+ struct iscsi_bhs_login_rsp *rsph;
+
+ rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs;
+ rsph->opcode = ISCSI_OP_LOGIN_RSP;
+ rsph->status_class = ISCSI_CLASS_SUCCESS;
+ rsph->status_detail = ISCSI_LOGIN_ACCEPT;
+ rsp_pdu->data_segment_len = 0;
+
+ /* The default MaxRecvDataSegmentLength 8192 is used during login. - RFC3720 */
+ rsp_pdu->data = calloc(1, 8192);
+ if (!rsp_pdu->data) {
+ SPDK_ERRLOG("calloc() failed for data segment\n");
+ rsph->status_class = ISCSI_CLASS_TARGET_ERROR;
+ rsph->status_detail = ISCSI_LOGIN_STATUS_NO_RESOURCES;
+ return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+ }
+ rsp_pdu->data_buf_len = 8192;
+
+ reqh = (struct iscsi_bhs_login_req *)&pdu->bhs;
+ rsph->flags |= (reqh->flags & ISCSI_LOGIN_TRANSIT);
+ rsph->flags |= (reqh->flags & ISCSI_LOGIN_CONTINUE);
+ rsph->flags |= (reqh->flags & ISCSI_LOGIN_CURRENT_STAGE_MASK);
+ if (ISCSI_BHS_LOGIN_GET_TBIT(rsph->flags)) {
+ rsph->flags |= (reqh->flags & ISCSI_LOGIN_NEXT_STAGE_MASK);
+ }
+
+ /* We don't need to convert from network byte order. Just store it */
+ memcpy(&rsph->isid, reqh->isid, 6);
+ rsph->tsih = reqh->tsih;
+ rsph->itt = reqh->itt;
+ rsp_pdu->cmd_sn = from_be32(&reqh->cmd_sn);
+
+ if (rsph->tsih) {
+ rsph->stat_sn = reqh->exp_stat_sn;
+ }
+
+ SPDK_LOGDUMP(SPDK_LOG_ISCSI, "PDU", (uint8_t *)&pdu->bhs, ISCSI_BHS_LEN);
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI,
+ "T=%d, C=%d, CSG=%d, NSG=%d, Min=%d, Max=%d, ITT=%x\n",
+ ISCSI_BHS_LOGIN_GET_TBIT(rsph->flags),
+ ISCSI_BHS_LOGIN_GET_CBIT(rsph->flags),
+ ISCSI_BHS_LOGIN_GET_CSG(rsph->flags),
+ ISCSI_BHS_LOGIN_GET_NSG(rsph->flags),
+ reqh->version_min, reqh->version_max, from_be32(&rsph->itt));
+
+ if (conn->sess != NULL) {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI,
+ "CmdSN=%u, ExpStatSN=%u, StatSN=%u, ExpCmdSN=%u,"
+ "MaxCmdSN=%u\n", rsp_pdu->cmd_sn,
+ from_be32(&rsph->stat_sn), conn->StatSN,
+ conn->sess->ExpCmdSN,
+ conn->sess->MaxCmdSN);
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI,
+ "CmdSN=%u, ExpStatSN=%u, StatSN=%u\n",
+ rsp_pdu->cmd_sn, from_be32(&rsph->stat_sn),
+ conn->StatSN);
+ }
+
+ if (ISCSI_BHS_LOGIN_GET_TBIT(rsph->flags) &&
+ ISCSI_BHS_LOGIN_GET_CBIT(rsph->flags)) {
+ SPDK_ERRLOG("transit error\n");
+ rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+ rsph->status_detail = ISCSI_LOGIN_INITIATOR_ERROR;
+ return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+ }
+ /* make sure reqh->version_max < ISCSI_VERSION */
+ if (reqh->version_min > ISCSI_VERSION) {
+ SPDK_ERRLOG("unsupported version min %d/max %d, expecting %d\n", reqh->version_min,
+ reqh->version_max, ISCSI_VERSION);
+ /* Unsupported version */
+ /* set all reserved flag to zero */
+ rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+ rsph->status_detail = ISCSI_LOGIN_UNSUPPORTED_VERSION;
+ return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+ }
+
+ if ((ISCSI_BHS_LOGIN_GET_NSG(rsph->flags) == ISCSI_NSG_RESERVED_CODE) &&
+ ISCSI_BHS_LOGIN_GET_TBIT(rsph->flags)) {
+ /* set NSG to zero */
+ rsph->flags &= ~ISCSI_LOGIN_NEXT_STAGE_MASK;
+ /* also set other bits to zero */
+ rsph->flags &= ~ISCSI_LOGIN_TRANSIT;
+ rsph->flags &= ~ISCSI_LOGIN_CURRENT_STAGE_MASK;
+ SPDK_ERRLOG("Received reserved NSG code: %d\n", ISCSI_NSG_RESERVED_CODE);
+ /* Initiator error */
+ rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+ rsph->status_detail = ISCSI_LOGIN_INITIATOR_ERROR;
+ return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+ }
+
+ return 0;
+}
+
+static int
+iscsi_op_login_store_incoming_params(struct spdk_iscsi_conn *conn,
+ struct spdk_iscsi_pdu *pdu, struct spdk_iscsi_pdu *rsp_pdu,
+ struct iscsi_param **params)
+{
+ struct iscsi_bhs_login_req *reqh;
+ struct iscsi_bhs_login_rsp *rsph;
+ int rc;
+
+ reqh = (struct iscsi_bhs_login_req *)&pdu->bhs;
+ rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs;
+
+ rc = iscsi_parse_params(params, pdu->data,
+ pdu->data_segment_len, ISCSI_BHS_LOGIN_GET_CBIT(reqh->flags),
+ &conn->partial_text_parameter);
+ if (rc < 0) {
+ SPDK_ERRLOG("iscsi_parse_params() failed\n");
+ iscsi_param_free(*params);
+ rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+ rsph->status_detail = ISCSI_LOGIN_INITIATOR_ERROR;
+ return SPDK_ISCSI_LOGIN_ERROR_PARAMETER;
+ }
+
+ return 0;
+}
+
+/*
+ * This function is used to initialize the port info
+ * return
+ * 0: success
+ * otherwise: error
+ */
+static int
+iscsi_op_login_initialize_port(struct spdk_iscsi_conn *conn,
+ struct spdk_iscsi_pdu *rsp_pdu,
+ char *initiator_port_name,
+ uint32_t name_length,
+ struct iscsi_param *params)
+{
+ const char *val;
+ struct iscsi_bhs_login_rsp *rsph;
+ rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs;
+
+ /* Initiator Name and Port */
+ val = iscsi_param_get_val(params, "InitiatorName");
+ if (val == NULL) {
+ SPDK_ERRLOG("InitiatorName is empty\n");
+ /* Missing parameter */
+ rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+ rsph->status_detail = ISCSI_LOGIN_MISSING_PARMS;
+ return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+ }
+ snprintf(conn->initiator_name, sizeof(conn->initiator_name), "%s", val);
+ snprintf(initiator_port_name, name_length,
+ "%s,i,0x%12.12" PRIx64, val, iscsi_get_isid(rsph->isid));
+ spdk_strlwr(conn->initiator_name);
+ spdk_strlwr(initiator_port_name);
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Initiator name: %s\n", conn->initiator_name);
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Initiator port: %s\n", initiator_port_name);
+
+ return 0;
+}
+
+/*
+ * This function is used to judge the session type
+ * return
+ * 0: success
+ * Other value: error
+ */
+static int
+iscsi_op_login_session_type(struct spdk_iscsi_conn *conn,
+ struct spdk_iscsi_pdu *rsp_pdu,
+ enum session_type *session_type,
+ struct iscsi_param *params)
+{
+ const char *session_type_str;
+ struct iscsi_bhs_login_rsp *rsph;
+
+ rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs;
+ session_type_str = iscsi_param_get_val(params, "SessionType");
+ if (session_type_str == NULL) {
+ if (rsph->tsih != 0) {
+ *session_type = SESSION_TYPE_NORMAL;
+ } else {
+ SPDK_ERRLOG("SessionType is empty\n");
+ /* Missing parameter */
+ rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+ rsph->status_detail = ISCSI_LOGIN_MISSING_PARMS;
+ return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+ }
+ } else {
+ if (strcasecmp(session_type_str, "Discovery") == 0) {
+ *session_type = SESSION_TYPE_DISCOVERY;
+ } else if (strcasecmp(session_type_str, "Normal") == 0) {
+ *session_type = SESSION_TYPE_NORMAL;
+ } else {
+ *session_type = SESSION_TYPE_INVALID;
+ SPDK_ERRLOG("SessionType is invalid\n");
+ /* Missing parameter */
+ rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+ rsph->status_detail = ISCSI_LOGIN_MISSING_PARMS;
+ return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+ }
+ }
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Session Type: %s\n", session_type_str);
+
+ return 0;
+}
+
+/*
+ * This function is used to check the target info
+ * return:
+ * 0: success
+ * otherwise: error
+ */
+static int
+iscsi_op_login_check_target(struct spdk_iscsi_conn *conn,
+ struct spdk_iscsi_pdu *rsp_pdu,
+ const char *target_name,
+ struct spdk_iscsi_tgt_node **target)
+{
+ bool result;
+ struct iscsi_bhs_login_rsp *rsph;
+
+ rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs;
+ *target = iscsi_find_tgt_node(target_name);
+ if (*target == NULL) {
+ SPDK_WARNLOG("target %s not found\n", target_name);
+ /* Not found */
+ rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+ rsph->status_detail = ISCSI_LOGIN_TARGET_NOT_FOUND;
+ return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+ }
+ if (iscsi_tgt_node_is_destructed(*target)) {
+ SPDK_ERRLOG("target %s is removed\n", target_name);
+ rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+ rsph->status_detail = ISCSI_LOGIN_TARGET_REMOVED;
+ return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+ }
+ result = iscsi_tgt_node_access(conn, *target,
+ conn->initiator_name,
+ conn->initiator_addr);
+ if (!result) {
+ SPDK_ERRLOG("access denied\n");
+ rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+ rsph->status_detail = ISCSI_LOGIN_AUTHORIZATION_FAIL;
+ return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+ }
+
+ return 0;
+}
+
+/*
+ * This function use to check the session
+ * return:
+ * 0, success
+ * otherwise: error
+ */
+static int
+iscsi_op_login_check_session(struct spdk_iscsi_conn *conn,
+ struct spdk_iscsi_pdu *rsp_pdu,
+ char *initiator_port_name, int cid)
+
+{
+ int rc = 0;
+ struct iscsi_bhs_login_rsp *rsph;
+
+ rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs;
+ /* check existing session */
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "isid=%"PRIx64", tsih=%u, cid=%u\n",
+ iscsi_get_isid(rsph->isid), from_be16(&rsph->tsih), cid);
+ if (rsph->tsih != 0) {
+ /* multiple connections */
+ rc = append_iscsi_sess(conn, initiator_port_name,
+ from_be16(&rsph->tsih), cid);
+ if (rc != 0) {
+ SPDK_ERRLOG("isid=%"PRIx64", tsih=%u, cid=%u:"
+ "spdk_append_iscsi_sess() failed\n",
+ iscsi_get_isid(rsph->isid), from_be16(&rsph->tsih),
+ cid);
+ /* Can't include in session */
+ rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+ rsph->status_detail = rc;
+ return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+ }
+ } else if (!g_iscsi.AllowDuplicateIsid) {
+ /* new session, drop old sess by the initiator */
+ iscsi_drop_conns(conn, initiator_port_name, 0 /* drop old */);
+ }
+
+ return rc;
+}
+
+/*
+ * This function is used to del the original param and update it with new
+ * value
+ * return:
+ * 0: success
+ * otherwise: error
+ */
+static int
+iscsi_op_login_update_param(struct spdk_iscsi_conn *conn,
+ const char *key, const char *value,
+ const char *list)
+{
+ int rc = 0;
+ struct iscsi_param *new_param, *orig_param;
+ int index;
+
+ orig_param = iscsi_param_find(conn->params, key);
+ if (orig_param == NULL) {
+ SPDK_ERRLOG("orig_param %s not found\n", key);
+ return SPDK_ISCSI_LOGIN_ERROR_PARAMETER;
+ }
+
+ index = orig_param->state_index;
+ rc = iscsi_param_del(&conn->params, key);
+ if (rc < 0) {
+ SPDK_ERRLOG("iscsi_param_del(%s) failed\n", key);
+ return SPDK_ISCSI_LOGIN_ERROR_PARAMETER;
+ }
+ rc = iscsi_param_add(&conn->params, key, value, list, ISPT_LIST);
+ if (rc < 0) {
+ SPDK_ERRLOG("iscsi_param_add() failed\n");
+ return SPDK_ISCSI_LOGIN_ERROR_PARAMETER;
+ }
+ new_param = iscsi_param_find(conn->params, key);
+ if (new_param == NULL) {
+ SPDK_ERRLOG("iscsi_param_find() failed\n");
+ return SPDK_ISCSI_LOGIN_ERROR_PARAMETER;
+ }
+ new_param->state_index = index;
+ return rc;
+}
+
+static int
+iscsi_negotiate_chap_param(struct spdk_iscsi_conn *conn)
+{
+ int rc = 0;
+
+ if (conn->disable_chap) {
+ rc = iscsi_op_login_update_param(conn, "AuthMethod", "None", "None");
+ } else if (conn->require_chap) {
+ rc = iscsi_op_login_update_param(conn, "AuthMethod", "CHAP", "CHAP");
+ }
+
+ return rc;
+}
+
+/*
+ * The function which is used to handle the part of session discovery
+ * return:
+ * 0, success;
+ * otherwise: error;
+ */
+static int
+iscsi_op_login_session_discovery_chap(struct spdk_iscsi_conn *conn)
+{
+ return iscsi_negotiate_chap_param(conn);
+}
+
+/*
+ * This function is used to update the param related with chap
+ * return:
+ * 0: success
+ * otherwise: error
+ */
+static int
+iscsi_op_login_negotiate_chap_param(struct spdk_iscsi_conn *conn,
+ struct spdk_iscsi_tgt_node *target)
+{
+ conn->disable_chap = target->disable_chap;
+ conn->require_chap = target->require_chap;
+ conn->mutual_chap = target->mutual_chap;
+ conn->chap_group = target->chap_group;
+
+ return iscsi_negotiate_chap_param(conn);
+}
+
+static int
+iscsi_op_login_negotiate_digest_param(struct spdk_iscsi_conn *conn,
+ struct spdk_iscsi_tgt_node *target)
+{
+ int rc;
+
+ if (target->header_digest) {
+ /*
+ * User specified header digests, so update the list of
+ * HeaderDigest values to remove "None" so that only
+ * initiators who support CRC32C can connect.
+ */
+ rc = iscsi_op_login_update_param(conn, "HeaderDigest", "CRC32C", "CRC32C");
+ if (rc < 0) {
+ return rc;
+ }
+ }
+
+ if (target->data_digest) {
+ /*
+ * User specified data digests, so update the list of
+ * DataDigest values to remove "None" so that only
+ * initiators who support CRC32C can connect.
+ */
+ rc = iscsi_op_login_update_param(conn, "DataDigest", "CRC32C", "CRC32C");
+ if (rc < 0) {
+ return rc;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * The function which is used to handle the part of normal login session
+ * return:
+ * 0, success;
+ * SPDK_ISCSI_LOGIN_ERROR_PARAMETER, parameter error;
+ */
+static int
+iscsi_op_login_session_normal(struct spdk_iscsi_conn *conn,
+ struct spdk_iscsi_pdu *rsp_pdu,
+ char *initiator_port_name,
+ struct iscsi_param *params,
+ int cid)
+{
+ struct spdk_iscsi_tgt_node *target = NULL;
+ const char *target_name;
+ const char *target_short_name;
+ struct iscsi_bhs_login_rsp *rsph;
+ int rc = 0;
+
+ rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs;
+ target_name = iscsi_param_get_val(params, "TargetName");
+
+ if (target_name == NULL) {
+ SPDK_ERRLOG("TargetName is empty\n");
+ /* Missing parameter */
+ rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+ rsph->status_detail = ISCSI_LOGIN_MISSING_PARMS;
+ return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+ }
+
+ memset(conn->target_short_name, 0, MAX_TARGET_NAME);
+ target_short_name = strstr(target_name, ":");
+ if (target_short_name != NULL) {
+ target_short_name++; /* Advance past the ':' */
+ if (strlen(target_short_name) >= MAX_TARGET_NAME) {
+ SPDK_ERRLOG("Target Short Name (%s) is more than %u characters\n",
+ target_short_name, MAX_TARGET_NAME);
+ /* Invalid request */
+ rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+ rsph->status_detail = ISCSI_LOGIN_INVALID_LOGIN_REQUEST;
+ return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+ }
+ snprintf(conn->target_short_name, MAX_TARGET_NAME, "%s",
+ target_short_name);
+ }
+
+ pthread_mutex_lock(&g_iscsi.mutex);
+ rc = iscsi_op_login_check_target(conn, rsp_pdu, target_name, &target);
+ pthread_mutex_unlock(&g_iscsi.mutex);
+
+ if (rc < 0) {
+ return rc;
+ }
+
+ conn->target = target;
+ conn->dev = target->dev;
+ conn->target_port = spdk_scsi_dev_find_port_by_id(target->dev,
+ conn->pg_tag);
+
+ rc = iscsi_op_login_check_session(conn, rsp_pdu,
+ initiator_port_name, cid);
+ if (rc < 0) {
+ return rc;
+ }
+
+ /* force target flags */
+ pthread_mutex_lock(&target->mutex);
+ rc = iscsi_op_login_negotiate_chap_param(conn, target);
+ pthread_mutex_unlock(&target->mutex);
+
+ if (rc == 0) {
+ rc = iscsi_op_login_negotiate_digest_param(conn, target);
+ }
+
+ if (rc != 0) {
+ /* Invalid request */
+ rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+ rsph->status_detail = ISCSI_LOGIN_INVALID_LOGIN_REQUEST;
+ }
+
+ return rc;
+}
+
+/*
+ * This function is used to set the info in the connection data structure
+ * return
+ * 0: success
+ * otherwise: error
+ */
+static int
+iscsi_op_login_set_conn_info(struct spdk_iscsi_conn *conn,
+ struct spdk_iscsi_pdu *rsp_pdu,
+ char *initiator_port_name,
+ enum session_type session_type, int cid)
+{
+ int rc = 0;
+ struct spdk_iscsi_tgt_node *target;
+ struct iscsi_bhs_login_rsp *rsph;
+ struct spdk_scsi_port *initiator_port;
+
+ target = conn->target;
+
+ rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs;
+ conn->authenticated = false;
+ conn->auth.chap_phase = ISCSI_CHAP_PHASE_WAIT_A;
+ conn->cid = cid;
+
+ if (conn->sess == NULL) {
+ /* create initiator port */
+ initiator_port = spdk_scsi_port_create(iscsi_get_isid(rsph->isid), 0, initiator_port_name);
+ if (initiator_port == NULL) {
+ SPDK_ERRLOG("create_port() failed\n");
+ rsph->status_class = ISCSI_CLASS_TARGET_ERROR;
+ rsph->status_detail = ISCSI_LOGIN_STATUS_NO_RESOURCES;
+ return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+ }
+
+ /* new session */
+ rc = create_iscsi_sess(conn, target, session_type);
+ if (rc < 0) {
+ spdk_scsi_port_free(&initiator_port);
+ SPDK_ERRLOG("create_sess() failed\n");
+ rsph->status_class = ISCSI_CLASS_TARGET_ERROR;
+ rsph->status_detail = ISCSI_LOGIN_STATUS_NO_RESOURCES;
+ return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+ }
+ /* initialize parameters */
+ conn->sess->initiator_port = initiator_port;
+ conn->StatSN = from_be32(&rsph->stat_sn);
+ conn->sess->isid = iscsi_get_isid(rsph->isid);
+
+ /* Initiator port TransportID */
+ spdk_scsi_port_set_iscsi_transport_id(conn->sess->initiator_port,
+ conn->initiator_name,
+ conn->sess->isid);
+
+ /* Discovery sessions will not have a target. */
+ if (target != NULL) {
+ conn->sess->queue_depth = target->queue_depth;
+ } else {
+ /*
+ * Assume discovery sessions have an effective command
+ * windows size of 1.
+ */
+ conn->sess->queue_depth = 1;
+ }
+ conn->sess->ExpCmdSN = rsp_pdu->cmd_sn;
+ conn->sess->MaxCmdSN = rsp_pdu->cmd_sn + conn->sess->queue_depth - 1;
+ }
+
+ conn->initiator_port = conn->sess->initiator_port;
+
+ return 0;
+}
+
+/*
+ * This function is used to set the target info
+ * return
+ * 0: success
+ * otherwise: error
+ */
+static int
+iscsi_op_login_set_target_info(struct spdk_iscsi_conn *conn,
+ struct spdk_iscsi_pdu *rsp_pdu,
+ enum session_type session_type)
+{
+ char buf[MAX_TMPBUF];
+ const char *val;
+ int rc = 0;
+ struct spdk_iscsi_tgt_node *target = conn->target;
+
+ /* declarative parameters */
+ if (target != NULL) {
+ pthread_mutex_lock(&target->mutex);
+ if (target->alias[0] != '\0') {
+ snprintf(buf, sizeof buf, "%s", target->alias);
+ } else {
+ snprintf(buf, sizeof buf, "%s", "");
+ }
+ pthread_mutex_unlock(&target->mutex);
+ rc = iscsi_param_set(conn->sess->params, "TargetAlias", buf);
+ if (rc < 0) {
+ SPDK_ERRLOG("iscsi_param_set() failed\n");
+ return SPDK_ISCSI_LOGIN_ERROR_PARAMETER;
+ }
+ }
+ snprintf(buf, sizeof buf, "%s:%s,%d", conn->portal_host, conn->portal_port,
+ conn->pg_tag);
+ rc = iscsi_param_set(conn->sess->params, "TargetAddress", buf);
+ if (rc < 0) {
+ SPDK_ERRLOG("iscsi_param_set() failed\n");
+ return SPDK_ISCSI_LOGIN_ERROR_PARAMETER;
+ }
+ snprintf(buf, sizeof buf, "%d", conn->pg_tag);
+ rc = iscsi_param_set(conn->sess->params, "TargetPortalGroupTag", buf);
+ if (rc < 0) {
+ SPDK_ERRLOG("iscsi_param_set() failed\n");
+ return SPDK_ISCSI_LOGIN_ERROR_PARAMETER;
+ }
+
+ /* write in response */
+ if (target != NULL) {
+ val = iscsi_param_get_val(conn->sess->params, "TargetAlias");
+ if (val != NULL && strlen(val) != 0) {
+ rsp_pdu->data_segment_len = iscsi_append_param(conn,
+ "TargetAlias",
+ rsp_pdu->data,
+ rsp_pdu->data_buf_len,
+ rsp_pdu->data_segment_len);
+ }
+ if (session_type == SESSION_TYPE_DISCOVERY) {
+ rsp_pdu->data_segment_len = iscsi_append_param(conn,
+ "TargetAddress",
+ rsp_pdu->data,
+ rsp_pdu->data_buf_len,
+ rsp_pdu->data_segment_len);
+ }
+ rsp_pdu->data_segment_len = iscsi_append_param(conn,
+ "TargetPortalGroupTag",
+ rsp_pdu->data,
+ rsp_pdu->data_buf_len,
+ rsp_pdu->data_segment_len);
+ }
+
+ return rc;
+}
+
+/*
+ * This function is used to handle the login of iscsi initiator when there is
+ * no session
+ * return:
+ * 0, success;
+ * SPDK_ISCSI_LOGIN_ERROR_PARAMETER, parameter error;
+ * SPDK_ISCSI_LOGIN_ERROR_RESPONSE, used to notify the login fail.
+ */
+static int
+iscsi_op_login_phase_none(struct spdk_iscsi_conn *conn,
+ struct spdk_iscsi_pdu *rsp_pdu,
+ struct iscsi_param *params, int cid)
+{
+ enum session_type session_type;
+ char initiator_port_name[MAX_INITIATOR_PORT_NAME];
+ struct iscsi_bhs_login_rsp *rsph;
+ int rc = 0;
+ rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs;
+
+ conn->target = NULL;
+ conn->dev = NULL;
+
+ rc = iscsi_op_login_initialize_port(conn, rsp_pdu, initiator_port_name,
+ MAX_INITIATOR_PORT_NAME, params);
+ if (rc < 0) {
+ return rc;
+ }
+
+ rc = iscsi_op_login_session_type(conn, rsp_pdu, &session_type, params);
+ if (rc < 0) {
+ return rc;
+ }
+
+ /* Target Name and Port */
+ if (session_type == SESSION_TYPE_NORMAL) {
+ rc = iscsi_op_login_session_normal(conn, rsp_pdu,
+ initiator_port_name,
+ params, cid);
+ if (rc < 0) {
+ return rc;
+ }
+
+ } else if (session_type == SESSION_TYPE_DISCOVERY) {
+ rsph->tsih = 0;
+
+ /* force target flags */
+ pthread_mutex_lock(&g_iscsi.mutex);
+ rc = iscsi_op_login_session_discovery_chap(conn);
+ pthread_mutex_unlock(&g_iscsi.mutex);
+ if (rc < 0) {
+ return rc;
+ }
+ } else {
+ SPDK_ERRLOG("unknown session type\n");
+ /* Missing parameter */
+ rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+ rsph->status_detail = ISCSI_LOGIN_MISSING_PARMS;
+ return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+ }
+
+ rc = iscsi_op_login_set_conn_info(conn, rsp_pdu, initiator_port_name,
+ session_type, cid);
+ if (rc < 0) {
+ return rc;
+ }
+
+ /* limit conns on discovery session */
+ if (session_type == SESSION_TYPE_DISCOVERY) {
+ conn->sess->MaxConnections = 1;
+ rc = iscsi_param_set_int(conn->sess->params,
+ "MaxConnections",
+ conn->sess->MaxConnections);
+ if (rc < 0) {
+ SPDK_ERRLOG("iscsi_param_set_int() failed\n");
+ return SPDK_ISCSI_LOGIN_ERROR_PARAMETER;
+ }
+ }
+
+ return iscsi_op_login_set_target_info(conn, rsp_pdu, session_type);
+}
+
+/*
+ * This function is used to set the csg bit case in rsp
+ * return:
+ * 0, success
+ * otherwise: error
+ */
+static int
+iscsi_op_login_rsp_handle_csg_bit(struct spdk_iscsi_conn *conn,
+ struct spdk_iscsi_pdu *rsp_pdu,
+ struct iscsi_param *params)
+{
+ const char *auth_method;
+ int rc;
+ struct iscsi_bhs_login_rsp *rsph;
+ rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs;
+
+ switch (ISCSI_BHS_LOGIN_GET_CSG(rsph->flags)) {
+ case ISCSI_SECURITY_NEGOTIATION_PHASE:
+ /* SecurityNegotiation */
+ auth_method = iscsi_param_get_val(conn->params, "AuthMethod");
+ if (auth_method == NULL) {
+ SPDK_ERRLOG("AuthMethod is empty\n");
+ /* Missing parameter */
+ rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+ rsph->status_detail = ISCSI_LOGIN_MISSING_PARMS;
+ return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+ }
+ if (strcasecmp(auth_method, "None") == 0) {
+ conn->authenticated = true;
+ } else {
+ rc = iscsi_auth_params(conn, params, auth_method,
+ rsp_pdu->data, rsp_pdu->data_buf_len,
+ rsp_pdu->data_segment_len);
+ if (rc < 0) {
+ SPDK_ERRLOG("iscsi_auth_params() failed\n");
+ /* Authentication failure */
+ rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+ rsph->status_detail = ISCSI_LOGIN_AUTHENT_FAIL;
+ return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+ }
+ rsp_pdu->data_segment_len = rc;
+ if (!conn->authenticated) {
+ /* not complete */
+ rsph->flags &= ~ISCSI_LOGIN_TRANSIT;
+ } else {
+ if (conn->auth.chap_phase != ISCSI_CHAP_PHASE_END) {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "CHAP phase not complete");
+ }
+ }
+
+ SPDK_LOGDUMP(SPDK_LOG_ISCSI, "Negotiated Auth Params",
+ rsp_pdu->data, rsp_pdu->data_segment_len);
+ }
+ break;
+
+ case ISCSI_OPERATIONAL_NEGOTIATION_PHASE:
+ /* LoginOperationalNegotiation */
+ if (conn->state == ISCSI_CONN_STATE_INVALID) {
+ if (conn->require_chap) {
+ /* Authentication failure */
+ rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+ rsph->status_detail = ISCSI_LOGIN_AUTHENT_FAIL;
+ return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+ } else {
+ /* AuthMethod=None */
+ conn->authenticated = true;
+ }
+ }
+ if (!conn->authenticated) {
+ SPDK_ERRLOG("authentication error\n");
+ /* Authentication failure */
+ rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+ rsph->status_detail = ISCSI_LOGIN_AUTHENT_FAIL;
+ return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+ }
+ break;
+
+ case ISCSI_FULL_FEATURE_PHASE:
+ /* FullFeaturePhase */
+ SPDK_ERRLOG("XXX Login in FullFeaturePhase\n");
+ /* Initiator error */
+ rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+ rsph->status_detail = ISCSI_LOGIN_INITIATOR_ERROR;
+ return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+
+ default:
+ SPDK_ERRLOG("unknown stage\n");
+ /* Initiator error */
+ rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+ rsph->status_detail = ISCSI_LOGIN_INITIATOR_ERROR;
+ return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+ }
+
+ return 0;
+}
+
+/* This function is used to notify the session info
+ * return
+ * 0: success
+ * otherwise: error
+ */
+static int
+iscsi_op_login_notify_session_info(struct spdk_iscsi_conn *conn,
+ struct spdk_iscsi_pdu *rsp_pdu)
+{
+ struct iscsi_bhs_login_rsp *rsph;
+
+ rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs;
+ if (conn->sess->session_type == SESSION_TYPE_NORMAL) {
+ /* normal session */
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Login from %s (%s) on %s tgt_node%d"
+ " (%s:%s,%d), ISID=%"PRIx64", TSIH=%u,"
+ " CID=%u, HeaderDigest=%s, DataDigest=%s\n",
+ conn->initiator_name, conn->initiator_addr,
+ conn->target->name, conn->target->num,
+ conn->portal_host, conn->portal_port, conn->pg_tag,
+ conn->sess->isid, conn->sess->tsih, conn->cid,
+ (iscsi_param_eq_val(conn->params, "HeaderDigest", "CRC32C")
+ ? "on" : "off"),
+ (iscsi_param_eq_val(conn->params, "DataDigest", "CRC32C")
+ ? "on" : "off"));
+ } else if (conn->sess->session_type == SESSION_TYPE_DISCOVERY) {
+ /* discovery session */
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Login(discovery) from %s (%s) on"
+ " (%s:%s,%d), ISID=%"PRIx64", TSIH=%u,"
+ " CID=%u, HeaderDigest=%s, DataDigest=%s\n",
+ conn->initiator_name, conn->initiator_addr,
+ conn->portal_host, conn->portal_port, conn->pg_tag,
+ conn->sess->isid, conn->sess->tsih, conn->cid,
+ (iscsi_param_eq_val(conn->params, "HeaderDigest", "CRC32C")
+ ? "on" : "off"),
+ (iscsi_param_eq_val(conn->params, "DataDigest", "CRC32C")
+ ? "on" : "off"));
+ } else {
+ SPDK_ERRLOG("unknown session type\n");
+ /* Initiator error */
+ rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+ rsph->status_detail = ISCSI_LOGIN_INITIATOR_ERROR;
+ return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+ }
+
+ return 0;
+}
+
+/*
+ * This function is to handle the tbit cases
+ * return
+ * 0: success
+ * otherwise error
+ */
+static int
+iscsi_op_login_rsp_handle_t_bit(struct spdk_iscsi_conn *conn,
+ struct spdk_iscsi_pdu *rsp_pdu)
+{
+ int rc;
+ struct iscsi_bhs_login_rsp *rsph;
+ rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs;
+
+ switch (ISCSI_BHS_LOGIN_GET_NSG(rsph->flags)) {
+ case ISCSI_SECURITY_NEGOTIATION_PHASE:
+ /* SecurityNegotiation */
+ conn->login_phase = ISCSI_SECURITY_NEGOTIATION_PHASE;
+ break;
+
+ case ISCSI_OPERATIONAL_NEGOTIATION_PHASE:
+ /* LoginOperationalNegotiation */
+ conn->login_phase = ISCSI_OPERATIONAL_NEGOTIATION_PHASE;
+ break;
+
+ case ISCSI_FULL_FEATURE_PHASE:
+ /* FullFeaturePhase */
+ conn->login_phase = ISCSI_FULL_FEATURE_PHASE;
+ to_be16(&rsph->tsih, conn->sess->tsih);
+
+ rc = iscsi_op_login_notify_session_info(conn, rsp_pdu);
+ if (rc < 0) {
+ return rc;
+ }
+
+ conn->full_feature = 1;
+ break;
+
+ default:
+ SPDK_ERRLOG("unknown stage\n");
+ /* Initiator error */
+ rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+ rsph->status_detail = ISCSI_LOGIN_INITIATOR_ERROR;
+ return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+ }
+
+ return 0;
+}
+
+/*
+ * This function is used to set the values of the internal data structure used
+ * by spdk_iscsi_op_login function
+ * return:
+ * 0, used to notify the a successful login
+ * SPDK_ISCSI_LOGIN_ERROR_RESPONSE, used to notify a failure login.
+ */
+static int
+iscsi_op_login_rsp_handle(struct spdk_iscsi_conn *conn,
+ struct spdk_iscsi_pdu *rsp_pdu, struct iscsi_param **params)
+{
+ int rc;
+ struct iscsi_bhs_login_rsp *rsph;
+ rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs;
+
+ /* negotiate parameters */
+ rc = iscsi_negotiate_params(conn, params, rsp_pdu->data,
+ rsp_pdu->data_buf_len,
+ rsp_pdu->data_segment_len);
+ if (rc < 0) {
+ /*
+ * iscsi_negotiate_params just returns -1 on failure,
+ * so translate this into meaningful response codes and
+ * return values.
+ */
+ rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+ rsph->status_detail = ISCSI_LOGIN_INITIATOR_ERROR;
+ return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+ }
+
+ rsp_pdu->data_segment_len = rc;
+ SPDK_LOGDUMP(SPDK_LOG_ISCSI, "Negotiated Params", rsp_pdu->data, rc);
+
+ /* handle the CSG bit case */
+ rc = iscsi_op_login_rsp_handle_csg_bit(conn, rsp_pdu, *params);
+ if (rc < 0) {
+ return rc;
+ }
+
+ /* handle the T bit case */
+ if (ISCSI_BHS_LOGIN_GET_TBIT(rsph->flags)) {
+ rc = iscsi_op_login_rsp_handle_t_bit(conn, rsp_pdu);
+ }
+
+ return rc;
+}
+
+static int
+iscsi_pdu_hdr_op_login(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu)
+{
+ int rc;
+ struct iscsi_bhs_login_req *reqh;
+ struct spdk_iscsi_pdu *rsp_pdu;
+
+ if (conn->full_feature && conn->sess != NULL &&
+ conn->sess->session_type == SESSION_TYPE_DISCOVERY) {
+ return SPDK_ISCSI_CONNECTION_FATAL;
+ }
+
+ reqh = (struct iscsi_bhs_login_req *)&pdu->bhs;
+ pdu->cmd_sn = from_be32(&reqh->cmd_sn);
+
+ /* During login processing, use the 8KB default FirstBurstLength as
+ * our maximum data segment length value.
+ */
+ if (pdu->data_segment_len > SPDK_ISCSI_FIRST_BURST_LENGTH) {
+ return iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR);
+ }
+
+ rsp_pdu = iscsi_get_pdu(conn);
+ if (rsp_pdu == NULL) {
+ return SPDK_ISCSI_CONNECTION_FATAL;
+ }
+ rc = iscsi_op_login_rsp_init(conn, pdu, rsp_pdu);
+ if (rc < 0) {
+ iscsi_op_login_response(conn, rsp_pdu, NULL, iscsi_conn_login_pdu_err_complete);
+ return 0;
+ }
+
+ conn->login_rsp_pdu = rsp_pdu;
+ return 0;
+}
+
+static int
+iscsi_pdu_payload_op_login(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu)
+{
+ int rc;
+ struct iscsi_bhs_login_req *reqh;
+ struct spdk_iscsi_pdu *rsp_pdu;
+ struct iscsi_param *params = NULL;
+ int cid;
+
+ if (conn->login_rsp_pdu == NULL) {
+ return 0;
+ }
+
+ rsp_pdu = conn->login_rsp_pdu;
+
+ reqh = (struct iscsi_bhs_login_req *)&pdu->bhs;
+ cid = from_be16(&reqh->cid);
+
+ rc = iscsi_op_login_store_incoming_params(conn, pdu, rsp_pdu, &params);
+ if (rc < 0) {
+ iscsi_op_login_response(conn, rsp_pdu, NULL, iscsi_conn_login_pdu_err_complete);
+ return 0;
+ }
+
+ if (conn->state == ISCSI_CONN_STATE_INVALID) {
+ rc = iscsi_op_login_phase_none(conn, rsp_pdu, params, cid);
+ if (rc == SPDK_ISCSI_LOGIN_ERROR_RESPONSE || rc == SPDK_ISCSI_LOGIN_ERROR_PARAMETER) {
+ iscsi_op_login_response(conn, rsp_pdu, params, iscsi_conn_login_pdu_err_complete);
+ return 0;
+ }
+ }
+
+ rc = iscsi_op_login_rsp_handle(conn, rsp_pdu, &params);
+ if (rc == SPDK_ISCSI_LOGIN_ERROR_RESPONSE) {
+ iscsi_op_login_response(conn, rsp_pdu, params, iscsi_conn_login_pdu_err_complete);
+ return 0;
+ }
+
+ iscsi_op_login_response(conn, rsp_pdu, params, iscsi_conn_login_pdu_success_complete);
+ return 0;
+}
+
+static int
+iscsi_pdu_hdr_op_text(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu)
+{
+ uint32_t task_tag;
+ uint32_t ExpStatSN;
+ int F_bit, C_bit;
+ struct iscsi_bhs_text_req *reqh;
+
+ if (pdu->data_segment_len > iscsi_get_max_immediate_data_size()) {
+ SPDK_ERRLOG("data segment len(=%zu) > immediate data len(=%"PRIu32")\n",
+ pdu->data_segment_len, iscsi_get_max_immediate_data_size());
+ return iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR);
+ }
+
+ reqh = (struct iscsi_bhs_text_req *)&pdu->bhs;
+
+ F_bit = !!(reqh->flags & ISCSI_FLAG_FINAL);
+ C_bit = !!(reqh->flags & ISCSI_TEXT_CONTINUE);
+ task_tag = from_be32(&reqh->itt);
+ ExpStatSN = from_be32(&reqh->exp_stat_sn);
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "I=%d, F=%d, C=%d, ITT=%x, TTT=%x\n",
+ reqh->immediate, F_bit, C_bit, task_tag, from_be32(&reqh->ttt));
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI,
+ "CmdSN=%u, ExpStatSN=%u, StatSN=%u, ExpCmdSN=%u, MaxCmdSN=%u\n",
+ pdu->cmd_sn, ExpStatSN, conn->StatSN, conn->sess->ExpCmdSN,
+ conn->sess->MaxCmdSN);
+
+ if (ExpStatSN != conn->StatSN) {
+#if 0
+ SPDK_ERRLOG("StatSN(%u) error\n", ExpStatSN);
+ return -1;
+#else
+ /* StarPort have a bug */
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "StatSN(%u) rewound\n", ExpStatSN);
+ conn->StatSN = ExpStatSN;
+#endif
+ }
+
+ if (F_bit && C_bit) {
+ SPDK_ERRLOG("final and continue\n");
+ return -1;
+ }
+
+ /*
+ * If this is the first text op in a sequence, save the ITT so we can
+ * compare it against the ITT for subsequent ops in the same sequence.
+ * If a subsequent text op in same sequence has a different ITT, reject
+ * that PDU.
+ */
+ if (conn->sess->current_text_itt == 0xffffffffU) {
+ conn->sess->current_text_itt = task_tag;
+ } else if (conn->sess->current_text_itt != task_tag) {
+ SPDK_ERRLOG("The correct itt is %u, and the current itt is %u...\n",
+ conn->sess->current_text_itt, task_tag);
+ return iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR);
+ }
+
+ return 0;
+}
+
+static void
+iscsi_conn_text_pdu_complete(void *arg)
+{
+ struct spdk_iscsi_conn *conn = arg;
+
+ iscsi_conn_params_update(conn);
+}
+
+static int
+iscsi_pdu_payload_op_text(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu)
+{
+ struct iscsi_param *params = NULL;
+ struct spdk_iscsi_pdu *rsp_pdu;
+ uint8_t *data;
+ uint64_t lun;
+ uint32_t task_tag;
+ const char *val;
+ int F_bit, C_bit;
+ int data_len;
+ int alloc_len;
+ int rc;
+ struct iscsi_bhs_text_req *reqh;
+ struct iscsi_bhs_text_resp *rsph;
+
+ data_len = 0;
+ alloc_len = conn->MaxRecvDataSegmentLength;
+
+ reqh = (struct iscsi_bhs_text_req *)&pdu->bhs;
+
+ F_bit = !!(reqh->flags & ISCSI_FLAG_FINAL);
+ C_bit = !!(reqh->flags & ISCSI_TEXT_CONTINUE);
+ lun = from_be64(&reqh->lun);
+ task_tag = from_be32(&reqh->itt);
+
+ /* store incoming parameters */
+ rc = iscsi_parse_params(&params, pdu->data, pdu->data_segment_len,
+ C_bit, &conn->partial_text_parameter);
+ if (rc < 0) {
+ SPDK_ERRLOG("iscsi_parse_params() failed\n");
+ iscsi_param_free(params);
+ return -1;
+ }
+
+ data = calloc(1, alloc_len);
+ if (!data) {
+ SPDK_ERRLOG("calloc() failed for data segment\n");
+ iscsi_param_free(params);
+ return -ENOMEM;
+ }
+
+ /* negotiate parameters */
+ data_len = iscsi_negotiate_params(conn, &params,
+ data, alloc_len, data_len);
+ if (data_len < 0) {
+ SPDK_ERRLOG("iscsi_negotiate_params() failed\n");
+ iscsi_param_free(params);
+ free(data);
+ return -1;
+ }
+
+ /* sendtargets is special case */
+ val = iscsi_param_get_val(params, "SendTargets");
+ if (val != NULL) {
+ if (iscsi_param_eq_val(conn->sess->params,
+ "SessionType", "Discovery")) {
+ if (strcasecmp(val, "") == 0) {
+ val = "ALL";
+ }
+
+ data_len = iscsi_send_tgts(conn,
+ conn->initiator_name,
+ conn->initiator_addr,
+ val, data, alloc_len,
+ data_len);
+ } else {
+ if (strcasecmp(val, "") == 0) {
+ val = conn->target->name;
+ }
+
+ if (strcasecmp(val, "ALL") == 0) {
+ /* not in discovery session */
+ data_len = iscsi_append_text(conn,
+ "SendTargets",
+ "Reject", data,
+ alloc_len, data_len);
+ } else {
+ data_len = iscsi_send_tgts(conn,
+ conn->initiator_name,
+ conn->initiator_addr,
+ val, data, alloc_len,
+ data_len);
+ }
+ }
+ } else {
+ if (iscsi_param_eq_val(conn->sess->params, "SessionType", "Discovery")) {
+ iscsi_param_free(params);
+ free(data);
+ return SPDK_ISCSI_CONNECTION_FATAL;
+ }
+ }
+
+ iscsi_param_free(params);
+ SPDK_LOGDUMP(SPDK_LOG_ISCSI, "Negotiated Params", data, data_len);
+
+ /* response PDU */
+ rsp_pdu = iscsi_get_pdu(conn);
+ if (rsp_pdu == NULL) {
+ free(data);
+ return SPDK_ISCSI_CONNECTION_FATAL;
+ }
+ rsph = (struct iscsi_bhs_text_resp *)&rsp_pdu->bhs;
+
+ rsp_pdu->data = data;
+ rsph->opcode = ISCSI_OP_TEXT_RSP;
+
+ if (F_bit) {
+ rsph->flags |= ISCSI_FLAG_FINAL;
+ }
+
+ if (C_bit) {
+ rsph->flags |= ISCSI_TEXT_CONTINUE;
+ }
+
+ DSET24(rsph->data_segment_len, data_len);
+ to_be64(&rsph->lun, lun);
+ to_be32(&rsph->itt, task_tag);
+
+ if (F_bit) {
+ rsph->ttt = 0xffffffffU;
+ conn->sess->current_text_itt = 0xffffffffU;
+ } else {
+ to_be32(&rsph->ttt, 1 + conn->id);
+ }
+
+ to_be32(&rsph->stat_sn, conn->StatSN);
+ conn->StatSN++;
+
+ if (reqh->immediate == 0) {
+ conn->sess->MaxCmdSN++;
+ }
+
+ to_be32(&rsph->exp_cmd_sn, conn->sess->ExpCmdSN);
+ to_be32(&rsph->max_cmd_sn, conn->sess->MaxCmdSN);
+
+ iscsi_conn_write_pdu(conn, rsp_pdu, iscsi_conn_text_pdu_complete, conn);
+ return 0;
+}
+
+static void iscsi_conn_logout_pdu_complete(void *arg)
+{
+ struct spdk_iscsi_conn *conn = arg;
+
+ if (conn->sess == NULL) {
+ /*
+ * login failed but initiator still sent a logout rather than
+ * just closing the TCP connection.
+ */
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Logout(login failed) from %s (%s) on"
+ " (%s:%s,%d)\n",
+ conn->initiator_name, conn->initiator_addr,
+ conn->portal_host, conn->portal_port, conn->pg_tag);
+ } else if (iscsi_param_eq_val(conn->sess->params, "SessionType", "Normal")) {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Logout from %s (%s) on %s tgt_node%d"
+ " (%s:%s,%d), ISID=%"PRIx64", TSIH=%u,"
+ " CID=%u, HeaderDigest=%s, DataDigest=%s\n",
+ conn->initiator_name, conn->initiator_addr,
+ conn->target->name, conn->target->num,
+ conn->portal_host, conn->portal_port, conn->pg_tag,
+ conn->sess->isid, conn->sess->tsih, conn->cid,
+ (iscsi_param_eq_val(conn->params, "HeaderDigest", "CRC32C")
+ ? "on" : "off"),
+ (iscsi_param_eq_val(conn->params, "DataDigest", "CRC32C")
+ ? "on" : "off"));
+ } else {
+ /* discovery session */
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Logout(discovery) from %s (%s) on"
+ " (%s:%s,%d), ISID=%"PRIx64", TSIH=%u,"
+ " CID=%u, HeaderDigest=%s, DataDigest=%s\n",
+ conn->initiator_name, conn->initiator_addr,
+ conn->portal_host, conn->portal_port, conn->pg_tag,
+ conn->sess->isid, conn->sess->tsih, conn->cid,
+ (iscsi_param_eq_val(conn->params, "HeaderDigest", "CRC32C")
+ ? "on" : "off"),
+ (iscsi_param_eq_val(conn->params, "DataDigest", "CRC32C")
+ ? "on" : "off"));
+ }
+}
+
+static int
+iscsi_pdu_hdr_op_logout(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu)
+{
+ struct spdk_iscsi_pdu *rsp_pdu;
+ uint32_t task_tag;
+ uint32_t ExpStatSN;
+ int response;
+ struct iscsi_bhs_logout_req *reqh;
+ struct iscsi_bhs_logout_resp *rsph;
+ uint16_t cid;
+
+ reqh = (struct iscsi_bhs_logout_req *)&pdu->bhs;
+
+ cid = from_be16(&reqh->cid);
+ task_tag = from_be32(&reqh->itt);
+ ExpStatSN = from_be32(&reqh->exp_stat_sn);
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "reason=%d, ITT=%x, cid=%d\n",
+ reqh->reason, task_tag, cid);
+
+ if (conn->sess != NULL) {
+ if (conn->sess->session_type == SESSION_TYPE_DISCOVERY &&
+ reqh->reason != ISCSI_LOGOUT_REASON_CLOSE_SESSION) {
+ SPDK_ERRLOG("Target can accept logout only with reason \"close the session\" "
+ "on discovery session. %d is not acceptable reason.\n",
+ reqh->reason);
+ return SPDK_ISCSI_CONNECTION_FATAL;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI,
+ "CmdSN=%u, ExpStatSN=%u, StatSN=%u, ExpCmdSN=%u, MaxCmdSN=%u\n",
+ pdu->cmd_sn, ExpStatSN, conn->StatSN,
+ conn->sess->ExpCmdSN, conn->sess->MaxCmdSN);
+
+ if (pdu->cmd_sn != conn->sess->ExpCmdSN) {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "CmdSN(%u) might have dropped\n", pdu->cmd_sn);
+ /* ignore error */
+ }
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "CmdSN=%u, ExpStatSN=%u, StatSN=%u\n",
+ pdu->cmd_sn, ExpStatSN, conn->StatSN);
+ }
+
+ if (ExpStatSN != conn->StatSN) {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "StatSN(%u/%u) might have dropped\n",
+ ExpStatSN, conn->StatSN);
+ /* ignore error */
+ }
+
+ if (conn->id == cid) {
+ /* connection or session closed successfully */
+ response = 0;
+ iscsi_conn_logout(conn);
+ } else {
+ response = 1;
+ }
+
+ /* response PDU */
+ rsp_pdu = iscsi_get_pdu(conn);
+ if (rsp_pdu == NULL) {
+ return SPDK_ISCSI_CONNECTION_FATAL;
+ }
+ rsph = (struct iscsi_bhs_logout_resp *)&rsp_pdu->bhs;
+ rsp_pdu->data = NULL;
+ rsph->opcode = ISCSI_OP_LOGOUT_RSP;
+ rsph->flags |= 0x80; /* bit 0 must be 1 */
+ rsph->response = response;
+ DSET24(rsph->data_segment_len, 0);
+ to_be32(&rsph->itt, task_tag);
+
+ if (conn->sess != NULL) {
+ to_be32(&rsph->stat_sn, conn->StatSN);
+ conn->StatSN++;
+
+ if (conn->sess->connections == 1) {
+ conn->sess->MaxCmdSN++;
+ }
+
+ to_be32(&rsph->exp_cmd_sn, conn->sess->ExpCmdSN);
+ to_be32(&rsph->max_cmd_sn, conn->sess->MaxCmdSN);
+ } else {
+ to_be32(&rsph->stat_sn, conn->StatSN);
+ conn->StatSN++;
+ to_be32(&rsph->exp_cmd_sn, pdu->cmd_sn);
+ to_be32(&rsph->max_cmd_sn, pdu->cmd_sn);
+ }
+
+ rsph->time_2_wait = 0;
+ rsph->time_2_retain = 0;
+
+ iscsi_conn_write_pdu(conn, rsp_pdu, iscsi_conn_logout_pdu_complete, conn);
+
+ return 0;
+}
+
+static int
+iscsi_send_r2t(struct spdk_iscsi_conn *conn,
+ struct spdk_iscsi_task *task, int offset,
+ int len, uint32_t transfer_tag, uint32_t *R2TSN)
+{
+ struct spdk_iscsi_pdu *rsp_pdu;
+ struct iscsi_bhs_r2t *rsph;
+ uint64_t fmt_lun;
+
+ /* R2T PDU */
+ rsp_pdu = iscsi_get_pdu(conn);
+ if (rsp_pdu == NULL) {
+ return SPDK_ISCSI_CONNECTION_FATAL;
+ }
+ rsph = (struct iscsi_bhs_r2t *)&rsp_pdu->bhs;
+ rsp_pdu->data = NULL;
+ rsph->opcode = ISCSI_OP_R2T;
+ rsph->flags |= 0x80; /* bit 0 is default to 1 */
+ fmt_lun = spdk_scsi_lun_id_int_to_fmt(task->lun_id);
+ to_be64(&rsph->lun, fmt_lun);
+ to_be32(&rsph->itt, task->tag);
+ to_be32(&rsph->ttt, transfer_tag);
+
+ to_be32(&rsph->stat_sn, conn->StatSN);
+ to_be32(&rsph->exp_cmd_sn, conn->sess->ExpCmdSN);
+ to_be32(&rsph->max_cmd_sn, conn->sess->MaxCmdSN);
+
+ to_be32(&rsph->r2t_sn, *R2TSN);
+ *R2TSN += 1;
+
+ task->r2t_datasn = 0; /* next expected datasn to ack */
+
+ to_be32(&rsph->buffer_offset, (uint32_t)offset);
+ to_be32(&rsph->desired_xfer_len, (uint32_t)len);
+ task->desired_data_transfer_length = (size_t)len;
+
+ /* we need to hold onto this task/cmd because until the PDU has been
+ * written out */
+ rsp_pdu->task = task;
+ task->scsi.ref++;
+
+ iscsi_conn_write_pdu(conn, rsp_pdu, iscsi_conn_pdu_generic_complete, NULL);
+
+ return 0;
+}
+
+/* This function is used to remove the r2t pdu from snack_pdu_list by < task, r2t_sn> info */
+static struct spdk_iscsi_pdu *
+iscsi_remove_r2t_pdu_from_snack_list(struct spdk_iscsi_conn *conn,
+ struct spdk_iscsi_task *task,
+ uint32_t r2t_sn)
+{
+ struct spdk_iscsi_pdu *pdu;
+ struct iscsi_bhs_r2t *r2t_header;
+
+ TAILQ_FOREACH(pdu, &conn->snack_pdu_list, tailq) {
+ if (pdu->bhs.opcode == ISCSI_OP_R2T) {
+ r2t_header = (struct iscsi_bhs_r2t *)&pdu->bhs;
+ if (pdu->task == task &&
+ from_be32(&r2t_header->r2t_sn) == r2t_sn) {
+ TAILQ_REMOVE(&conn->snack_pdu_list, pdu, tailq);
+ return pdu;
+ }
+ }
+ }
+
+ return NULL;
+}
+
+/* This function is used re-send the r2t packet */
+static int
+iscsi_send_r2t_recovery(struct spdk_iscsi_conn *conn,
+ struct spdk_iscsi_task *task, uint32_t r2t_sn,
+ bool send_new_r2tsn)
+{
+ struct spdk_iscsi_pdu *pdu;
+ struct iscsi_bhs_r2t *rsph;
+ uint32_t transfer_len;
+ uint32_t len;
+ int rc;
+
+ /* remove the r2t pdu from the snack_list */
+ pdu = iscsi_remove_r2t_pdu_from_snack_list(conn, task, r2t_sn);
+ if (!pdu) {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "No pdu is found\n");
+ return -1;
+ }
+
+ /* flag
+ * false: only need to re-send the old r2t with changing statsn
+ * true: we send a r2t with new r2tsn
+ */
+ if (!send_new_r2tsn) {
+ to_be32(&pdu->bhs.stat_sn, conn->StatSN);
+ iscsi_conn_write_pdu(conn, pdu, iscsi_conn_pdu_generic_complete, NULL);
+ } else {
+ rsph = (struct iscsi_bhs_r2t *)&pdu->bhs;
+ transfer_len = from_be32(&rsph->desired_xfer_len);
+
+ /* still need to increase the acked r2tsn */
+ task->acked_r2tsn++;
+ len = spdk_min(conn->sess->MaxBurstLength,
+ (transfer_len - task->next_expected_r2t_offset));
+
+ /* remove the old_r2t_pdu */
+ iscsi_conn_free_pdu(conn, pdu);
+
+ /* re-send a new r2t pdu */
+ rc = iscsi_send_r2t(conn, task, task->next_expected_r2t_offset,
+ len, task->ttt, &task->R2TSN);
+ if (rc < 0) {
+ return SPDK_ISCSI_CONNECTION_FATAL;
+ }
+ }
+
+ return 0;
+}
+
+static int
+add_transfer_task(struct spdk_iscsi_conn *conn, struct spdk_iscsi_task *task)
+{
+ uint32_t transfer_len;
+ size_t max_burst_len;
+ size_t segment_len;
+ size_t data_len;
+ int len;
+ int rc;
+ int data_out_req;
+
+ transfer_len = task->scsi.transfer_len;
+ data_len = iscsi_task_get_pdu(task)->data_segment_len;
+ max_burst_len = conn->sess->MaxBurstLength;
+ segment_len = SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH;
+ data_out_req = 1 + (transfer_len - data_len - 1) / segment_len;
+ task->data_out_cnt = data_out_req;
+
+ /*
+ * If we already have too many tasks using R2T, then queue this task
+ * and start sending R2T for it after some of the tasks using R2T/data
+ * out buffers complete.
+ */
+ if (conn->pending_r2t >= DEFAULT_MAXR2T) {
+ TAILQ_INSERT_TAIL(&conn->queued_r2t_tasks, task, link);
+ return 0;
+ }
+
+ conn->data_out_cnt += data_out_req;
+ conn->pending_r2t++;
+
+ task->next_expected_r2t_offset = data_len;
+ task->current_r2t_length = 0;
+ task->R2TSN = 0;
+ /* According to RFC3720 10.8.5, 0xffffffff is
+ * reserved for TTT in R2T.
+ */
+ if (++conn->ttt == 0xffffffffu) {
+ conn->ttt = 0;
+ }
+ task->ttt = conn->ttt;
+
+ while (data_len != transfer_len) {
+ len = spdk_min(max_burst_len, (transfer_len - data_len));
+ rc = iscsi_send_r2t(conn, task, data_len, len,
+ task->ttt, &task->R2TSN);
+ if (rc < 0) {
+ SPDK_ERRLOG("iscsi_send_r2t() failed\n");
+ return rc;
+ }
+ data_len += len;
+ task->next_r2t_offset = data_len;
+ task->outstanding_r2t++;
+ if (conn->sess->MaxOutstandingR2T == task->outstanding_r2t) {
+ break;
+ }
+ }
+
+ TAILQ_INSERT_TAIL(&conn->active_r2t_tasks, task, link);
+ task->is_r2t_active = true;
+ return 0;
+}
+
+/* If there are additional large writes queued for R2Ts, start them now.
+ * This is called when a large write is just completed or when multiple LUNs
+ * are attached and large write tasks for the specific LUN are cleared.
+ */
+static void
+start_queued_transfer_tasks(struct spdk_iscsi_conn *conn)
+{
+ struct spdk_iscsi_task *task, *tmp;
+
+ TAILQ_FOREACH_SAFE(task, &conn->queued_r2t_tasks, link, tmp) {
+ if (conn->pending_r2t < DEFAULT_MAXR2T) {
+ TAILQ_REMOVE(&conn->queued_r2t_tasks, task, link);
+ add_transfer_task(conn, task);
+ } else {
+ break;
+ }
+ }
+}
+
+bool
+iscsi_del_transfer_task(struct spdk_iscsi_conn *conn, uint32_t task_tag)
+{
+ struct spdk_iscsi_task *task, *tmp;
+
+ TAILQ_FOREACH_SAFE(task, &conn->active_r2t_tasks, link, tmp) {
+ if (task->tag == task_tag) {
+ assert(conn->data_out_cnt >= task->data_out_cnt);
+ conn->data_out_cnt -= task->data_out_cnt;
+
+ conn->pending_r2t--;
+
+ assert(task->is_r2t_active == true);
+ TAILQ_REMOVE(&conn->active_r2t_tasks, task, link);
+ task->is_r2t_active = false;
+ iscsi_task_put(task);
+
+ start_queued_transfer_tasks(conn);
+ return true;
+ }
+ }
+ return false;
+}
+
+void iscsi_clear_all_transfer_task(struct spdk_iscsi_conn *conn,
+ struct spdk_scsi_lun *lun,
+ struct spdk_iscsi_pdu *pdu)
+{
+ struct spdk_iscsi_task *task, *task_tmp;
+ struct spdk_iscsi_pdu *pdu_tmp;
+
+ TAILQ_FOREACH_SAFE(task, &conn->active_r2t_tasks, link, task_tmp) {
+ pdu_tmp = iscsi_task_get_pdu(task);
+ if ((lun == NULL || lun == task->scsi.lun) &&
+ (pdu == NULL || spdk_sn32_lt(pdu_tmp->cmd_sn, pdu->cmd_sn))) {
+ task->outstanding_r2t = 0;
+ task->next_r2t_offset = 0;
+ task->next_expected_r2t_offset = 0;
+ assert(conn->data_out_cnt >= task->data_out_cnt);
+ conn->data_out_cnt -= task->data_out_cnt;
+ conn->pending_r2t--;
+
+ TAILQ_REMOVE(&conn->active_r2t_tasks, task, link);
+ task->is_r2t_active = false;
+ if (lun != NULL && spdk_scsi_lun_is_removing(lun)) {
+ spdk_scsi_task_process_null_lun(&task->scsi);
+ iscsi_task_response(conn, task);
+ }
+ iscsi_task_put(task);
+ }
+ }
+
+ TAILQ_FOREACH_SAFE(task, &conn->queued_r2t_tasks, link, task_tmp) {
+ pdu_tmp = iscsi_task_get_pdu(task);
+ if ((lun == NULL || lun == task->scsi.lun) &&
+ (pdu == NULL || spdk_sn32_lt(pdu_tmp->cmd_sn, pdu->cmd_sn))) {
+ TAILQ_REMOVE(&conn->queued_r2t_tasks, task, link);
+ task->is_r2t_active = false;
+ if (lun != NULL && spdk_scsi_lun_is_removing(lun)) {
+ spdk_scsi_task_process_null_lun(&task->scsi);
+ iscsi_task_response(conn, task);
+ }
+ iscsi_task_put(task);
+ }
+ }
+
+ start_queued_transfer_tasks(conn);
+}
+
+static struct spdk_iscsi_task *
+get_transfer_task(struct spdk_iscsi_conn *conn, uint32_t transfer_tag)
+{
+ struct spdk_iscsi_task *task;
+
+ TAILQ_FOREACH(task, &conn->active_r2t_tasks, link) {
+ if (task->ttt == transfer_tag) {
+ return task;
+ }
+ }
+
+ return NULL;
+}
+
+static void
+iscsi_conn_datain_pdu_complete(void *arg)
+{
+ struct spdk_iscsi_conn *conn = arg;
+
+ iscsi_conn_handle_queued_datain_tasks(conn);
+}
+
+static int
+iscsi_send_datain(struct spdk_iscsi_conn *conn,
+ struct spdk_iscsi_task *task, int datain_flag,
+ int residual_len, int offset, int DataSN, int len)
+{
+ struct spdk_iscsi_pdu *rsp_pdu;
+ struct iscsi_bhs_data_in *rsph;
+ uint32_t task_tag;
+ uint32_t transfer_tag;
+ int F_bit, U_bit, O_bit, S_bit;
+ struct spdk_iscsi_task *primary;
+ struct spdk_scsi_lun *lun_dev;
+
+ primary = iscsi_task_get_primary(task);
+
+ /* DATA PDU */
+ rsp_pdu = iscsi_get_pdu(conn);
+ rsph = (struct iscsi_bhs_data_in *)&rsp_pdu->bhs;
+ rsp_pdu->data = task->scsi.iovs[0].iov_base + offset;
+ rsp_pdu->data_buf_len = task->scsi.iovs[0].iov_len - offset;
+ rsp_pdu->data_from_mempool = true;
+
+ task_tag = task->tag;
+ transfer_tag = 0xffffffffU;
+
+ F_bit = datain_flag & ISCSI_FLAG_FINAL;
+ O_bit = datain_flag & ISCSI_DATAIN_OVERFLOW;
+ U_bit = datain_flag & ISCSI_DATAIN_UNDERFLOW;
+ S_bit = datain_flag & ISCSI_DATAIN_STATUS;
+
+ /*
+ * we need to hold onto this task/cmd because until the
+ * PDU has been written out
+ */
+ rsp_pdu->task = task;
+ task->scsi.ref++;
+
+ rsph->opcode = ISCSI_OP_SCSI_DATAIN;
+
+ if (F_bit) {
+ rsph->flags |= ISCSI_FLAG_FINAL;
+ }
+
+ /* we leave the A_bit clear */
+
+ if (F_bit && S_bit) {
+ if (O_bit) {
+ rsph->flags |= ISCSI_DATAIN_OVERFLOW;
+ }
+
+ if (U_bit) {
+ rsph->flags |= ISCSI_DATAIN_UNDERFLOW;
+ }
+ }
+
+ if (S_bit) {
+ rsph->flags |= ISCSI_DATAIN_STATUS;
+ rsph->status = task->scsi.status;
+ }
+
+ DSET24(rsph->data_segment_len, len);
+
+ to_be32(&rsph->itt, task_tag);
+ to_be32(&rsph->ttt, transfer_tag);
+
+ if (S_bit) {
+ to_be32(&rsph->stat_sn, conn->StatSN);
+ conn->StatSN++;
+ }
+
+ if (F_bit && S_bit && !iscsi_task_is_immediate(primary)) {
+ conn->sess->MaxCmdSN++;
+ }
+
+ to_be32(&rsph->exp_cmd_sn, conn->sess->ExpCmdSN);
+ to_be32(&rsph->max_cmd_sn, conn->sess->MaxCmdSN);
+
+ to_be32(&rsph->data_sn, DataSN);
+
+ if (conn->sess->ErrorRecoveryLevel >= 1) {
+ primary->datain_datasn = DataSN;
+ }
+ DataSN++;
+
+ if (task->parent) {
+ offset += primary->scsi.data_transferred;
+ }
+ to_be32(&rsph->buffer_offset, (uint32_t)offset);
+ task->scsi.offset = offset;
+
+ if (F_bit && S_bit) {
+ to_be32(&rsph->res_cnt, residual_len);
+ }
+
+ lun_dev = spdk_scsi_dev_get_lun(conn->dev, task->lun_id);
+ if (spdk_likely(lun_dev != NULL)) {
+ if (spdk_unlikely(spdk_scsi_lun_get_dif_ctx(lun_dev, &task->scsi,
+ &rsp_pdu->dif_ctx))) {
+ rsp_pdu->dif_insert_or_strip = true;
+ }
+ }
+
+ iscsi_conn_write_pdu(conn, rsp_pdu, iscsi_conn_datain_pdu_complete, conn);
+
+ return DataSN;
+}
+
+static int
+iscsi_transfer_in(struct spdk_iscsi_conn *conn, struct spdk_iscsi_task *task)
+{
+ uint32_t DataSN;
+ uint32_t transfer_len;
+ uint32_t data_len;
+ uint32_t segment_len;
+ uint32_t offset;
+ uint32_t residual_len = 0;
+ int sent_status;
+ uint32_t len;
+ int datain_flag = 0;
+ int datain_seq_cnt;
+ int i;
+ uint32_t sequence_end;
+ struct spdk_iscsi_task *primary;
+
+ primary = iscsi_task_get_primary(task);
+ segment_len = conn->MaxRecvDataSegmentLength;
+ data_len = task->scsi.data_transferred;
+ transfer_len = task->scsi.length;
+
+ if (task->scsi.status != SPDK_SCSI_STATUS_GOOD) {
+ return 0;
+ }
+
+ if (data_len < transfer_len) {
+ /* underflow */
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Underflow %u/%u\n", data_len, transfer_len);
+ residual_len = transfer_len - data_len;
+ transfer_len = data_len;
+ datain_flag |= ISCSI_DATAIN_UNDERFLOW;
+ } else if (data_len > transfer_len) {
+ /* overflow */
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Overflow %u/%u\n", data_len, transfer_len);
+ residual_len = data_len - transfer_len;
+ datain_flag |= ISCSI_DATAIN_OVERFLOW;
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Transfer %u\n", transfer_len);
+ residual_len = 0;
+ }
+
+ DataSN = primary->datain_datasn;
+ sent_status = 0;
+
+ /* calculate the number of sequences for all data-in pdus */
+ datain_seq_cnt = 1 + ((transfer_len - 1) / (int)conn->sess->MaxBurstLength);
+ for (i = 0; i < datain_seq_cnt; i++) {
+ offset = i * conn->sess->MaxBurstLength;
+ sequence_end = spdk_min(((i + 1) * conn->sess->MaxBurstLength),
+ transfer_len);
+
+ /* send data splitted by segment_len */
+ for (; offset < sequence_end; offset += segment_len) {
+ len = spdk_min(segment_len, (sequence_end - offset));
+
+ datain_flag &= ~ISCSI_FLAG_FINAL;
+ datain_flag &= ~ISCSI_DATAIN_STATUS;
+
+ if (offset + len == sequence_end) {
+ /* last PDU in a sequence */
+ datain_flag |= ISCSI_FLAG_FINAL;
+ if (task->scsi.sense_data_len == 0) {
+ /* The last pdu in all data-in pdus */
+ if ((offset + len) == transfer_len &&
+ (primary->bytes_completed == primary->scsi.transfer_len)) {
+ datain_flag |= ISCSI_DATAIN_STATUS;
+ sent_status = 1;
+ }
+ }
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Transfer=%d, Offset=%d, Len=%d\n",
+ sequence_end, offset, len);
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "StatSN=%u, DataSN=%u, Offset=%u, Len=%d\n",
+ conn->StatSN, DataSN, offset, len);
+
+ DataSN = iscsi_send_datain(conn, task, datain_flag, residual_len,
+ offset, DataSN, len);
+ }
+ }
+
+ if (task != primary) {
+ primary->scsi.data_transferred += task->scsi.data_transferred;
+ }
+ primary->datain_datasn = DataSN;
+
+ return sent_status;
+}
+
+void iscsi_task_response(struct spdk_iscsi_conn *conn,
+ struct spdk_iscsi_task *task)
+{
+ struct spdk_iscsi_pdu *rsp_pdu;
+ struct iscsi_bhs_scsi_resp *rsph;
+ uint32_t task_tag;
+ uint32_t transfer_len;
+ size_t residual_len;
+ size_t data_len;
+ int O_bit, U_bit;
+ int rc;
+ struct spdk_iscsi_task *primary;
+
+ primary = iscsi_task_get_primary(task);
+
+ transfer_len = primary->scsi.transfer_len;
+ task_tag = task->tag;
+
+ /* transfer data from logical unit */
+ /* (direction is view of initiator side) */
+ if (iscsi_task_is_read(primary)) {
+ rc = iscsi_transfer_in(conn, task);
+ if (rc > 0) {
+ /* sent status by last DATAIN PDU */
+ return;
+ }
+
+ if (primary->bytes_completed != primary->scsi.transfer_len) {
+ return;
+ }
+ }
+
+ O_bit = U_bit = 0;
+ residual_len = 0;
+ data_len = primary->scsi.data_transferred;
+
+ if ((transfer_len != 0) &&
+ (task->scsi.status == SPDK_SCSI_STATUS_GOOD)) {
+ if (data_len < transfer_len) {
+ /* underflow */
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Underflow %zu/%u\n", data_len, transfer_len);
+ residual_len = transfer_len - data_len;
+ U_bit = 1;
+ } else if (data_len > transfer_len) {
+ /* overflow */
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Overflow %zu/%u\n", data_len, transfer_len);
+ residual_len = data_len - transfer_len;
+ O_bit = 1;
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Transfer %u\n", transfer_len);
+ }
+ }
+
+ /* response PDU */
+ rsp_pdu = iscsi_get_pdu(conn);
+ assert(rsp_pdu != NULL);
+ rsph = (struct iscsi_bhs_scsi_resp *)&rsp_pdu->bhs;
+ assert(task->scsi.sense_data_len <= sizeof(rsp_pdu->sense.data));
+ memcpy(rsp_pdu->sense.data, task->scsi.sense_data, task->scsi.sense_data_len);
+ to_be16(&rsp_pdu->sense.length, task->scsi.sense_data_len);
+ rsp_pdu->data = (uint8_t *)&rsp_pdu->sense;
+ rsp_pdu->data_from_mempool = true;
+
+ /*
+ * we need to hold onto this task/cmd because until the
+ * PDU has been written out
+ */
+ rsp_pdu->task = task;
+ task->scsi.ref++;
+
+ rsph->opcode = ISCSI_OP_SCSI_RSP;
+ rsph->flags |= 0x80; /* bit 0 is default to 1 */
+
+ if (O_bit) {
+ rsph->flags |= ISCSI_SCSI_OVERFLOW;
+ }
+
+ if (U_bit) {
+ rsph->flags |= ISCSI_SCSI_UNDERFLOW;
+ }
+
+ rsph->status = task->scsi.status;
+ if (task->scsi.sense_data_len) {
+ /* SenseLength (2 bytes) + SenseData */
+ DSET24(rsph->data_segment_len, 2 + task->scsi.sense_data_len);
+ }
+ to_be32(&rsph->itt, task_tag);
+
+ to_be32(&rsph->stat_sn, conn->StatSN);
+ conn->StatSN++;
+
+ if (!iscsi_task_is_immediate(primary)) {
+ conn->sess->MaxCmdSN++;
+ }
+
+ to_be32(&rsph->exp_cmd_sn, conn->sess->ExpCmdSN);
+ to_be32(&rsph->max_cmd_sn, conn->sess->MaxCmdSN);
+
+ to_be32(&rsph->bi_read_res_cnt, 0);
+ to_be32(&rsph->res_cnt, residual_len);
+
+ iscsi_conn_write_pdu(conn, rsp_pdu, iscsi_conn_pdu_generic_complete, NULL);
+}
+
+/*
+ * This function compare the input pdu's bhs with the pdu's bhs associated by
+ * active_r2t_tasks and queued_r2t_tasks in a connection
+ */
+static bool
+iscsi_compare_pdu_bhs_within_existed_r2t_tasks(struct spdk_iscsi_conn *conn,
+ struct spdk_iscsi_pdu *pdu)
+{
+ struct spdk_iscsi_task *task;
+
+ TAILQ_FOREACH(task, &conn->active_r2t_tasks, link) {
+ if (!memcmp(&pdu->bhs, iscsi_task_get_bhs(task), ISCSI_BHS_LEN)) {
+ return true;
+ }
+ }
+
+ TAILQ_FOREACH(task, &conn->queued_r2t_tasks, link) {
+ if (!memcmp(&pdu->bhs, iscsi_task_get_bhs(task), ISCSI_BHS_LEN)) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+void
+iscsi_queue_task(struct spdk_iscsi_conn *conn, struct spdk_iscsi_task *task)
+{
+ spdk_trace_record(TRACE_ISCSI_TASK_QUEUE, conn->id, task->scsi.length,
+ (uintptr_t)task, (uintptr_t)task->pdu);
+ task->is_queued = true;
+ spdk_scsi_dev_queue_task(conn->dev, &task->scsi);
+}
+
+static int
+iscsi_pdu_payload_op_scsi_read(struct spdk_iscsi_conn *conn, struct spdk_iscsi_task *task)
+{
+ if (task->scsi.transfer_len <= SPDK_BDEV_LARGE_BUF_MAX_SIZE) {
+ task->parent = NULL;
+ task->scsi.offset = 0;
+ task->scsi.length = task->scsi.transfer_len;
+ spdk_scsi_task_set_data(&task->scsi, NULL, 0);
+
+ iscsi_queue_task(conn, task);
+ return 0;
+ } else {
+ TAILQ_INIT(&task->subtask_list);
+ task->current_datain_offset = 0;
+ TAILQ_INSERT_TAIL(&conn->queued_datain_tasks, task, link);
+
+ return iscsi_conn_handle_queued_datain_tasks(conn);
+ }
+}
+
+static int
+iscsi_pdu_payload_op_scsi_write(struct spdk_iscsi_conn *conn, struct spdk_iscsi_task *task)
+{
+ struct spdk_iscsi_pdu *pdu;
+ struct iscsi_bhs_scsi_req *reqh;
+ uint32_t transfer_len;
+ uint32_t scsi_data_len;
+ int rc;
+
+ pdu = iscsi_task_get_pdu(task);
+ reqh = (struct iscsi_bhs_scsi_req *)&pdu->bhs;
+
+ transfer_len = task->scsi.transfer_len;
+
+ if (spdk_likely(!pdu->dif_insert_or_strip)) {
+ scsi_data_len = pdu->data_segment_len;
+ } else {
+ scsi_data_len = pdu->data_buf_len;
+ }
+
+ if (reqh->final_bit &&
+ pdu->data_segment_len < transfer_len) {
+ /* needs R2T */
+ rc = add_transfer_task(conn, task);
+ if (rc < 0) {
+ SPDK_ERRLOG("add_transfer_task() failed\n");
+ iscsi_task_put(task);
+ return SPDK_ISCSI_CONNECTION_FATAL;
+ }
+
+ /* Non-immediate writes */
+ if (pdu->data_segment_len == 0) {
+ return 0;
+ } else {
+ /* we are doing the first partial write task */
+ task->scsi.ref++;
+ spdk_scsi_task_set_data(&task->scsi, pdu->data, scsi_data_len);
+ task->scsi.length = pdu->data_segment_len;
+ }
+ }
+
+ if (pdu->data_segment_len == transfer_len) {
+ /* we are doing small writes with no R2T */
+ spdk_scsi_task_set_data(&task->scsi, pdu->data, scsi_data_len);
+ task->scsi.length = transfer_len;
+ }
+
+ iscsi_queue_task(conn, task);
+ return 0;
+}
+
+static int
+iscsi_pdu_hdr_op_scsi(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu)
+{
+ struct spdk_iscsi_task *task;
+ struct spdk_scsi_dev *dev;
+ uint8_t *cdb;
+ uint64_t lun;
+ uint32_t task_tag;
+ uint32_t transfer_len;
+ int R_bit, W_bit;
+ int lun_i;
+ struct iscsi_bhs_scsi_req *reqh;
+
+ if (conn->sess->session_type != SESSION_TYPE_NORMAL) {
+ SPDK_ERRLOG("ISCSI_OP_SCSI not allowed in discovery and invalid session\n");
+ return SPDK_ISCSI_CONNECTION_FATAL;
+ }
+
+ reqh = (struct iscsi_bhs_scsi_req *)&pdu->bhs;
+
+ R_bit = reqh->read_bit;
+ W_bit = reqh->write_bit;
+ lun = from_be64(&reqh->lun);
+ task_tag = from_be32(&reqh->itt);
+ transfer_len = from_be32(&reqh->expected_data_xfer_len);
+ cdb = reqh->cdb;
+
+ SPDK_LOGDUMP(SPDK_LOG_ISCSI, "CDB", cdb, 16);
+
+ task = iscsi_task_get(conn, NULL, iscsi_task_cpl);
+ if (!task) {
+ SPDK_ERRLOG("Unable to acquire task\n");
+ return SPDK_ISCSI_CONNECTION_FATAL;
+ }
+
+ iscsi_task_associate_pdu(task, pdu);
+ lun_i = spdk_scsi_lun_id_fmt_to_int(lun);
+ task->lun_id = lun_i;
+ dev = conn->dev;
+ task->scsi.lun = spdk_scsi_dev_get_lun(dev, lun_i);
+
+ if ((R_bit != 0) && (W_bit != 0)) {
+ SPDK_ERRLOG("Bidirectional CDB is not supported\n");
+ iscsi_task_put(task);
+ return SPDK_ISCSI_CONNECTION_FATAL;
+ }
+
+ task->scsi.cdb = cdb;
+ task->tag = task_tag;
+ task->scsi.transfer_len = transfer_len;
+ task->scsi.target_port = conn->target_port;
+ task->scsi.initiator_port = conn->initiator_port;
+ task->parent = NULL;
+ task->rsp_scsi_status = SPDK_SCSI_STATUS_GOOD;
+
+ if (task->scsi.lun == NULL) {
+ spdk_scsi_task_process_null_lun(&task->scsi);
+ iscsi_task_cpl(&task->scsi);
+ return 0;
+ }
+
+ /* no bi-directional support */
+ if (R_bit) {
+ task->scsi.dxfer_dir = SPDK_SCSI_DIR_FROM_DEV;
+ } else if (W_bit) {
+ task->scsi.dxfer_dir = SPDK_SCSI_DIR_TO_DEV;
+
+ if ((conn->sess->ErrorRecoveryLevel >= 1) &&
+ (iscsi_compare_pdu_bhs_within_existed_r2t_tasks(conn, pdu))) {
+ iscsi_task_response(conn, task);
+ iscsi_task_put(task);
+ return 0;
+ }
+
+ if (pdu->data_segment_len > iscsi_get_max_immediate_data_size()) {
+ SPDK_ERRLOG("data segment len(=%zu) > immediate data len(=%"PRIu32")\n",
+ pdu->data_segment_len, iscsi_get_max_immediate_data_size());
+ iscsi_task_put(task);
+ return iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR);
+ }
+
+ if (pdu->data_segment_len > transfer_len) {
+ SPDK_ERRLOG("data segment len(=%zu) > task transfer len(=%d)\n",
+ pdu->data_segment_len, transfer_len);
+ iscsi_task_put(task);
+ return iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR);
+ }
+
+ /* check the ImmediateData and also pdu->data_segment_len */
+ if ((!conn->sess->ImmediateData && (pdu->data_segment_len > 0)) ||
+ (pdu->data_segment_len > conn->sess->FirstBurstLength)) {
+ iscsi_task_put(task);
+ return iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR);
+ }
+
+ if (spdk_unlikely(spdk_scsi_lun_get_dif_ctx(task->scsi.lun, &task->scsi, &pdu->dif_ctx))) {
+ pdu->dif_insert_or_strip = true;
+ }
+ } else {
+ /* neither R nor W bit set */
+ task->scsi.dxfer_dir = SPDK_SCSI_DIR_NONE;
+ if (transfer_len > 0) {
+ iscsi_task_put(task);
+ SPDK_ERRLOG("Reject scsi cmd with EDTL > 0 but (R | W) == 0\n");
+ return iscsi_reject(conn, pdu, ISCSI_REASON_INVALID_PDU_FIELD);
+ }
+ }
+
+ pdu->task = task;
+ return 0;
+}
+
+static int
+iscsi_pdu_payload_op_scsi(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu)
+{
+ struct spdk_iscsi_task *task;
+
+ if (pdu->task == NULL) {
+ return 0;
+ }
+
+ task = pdu->task;
+
+ if (spdk_scsi_dev_get_lun(conn->dev, task->lun_id) == NULL) {
+ spdk_scsi_task_process_null_lun(&task->scsi);
+ iscsi_task_cpl(&task->scsi);
+ return 0;
+ }
+
+ switch (task->scsi.dxfer_dir) {
+ case SPDK_SCSI_DIR_FROM_DEV:
+ return iscsi_pdu_payload_op_scsi_read(conn, task);
+ case SPDK_SCSI_DIR_TO_DEV:
+ return iscsi_pdu_payload_op_scsi_write(conn, task);
+ case SPDK_SCSI_DIR_NONE:
+ iscsi_queue_task(conn, task);
+ return 0;
+ default:
+ assert(false);
+ iscsi_task_put(task);
+ break;
+ }
+
+ return SPDK_ISCSI_CONNECTION_FATAL;
+}
+
+static void
+abort_transfer_task_in_task_mgmt_resp(struct spdk_iscsi_conn *conn,
+ struct spdk_iscsi_task *task)
+{
+ struct spdk_iscsi_pdu *pdu;
+
+ pdu = iscsi_task_get_pdu(task);
+
+ switch (task->scsi.function) {
+ /* abort task identified by Reference Task Tag field */
+ case ISCSI_TASK_FUNC_ABORT_TASK:
+ iscsi_del_transfer_task(conn, task->scsi.abort_id);
+ break;
+
+ /* abort all tasks issued via this session on the LUN */
+ case ISCSI_TASK_FUNC_ABORT_TASK_SET:
+ iscsi_clear_all_transfer_task(conn, task->scsi.lun, pdu);
+ break;
+
+ case ISCSI_TASK_FUNC_LOGICAL_UNIT_RESET:
+ iscsi_clear_all_transfer_task(conn, task->scsi.lun, pdu);
+ break;
+ }
+}
+
+void
+iscsi_task_mgmt_response(struct spdk_iscsi_conn *conn,
+ struct spdk_iscsi_task *task)
+{
+ struct spdk_iscsi_pdu *rsp_pdu;
+ struct iscsi_bhs_task_req *reqh;
+ struct iscsi_bhs_task_resp *rsph;
+
+ if (task->pdu == NULL) {
+ /*
+ * This was an internally generated task management command,
+ * usually from LUN cleanup when a connection closes.
+ */
+ return;
+ }
+
+ reqh = (struct iscsi_bhs_task_req *)&task->pdu->bhs;
+ /* response PDU */
+ rsp_pdu = iscsi_get_pdu(conn);
+ rsph = (struct iscsi_bhs_task_resp *)&rsp_pdu->bhs;
+ rsph->opcode = ISCSI_OP_TASK_RSP;
+ rsph->flags |= 0x80; /* bit 0 default to 1 */
+ switch (task->scsi.response) {
+ case SPDK_SCSI_TASK_MGMT_RESP_COMPLETE:
+ abort_transfer_task_in_task_mgmt_resp(conn, task);
+ rsph->response = ISCSI_TASK_FUNC_RESP_COMPLETE;
+ break;
+ case SPDK_SCSI_TASK_MGMT_RESP_SUCCESS:
+ abort_transfer_task_in_task_mgmt_resp(conn, task);
+ rsph->response = ISCSI_TASK_FUNC_RESP_COMPLETE;
+ break;
+ case SPDK_SCSI_TASK_MGMT_RESP_REJECT:
+ rsph->response = ISCSI_TASK_FUNC_REJECTED;
+ break;
+ case SPDK_SCSI_TASK_MGMT_RESP_INVALID_LUN:
+ rsph->response = ISCSI_TASK_FUNC_RESP_LUN_NOT_EXIST;
+ break;
+ case SPDK_SCSI_TASK_MGMT_RESP_TARGET_FAILURE:
+ rsph->response = ISCSI_TASK_FUNC_REJECTED;
+ break;
+ case SPDK_SCSI_TASK_MGMT_RESP_REJECT_FUNC_NOT_SUPPORTED:
+ rsph->response = ISCSI_TASK_FUNC_RESP_FUNC_NOT_SUPPORTED;
+ break;
+ }
+ rsph->itt = reqh->itt;
+
+ to_be32(&rsph->stat_sn, conn->StatSN);
+ conn->StatSN++;
+
+ if (reqh->immediate == 0) {
+ conn->sess->MaxCmdSN++;
+ }
+
+ to_be32(&rsph->exp_cmd_sn, conn->sess->ExpCmdSN);
+ to_be32(&rsph->max_cmd_sn, conn->sess->MaxCmdSN);
+
+ iscsi_conn_write_pdu(conn, rsp_pdu, iscsi_conn_pdu_generic_complete, NULL);
+}
+
+static void
+iscsi_queue_mgmt_task(struct spdk_iscsi_conn *conn, struct spdk_iscsi_task *task)
+{
+ struct spdk_scsi_lun *lun;
+
+ lun = spdk_scsi_dev_get_lun(conn->dev, task->lun_id);
+ if (lun == NULL) {
+ task->scsi.response = SPDK_SCSI_TASK_MGMT_RESP_INVALID_LUN;
+ iscsi_task_mgmt_response(conn, task);
+ iscsi_task_put(task);
+ return;
+ }
+
+ spdk_scsi_dev_queue_mgmt_task(conn->dev, &task->scsi);
+}
+
+static int
+_iscsi_op_abort_task(void *arg)
+{
+ struct spdk_iscsi_task *task = arg;
+ int rc;
+
+ rc = iscsi_conn_abort_queued_datain_task(task->conn, task->scsi.abort_id);
+ if (rc != 0) {
+ return SPDK_POLLER_BUSY;
+ }
+
+ spdk_poller_unregister(&task->mgmt_poller);
+ iscsi_queue_mgmt_task(task->conn, task);
+ return SPDK_POLLER_BUSY;
+}
+
+static void
+iscsi_op_abort_task(struct spdk_iscsi_task *task, uint32_t ref_task_tag)
+{
+ task->scsi.abort_id = ref_task_tag;
+ task->scsi.function = SPDK_SCSI_TASK_FUNC_ABORT_TASK;
+ task->mgmt_poller = SPDK_POLLER_REGISTER(_iscsi_op_abort_task, task, 10);
+}
+
+static int
+_iscsi_op_abort_task_set(void *arg)
+{
+ struct spdk_iscsi_task *task = arg;
+ int rc;
+
+ rc = iscsi_conn_abort_queued_datain_tasks(task->conn, task->scsi.lun,
+ task->pdu);
+ if (rc != 0) {
+ return SPDK_POLLER_BUSY;
+ }
+
+ spdk_poller_unregister(&task->mgmt_poller);
+ iscsi_queue_mgmt_task(task->conn, task);
+ return SPDK_POLLER_BUSY;
+}
+
+void
+iscsi_op_abort_task_set(struct spdk_iscsi_task *task, uint8_t function)
+{
+ task->scsi.function = function;
+ task->mgmt_poller = SPDK_POLLER_REGISTER(_iscsi_op_abort_task_set, task, 10);
+}
+
+static int
+iscsi_pdu_hdr_op_task(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu)
+{
+ struct iscsi_bhs_task_req *reqh;
+ uint64_t lun;
+ uint32_t task_tag;
+ uint32_t ref_task_tag;
+ uint8_t function;
+ int lun_i;
+ struct spdk_iscsi_task *task;
+ struct spdk_scsi_dev *dev;
+
+ if (conn->sess->session_type != SESSION_TYPE_NORMAL) {
+ SPDK_ERRLOG("ISCSI_OP_TASK not allowed in discovery and invalid session\n");
+ return SPDK_ISCSI_CONNECTION_FATAL;
+ }
+
+ reqh = (struct iscsi_bhs_task_req *)&pdu->bhs;
+ function = reqh->flags & ISCSI_TASK_FUNCTION_MASK;
+ lun = from_be64(&reqh->lun);
+ task_tag = from_be32(&reqh->itt);
+ ref_task_tag = from_be32(&reqh->ref_task_tag);
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "I=%d, func=%d, ITT=%x, ref TT=%x, LUN=0x%16.16"PRIx64"\n",
+ reqh->immediate, function, task_tag, ref_task_tag, lun);
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "StatSN=%u, ExpCmdSN=%u, MaxCmdSN=%u\n",
+ conn->StatSN, conn->sess->ExpCmdSN, conn->sess->MaxCmdSN);
+
+ lun_i = spdk_scsi_lun_id_fmt_to_int(lun);
+ dev = conn->dev;
+
+ task = iscsi_task_get(conn, NULL, iscsi_task_mgmt_cpl);
+ if (!task) {
+ SPDK_ERRLOG("Unable to acquire task\n");
+ return SPDK_ISCSI_CONNECTION_FATAL;
+ }
+
+ iscsi_task_associate_pdu(task, pdu);
+ task->scsi.target_port = conn->target_port;
+ task->scsi.initiator_port = conn->initiator_port;
+ task->tag = task_tag;
+ task->scsi.lun = spdk_scsi_dev_get_lun(dev, lun_i);
+ task->lun_id = lun_i;
+
+ if (task->scsi.lun == NULL) {
+ task->scsi.response = SPDK_SCSI_TASK_MGMT_RESP_INVALID_LUN;
+ iscsi_task_mgmt_response(conn, task);
+ iscsi_task_put(task);
+ return 0;
+ }
+
+ switch (function) {
+ /* abort task identified by Referenced Task Tag field */
+ case ISCSI_TASK_FUNC_ABORT_TASK:
+ SPDK_NOTICELOG("ABORT_TASK\n");
+
+ iscsi_op_abort_task(task, ref_task_tag);
+ return 0;
+
+ /* abort all tasks issued via this session on the LUN */
+ case ISCSI_TASK_FUNC_ABORT_TASK_SET:
+ SPDK_NOTICELOG("ABORT_TASK_SET\n");
+
+ iscsi_op_abort_task_set(task, SPDK_SCSI_TASK_FUNC_ABORT_TASK_SET);
+ return 0;
+
+ case ISCSI_TASK_FUNC_CLEAR_TASK_SET:
+ task->scsi.response = SPDK_SCSI_TASK_MGMT_RESP_REJECT_FUNC_NOT_SUPPORTED;
+ SPDK_NOTICELOG("CLEAR_TASK_SET (Unsupported)\n");
+ break;
+
+ case ISCSI_TASK_FUNC_CLEAR_ACA:
+ task->scsi.response = SPDK_SCSI_TASK_MGMT_RESP_REJECT_FUNC_NOT_SUPPORTED;
+ SPDK_NOTICELOG("CLEAR_ACA (Unsupported)\n");
+ break;
+
+ case ISCSI_TASK_FUNC_LOGICAL_UNIT_RESET:
+ SPDK_NOTICELOG("LOGICAL_UNIT_RESET\n");
+
+ iscsi_op_abort_task_set(task, SPDK_SCSI_TASK_FUNC_LUN_RESET);
+ return 0;
+
+ case ISCSI_TASK_FUNC_TARGET_WARM_RESET:
+ SPDK_NOTICELOG("TARGET_WARM_RESET (Unsupported)\n");
+ task->scsi.response = SPDK_SCSI_TASK_MGMT_RESP_REJECT_FUNC_NOT_SUPPORTED;
+ break;
+
+ case ISCSI_TASK_FUNC_TARGET_COLD_RESET:
+ SPDK_NOTICELOG("TARGET_COLD_RESET (Unsupported)\n");
+ task->scsi.response = SPDK_SCSI_TASK_MGMT_RESP_REJECT_FUNC_NOT_SUPPORTED;
+ break;
+
+ case ISCSI_TASK_FUNC_TASK_REASSIGN:
+ SPDK_NOTICELOG("TASK_REASSIGN (Unsupported)\n");
+ task->scsi.response = SPDK_SCSI_TASK_MGMT_RESP_REJECT_FUNC_NOT_SUPPORTED;
+ break;
+
+ default:
+ SPDK_ERRLOG("unsupported function %d\n", function);
+ task->scsi.response = SPDK_SCSI_TASK_MGMT_RESP_REJECT;
+ break;
+ }
+
+ iscsi_task_mgmt_response(conn, task);
+ iscsi_task_put(task);
+ return 0;
+}
+
+static int
+iscsi_pdu_hdr_op_nopout(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu)
+{
+ struct iscsi_bhs_nop_out *reqh;
+ uint32_t task_tag;
+ uint32_t transfer_tag;
+ int I_bit;
+
+ if (conn->sess->session_type == SESSION_TYPE_DISCOVERY) {
+ SPDK_ERRLOG("ISCSI_OP_NOPOUT not allowed in discovery session\n");
+ return SPDK_ISCSI_CONNECTION_FATAL;
+ }
+
+ reqh = (struct iscsi_bhs_nop_out *)&pdu->bhs;
+ I_bit = reqh->immediate;
+
+ if (pdu->data_segment_len > SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH) {
+ return iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR);
+ }
+
+ task_tag = from_be32(&reqh->itt);
+ transfer_tag = from_be32(&reqh->ttt);
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "I=%d, ITT=%x, TTT=%x\n",
+ I_bit, task_tag, transfer_tag);
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "CmdSN=%u, StatSN=%u, ExpCmdSN=%u, MaxCmdSN=%u\n",
+ pdu->cmd_sn, conn->StatSN, conn->sess->ExpCmdSN,
+ conn->sess->MaxCmdSN);
+
+ if (transfer_tag != 0xFFFFFFFF && transfer_tag != (uint32_t)conn->id) {
+ SPDK_ERRLOG("invalid transfer tag 0x%x\n", transfer_tag);
+ /*
+ * Technically we should probably fail the connection here, but for now
+ * just print the error message and continue.
+ */
+ }
+
+ if (task_tag == 0xffffffffU && I_bit == 0) {
+ SPDK_ERRLOG("got NOPOUT ITT=0xffffffff, I=0\n");
+ return SPDK_ISCSI_CONNECTION_FATAL;
+ }
+
+ return 0;
+}
+
+static int
+iscsi_pdu_payload_op_nopout(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu)
+{
+ struct spdk_iscsi_pdu *rsp_pdu;
+ struct iscsi_bhs_nop_out *reqh;
+ struct iscsi_bhs_nop_in *rsph;
+ uint8_t *data;
+ uint64_t lun;
+ uint32_t task_tag;
+ int I_bit;
+ int data_len;
+
+ reqh = (struct iscsi_bhs_nop_out *)&pdu->bhs;
+ I_bit = reqh->immediate;
+
+ data_len = pdu->data_segment_len;
+ if (data_len > conn->MaxRecvDataSegmentLength) {
+ data_len = conn->MaxRecvDataSegmentLength;
+ }
+
+ lun = from_be64(&reqh->lun);
+ task_tag = from_be32(&reqh->itt);
+
+ /*
+ * We don't actually check to see if this is a response to the NOP-In
+ * that we sent. Our goal is to just verify that the initiator is
+ * alive and responding to commands, not to verify that it tags
+ * NOP-Outs correctly
+ */
+ conn->nop_outstanding = false;
+
+ if (task_tag == 0xffffffffU) {
+ assert(I_bit == 1);
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "got NOPOUT ITT=0xffffffff\n");
+ return 0;
+ }
+
+ data = calloc(1, data_len);
+ if (!data) {
+ SPDK_ERRLOG("calloc() failed for ping data\n");
+ return SPDK_ISCSI_CONNECTION_FATAL;
+ }
+
+ /* response of NOPOUT */
+ if (data_len > 0) {
+ /* copy ping data */
+ memcpy(data, pdu->data, data_len);
+ }
+
+ /* response PDU */
+ rsp_pdu = iscsi_get_pdu(conn);
+ assert(rsp_pdu != NULL);
+
+ rsph = (struct iscsi_bhs_nop_in *)&rsp_pdu->bhs;
+ rsp_pdu->data = data;
+ rsph->opcode = ISCSI_OP_NOPIN;
+ rsph->flags |= 0x80; /* bit 0 default to 1 */
+ DSET24(rsph->data_segment_len, data_len);
+ to_be64(&rsph->lun, lun);
+ to_be32(&rsph->itt, task_tag);
+ to_be32(&rsph->ttt, 0xffffffffU);
+
+ to_be32(&rsph->stat_sn, conn->StatSN);
+ conn->StatSN++;
+
+ if (I_bit == 0) {
+ conn->sess->MaxCmdSN++;
+ }
+
+ to_be32(&rsph->exp_cmd_sn, conn->sess->ExpCmdSN);
+ to_be32(&rsph->max_cmd_sn, conn->sess->MaxCmdSN);
+
+ iscsi_conn_write_pdu(conn, rsp_pdu, iscsi_conn_pdu_generic_complete, NULL);
+ conn->last_nopin = spdk_get_ticks();
+
+ return 0;
+}
+
+/* This function returns the spdk_scsi_task by searching the snack list via
+ * task transfertag and the pdu's opcode
+ */
+static struct spdk_iscsi_task *
+get_scsi_task_from_ttt(struct spdk_iscsi_conn *conn, uint32_t transfer_tag)
+{
+ struct spdk_iscsi_pdu *pdu;
+ struct iscsi_bhs_data_in *datain_bhs;
+
+ TAILQ_FOREACH(pdu, &conn->snack_pdu_list, tailq) {
+ if (pdu->bhs.opcode == ISCSI_OP_SCSI_DATAIN) {
+ datain_bhs = (struct iscsi_bhs_data_in *)&pdu->bhs;
+ if (from_be32(&datain_bhs->ttt) == transfer_tag) {
+ return pdu->task;
+ }
+ }
+ }
+
+ return NULL;
+}
+
+/* This function returns the spdk_scsi_task by searching the snack list via
+ * initiator task tag and the pdu's opcode
+ */
+static struct spdk_iscsi_task *
+get_scsi_task_from_itt(struct spdk_iscsi_conn *conn,
+ uint32_t task_tag, enum iscsi_op opcode)
+{
+ struct spdk_iscsi_pdu *pdu;
+
+ TAILQ_FOREACH(pdu, &conn->snack_pdu_list, tailq) {
+ if (pdu->bhs.opcode == opcode &&
+ pdu->task != NULL &&
+ pdu->task->tag == task_tag) {
+ return pdu->task;
+ }
+ }
+
+ return NULL;
+}
+
+/* This function is used to handle the r2t snack */
+static int
+iscsi_handle_r2t_snack(struct spdk_iscsi_conn *conn,
+ struct spdk_iscsi_task *task,
+ struct spdk_iscsi_pdu *pdu, uint32_t beg_run,
+ uint32_t run_length, int32_t task_tag)
+{
+ int32_t last_r2tsn;
+ int i;
+
+ if (beg_run < task->acked_r2tsn) {
+ SPDK_ERRLOG("ITT: 0x%08x, R2T SNACK requests retransmission of"
+ "R2TSN: from 0x%08x to 0x%08x. But it has already"
+ "ack to R2TSN:0x%08x, protocol error.\n",
+ task_tag, beg_run, (beg_run + run_length),
+ (task->acked_r2tsn - 1));
+ return iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR);
+ }
+
+ if (run_length) {
+ if ((beg_run + run_length) > task->R2TSN) {
+ SPDK_ERRLOG("ITT: 0x%08x, received R2T SNACK with"
+ "BegRun: 0x%08x, RunLength: 0x%08x, exceeds"
+ "current R2TSN: 0x%08x, protocol error.\n",
+ task_tag, beg_run, run_length,
+ task->R2TSN);
+
+ return iscsi_reject(conn, pdu, ISCSI_REASON_INVALID_PDU_FIELD);
+ }
+ last_r2tsn = (beg_run + run_length);
+ } else {
+ last_r2tsn = task->R2TSN;
+ }
+
+ for (i = beg_run; i < last_r2tsn; i++) {
+ if (iscsi_send_r2t_recovery(conn, task, i, false) < 0) {
+ SPDK_ERRLOG("The r2t_sn=%d of r2t_task=%p is not sent\n", i, task);
+ }
+ }
+ return 0;
+}
+
+/* This function is used to recover the data in packet */
+static int
+iscsi_handle_recovery_datain(struct spdk_iscsi_conn *conn,
+ struct spdk_iscsi_task *task,
+ struct spdk_iscsi_pdu *pdu, uint32_t beg_run,
+ uint32_t run_length, uint32_t task_tag)
+{
+ struct spdk_iscsi_pdu *old_pdu, *pdu_temp;
+ uint32_t i;
+ struct iscsi_bhs_data_in *datain_header;
+ uint32_t last_statsn;
+
+ task = iscsi_task_get_primary(task);
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "iscsi_handle_recovery_datain\n");
+
+ if (beg_run < task->acked_data_sn) {
+ SPDK_ERRLOG("ITT: 0x%08x, DATA IN SNACK requests retransmission of"
+ "DATASN: from 0x%08x to 0x%08x but already acked to "
+ "DATASN: 0x%08x protocol error\n",
+ task_tag, beg_run,
+ (beg_run + run_length), (task->acked_data_sn - 1));
+
+ return iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR);
+ }
+
+ if (run_length == 0) {
+ /* as the DataSN begins at 0 */
+ run_length = task->datain_datasn + 1;
+ }
+
+ if ((beg_run + run_length - 1) > task->datain_datasn) {
+ SPDK_ERRLOG("Initiator requests BegRun: 0x%08x, RunLength:"
+ "0x%08x greater than maximum DataSN: 0x%08x.\n",
+ beg_run, run_length, task->datain_datasn);
+
+ return -1;
+ } else {
+ last_statsn = beg_run + run_length - 1;
+ }
+
+ for (i = beg_run; i <= last_statsn; i++) {
+ TAILQ_FOREACH_SAFE(old_pdu, &conn->snack_pdu_list, tailq, pdu_temp) {
+ if (old_pdu->bhs.opcode == ISCSI_OP_SCSI_DATAIN) {
+ datain_header = (struct iscsi_bhs_data_in *)&old_pdu->bhs;
+ if (from_be32(&datain_header->itt) == task_tag &&
+ from_be32(&datain_header->data_sn) == i) {
+ TAILQ_REMOVE(&conn->snack_pdu_list, old_pdu, tailq);
+ iscsi_conn_write_pdu(conn, old_pdu, old_pdu->cb_fn, old_pdu->cb_arg);
+ break;
+ }
+ }
+ }
+ }
+ return 0;
+}
+
+/* This function is used to handle the status snack */
+static int
+iscsi_handle_status_snack(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu)
+{
+ uint32_t beg_run;
+ uint32_t run_length;
+ struct iscsi_bhs_snack_req *reqh;
+ uint32_t i;
+ uint32_t last_statsn;
+ bool found_pdu;
+ struct spdk_iscsi_pdu *old_pdu;
+
+ reqh = (struct iscsi_bhs_snack_req *)&pdu->bhs;
+ beg_run = from_be32(&reqh->beg_run);
+ run_length = from_be32(&reqh->run_len);
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "beg_run=%d, run_length=%d, conn->StatSN="
+ "%d, conn->exp_statsn=%d\n", beg_run, run_length,
+ conn->StatSN, conn->exp_statsn);
+
+ if (!beg_run) {
+ beg_run = conn->exp_statsn;
+ } else if (beg_run < conn->exp_statsn) {
+ SPDK_ERRLOG("Got Status SNACK Begrun: 0x%08x, RunLength: 0x%08x "
+ "but already got ExpStatSN: 0x%08x on CID:%hu.\n",
+ beg_run, run_length, conn->StatSN, conn->cid);
+
+ return iscsi_reject(conn, pdu, ISCSI_REASON_INVALID_PDU_FIELD);
+ }
+
+ last_statsn = (!run_length) ? conn->StatSN : (beg_run + run_length);
+
+ for (i = beg_run; i < last_statsn; i++) {
+ found_pdu = false;
+ TAILQ_FOREACH(old_pdu, &conn->snack_pdu_list, tailq) {
+ if (from_be32(&old_pdu->bhs.stat_sn) == i) {
+ found_pdu = true;
+ break;
+ }
+ }
+
+ if (!found_pdu) {
+ SPDK_ERRLOG("Unable to find StatSN: 0x%08x. For a Status"
+ "SNACK, assuming this is a proactive SNACK "
+ "for an untransmitted StatSN, ignoring.\n",
+ beg_run);
+ } else {
+ TAILQ_REMOVE(&conn->snack_pdu_list, old_pdu, tailq);
+ iscsi_conn_write_pdu(conn, old_pdu, old_pdu->cb_fn, old_pdu->cb_arg);
+ }
+ }
+
+ return 0;
+}
+
+/* This function is used to handle the data ack snack */
+static int
+iscsi_handle_data_ack(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu)
+{
+ uint32_t transfer_tag;
+ uint32_t beg_run;
+ uint32_t run_length;
+ struct spdk_iscsi_pdu *old_pdu;
+ uint32_t old_datasn;
+ struct iscsi_bhs_snack_req *reqh;
+ struct spdk_iscsi_task *task;
+ struct iscsi_bhs_data_in *datain_header;
+ struct spdk_iscsi_task *primary;
+
+ reqh = (struct iscsi_bhs_snack_req *)&pdu->bhs;
+ transfer_tag = from_be32(&reqh->ttt);
+ beg_run = from_be32(&reqh->beg_run);
+ run_length = from_be32(&reqh->run_len);
+ task = NULL;
+ datain_header = NULL;
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "beg_run=%d,transfer_tag=%d,run_len=%d\n",
+ beg_run, transfer_tag, run_length);
+
+ task = get_scsi_task_from_ttt(conn, transfer_tag);
+ if (!task) {
+ SPDK_ERRLOG("Data ACK SNACK for TTT: 0x%08x is invalid.\n",
+ transfer_tag);
+ goto reject_return;
+ }
+
+ primary = iscsi_task_get_primary(task);
+ if ((run_length != 0) || (beg_run < primary->acked_data_sn)) {
+ SPDK_ERRLOG("TTT: 0x%08x Data ACK SNACK BegRUN: %d is less than "
+ "the next expected acked DataSN: %d\n",
+ transfer_tag, beg_run, primary->acked_data_sn);
+ goto reject_return;
+ }
+
+ primary->acked_data_sn = beg_run;
+
+ /* To free the pdu */
+ TAILQ_FOREACH(old_pdu, &conn->snack_pdu_list, tailq) {
+ if (old_pdu->bhs.opcode == ISCSI_OP_SCSI_DATAIN) {
+ datain_header = (struct iscsi_bhs_data_in *) &old_pdu->bhs;
+ old_datasn = from_be32(&datain_header->data_sn);
+ if ((from_be32(&datain_header->ttt) == transfer_tag) &&
+ (old_datasn == beg_run - 1)) {
+ TAILQ_REMOVE(&conn->snack_pdu_list, old_pdu, tailq);
+ iscsi_conn_free_pdu(conn, old_pdu);
+ break;
+ }
+ }
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Received Data ACK SNACK for TTT: 0x%08x,"
+ " updated acked DataSN to 0x%08x.\n", transfer_tag,
+ (task->acked_data_sn - 1));
+
+ return 0;
+
+reject_return:
+ return iscsi_reject(conn, pdu, ISCSI_REASON_INVALID_SNACK);
+}
+
+/* This function is used to handle the snack request from the initiator */
+static int
+iscsi_pdu_hdr_op_snack(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu)
+{
+ struct iscsi_bhs_snack_req *reqh;
+ struct spdk_iscsi_task *task;
+ int type;
+ uint32_t task_tag;
+ uint32_t beg_run;
+ uint32_t run_length;
+ int rc;
+
+ if (conn->sess->session_type == SESSION_TYPE_DISCOVERY) {
+ SPDK_ERRLOG("ISCSI_OP_SNACK not allowed in discovery session\n");
+ return SPDK_ISCSI_CONNECTION_FATAL;
+ }
+
+ reqh = (struct iscsi_bhs_snack_req *)&pdu->bhs;
+ if (!conn->sess->ErrorRecoveryLevel) {
+ SPDK_ERRLOG("Got a SNACK request in ErrorRecoveryLevel=0\n");
+ return iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR);
+ }
+
+ type = reqh->flags & ISCSI_FLAG_SNACK_TYPE_MASK;
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "The value of type is %d\n", type);
+
+ switch (type) {
+ case 0:
+ reqh = (struct iscsi_bhs_snack_req *)&pdu->bhs;
+ task_tag = from_be32(&reqh->itt);
+ beg_run = from_be32(&reqh->beg_run);
+ run_length = from_be32(&reqh->run_len);
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "beg_run=%d, run_length=%d, "
+ "task_tag=%x, transfer_tag=%u\n", beg_run,
+ run_length, task_tag, from_be32(&reqh->ttt));
+
+ task = get_scsi_task_from_itt(conn, task_tag,
+ ISCSI_OP_SCSI_DATAIN);
+ if (task) {
+ return iscsi_handle_recovery_datain(conn, task, pdu,
+ beg_run, run_length, task_tag);
+ }
+ task = get_scsi_task_from_itt(conn, task_tag, ISCSI_OP_R2T);
+ if (task) {
+ return iscsi_handle_r2t_snack(conn, task, pdu, beg_run,
+ run_length, task_tag);
+ }
+ SPDK_ERRLOG("It is Neither datain nor r2t recovery request\n");
+ rc = -1;
+ break;
+ case ISCSI_FLAG_SNACK_TYPE_STATUS:
+ rc = iscsi_handle_status_snack(conn, pdu);
+ break;
+ case ISCSI_FLAG_SNACK_TYPE_DATA_ACK:
+ rc = iscsi_handle_data_ack(conn, pdu);
+ break;
+ case ISCSI_FLAG_SNACK_TYPE_RDATA:
+ SPDK_ERRLOG("R-Data SNACK is Not Supported int spdk\n");
+ rc = iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR);
+ break;
+ default:
+ SPDK_ERRLOG("Unknown SNACK type %d, protocol error\n", type);
+ rc = iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR);
+ break;
+ }
+
+ return rc;
+}
+
+static int
+iscsi_pdu_hdr_op_data(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu)
+{
+ struct spdk_iscsi_task *task, *subtask;
+ struct iscsi_bhs_data_out *reqh;
+ struct spdk_scsi_lun *lun_dev;
+ uint32_t transfer_tag;
+ uint32_t task_tag;
+ uint32_t transfer_len;
+ uint32_t DataSN;
+ uint32_t buffer_offset;
+ uint32_t len;
+ int F_bit;
+ int rc;
+ int reject_reason = ISCSI_REASON_INVALID_PDU_FIELD;
+
+ if (conn->sess->session_type == SESSION_TYPE_DISCOVERY) {
+ SPDK_ERRLOG("ISCSI_OP_SCSI_DATAOUT not allowed in discovery session\n");
+ return SPDK_ISCSI_CONNECTION_FATAL;
+ }
+
+ reqh = (struct iscsi_bhs_data_out *)&pdu->bhs;
+ F_bit = !!(reqh->flags & ISCSI_FLAG_FINAL);
+ transfer_tag = from_be32(&reqh->ttt);
+ task_tag = from_be32(&reqh->itt);
+ DataSN = from_be32(&reqh->data_sn);
+ buffer_offset = from_be32(&reqh->buffer_offset);
+
+ if (pdu->data_segment_len > SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH) {
+ reject_reason = ISCSI_REASON_PROTOCOL_ERROR;
+ goto reject_return;
+ }
+
+ task = get_transfer_task(conn, transfer_tag);
+ if (task == NULL) {
+ SPDK_ERRLOG("Not found task for transfer_tag=%x\n", transfer_tag);
+ goto reject_return;
+ }
+
+ lun_dev = spdk_scsi_dev_get_lun(conn->dev, task->lun_id);
+
+ if (pdu->data_segment_len > task->desired_data_transfer_length) {
+ SPDK_ERRLOG("the dataout pdu data length is larger than the value sent by R2T PDU\n");
+ return SPDK_ISCSI_CONNECTION_FATAL;
+ }
+
+ if (task->tag != task_tag) {
+ SPDK_ERRLOG("The r2t task tag is %u, and the dataout task tag is %u\n",
+ task->tag, task_tag);
+ goto reject_return;
+ }
+
+ if (DataSN != task->r2t_datasn) {
+ SPDK_ERRLOG("DataSN(%u) exp=%d error\n", DataSN, task->r2t_datasn);
+ if (conn->sess->ErrorRecoveryLevel >= 1) {
+ goto send_r2t_recovery_return;
+ } else {
+ reject_reason = ISCSI_REASON_PROTOCOL_ERROR;
+ goto reject_return;
+ }
+ }
+
+ if (buffer_offset != task->next_expected_r2t_offset) {
+ SPDK_ERRLOG("offset(%u) error\n", buffer_offset);
+ return SPDK_ISCSI_CONNECTION_FATAL;
+ }
+
+ transfer_len = task->scsi.transfer_len;
+ task->current_r2t_length += pdu->data_segment_len;
+ task->next_expected_r2t_offset += pdu->data_segment_len;
+ task->r2t_datasn++;
+
+ if (task->current_r2t_length > conn->sess->MaxBurstLength) {
+ SPDK_ERRLOG("R2T burst(%u) > MaxBurstLength(%u)\n",
+ task->current_r2t_length,
+ conn->sess->MaxBurstLength);
+ return SPDK_ISCSI_CONNECTION_FATAL;
+ }
+
+ if (F_bit) {
+ /*
+ * This R2T burst is done. Clear the length before we
+ * receive a PDU for the next R2t burst.
+ */
+ task->current_r2t_length = 0;
+ }
+
+ subtask = iscsi_task_get(conn, task, iscsi_task_cpl);
+ if (subtask == NULL) {
+ SPDK_ERRLOG("Unable to acquire subtask\n");
+ return SPDK_ISCSI_CONNECTION_FATAL;
+ }
+ subtask->scsi.offset = buffer_offset;
+ subtask->scsi.length = pdu->data_segment_len;
+ iscsi_task_associate_pdu(subtask, pdu);
+
+ if (task->next_expected_r2t_offset == transfer_len) {
+ task->acked_r2tsn++;
+ } else if (F_bit && (task->next_r2t_offset < transfer_len)) {
+ task->acked_r2tsn++;
+ len = spdk_min(conn->sess->MaxBurstLength,
+ (transfer_len - task->next_r2t_offset));
+ rc = iscsi_send_r2t(conn, task, task->next_r2t_offset, len,
+ task->ttt, &task->R2TSN);
+ if (rc < 0) {
+ SPDK_ERRLOG("iscsi_send_r2t() failed\n");
+ }
+ task->next_r2t_offset += len;
+ }
+
+ if (lun_dev == NULL) {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "LUN %d is removed, complete the task immediately\n",
+ task->lun_id);
+ subtask->scsi.transfer_len = subtask->scsi.length;
+ spdk_scsi_task_process_null_lun(&subtask->scsi);
+ iscsi_task_cpl(&subtask->scsi);
+ return 0;
+ }
+
+ if (spdk_unlikely(spdk_scsi_lun_get_dif_ctx(lun_dev, &subtask->scsi, &pdu->dif_ctx))) {
+ pdu->dif_insert_or_strip = true;
+ }
+
+ pdu->task = subtask;
+ return 0;
+
+send_r2t_recovery_return:
+ rc = iscsi_send_r2t_recovery(conn, task, task->acked_r2tsn, true);
+ if (rc == 0) {
+ return 0;
+ }
+
+reject_return:
+ return iscsi_reject(conn, pdu, reject_reason);
+}
+
+static int
+iscsi_pdu_payload_op_data(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu)
+{
+ struct spdk_iscsi_task *subtask;
+ struct iscsi_bhs_data_out *reqh;
+ uint32_t transfer_tag;
+
+ if (pdu->task == NULL) {
+ return 0;
+ }
+
+ subtask = pdu->task;
+
+ reqh = (struct iscsi_bhs_data_out *)&pdu->bhs;
+ transfer_tag = from_be32(&reqh->ttt);
+
+ if (get_transfer_task(conn, transfer_tag) == NULL) {
+ SPDK_ERRLOG("Not found for transfer_tag=%x\n", transfer_tag);
+ subtask->scsi.transfer_len = subtask->scsi.length;
+ spdk_scsi_task_process_abort(&subtask->scsi);
+ iscsi_task_cpl(&subtask->scsi);
+ return 0;
+ }
+
+ if (spdk_likely(!pdu->dif_insert_or_strip)) {
+ spdk_scsi_task_set_data(&subtask->scsi, pdu->data, pdu->data_segment_len);
+ } else {
+ spdk_scsi_task_set_data(&subtask->scsi, pdu->data, pdu->data_buf_len);
+ }
+
+ if (spdk_scsi_dev_get_lun(conn->dev, subtask->lun_id) == NULL) {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "LUN %d is removed, complete the task immediately\n",
+ subtask->lun_id);
+ subtask->scsi.transfer_len = subtask->scsi.length;
+ spdk_scsi_task_process_null_lun(&subtask->scsi);
+ iscsi_task_cpl(&subtask->scsi);
+ return 0;
+ }
+
+ iscsi_queue_task(conn, subtask);
+ return 0;
+}
+
+static void
+init_login_reject_response(struct spdk_iscsi_pdu *pdu, struct spdk_iscsi_pdu *rsp_pdu)
+{
+ struct iscsi_bhs_login_rsp *rsph;
+
+ memset(rsp_pdu, 0, sizeof(struct spdk_iscsi_pdu));
+ rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs;
+ rsph->version_max = ISCSI_VERSION;
+ rsph->version_act = ISCSI_VERSION;
+ rsph->opcode = ISCSI_OP_LOGIN_RSP;
+ rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR;
+ rsph->status_detail = ISCSI_LOGIN_INVALID_LOGIN_REQUEST;
+ rsph->itt = pdu->bhs.itt;
+}
+
+static void
+iscsi_pdu_dump(struct spdk_iscsi_pdu *pdu)
+{
+ SPDK_ERRLOGDUMP("PDU", (uint8_t *)&pdu->bhs, ISCSI_BHS_LEN);
+}
+
+/* This function is used to refree the pdu when it is acknowledged */
+static void
+remove_acked_pdu(struct spdk_iscsi_conn *conn, uint32_t ExpStatSN)
+{
+ struct spdk_iscsi_pdu *pdu, *pdu_temp;
+ uint32_t stat_sn;
+
+ conn->exp_statsn = spdk_min(ExpStatSN, conn->StatSN);
+ TAILQ_FOREACH_SAFE(pdu, &conn->snack_pdu_list, tailq, pdu_temp) {
+ stat_sn = from_be32(&pdu->bhs.stat_sn);
+ if (spdk_sn32_lt(stat_sn, conn->exp_statsn)) {
+ TAILQ_REMOVE(&conn->snack_pdu_list, pdu, tailq);
+ iscsi_conn_free_pdu(conn, pdu);
+ }
+ }
+}
+
+static int
+iscsi_update_cmdsn(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu)
+{
+ int opcode;
+ uint32_t ExpStatSN;
+ int I_bit;
+ struct spdk_iscsi_sess *sess;
+ struct iscsi_bhs_scsi_req *reqh;
+
+ sess = conn->sess;
+ if (!sess) {
+ SPDK_ERRLOG("Connection has no associated session!\n");
+ return SPDK_ISCSI_CONNECTION_FATAL;
+ }
+
+ opcode = pdu->bhs.opcode;
+ reqh = (struct iscsi_bhs_scsi_req *)&pdu->bhs;
+
+ pdu->cmd_sn = from_be32(&reqh->cmd_sn);
+
+ I_bit = reqh->immediate;
+ if (I_bit == 0) {
+ if (spdk_sn32_lt(pdu->cmd_sn, sess->ExpCmdSN) ||
+ spdk_sn32_gt(pdu->cmd_sn, sess->MaxCmdSN)) {
+ if (sess->session_type == SESSION_TYPE_NORMAL &&
+ opcode != ISCSI_OP_SCSI_DATAOUT) {
+ SPDK_ERRLOG("CmdSN(%u) ignore (ExpCmdSN=%u, MaxCmdSN=%u)\n",
+ pdu->cmd_sn, sess->ExpCmdSN, sess->MaxCmdSN);
+
+ if (sess->ErrorRecoveryLevel >= 1) {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Skip the error in ERL 1 and 2\n");
+ } else {
+ return SPDK_PDU_FATAL;
+ }
+ }
+ }
+ } else if (pdu->cmd_sn != sess->ExpCmdSN) {
+ SPDK_ERRLOG("CmdSN(%u) error ExpCmdSN=%u\n", pdu->cmd_sn, sess->ExpCmdSN);
+
+ if (sess->ErrorRecoveryLevel >= 1) {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Skip the error in ERL 1 and 2\n");
+ } else if (opcode != ISCSI_OP_NOPOUT) {
+ /*
+ * The Linux initiator does not send valid CmdSNs for
+ * nopout under heavy load, so do not close the
+ * connection in that case.
+ */
+ return SPDK_ISCSI_CONNECTION_FATAL;
+ }
+ }
+
+ ExpStatSN = from_be32(&reqh->exp_stat_sn);
+ if (spdk_sn32_gt(ExpStatSN, conn->StatSN)) {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "StatSN(%u) advanced\n", ExpStatSN);
+ ExpStatSN = conn->StatSN;
+ }
+
+ if (sess->ErrorRecoveryLevel >= 1) {
+ remove_acked_pdu(conn, ExpStatSN);
+ }
+
+ if (!I_bit && opcode != ISCSI_OP_SCSI_DATAOUT) {
+ sess->ExpCmdSN++;
+ }
+
+ return 0;
+}
+
+static int
+iscsi_pdu_hdr_handle(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu)
+{
+ int opcode;
+ int rc;
+ struct spdk_iscsi_pdu *rsp_pdu = NULL;
+
+ if (pdu == NULL) {
+ return -1;
+ }
+
+ opcode = pdu->bhs.opcode;
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "opcode %x\n", opcode);
+
+ if (opcode == ISCSI_OP_LOGIN) {
+ return iscsi_pdu_hdr_op_login(conn, pdu);
+ }
+
+ /* connection in login phase but receive non-login opcode
+ * return response code 0x020b to initiator.
+ * */
+ if (!conn->full_feature && conn->state == ISCSI_CONN_STATE_RUNNING) {
+ rsp_pdu = iscsi_get_pdu(conn);
+ if (rsp_pdu == NULL) {
+ return SPDK_ISCSI_CONNECTION_FATAL;
+ }
+ init_login_reject_response(pdu, rsp_pdu);
+ iscsi_conn_write_pdu(conn, rsp_pdu, iscsi_conn_pdu_generic_complete, NULL);
+ SPDK_ERRLOG("Received opcode %d in login phase\n", opcode);
+ return SPDK_ISCSI_LOGIN_ERROR_RESPONSE;
+ } else if (conn->state == ISCSI_CONN_STATE_INVALID) {
+ SPDK_ERRLOG("before Full Feature\n");
+ iscsi_pdu_dump(pdu);
+ return SPDK_ISCSI_CONNECTION_FATAL;
+ }
+
+ rc = iscsi_update_cmdsn(conn, pdu);
+ if (rc != 0) {
+ return rc;
+ }
+
+ switch (opcode) {
+ case ISCSI_OP_NOPOUT:
+ rc = iscsi_pdu_hdr_op_nopout(conn, pdu);
+ break;
+
+ case ISCSI_OP_SCSI:
+ rc = iscsi_pdu_hdr_op_scsi(conn, pdu);
+ break;
+ case ISCSI_OP_TASK:
+ rc = iscsi_pdu_hdr_op_task(conn, pdu);
+ break;
+
+ case ISCSI_OP_TEXT:
+ rc = iscsi_pdu_hdr_op_text(conn, pdu);
+ break;
+
+ case ISCSI_OP_LOGOUT:
+ rc = iscsi_pdu_hdr_op_logout(conn, pdu);
+ break;
+
+ case ISCSI_OP_SCSI_DATAOUT:
+ rc = iscsi_pdu_hdr_op_data(conn, pdu);
+ break;
+
+ case ISCSI_OP_SNACK:
+ rc = iscsi_pdu_hdr_op_snack(conn, pdu);
+ break;
+
+ default:
+ SPDK_ERRLOG("unsupported opcode %x\n", opcode);
+ return iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR);
+ }
+
+ if (rc < 0) {
+ SPDK_ERRLOG("processing PDU header (opcode=%x) failed on %s(%s)\n",
+ opcode,
+ conn->target_port != NULL ? spdk_scsi_port_get_name(conn->target_port) : "NULL",
+ conn->initiator_port != NULL ? spdk_scsi_port_get_name(conn->initiator_port) : "NULL");
+ }
+
+ return rc;
+}
+
+static int
+iscsi_pdu_payload_handle(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu)
+{
+ int opcode;
+ int rc = 0;
+
+ opcode = pdu->bhs.opcode;
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "opcode %x\n", opcode);
+
+ switch (opcode) {
+ case ISCSI_OP_LOGIN:
+ rc = iscsi_pdu_payload_op_login(conn, pdu);
+ break;
+ case ISCSI_OP_NOPOUT:
+ rc = iscsi_pdu_payload_op_nopout(conn, pdu);
+ break;
+ case ISCSI_OP_SCSI:
+ rc = iscsi_pdu_payload_op_scsi(conn, pdu);
+ break;
+ case ISCSI_OP_TASK:
+ break;
+ case ISCSI_OP_TEXT:
+ rc = iscsi_pdu_payload_op_text(conn, pdu);
+ break;
+ case ISCSI_OP_LOGOUT:
+ break;
+ case ISCSI_OP_SCSI_DATAOUT:
+ rc = iscsi_pdu_payload_op_data(conn, pdu);
+ break;
+ case ISCSI_OP_SNACK:
+ break;
+ default:
+ SPDK_ERRLOG("unsupported opcode %x\n", opcode);
+ return iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR);
+ }
+
+ if (rc < 0) {
+ SPDK_ERRLOG("processing PDU payload (opcode=%x) failed on %s(%s)\n",
+ opcode,
+ conn->target_port != NULL ? spdk_scsi_port_get_name(conn->target_port) : "NULL",
+ conn->initiator_port != NULL ? spdk_scsi_port_get_name(conn->initiator_port) : "NULL");
+ }
+
+ return rc;
+}
+
+static int
+iscsi_read_pdu(struct spdk_iscsi_conn *conn)
+{
+ enum iscsi_pdu_recv_state prev_state;
+ struct spdk_iscsi_pdu *pdu;
+ struct spdk_mempool *pool;
+ uint32_t crc32c;
+ int ahs_len;
+ uint32_t data_len;
+ int rc;
+
+ do {
+ prev_state = conn->pdu_recv_state;
+ pdu = conn->pdu_in_progress;
+
+ switch (conn->pdu_recv_state) {
+ case ISCSI_PDU_RECV_STATE_AWAIT_PDU_READY:
+ assert(conn->pdu_in_progress == NULL);
+
+ conn->pdu_in_progress = iscsi_get_pdu(conn);
+ if (conn->pdu_in_progress == NULL) {
+ return SPDK_ISCSI_CONNECTION_FATAL;
+ }
+ conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_AWAIT_PDU_HDR;
+ break;
+ case ISCSI_PDU_RECV_STATE_AWAIT_PDU_HDR:
+ if (pdu->bhs_valid_bytes < ISCSI_BHS_LEN) {
+ rc = iscsi_conn_read_data(conn,
+ ISCSI_BHS_LEN - pdu->bhs_valid_bytes,
+ (uint8_t *)&pdu->bhs + pdu->bhs_valid_bytes);
+ if (rc < 0) {
+ conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_ERROR;
+ break;
+ }
+ pdu->bhs_valid_bytes += rc;
+ if (pdu->bhs_valid_bytes < ISCSI_BHS_LEN) {
+ return 0;
+ }
+ }
+
+ pdu->data_segment_len = ISCSI_ALIGN(DGET24(pdu->bhs.data_segment_len));
+
+ /* AHS */
+ ahs_len = pdu->bhs.total_ahs_len * 4;
+ assert(ahs_len <= ISCSI_AHS_LEN);
+ if (pdu->ahs_valid_bytes < ahs_len) {
+ rc = iscsi_conn_read_data(conn,
+ ahs_len - pdu->ahs_valid_bytes,
+ pdu->ahs + pdu->ahs_valid_bytes);
+ if (rc < 0) {
+ conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_ERROR;
+ break;
+ }
+
+ pdu->ahs_valid_bytes += rc;
+ if (pdu->ahs_valid_bytes < ahs_len) {
+ return 0;
+ }
+ }
+
+ /* Header Digest */
+ if (conn->header_digest &&
+ pdu->hdigest_valid_bytes < ISCSI_DIGEST_LEN) {
+ rc = iscsi_conn_read_data(conn,
+ ISCSI_DIGEST_LEN - pdu->hdigest_valid_bytes,
+ pdu->header_digest + pdu->hdigest_valid_bytes);
+ if (rc < 0) {
+ conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_ERROR;
+ break;
+ }
+
+ pdu->hdigest_valid_bytes += rc;
+ if (pdu->hdigest_valid_bytes < ISCSI_DIGEST_LEN) {
+ return 0;
+ }
+ }
+
+ if (conn->header_digest) {
+ crc32c = iscsi_pdu_calc_header_digest(pdu);
+ rc = MATCH_DIGEST_WORD(pdu->header_digest, crc32c);
+ if (rc == 0) {
+ SPDK_ERRLOG("header digest error (%s)\n", conn->initiator_name);
+ conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_ERROR;
+ break;
+ }
+ }
+
+ rc = iscsi_pdu_hdr_handle(conn, pdu);
+ if (rc < 0) {
+ SPDK_ERRLOG("Critical error is detected. Close the connection\n");
+ conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_ERROR;
+ break;
+ }
+
+ conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD;
+ break;
+ case ISCSI_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD:
+ data_len = pdu->data_segment_len;
+
+ if (data_len != 0 && pdu->data_buf == NULL) {
+ if (data_len <= iscsi_get_max_immediate_data_size()) {
+ pool = g_iscsi.pdu_immediate_data_pool;
+ pdu->data_buf_len = SPDK_BDEV_BUF_SIZE_WITH_MD(iscsi_get_max_immediate_data_size());
+ } else if (data_len <= SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH) {
+ pool = g_iscsi.pdu_data_out_pool;
+ pdu->data_buf_len = SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH);
+ } else {
+ SPDK_ERRLOG("Data(%d) > MaxSegment(%d)\n",
+ data_len, SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH);
+ conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_ERROR;
+ break;
+ }
+ pdu->mobj = spdk_mempool_get(pool);
+ if (pdu->mobj == NULL) {
+ return 0;
+ }
+ pdu->data_buf = pdu->mobj->buf;
+ pdu->data = pdu->mobj->buf;
+ pdu->data_from_mempool = true;
+ }
+
+ /* copy the actual data into local buffer */
+ if (pdu->data_valid_bytes < data_len) {
+ rc = iscsi_conn_read_data_segment(conn, pdu, data_len);
+ if (rc < 0) {
+ conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_ERROR;
+ break;
+ }
+
+ pdu->data_valid_bytes += rc;
+ if (pdu->data_valid_bytes < data_len) {
+ return 0;
+ }
+ }
+
+ /* copy out the data digest */
+ if (conn->data_digest && data_len != 0 &&
+ pdu->ddigest_valid_bytes < ISCSI_DIGEST_LEN) {
+ rc = iscsi_conn_read_data(conn,
+ ISCSI_DIGEST_LEN - pdu->ddigest_valid_bytes,
+ pdu->data_digest + pdu->ddigest_valid_bytes);
+ if (rc < 0) {
+ conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_ERROR;
+ break;
+ }
+
+ pdu->ddigest_valid_bytes += rc;
+ if (pdu->ddigest_valid_bytes < ISCSI_DIGEST_LEN) {
+ return 0;
+ }
+ }
+
+ /* All data for this PDU has now been read from the socket. */
+ spdk_trace_record(TRACE_ISCSI_READ_PDU, conn->id, pdu->data_valid_bytes,
+ (uintptr_t)pdu, pdu->bhs.opcode);
+
+ /* check data digest */
+ if (conn->data_digest && data_len != 0) {
+ crc32c = iscsi_pdu_calc_data_digest(pdu);
+ rc = MATCH_DIGEST_WORD(pdu->data_digest, crc32c);
+ if (rc == 0) {
+ SPDK_ERRLOG("data digest error (%s)\n", conn->initiator_name);
+ conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_ERROR;
+ break;
+ }
+ }
+
+ if (conn->is_logged_out) {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "pdu received after logout\n");
+ conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_ERROR;
+ break;
+ }
+
+ if (!pdu->is_rejected) {
+ rc = iscsi_pdu_payload_handle(conn, pdu);
+ } else {
+ rc = 0;
+ }
+ if (rc == 0) {
+ spdk_trace_record(TRACE_ISCSI_TASK_EXECUTED, 0, 0, (uintptr_t)pdu, 0);
+ iscsi_put_pdu(pdu);
+ conn->pdu_in_progress = NULL;
+ conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_AWAIT_PDU_READY;
+ return 1;
+ } else {
+ conn->pdu_recv_state = ISCSI_PDU_RECV_STATE_ERROR;
+ }
+ break;
+ case ISCSI_PDU_RECV_STATE_ERROR:
+ return SPDK_ISCSI_CONNECTION_FATAL;
+ default:
+ assert(false);
+ SPDK_ERRLOG("code should not come here\n");
+ break;
+ }
+ } while (prev_state != conn->pdu_recv_state);
+
+ return 0;
+}
+
+#define GET_PDU_LOOP_COUNT 16
+
+int
+iscsi_handle_incoming_pdus(struct spdk_iscsi_conn *conn)
+{
+ int i, rc;
+
+ /* Read new PDUs from network */
+ for (i = 0; i < GET_PDU_LOOP_COUNT; i++) {
+ rc = iscsi_read_pdu(conn);
+ if (rc == 0) {
+ break;
+ } else if (rc < 0) {
+ return rc;
+ }
+
+ if (conn->is_stopped) {
+ break;
+ }
+ }
+
+ return i;
+}
diff --git a/src/spdk/lib/iscsi/iscsi.h b/src/spdk/lib/iscsi/iscsi.h
new file mode 100644
index 000000000..b1747e4ab
--- /dev/null
+++ b/src/spdk/lib/iscsi/iscsi.h
@@ -0,0 +1,465 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_ISCSI_H
+#define SPDK_ISCSI_H
+
+#include "spdk/stdinc.h"
+
+#include "spdk/bdev.h"
+#include "spdk/iscsi_spec.h"
+#include "spdk/thread.h"
+#include "spdk/sock.h"
+
+#include "spdk/scsi.h"
+#include "iscsi/param.h"
+
+#include "spdk/assert.h"
+#include "spdk/dif.h"
+#include "spdk/util.h"
+
+#define SPDK_ISCSI_DEFAULT_NODEBASE "iqn.2016-06.io.spdk"
+
+#define DEFAULT_MAXR2T 4
+#define MAX_INITIATOR_PORT_NAME 256
+#define MAX_INITIATOR_NAME 223
+#define MAX_TARGET_NAME 223
+
+#define MAX_PORTAL 1024
+#define MAX_INITIATOR 256
+#define MAX_NETMASK 256
+#define MAX_ISCSI_CONNECTIONS 1024
+#define MAX_PORTAL_ADDR 256
+#define MAX_PORTAL_PORT 32
+
+#define DEFAULT_PORT 3260
+#define DEFAULT_MAX_SESSIONS 128
+#define DEFAULT_MAX_CONNECTIONS_PER_SESSION 2
+#define DEFAULT_MAXOUTSTANDINGR2T 1
+#define DEFAULT_DEFAULTTIME2WAIT 2
+#define DEFAULT_DEFAULTTIME2RETAIN 20
+#define DEFAULT_INITIALR2T true
+#define DEFAULT_IMMEDIATEDATA true
+#define DEFAULT_DATAPDUINORDER true
+#define DEFAULT_DATASEQUENCEINORDER true
+#define DEFAULT_ERRORRECOVERYLEVEL 0
+#define DEFAULT_TIMEOUT 60
+#define MAX_NOPININTERVAL 60
+#define DEFAULT_NOPININTERVAL 30
+
+/*
+ * SPDK iSCSI target currently only supports 64KB as the maximum data segment length
+ * it can receive from initiators. Other values may work, but no guarantees.
+ */
+#define SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH 65536
+
+/*
+ * Defines maximum number of data out buffers each connection can have in
+ * use at any given time.
+ */
+#define MAX_DATA_OUT_PER_CONNECTION 16
+
+/*
+ * Defines maximum number of data in buffers each connection can have in
+ * use at any given time. So this limit does not affect I/O smaller than
+ * SPDK_BDEV_SMALL_BUF_MAX_SIZE.
+ */
+#define MAX_LARGE_DATAIN_PER_CONNECTION 64
+
+#define SPDK_ISCSI_MAX_BURST_LENGTH \
+ (SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH * MAX_DATA_OUT_PER_CONNECTION)
+
+/*
+ * Defines default maximum amount in bytes of unsolicited data the iSCSI
+ * initiator may send to the SPDK iSCSI target during the execution of
+ * a single SCSI command. And it is smaller than the MaxBurstLength.
+ */
+#define SPDK_ISCSI_FIRST_BURST_LENGTH 8192
+
+/*
+ * Defines minimum amount in bytes of unsolicited data the iSCSI initiator
+ * may send to the SPDK iSCSI target during the execution of a single
+ * SCSI command.
+ */
+#define SPDK_ISCSI_MIN_FIRST_BURST_LENGTH 512
+
+#define SPDK_ISCSI_MAX_FIRST_BURST_LENGTH 16777215
+
+/*
+ * Defines default maximum queue depth per connection and this can be
+ * changed by configuration file.
+ */
+#define DEFAULT_MAX_QUEUE_DEPTH 64
+
+/** Defines how long we should wait for a logout request when the target
+ * requests logout to the initiator asynchronously.
+ */
+#define ISCSI_LOGOUT_REQUEST_TIMEOUT 30 /* in seconds */
+
+/** Defines how long we should wait for a TCP close after responding to a
+ * logout request, before terminating the connection ourselves.
+ */
+#define ISCSI_LOGOUT_TIMEOUT 5 /* in seconds */
+
+/* For spdk_iscsi_login_in related function use, we need to avoid the conflict
+ * with other errors
+ * */
+#define SPDK_ISCSI_LOGIN_ERROR_RESPONSE -1000
+#define SPDK_ISCSI_LOGIN_ERROR_PARAMETER -1001
+#define SPDK_ISCSI_PARAMETER_EXCHANGE_NOT_ONCE -1002
+
+#define ISCSI_AHS_LEN 60
+
+struct spdk_mobj {
+ struct spdk_mempool *mp;
+ void *buf;
+};
+
+/*
+ * Maximum number of SGL elements, i.e.,
+ * BHS, AHS, Header Digest, Data Segment and Data Digest.
+ */
+#define SPDK_ISCSI_MAX_SGL_DESCRIPTORS (5)
+
+typedef void (*iscsi_conn_xfer_complete_cb)(void *cb_arg);
+
+struct spdk_iscsi_pdu {
+ struct iscsi_bhs bhs;
+ struct spdk_mobj *mobj;
+ bool is_rejected;
+ uint8_t *data_buf;
+ uint8_t *data;
+ uint8_t header_digest[ISCSI_DIGEST_LEN];
+ uint8_t data_digest[ISCSI_DIGEST_LEN];
+ size_t data_segment_len;
+ int bhs_valid_bytes;
+ int ahs_valid_bytes;
+ uint32_t data_valid_bytes;
+ int hdigest_valid_bytes;
+ int ddigest_valid_bytes;
+ int ref;
+ bool data_from_mempool; /* indicate whether the data buffer is allocated from mempool */
+ struct spdk_iscsi_task *task; /* data tied to a task buffer */
+ uint32_t cmd_sn;
+ uint32_t writev_offset;
+ uint32_t data_buf_len;
+ bool dif_insert_or_strip;
+ struct spdk_dif_ctx dif_ctx;
+ struct spdk_iscsi_conn *conn;
+
+ iscsi_conn_xfer_complete_cb cb_fn;
+ void *cb_arg;
+
+ /* The sock request ends with a 0 length iovec. Place the actual iovec immediately
+ * after it. There is a static assert below to check if the compiler inserted
+ * any unwanted padding */
+ int32_t mapped_length;
+ struct spdk_sock_request sock_req;
+ struct iovec iov[SPDK_ISCSI_MAX_SGL_DESCRIPTORS];
+ TAILQ_ENTRY(spdk_iscsi_pdu) tailq;
+
+
+ /*
+ * 60 bytes of AHS should suffice for now.
+ * This should always be at the end of PDU data structure.
+ * we need to not zero this out when doing memory clear.
+ */
+ uint8_t ahs[ISCSI_AHS_LEN];
+
+ struct {
+ uint16_t length; /* iSCSI SenseLength (big-endian) */
+ uint8_t data[32];
+ } sense;
+};
+SPDK_STATIC_ASSERT(offsetof(struct spdk_iscsi_pdu,
+ sock_req) + sizeof(struct spdk_sock_request) == offsetof(struct spdk_iscsi_pdu, iov),
+ "Compiler inserted padding between iov and sock_req");
+
+enum iscsi_connection_state {
+ ISCSI_CONN_STATE_INVALID = 0,
+ ISCSI_CONN_STATE_RUNNING = 1,
+ ISCSI_CONN_STATE_EXITING = 2,
+ ISCSI_CONN_STATE_EXITED = 3,
+};
+
+enum iscsi_chap_phase {
+ ISCSI_CHAP_PHASE_NONE = 0,
+ ISCSI_CHAP_PHASE_WAIT_A = 1,
+ ISCSI_CHAP_PHASE_WAIT_NR = 2,
+ ISCSI_CHAP_PHASE_END = 3,
+};
+
+enum session_type {
+ SESSION_TYPE_INVALID = 0,
+ SESSION_TYPE_NORMAL = 1,
+ SESSION_TYPE_DISCOVERY = 2,
+};
+
+#define ISCSI_CHAP_CHALLENGE_LEN 1024
+#define ISCSI_CHAP_MAX_USER_LEN 255
+#define ISCSI_CHAP_MAX_SECRET_LEN 255
+
+struct iscsi_chap_auth {
+ enum iscsi_chap_phase chap_phase;
+
+ char user[ISCSI_CHAP_MAX_USER_LEN + 1];
+ char secret[ISCSI_CHAP_MAX_SECRET_LEN + 1];
+ char muser[ISCSI_CHAP_MAX_USER_LEN + 1];
+ char msecret[ISCSI_CHAP_MAX_SECRET_LEN + 1];
+
+ uint8_t chap_id[1];
+ uint8_t chap_mid[1];
+ int chap_challenge_len;
+ uint8_t chap_challenge[ISCSI_CHAP_CHALLENGE_LEN];
+ int chap_mchallenge_len;
+ uint8_t chap_mchallenge[ISCSI_CHAP_CHALLENGE_LEN];
+};
+
+struct spdk_iscsi_auth_secret {
+ char user[ISCSI_CHAP_MAX_USER_LEN + 1];
+ char secret[ISCSI_CHAP_MAX_SECRET_LEN + 1];
+ char muser[ISCSI_CHAP_MAX_USER_LEN + 1];
+ char msecret[ISCSI_CHAP_MAX_SECRET_LEN + 1];
+ TAILQ_ENTRY(spdk_iscsi_auth_secret) tailq;
+};
+
+struct spdk_iscsi_auth_group {
+ int32_t tag;
+ TAILQ_HEAD(, spdk_iscsi_auth_secret) secret_head;
+ TAILQ_ENTRY(spdk_iscsi_auth_group) tailq;
+};
+
+struct spdk_iscsi_sess {
+ uint32_t connections;
+ struct spdk_iscsi_conn **conns;
+
+ struct spdk_scsi_port *initiator_port;
+ int tag;
+
+ uint64_t isid;
+ uint16_t tsih;
+ struct spdk_iscsi_tgt_node *target;
+ int queue_depth;
+
+ struct iscsi_param *params;
+
+ enum session_type session_type;
+ uint32_t MaxConnections;
+ uint32_t MaxOutstandingR2T;
+ uint32_t DefaultTime2Wait;
+ uint32_t DefaultTime2Retain;
+ uint32_t FirstBurstLength;
+ uint32_t MaxBurstLength;
+ bool InitialR2T;
+ bool ImmediateData;
+ bool DataPDUInOrder;
+ bool DataSequenceInOrder;
+ uint32_t ErrorRecoveryLevel;
+
+ uint32_t ExpCmdSN;
+ uint32_t MaxCmdSN;
+
+ uint32_t current_text_itt;
+};
+
+struct spdk_iscsi_poll_group {
+ struct spdk_poller *poller;
+ struct spdk_poller *nop_poller;
+ STAILQ_HEAD(connections, spdk_iscsi_conn) connections;
+ struct spdk_sock_group *sock_group;
+ TAILQ_ENTRY(spdk_iscsi_poll_group) link;
+};
+
+struct spdk_iscsi_opts {
+ char *authfile;
+ char *nodebase;
+ int32_t timeout;
+ int32_t nopininterval;
+ bool disable_chap;
+ bool require_chap;
+ bool mutual_chap;
+ int32_t chap_group;
+ uint32_t MaxSessions;
+ uint32_t MaxConnectionsPerSession;
+ uint32_t MaxConnections;
+ uint32_t MaxQueueDepth;
+ uint32_t DefaultTime2Wait;
+ uint32_t DefaultTime2Retain;
+ uint32_t FirstBurstLength;
+ bool ImmediateData;
+ uint32_t ErrorRecoveryLevel;
+ bool AllowDuplicateIsid;
+};
+
+struct spdk_iscsi_globals {
+ char *authfile;
+ char *nodebase;
+ pthread_mutex_t mutex;
+ uint32_t refcnt;
+ TAILQ_HEAD(, spdk_iscsi_portal) portal_head;
+ TAILQ_HEAD(, spdk_iscsi_portal_grp) pg_head;
+ TAILQ_HEAD(, spdk_iscsi_init_grp) ig_head;
+ TAILQ_HEAD(, spdk_iscsi_tgt_node) target_head;
+ TAILQ_HEAD(, spdk_iscsi_auth_group) auth_group_head;
+ TAILQ_HEAD(, spdk_iscsi_poll_group) poll_group_head;
+
+ int32_t timeout;
+ int32_t nopininterval;
+ bool disable_chap;
+ bool require_chap;
+ bool mutual_chap;
+ int32_t chap_group;
+
+ uint32_t MaxSessions;
+ uint32_t MaxConnectionsPerSession;
+ uint32_t MaxConnections;
+ uint32_t MaxQueueDepth;
+ uint32_t DefaultTime2Wait;
+ uint32_t DefaultTime2Retain;
+ uint32_t FirstBurstLength;
+ bool ImmediateData;
+ uint32_t ErrorRecoveryLevel;
+ bool AllowDuplicateIsid;
+
+ struct spdk_mempool *pdu_pool;
+ struct spdk_mempool *pdu_immediate_data_pool;
+ struct spdk_mempool *pdu_data_out_pool;
+ struct spdk_mempool *session_pool;
+ struct spdk_mempool *task_pool;
+
+ struct spdk_iscsi_sess **session;
+};
+
+#define ISCSI_SECURITY_NEGOTIATION_PHASE 0
+#define ISCSI_OPERATIONAL_NEGOTIATION_PHASE 1
+#define ISCSI_NSG_RESERVED_CODE 2
+#define ISCSI_FULL_FEATURE_PHASE 3
+
+/* logout reason */
+#define ISCSI_LOGOUT_REASON_CLOSE_SESSION 0
+#define ISCSI_LOGOUT_REASON_CLOSE_CONNECTION 1
+#define ISCSI_LOGOUT_REASON_REMOVE_CONN_FOR_RECOVERY 2
+
+enum spdk_error_codes {
+ SPDK_ISCSI_CONNECTION_FATAL = -1,
+ SPDK_PDU_FATAL = -2,
+};
+
+#define DGET24(B) \
+ ((( (uint32_t) *((uint8_t *)(B)+0)) << 16) \
+ | (((uint32_t) *((uint8_t *)(B)+1)) << 8) \
+ | (((uint32_t) *((uint8_t *)(B)+2)) << 0))
+
+#define DSET24(B,D) \
+ (((*((uint8_t *)(B)+0)) = (uint8_t)((uint32_t)(D) >> 16)), \
+ ((*((uint8_t *)(B)+1)) = (uint8_t)((uint32_t)(D) >> 8)), \
+ ((*((uint8_t *)(B)+2)) = (uint8_t)((uint32_t)(D) >> 0)))
+
+#define xstrdup(s) (s ? strdup(s) : (char *)NULL)
+
+extern struct spdk_iscsi_globals g_iscsi;
+extern struct spdk_iscsi_opts *g_spdk_iscsi_opts;
+
+struct spdk_iscsi_task;
+struct spdk_json_write_ctx;
+
+typedef void (*spdk_iscsi_init_cb)(void *cb_arg, int rc);
+
+void spdk_iscsi_init(spdk_iscsi_init_cb cb_fn, void *cb_arg);
+typedef void (*spdk_iscsi_fini_cb)(void *arg);
+void spdk_iscsi_fini(spdk_iscsi_fini_cb cb_fn, void *cb_arg);
+void shutdown_iscsi_conns_done(void);
+void spdk_iscsi_config_text(FILE *fp);
+void spdk_iscsi_config_json(struct spdk_json_write_ctx *w);
+
+struct spdk_iscsi_opts *iscsi_opts_alloc(void);
+void iscsi_opts_free(struct spdk_iscsi_opts *opts);
+struct spdk_iscsi_opts *iscsi_opts_copy(struct spdk_iscsi_opts *src);
+void iscsi_opts_info_json(struct spdk_json_write_ctx *w);
+int iscsi_set_discovery_auth(bool disable_chap, bool require_chap,
+ bool mutual_chap, int32_t chap_group);
+int iscsi_chap_get_authinfo(struct iscsi_chap_auth *auth, const char *authuser,
+ int ag_tag);
+int iscsi_add_auth_group(int32_t tag, struct spdk_iscsi_auth_group **_group);
+struct spdk_iscsi_auth_group *iscsi_find_auth_group_by_tag(int32_t tag);
+void iscsi_delete_auth_group(struct spdk_iscsi_auth_group *group);
+int iscsi_auth_group_add_secret(struct spdk_iscsi_auth_group *group,
+ const char *user, const char *secret,
+ const char *muser, const char *msecret);
+int iscsi_auth_group_delete_secret(struct spdk_iscsi_auth_group *group,
+ const char *user);
+void iscsi_auth_groups_info_json(struct spdk_json_write_ctx *w);
+
+void iscsi_task_response(struct spdk_iscsi_conn *conn,
+ struct spdk_iscsi_task *task);
+int iscsi_build_iovs(struct spdk_iscsi_conn *conn, struct iovec *iovs, int iovcnt,
+ struct spdk_iscsi_pdu *pdu, uint32_t *mapped_length);
+int iscsi_handle_incoming_pdus(struct spdk_iscsi_conn *conn);
+void iscsi_task_mgmt_response(struct spdk_iscsi_conn *conn,
+ struct spdk_iscsi_task *task);
+
+void iscsi_free_sess(struct spdk_iscsi_sess *sess);
+void iscsi_clear_all_transfer_task(struct spdk_iscsi_conn *conn,
+ struct spdk_scsi_lun *lun,
+ struct spdk_iscsi_pdu *pdu);
+bool iscsi_del_transfer_task(struct spdk_iscsi_conn *conn, uint32_t CmdSN);
+
+uint32_t iscsi_pdu_calc_header_digest(struct spdk_iscsi_pdu *pdu);
+uint32_t iscsi_pdu_calc_data_digest(struct spdk_iscsi_pdu *pdu);
+
+/* Memory management */
+void iscsi_put_pdu(struct spdk_iscsi_pdu *pdu);
+struct spdk_iscsi_pdu *iscsi_get_pdu(struct spdk_iscsi_conn *conn);
+void iscsi_op_abort_task_set(struct spdk_iscsi_task *task,
+ uint8_t function);
+void iscsi_queue_task(struct spdk_iscsi_conn *conn, struct spdk_iscsi_task *task);
+
+static inline uint32_t
+iscsi_get_max_immediate_data_size(void)
+{
+ /*
+ * Specify enough extra space in addition to FirstBurstLength to
+ * account for a header digest, data digest and additional header
+ * segments (AHS). These are not normally used but they do not
+ * take up much space and we need to make sure the worst-case scenario
+ * can be satisified by the size returned here.
+ */
+ return g_iscsi.FirstBurstLength +
+ ISCSI_DIGEST_LEN + /* data digest */
+ ISCSI_DIGEST_LEN + /* header digest */
+ 8 + /* bidirectional AHS */
+ 52; /* extended CDB AHS (for a 64-byte CDB) */
+}
+
+#endif /* SPDK_ISCSI_H */
diff --git a/src/spdk/lib/iscsi/iscsi_rpc.c b/src/spdk/lib/iscsi/iscsi_rpc.c
new file mode 100644
index 000000000..8ab43d31d
--- /dev/null
+++ b/src/spdk/lib/iscsi/iscsi_rpc.c
@@ -0,0 +1,1639 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "iscsi/iscsi.h"
+#include "iscsi/conn.h"
+#include "iscsi/tgt_node.h"
+#include "iscsi/portal_grp.h"
+#include "iscsi/init_grp.h"
+
+#include "spdk/rpc.h"
+#include "spdk/util.h"
+#include "spdk/string.h"
+#include "spdk_internal/log.h"
+
+static void
+rpc_iscsi_get_initiator_groups(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct spdk_json_write_ctx *w;
+
+ if (params != NULL) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "iscsi_get_initiator_groups requires no parameters");
+ return;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_array_begin(w);
+ iscsi_init_grps_info_json(w);
+ spdk_json_write_array_end(w);
+
+ spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("iscsi_get_initiator_groups", rpc_iscsi_get_initiator_groups,
+ SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_get_initiator_groups, get_initiator_groups)
+
+struct rpc_initiator_list {
+ size_t num_initiators;
+ char *initiators[MAX_INITIATOR];
+};
+
+static int
+decode_rpc_initiator_list(const struct spdk_json_val *val, void *out)
+{
+ struct rpc_initiator_list *list = out;
+
+ return spdk_json_decode_array(val, spdk_json_decode_string, list->initiators, MAX_INITIATOR,
+ &list->num_initiators, sizeof(char *));
+}
+
+static void
+free_rpc_initiator_list(struct rpc_initiator_list *list)
+{
+ size_t i;
+
+ for (i = 0; i < list->num_initiators; i++) {
+ free(list->initiators[i]);
+ }
+}
+
+struct rpc_netmask_list {
+ size_t num_netmasks;
+ char *netmasks[MAX_NETMASK];
+};
+
+static int
+decode_rpc_netmask_list(const struct spdk_json_val *val, void *out)
+{
+ struct rpc_netmask_list *list = out;
+
+ return spdk_json_decode_array(val, spdk_json_decode_string, list->netmasks, MAX_NETMASK,
+ &list->num_netmasks, sizeof(char *));
+}
+
+static void
+free_rpc_netmask_list(struct rpc_netmask_list *list)
+{
+ size_t i;
+
+ for (i = 0; i < list->num_netmasks; i++) {
+ free(list->netmasks[i]);
+ }
+}
+
+struct rpc_initiator_group {
+ int32_t tag;
+ struct rpc_initiator_list initiator_list;
+ struct rpc_netmask_list netmask_list;
+};
+
+static void
+free_rpc_initiator_group(struct rpc_initiator_group *ig)
+{
+ free_rpc_initiator_list(&ig->initiator_list);
+ free_rpc_netmask_list(&ig->netmask_list);
+}
+
+static const struct spdk_json_object_decoder rpc_initiator_group_decoders[] = {
+ {"tag", offsetof(struct rpc_initiator_group, tag), spdk_json_decode_int32},
+ {"initiators", offsetof(struct rpc_initiator_group, initiator_list), decode_rpc_initiator_list},
+ {"netmasks", offsetof(struct rpc_initiator_group, netmask_list), decode_rpc_netmask_list},
+};
+
+static void
+rpc_iscsi_create_initiator_group(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_initiator_group req = {};
+ struct spdk_json_write_ctx *w;
+
+ if (spdk_json_decode_object(params, rpc_initiator_group_decoders,
+ SPDK_COUNTOF(rpc_initiator_group_decoders), &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ goto invalid;
+ }
+
+ if (req.initiator_list.num_initiators == 0 ||
+ req.netmask_list.num_netmasks == 0) {
+ goto invalid;
+ }
+
+ if (iscsi_init_grp_create_from_initiator_list(req.tag,
+ req.initiator_list.num_initiators,
+ req.initiator_list.initiators,
+ req.netmask_list.num_netmasks,
+ req.netmask_list.netmasks)) {
+ SPDK_ERRLOG("create_from_initiator_list failed\n");
+ goto invalid;
+ }
+
+ free_rpc_initiator_group(&req);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ return;
+
+invalid:
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+ free_rpc_initiator_group(&req);
+}
+SPDK_RPC_REGISTER("iscsi_create_initiator_group", rpc_iscsi_create_initiator_group,
+ SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_create_initiator_group, add_initiator_group)
+
+static const struct spdk_json_object_decoder rpc_add_or_delete_initiators_decoders[] = {
+ {"tag", offsetof(struct rpc_initiator_group, tag), spdk_json_decode_int32},
+ {"initiators", offsetof(struct rpc_initiator_group, initiator_list), decode_rpc_initiator_list, true},
+ {"netmasks", offsetof(struct rpc_initiator_group, netmask_list), decode_rpc_netmask_list, true},
+};
+
+static void
+rpc_iscsi_initiator_group_add_initiators(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_initiator_group req = {};
+ struct spdk_json_write_ctx *w;
+
+ if (spdk_json_decode_object(params, rpc_add_or_delete_initiators_decoders,
+ SPDK_COUNTOF(rpc_add_or_delete_initiators_decoders), &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ goto invalid;
+ }
+
+ if (iscsi_init_grp_add_initiators_from_initiator_list(req.tag,
+ req.initiator_list.num_initiators,
+ req.initiator_list.initiators,
+ req.netmask_list.num_netmasks,
+ req.netmask_list.netmasks)) {
+ SPDK_ERRLOG("add_initiators_from_initiator_list failed\n");
+ goto invalid;
+ }
+
+ free_rpc_initiator_group(&req);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ return;
+
+invalid:
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+ free_rpc_initiator_group(&req);
+}
+SPDK_RPC_REGISTER("iscsi_initiator_group_add_initiators",
+ rpc_iscsi_initiator_group_add_initiators, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_initiator_group_add_initiators,
+ add_initiators_to_initiator_group)
+
+static void
+rpc_iscsi_initiator_group_remove_initiators(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_initiator_group req = {};
+ struct spdk_json_write_ctx *w;
+
+ if (spdk_json_decode_object(params, rpc_add_or_delete_initiators_decoders,
+ SPDK_COUNTOF(rpc_add_or_delete_initiators_decoders), &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ goto invalid;
+ }
+
+ if (iscsi_init_grp_delete_initiators_from_initiator_list(req.tag,
+ req.initiator_list.num_initiators,
+ req.initiator_list.initiators,
+ req.netmask_list.num_netmasks,
+ req.netmask_list.netmasks)) {
+ SPDK_ERRLOG("delete_initiators_from_initiator_list failed\n");
+ goto invalid;
+ }
+
+ free_rpc_initiator_group(&req);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ return;
+
+invalid:
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+ free_rpc_initiator_group(&req);
+}
+SPDK_RPC_REGISTER("iscsi_initiator_group_remove_initiators",
+ rpc_iscsi_initiator_group_remove_initiators, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_initiator_group_remove_initiators,
+ delete_initiators_from_initiator_group)
+
+struct rpc_iscsi_delete_initiator_group {
+ int32_t tag;
+};
+
+static const struct spdk_json_object_decoder rpc_iscsi_delete_initiator_group_decoders[] = {
+ {"tag", offsetof(struct rpc_iscsi_delete_initiator_group, tag), spdk_json_decode_int32},
+};
+
+static void
+rpc_iscsi_delete_initiator_group(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_iscsi_delete_initiator_group req = {};
+ struct spdk_json_write_ctx *w;
+ struct spdk_iscsi_init_grp *ig;
+
+ if (spdk_json_decode_object(params, rpc_iscsi_delete_initiator_group_decoders,
+ SPDK_COUNTOF(rpc_iscsi_delete_initiator_group_decoders),
+ &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ goto invalid;
+ }
+
+ ig = iscsi_init_grp_unregister(req.tag);
+ if (!ig) {
+ goto invalid;
+ }
+ iscsi_tgt_node_delete_map(NULL, ig);
+ iscsi_init_grp_destroy(ig);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ return;
+
+invalid:
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+}
+SPDK_RPC_REGISTER("iscsi_delete_initiator_group", rpc_iscsi_delete_initiator_group,
+ SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_delete_initiator_group, delete_initiator_group)
+
+static void
+rpc_iscsi_get_target_nodes(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct spdk_json_write_ctx *w;
+
+ if (params != NULL) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "iscsi_get_target_nodes requires no parameters");
+ return;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_array_begin(w);
+ iscsi_tgt_nodes_info_json(w);
+ spdk_json_write_array_end(w);
+
+ spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("iscsi_get_target_nodes", rpc_iscsi_get_target_nodes, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_get_target_nodes, get_target_nodes)
+
+struct rpc_pg_ig_map {
+ int32_t pg_tag;
+ int32_t ig_tag;
+};
+
+static const struct spdk_json_object_decoder rpc_pg_ig_map_decoders[] = {
+ {"pg_tag", offsetof(struct rpc_pg_ig_map, pg_tag), spdk_json_decode_int32},
+ {"ig_tag", offsetof(struct rpc_pg_ig_map, ig_tag), spdk_json_decode_int32},
+};
+
+static int
+decode_rpc_pg_ig_map(const struct spdk_json_val *val, void *out)
+{
+ struct rpc_pg_ig_map *pg_ig_map = out;
+
+ return spdk_json_decode_object(val, rpc_pg_ig_map_decoders,
+ SPDK_COUNTOF(rpc_pg_ig_map_decoders),
+ pg_ig_map);
+}
+
+struct rpc_pg_ig_maps {
+ size_t num_maps;
+ struct rpc_pg_ig_map maps[MAX_TARGET_MAP];
+};
+
+static int
+decode_rpc_pg_ig_maps(const struct spdk_json_val *val, void *out)
+{
+ struct rpc_pg_ig_maps *pg_ig_maps = out;
+
+ return spdk_json_decode_array(val, decode_rpc_pg_ig_map, pg_ig_maps->maps,
+ MAX_TARGET_MAP, &pg_ig_maps->num_maps,
+ sizeof(struct rpc_pg_ig_map));
+}
+
+#define RPC_ISCSI_CREATE_TARGET_NODE_MAX_LUN 64
+
+struct rpc_lun {
+ char *bdev_name;
+ int32_t lun_id;
+};
+
+static const struct spdk_json_object_decoder rpc_lun_decoders[] = {
+ {"bdev_name", offsetof(struct rpc_lun, bdev_name), spdk_json_decode_string},
+ {"lun_id", offsetof(struct rpc_lun, lun_id), spdk_json_decode_int32},
+};
+
+static int
+decode_rpc_lun(const struct spdk_json_val *val, void *out)
+{
+ struct rpc_lun *lun = out;
+
+ return spdk_json_decode_object(val, rpc_lun_decoders,
+ SPDK_COUNTOF(rpc_lun_decoders), lun);
+}
+
+struct rpc_luns {
+ size_t num_luns;
+ struct rpc_lun luns[RPC_ISCSI_CREATE_TARGET_NODE_MAX_LUN];
+};
+
+static int
+decode_rpc_luns(const struct spdk_json_val *val, void *out)
+{
+ struct rpc_luns *luns = out;
+
+ return spdk_json_decode_array(val, decode_rpc_lun, luns->luns,
+ RPC_ISCSI_CREATE_TARGET_NODE_MAX_LUN,
+ &luns->num_luns, sizeof(struct rpc_lun));
+}
+
+static void
+free_rpc_luns(struct rpc_luns *p)
+{
+ size_t i;
+
+ for (i = 0; i < p->num_luns; i++) {
+ free(p->luns[i].bdev_name);
+ }
+}
+
+struct rpc_target_node {
+ char *name;
+ char *alias_name;
+
+ struct rpc_pg_ig_maps pg_ig_maps;
+ struct rpc_luns luns;
+
+ int32_t queue_depth;
+ bool disable_chap;
+ bool require_chap;
+ bool mutual_chap;
+ int32_t chap_group;
+
+ bool header_digest;
+ bool data_digest;
+};
+
+static void
+free_rpc_target_node(struct rpc_target_node *req)
+{
+ free(req->name);
+ free(req->alias_name);
+ free_rpc_luns(&req->luns);
+}
+
+static const struct spdk_json_object_decoder rpc_target_node_decoders[] = {
+ {"name", offsetof(struct rpc_target_node, name), spdk_json_decode_string},
+ {"alias_name", offsetof(struct rpc_target_node, alias_name), spdk_json_decode_string},
+ {"pg_ig_maps", offsetof(struct rpc_target_node, pg_ig_maps), decode_rpc_pg_ig_maps},
+ {"luns", offsetof(struct rpc_target_node, luns), decode_rpc_luns},
+ {"queue_depth", offsetof(struct rpc_target_node, queue_depth), spdk_json_decode_int32},
+ {"disable_chap", offsetof(struct rpc_target_node, disable_chap), spdk_json_decode_bool, true},
+ {"require_chap", offsetof(struct rpc_target_node, require_chap), spdk_json_decode_bool, true},
+ {"mutual_chap", offsetof(struct rpc_target_node, mutual_chap), spdk_json_decode_bool, true},
+ {"chap_group", offsetof(struct rpc_target_node, chap_group), spdk_json_decode_int32, true},
+ {"header_digest", offsetof(struct rpc_target_node, header_digest), spdk_json_decode_bool, true},
+ {"data_digest", offsetof(struct rpc_target_node, data_digest), spdk_json_decode_bool, true},
+};
+
+static void
+rpc_iscsi_create_target_node(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_target_node req = {};
+ struct spdk_json_write_ctx *w;
+ struct spdk_iscsi_tgt_node *target;
+ int32_t pg_tags[MAX_TARGET_MAP] = {0}, ig_tags[MAX_TARGET_MAP] = {0};
+ char *bdev_names[RPC_ISCSI_CREATE_TARGET_NODE_MAX_LUN] = {0};
+ int32_t lun_ids[RPC_ISCSI_CREATE_TARGET_NODE_MAX_LUN] = {0};
+ size_t i;
+
+ if (spdk_json_decode_object(params, rpc_target_node_decoders,
+ SPDK_COUNTOF(rpc_target_node_decoders),
+ &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ goto invalid;
+ }
+
+ for (i = 0; i < req.pg_ig_maps.num_maps; i++) {
+ pg_tags[i] = req.pg_ig_maps.maps[i].pg_tag;
+ ig_tags[i] = req.pg_ig_maps.maps[i].ig_tag;
+ }
+
+ for (i = 0; i < req.luns.num_luns; i++) {
+ bdev_names[i] = req.luns.luns[i].bdev_name;
+ lun_ids[i] = req.luns.luns[i].lun_id;
+ }
+
+ /*
+ * Use default parameters in a few places:
+ * index = -1 : automatically pick an index for the new target node
+ * alias = NULL
+ */
+ target = iscsi_tgt_node_construct(-1, req.name, req.alias_name,
+ pg_tags,
+ ig_tags,
+ req.pg_ig_maps.num_maps,
+ (const char **)bdev_names,
+ lun_ids,
+ req.luns.num_luns,
+ req.queue_depth,
+ req.disable_chap,
+ req.require_chap,
+ req.mutual_chap,
+ req.chap_group,
+ req.header_digest,
+ req.data_digest);
+
+ if (target == NULL) {
+ goto invalid;
+ }
+
+ free_rpc_target_node(&req);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ return;
+
+invalid:
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+ free_rpc_target_node(&req);
+}
+SPDK_RPC_REGISTER("iscsi_create_target_node", rpc_iscsi_create_target_node, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_create_target_node, construct_target_node)
+
+struct rpc_tgt_node_pg_ig_maps {
+ char *name;
+ struct rpc_pg_ig_maps pg_ig_maps;
+};
+
+static const struct spdk_json_object_decoder rpc_tgt_node_pg_ig_maps_decoders[] = {
+ {"name", offsetof(struct rpc_tgt_node_pg_ig_maps, name), spdk_json_decode_string},
+ {"pg_ig_maps", offsetof(struct rpc_tgt_node_pg_ig_maps, pg_ig_maps), decode_rpc_pg_ig_maps},
+};
+
+static void
+rpc_iscsi_target_node_add_pg_ig_maps(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_tgt_node_pg_ig_maps req = {};
+ struct spdk_json_write_ctx *w;
+ struct spdk_iscsi_tgt_node *target;
+ int32_t pg_tags[MAX_TARGET_MAP] = {0}, ig_tags[MAX_TARGET_MAP] = {0};
+ size_t i;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_tgt_node_pg_ig_maps_decoders,
+ SPDK_COUNTOF(rpc_tgt_node_pg_ig_maps_decoders),
+ &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ goto invalid;
+ }
+
+ target = iscsi_find_tgt_node(req.name);
+ if (target == NULL) {
+ SPDK_ERRLOG("target is not found\n");
+ goto invalid;
+ }
+
+ for (i = 0; i < req.pg_ig_maps.num_maps; i++) {
+ pg_tags[i] = req.pg_ig_maps.maps[i].pg_tag;
+ ig_tags[i] = req.pg_ig_maps.maps[i].ig_tag;
+ }
+
+ rc = iscsi_target_node_add_pg_ig_maps(target, pg_tags, ig_tags,
+ req.pg_ig_maps.num_maps);
+ if (rc < 0) {
+ SPDK_ERRLOG("add pg-ig maps failed\n");
+ goto invalid;
+ }
+
+ free(req.name);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ return;
+
+invalid:
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid parameters");
+ free(req.name);
+}
+SPDK_RPC_REGISTER("iscsi_target_node_add_pg_ig_maps",
+ rpc_iscsi_target_node_add_pg_ig_maps, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_target_node_add_pg_ig_maps, add_pg_ig_maps)
+
+static void
+rpc_iscsi_target_node_remove_pg_ig_maps(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_tgt_node_pg_ig_maps req = {};
+ struct spdk_json_write_ctx *w;
+ struct spdk_iscsi_tgt_node *target;
+ int32_t pg_tags[MAX_TARGET_MAP] = {0}, ig_tags[MAX_TARGET_MAP] = {0};
+ size_t i;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_tgt_node_pg_ig_maps_decoders,
+ SPDK_COUNTOF(rpc_tgt_node_pg_ig_maps_decoders),
+ &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ goto invalid;
+ }
+
+ target = iscsi_find_tgt_node(req.name);
+ if (target == NULL) {
+ SPDK_ERRLOG("target is not found\n");
+ goto invalid;
+ }
+
+ for (i = 0; i < req.pg_ig_maps.num_maps; i++) {
+ pg_tags[i] = req.pg_ig_maps.maps[i].pg_tag;
+ ig_tags[i] = req.pg_ig_maps.maps[i].ig_tag;
+ }
+
+ rc = iscsi_target_node_remove_pg_ig_maps(target, pg_tags, ig_tags,
+ req.pg_ig_maps.num_maps);
+ if (rc < 0) {
+ SPDK_ERRLOG("remove pg-ig maps failed\n");
+ goto invalid;
+ }
+
+ free(req.name);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ return;
+
+invalid:
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid parameters");
+ free(req.name);
+}
+SPDK_RPC_REGISTER("iscsi_target_node_remove_pg_ig_maps",
+ rpc_iscsi_target_node_remove_pg_ig_maps, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_target_node_remove_pg_ig_maps,
+ delete_pg_ig_maps)
+
+struct rpc_iscsi_delete_target_node {
+ char *name;
+};
+
+static void
+free_rpc_iscsi_delete_target_node(struct rpc_iscsi_delete_target_node *r)
+{
+ free(r->name);
+}
+
+static const struct spdk_json_object_decoder rpc_iscsi_delete_target_node_decoders[] = {
+ {"name", offsetof(struct rpc_iscsi_delete_target_node, name), spdk_json_decode_string},
+};
+
+struct rpc_iscsi_delete_target_node_ctx {
+ struct rpc_iscsi_delete_target_node req;
+ struct spdk_jsonrpc_request *request;
+};
+
+static void
+rpc_iscsi_delete_target_node_done(void *cb_arg, int rc)
+{
+ struct rpc_iscsi_delete_target_node_ctx *ctx = cb_arg;
+ struct spdk_json_write_ctx *w;
+
+ free_rpc_iscsi_delete_target_node(&ctx->req);
+
+ w = spdk_jsonrpc_begin_result(ctx->request);
+ spdk_json_write_bool(w, rc == 0);
+ spdk_jsonrpc_end_result(ctx->request, w);
+
+ free(ctx);
+}
+
+static void
+rpc_iscsi_delete_target_node(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_iscsi_delete_target_node_ctx *ctx;
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ spdk_strerror(ENOMEM));
+ return;
+ }
+
+ if (spdk_json_decode_object(params, rpc_iscsi_delete_target_node_decoders,
+ SPDK_COUNTOF(rpc_iscsi_delete_target_node_decoders),
+ &ctx->req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ goto invalid;
+ }
+
+ if (ctx->req.name == NULL) {
+ SPDK_ERRLOG("missing name param\n");
+ goto invalid;
+ }
+
+ ctx->request = request;
+
+ iscsi_shutdown_tgt_node_by_name(ctx->req.name,
+ rpc_iscsi_delete_target_node_done, ctx);
+ return;
+
+invalid:
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+ free_rpc_iscsi_delete_target_node(&ctx->req);
+ free(ctx);
+}
+SPDK_RPC_REGISTER("iscsi_delete_target_node", rpc_iscsi_delete_target_node, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_delete_target_node, delete_target_node)
+
+static void
+rpc_iscsi_get_portal_groups(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct spdk_json_write_ctx *w;
+
+ if (params != NULL) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "iscsi_get_portal_groups requires no parameters");
+ return;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_array_begin(w);
+ iscsi_portal_grps_info_json(w);
+ spdk_json_write_array_end(w);
+
+ spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("iscsi_get_portal_groups", rpc_iscsi_get_portal_groups, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_get_portal_groups, get_portal_groups)
+
+struct rpc_portal {
+ char *host;
+ char *port;
+};
+
+struct rpc_portal_list {
+ size_t num_portals;
+ struct rpc_portal portals[MAX_PORTAL];
+};
+
+struct rpc_portal_group {
+ int32_t tag;
+ struct rpc_portal_list portal_list;
+};
+
+static void
+free_rpc_portal(struct rpc_portal *portal)
+{
+ free(portal->host);
+ free(portal->port);
+}
+
+static void
+free_rpc_portal_list(struct rpc_portal_list *pl)
+{
+ size_t i;
+
+ for (i = 0; i < pl->num_portals; i++) {
+ free_rpc_portal(&pl->portals[i]);
+ }
+ pl->num_portals = 0;
+}
+
+static void
+free_rpc_portal_group(struct rpc_portal_group *pg)
+{
+ free_rpc_portal_list(&pg->portal_list);
+}
+
+static const struct spdk_json_object_decoder rpc_portal_decoders[] = {
+ {"host", offsetof(struct rpc_portal, host), spdk_json_decode_string},
+ {"port", offsetof(struct rpc_portal, port), spdk_json_decode_string},
+};
+
+static int
+decode_rpc_portal(const struct spdk_json_val *val, void *out)
+{
+ struct rpc_portal *portal = out;
+
+ return spdk_json_decode_object(val, rpc_portal_decoders,
+ SPDK_COUNTOF(rpc_portal_decoders),
+ portal);
+}
+
+static int
+decode_rpc_portal_list(const struct spdk_json_val *val, void *out)
+{
+ struct rpc_portal_list *list = out;
+
+ return spdk_json_decode_array(val, decode_rpc_portal, list->portals, MAX_PORTAL, &list->num_portals,
+ sizeof(struct rpc_portal));
+}
+
+static const struct spdk_json_object_decoder rpc_portal_group_decoders[] = {
+ {"tag", offsetof(struct rpc_portal_group, tag), spdk_json_decode_int32},
+ {"portals", offsetof(struct rpc_portal_group, portal_list), decode_rpc_portal_list},
+};
+
+static void
+rpc_iscsi_create_portal_group(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_portal_group req = {};
+ struct spdk_iscsi_portal_grp *pg = NULL;
+ struct spdk_iscsi_portal *portal;
+ struct spdk_json_write_ctx *w;
+ size_t i = 0;
+ int rc = -1;
+
+ if (spdk_json_decode_object(params, rpc_portal_group_decoders,
+ SPDK_COUNTOF(rpc_portal_group_decoders),
+ &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ goto out;
+ }
+
+ pg = iscsi_portal_grp_create(req.tag);
+ if (pg == NULL) {
+ SPDK_ERRLOG("portal_grp_create failed\n");
+ goto out;
+ }
+ for (i = 0; i < req.portal_list.num_portals; i++) {
+ portal = iscsi_portal_create(req.portal_list.portals[i].host,
+ req.portal_list.portals[i].port);
+ if (portal == NULL) {
+ SPDK_ERRLOG("portal_create failed\n");
+ goto out;
+ }
+ iscsi_portal_grp_add_portal(pg, portal);
+ }
+
+ rc = iscsi_portal_grp_open(pg);
+ if (rc != 0) {
+ SPDK_ERRLOG("portal_grp_open failed\n");
+ goto out;
+ }
+
+ rc = iscsi_portal_grp_register(pg);
+ if (rc != 0) {
+ SPDK_ERRLOG("portal_grp_register failed\n");
+ }
+
+out:
+ if (rc == 0) {
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ } else {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+
+ if (pg != NULL) {
+ iscsi_portal_grp_release(pg);
+ }
+ }
+ free_rpc_portal_group(&req);
+}
+SPDK_RPC_REGISTER("iscsi_create_portal_group", rpc_iscsi_create_portal_group, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_create_portal_group, add_portal_group)
+
+struct rpc_iscsi_delete_portal_group {
+ int32_t tag;
+};
+
+static const struct spdk_json_object_decoder rpc_iscsi_delete_portal_group_decoders[] = {
+ {"tag", offsetof(struct rpc_iscsi_delete_portal_group, tag), spdk_json_decode_int32},
+};
+
+static void
+rpc_iscsi_delete_portal_group(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_iscsi_delete_portal_group req = {};
+ struct spdk_json_write_ctx *w;
+ struct spdk_iscsi_portal_grp *pg;
+
+ if (spdk_json_decode_object(params, rpc_iscsi_delete_portal_group_decoders,
+ SPDK_COUNTOF(rpc_iscsi_delete_portal_group_decoders),
+ &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ goto invalid;
+ }
+
+ pg = iscsi_portal_grp_unregister(req.tag);
+ if (!pg) {
+ goto invalid;
+ }
+
+ iscsi_tgt_node_delete_map(pg, NULL);
+ iscsi_portal_grp_release(pg);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ return;
+
+invalid:
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+}
+SPDK_RPC_REGISTER("iscsi_delete_portal_group", rpc_iscsi_delete_portal_group, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_delete_portal_group, delete_portal_group)
+
+struct rpc_portal_group_auth {
+ int32_t tag;
+ bool disable_chap;
+ bool require_chap;
+ bool mutual_chap;
+ int32_t chap_group;
+};
+
+static const struct spdk_json_object_decoder rpc_portal_group_auth_decoders[] = {
+ {"tag", offsetof(struct rpc_portal_group_auth, tag), spdk_json_decode_int32},
+ {"disable_chap", offsetof(struct rpc_portal_group_auth, disable_chap), spdk_json_decode_bool, true},
+ {"require_chap", offsetof(struct rpc_portal_group_auth, require_chap), spdk_json_decode_bool, true},
+ {"mutual_chap", offsetof(struct rpc_portal_group_auth, mutual_chap), spdk_json_decode_bool, true},
+ {"chap_group", offsetof(struct rpc_portal_group_auth, chap_group), spdk_json_decode_int32, true},
+};
+
+static void
+rpc_iscsi_portal_group_set_auth(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_portal_group_auth req = {};
+ struct spdk_json_write_ctx *w;
+ struct spdk_iscsi_portal_grp *pg;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_portal_group_auth_decoders,
+ SPDK_COUNTOF(rpc_portal_group_auth_decoders), &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid parameters");
+ return;
+ }
+
+ pthread_mutex_lock(&g_iscsi.mutex);
+
+ pg = iscsi_portal_grp_find_by_tag(req.tag);
+ if (pg == NULL) {
+ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Could not find portal group %d", req.tag);
+ goto exit;
+ }
+
+ rc = iscsi_portal_grp_set_chap_params(pg, req.disable_chap, req.require_chap,
+ req.mutual_chap, req.chap_group);
+ if (rc < 0) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid combination of auth params");
+ goto exit;
+ }
+
+ pthread_mutex_unlock(&g_iscsi.mutex);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+
+ return;
+
+exit:
+ pthread_mutex_unlock(&g_iscsi.mutex);
+}
+SPDK_RPC_REGISTER("iscsi_portal_group_set_auth", rpc_iscsi_portal_group_set_auth,
+ SPDK_RPC_RUNTIME)
+
+struct rpc_iscsi_get_connections_ctx {
+ struct spdk_jsonrpc_request *request;
+ struct spdk_json_write_ctx *w;
+};
+
+static void
+_rpc_iscsi_get_connections_done(struct spdk_io_channel_iter *i, int status)
+{
+ struct rpc_iscsi_get_connections_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
+
+ spdk_json_write_array_end(ctx->w);
+ spdk_jsonrpc_end_result(ctx->request, ctx->w);
+
+ free(ctx);
+}
+
+static void
+_rpc_iscsi_get_connections(struct spdk_io_channel_iter *i)
+{
+ struct rpc_iscsi_get_connections_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
+ struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
+ struct spdk_iscsi_poll_group *pg = spdk_io_channel_get_ctx(ch);
+ struct spdk_iscsi_conn *conn;
+
+ STAILQ_FOREACH(conn, &pg->connections, pg_link) {
+ iscsi_conn_info_json(ctx->w, conn);
+ }
+
+ spdk_for_each_channel_continue(i, 0);
+}
+
+static void
+rpc_iscsi_get_connections(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_iscsi_get_connections_ctx *ctx;
+
+ if (params != NULL) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "iscsi_get_connections requires no parameters");
+ return;
+ }
+
+ ctx = calloc(1, sizeof(struct rpc_iscsi_get_connections_ctx));
+ if (ctx == NULL) {
+ SPDK_ERRLOG("Failed to allocate rpc_get_iscsi_conns_ctx struct\n");
+ spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM));
+ return;
+ }
+
+ ctx->request = request;
+ ctx->w = spdk_jsonrpc_begin_result(request);
+
+ spdk_json_write_array_begin(ctx->w);
+
+ spdk_for_each_channel(&g_iscsi,
+ _rpc_iscsi_get_connections,
+ ctx,
+ _rpc_iscsi_get_connections_done);
+}
+SPDK_RPC_REGISTER("iscsi_get_connections", rpc_iscsi_get_connections, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_get_connections, get_iscsi_connections)
+
+struct rpc_target_lun {
+ char *name;
+ char *bdev_name;
+ int32_t lun_id;
+};
+
+static void
+free_rpc_target_lun(struct rpc_target_lun *req)
+{
+ free(req->name);
+ free(req->bdev_name);
+}
+
+static const struct spdk_json_object_decoder rpc_target_lun_decoders[] = {
+ {"name", offsetof(struct rpc_target_lun, name), spdk_json_decode_string},
+ {"bdev_name", offsetof(struct rpc_target_lun, bdev_name), spdk_json_decode_string},
+ {"lun_id", offsetof(struct rpc_target_lun, lun_id), spdk_json_decode_int32, true},
+};
+
+static void
+rpc_iscsi_target_node_add_lun(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_target_lun req = {};
+ struct spdk_json_write_ctx *w;
+ struct spdk_iscsi_tgt_node *target;
+ int rc;
+
+ req.lun_id = -1;
+
+ if (spdk_json_decode_object(params, rpc_target_lun_decoders,
+ SPDK_COUNTOF(rpc_target_lun_decoders), &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ goto invalid;
+ }
+
+ target = iscsi_find_tgt_node(req.name);
+ if (target == NULL) {
+ SPDK_ERRLOG("target is not found\n");
+ goto invalid;
+ }
+
+ rc = iscsi_tgt_node_add_lun(target, req.bdev_name, req.lun_id);
+ if (rc < 0) {
+ SPDK_ERRLOG("add lun failed\n");
+ goto invalid;
+ }
+
+ free_rpc_target_lun(&req);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ return;
+
+invalid:
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid parameters");
+ free_rpc_target_lun(&req);
+}
+SPDK_RPC_REGISTER("iscsi_target_node_add_lun", rpc_iscsi_target_node_add_lun, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_target_node_add_lun, target_node_add_lun)
+
+struct rpc_target_auth {
+ char *name;
+ bool disable_chap;
+ bool require_chap;
+ bool mutual_chap;
+ int32_t chap_group;
+};
+
+static void
+free_rpc_target_auth(struct rpc_target_auth *req)
+{
+ free(req->name);
+}
+
+static const struct spdk_json_object_decoder rpc_target_auth_decoders[] = {
+ {"name", offsetof(struct rpc_target_auth, name), spdk_json_decode_string},
+ {"disable_chap", offsetof(struct rpc_target_auth, disable_chap), spdk_json_decode_bool, true},
+ {"require_chap", offsetof(struct rpc_target_auth, require_chap), spdk_json_decode_bool, true},
+ {"mutual_chap", offsetof(struct rpc_target_auth, mutual_chap), spdk_json_decode_bool, true},
+ {"chap_group", offsetof(struct rpc_target_auth, chap_group), spdk_json_decode_int32, true},
+};
+
+static void
+rpc_iscsi_target_node_set_auth(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_target_auth req = {};
+ struct spdk_json_write_ctx *w;
+ struct spdk_iscsi_tgt_node *target;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_target_auth_decoders,
+ SPDK_COUNTOF(rpc_target_auth_decoders), &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid parameters");
+ goto exit;
+ }
+
+ target = iscsi_find_tgt_node(req.name);
+ if (target == NULL) {
+ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Could not find target %s", req.name);
+ goto exit;
+ }
+
+ rc = iscsi_tgt_node_set_chap_params(target, req.disable_chap, req.require_chap,
+ req.mutual_chap, req.chap_group);
+ if (rc < 0) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid combination of auth params");
+ goto exit;
+ }
+
+ free_rpc_target_auth(&req);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ return;
+
+exit:
+ free_rpc_target_auth(&req);
+}
+SPDK_RPC_REGISTER("iscsi_target_node_set_auth", rpc_iscsi_target_node_set_auth,
+ SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_target_node_set_auth, set_iscsi_target_node_auth)
+
+static void
+rpc_iscsi_get_options(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct spdk_json_write_ctx *w;
+
+ if (params != NULL) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "iscsi_get_options requires no parameters");
+ return;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ iscsi_opts_info_json(w);
+
+ spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("iscsi_get_options", rpc_iscsi_get_options, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_get_options, get_iscsi_global_params)
+
+struct rpc_discovery_auth {
+ bool disable_chap;
+ bool require_chap;
+ bool mutual_chap;
+ int32_t chap_group;
+};
+
+static const struct spdk_json_object_decoder rpc_discovery_auth_decoders[] = {
+ {"disable_chap", offsetof(struct rpc_discovery_auth, disable_chap), spdk_json_decode_bool, true},
+ {"require_chap", offsetof(struct rpc_discovery_auth, require_chap), spdk_json_decode_bool, true},
+ {"mutual_chap", offsetof(struct rpc_discovery_auth, mutual_chap), spdk_json_decode_bool, true},
+ {"chap_group", offsetof(struct rpc_discovery_auth, chap_group), spdk_json_decode_int32, true},
+};
+
+static void
+rpc_iscsi_set_discovery_auth(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_discovery_auth req = {};
+ struct spdk_json_write_ctx *w;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_discovery_auth_decoders,
+ SPDK_COUNTOF(rpc_discovery_auth_decoders), &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid parameters");
+ return;
+ }
+
+ rc = iscsi_set_discovery_auth(req.disable_chap, req.require_chap,
+ req.mutual_chap, req.chap_group);
+ if (rc < 0) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid combination of CHAP params");
+ return;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("iscsi_set_discovery_auth", rpc_iscsi_set_discovery_auth, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_set_discovery_auth, set_iscsi_discovery_auth)
+
+#define MAX_AUTH_SECRETS 64
+
+struct rpc_auth_secret {
+ char *user;
+ char *secret;
+ char *muser;
+ char *msecret;
+};
+
+static void
+free_rpc_auth_secret(struct rpc_auth_secret *_secret)
+{
+ free(_secret->user);
+ free(_secret->secret);
+ free(_secret->muser);
+ free(_secret->msecret);
+}
+
+static const struct spdk_json_object_decoder rpc_auth_secret_decoders[] = {
+ {"user", offsetof(struct rpc_auth_secret, user), spdk_json_decode_string},
+ {"secret", offsetof(struct rpc_auth_secret, secret), spdk_json_decode_string},
+ {"muser", offsetof(struct rpc_auth_secret, muser), spdk_json_decode_string, true},
+ {"msecret", offsetof(struct rpc_auth_secret, msecret), spdk_json_decode_string, true},
+};
+
+static int
+decode_rpc_auth_secret(const struct spdk_json_val *val, void *out)
+{
+ struct rpc_auth_secret *_secret = out;
+
+ return spdk_json_decode_object(val, rpc_auth_secret_decoders,
+ SPDK_COUNTOF(rpc_auth_secret_decoders), _secret);
+}
+
+struct rpc_auth_secrets {
+ size_t num_secret;
+ struct rpc_auth_secret secrets[MAX_AUTH_SECRETS];
+};
+
+static void
+free_rpc_auth_secrets(struct rpc_auth_secrets *secrets)
+{
+ size_t i;
+
+ for (i = 0; i < secrets->num_secret; i++) {
+ free_rpc_auth_secret(&secrets->secrets[i]);
+ }
+}
+
+static int
+decode_rpc_auth_secrets(const struct spdk_json_val *val, void *out)
+{
+ struct rpc_auth_secrets *secrets = out;
+
+ return spdk_json_decode_array(val, decode_rpc_auth_secret, secrets->secrets,
+ MAX_AUTH_SECRETS, &secrets->num_secret,
+ sizeof(struct rpc_auth_secret));
+}
+
+struct rpc_auth_group {
+ int32_t tag;
+ struct rpc_auth_secrets secrets;
+};
+
+static void
+free_rpc_auth_group(struct rpc_auth_group *group)
+{
+ free_rpc_auth_secrets(&group->secrets);
+}
+
+static const struct spdk_json_object_decoder rpc_auth_group_decoders[] = {
+ {"tag", offsetof(struct rpc_auth_group, tag), spdk_json_decode_int32},
+ {"secrets", offsetof(struct rpc_auth_group, secrets), decode_rpc_auth_secrets, true},
+};
+
+static void
+rpc_iscsi_create_auth_group(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_auth_group req = {};
+ struct rpc_auth_secret *_secret;
+ struct spdk_json_write_ctx *w;
+ struct spdk_iscsi_auth_group *group = NULL;
+ int rc;
+ size_t i;
+
+ if (spdk_json_decode_object(params, rpc_auth_group_decoders,
+ SPDK_COUNTOF(rpc_auth_group_decoders), &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid parameters");
+ free_rpc_auth_group(&req);
+ return;
+ }
+
+ pthread_mutex_lock(&g_iscsi.mutex);
+
+ rc = iscsi_add_auth_group(req.tag, &group);
+ if (rc != 0) {
+ pthread_mutex_unlock(&g_iscsi.mutex);
+
+ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Could not add auth group (%d), %s",
+ req.tag, spdk_strerror(-rc));
+ free_rpc_auth_group(&req);
+ return;
+ }
+
+ for (i = 0; i < req.secrets.num_secret; i++) {
+ _secret = &req.secrets.secrets[i];
+ rc = iscsi_auth_group_add_secret(group, _secret->user, _secret->secret,
+ _secret->muser, _secret->msecret);
+ if (rc != 0) {
+ iscsi_delete_auth_group(group);
+ pthread_mutex_unlock(&g_iscsi.mutex);
+
+ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Could not add secret to auth group (%d), %s",
+ req.tag, spdk_strerror(-rc));
+ free_rpc_auth_group(&req);
+ return;
+ }
+ }
+
+ pthread_mutex_unlock(&g_iscsi.mutex);
+
+ free_rpc_auth_group(&req);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("iscsi_create_auth_group", rpc_iscsi_create_auth_group, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_create_auth_group, add_iscsi_auth_group)
+
+struct rpc_delete_auth_group {
+ int32_t tag;
+};
+
+static const struct spdk_json_object_decoder rpc_delete_auth_group_decoders[] = {
+ {"tag", offsetof(struct rpc_delete_auth_group, tag), spdk_json_decode_int32},
+};
+
+static void
+rpc_iscsi_delete_auth_group(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_delete_auth_group req = {};
+ struct spdk_json_write_ctx *w;
+ struct spdk_iscsi_auth_group *group;
+
+ if (spdk_json_decode_object(params, rpc_delete_auth_group_decoders,
+ SPDK_COUNTOF(rpc_delete_auth_group_decoders), &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid parameters");
+ return;
+ }
+
+ pthread_mutex_lock(&g_iscsi.mutex);
+
+ group = iscsi_find_auth_group_by_tag(req.tag);
+ if (group == NULL) {
+ pthread_mutex_unlock(&g_iscsi.mutex);
+
+ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Could not find auth group (%d)", req.tag);
+ return;
+ }
+
+ iscsi_delete_auth_group(group);
+
+ pthread_mutex_unlock(&g_iscsi.mutex);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("iscsi_delete_auth_group", rpc_iscsi_delete_auth_group, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_delete_auth_group, delete_iscsi_auth_group)
+
+struct rpc_add_auth_secret {
+ int32_t tag;
+ char *user;
+ char *secret;
+ char *muser;
+ char *msecret;
+};
+
+static void
+free_rpc_add_auth_secret(struct rpc_add_auth_secret *_secret)
+{
+ free(_secret->user);
+ free(_secret->secret);
+ free(_secret->muser);
+ free(_secret->msecret);
+}
+
+static const struct spdk_json_object_decoder rpc_add_auth_secret_decoders[] = {
+ {"tag", offsetof(struct rpc_add_auth_secret, tag), spdk_json_decode_int32},
+ {"user", offsetof(struct rpc_add_auth_secret, user), spdk_json_decode_string},
+ {"secret", offsetof(struct rpc_add_auth_secret, secret), spdk_json_decode_string},
+ {"muser", offsetof(struct rpc_add_auth_secret, muser), spdk_json_decode_string, true},
+ {"msecret", offsetof(struct rpc_add_auth_secret, msecret), spdk_json_decode_string, true},
+};
+
+static void
+rpc_iscsi_auth_group_add_secret(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_add_auth_secret req = {};
+ struct spdk_json_write_ctx *w;
+ struct spdk_iscsi_auth_group *group;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_add_auth_secret_decoders,
+ SPDK_COUNTOF(rpc_add_auth_secret_decoders), &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid parameters");
+ free_rpc_add_auth_secret(&req);
+ return;
+ }
+
+ pthread_mutex_lock(&g_iscsi.mutex);
+
+ group = iscsi_find_auth_group_by_tag(req.tag);
+ if (group == NULL) {
+ pthread_mutex_unlock(&g_iscsi.mutex);
+
+ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Could not find auth group (%d)", req.tag);
+ free_rpc_add_auth_secret(&req);
+ return;
+ }
+
+ rc = iscsi_auth_group_add_secret(group, req.user, req.secret, req.muser, req.msecret);
+ if (rc != 0) {
+ pthread_mutex_unlock(&g_iscsi.mutex);
+
+ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Could not add secret to auth group (%d), %s",
+ req.tag, spdk_strerror(-rc));
+ free_rpc_add_auth_secret(&req);
+ return;
+ }
+
+ pthread_mutex_unlock(&g_iscsi.mutex);
+
+ free_rpc_add_auth_secret(&req);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("iscsi_auth_group_add_secret", rpc_iscsi_auth_group_add_secret,
+ SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_auth_group_add_secret, add_secret_to_iscsi_auth_group)
+
+
+struct rpc_remove_auth_secret {
+ int32_t tag;
+ char *user;
+};
+
+static void
+free_rpc_remove_auth_secret(struct rpc_remove_auth_secret *_secret)
+{
+ free(_secret->user);
+}
+
+static const struct spdk_json_object_decoder rpc_remove_auth_secret_decoders[] = {
+ {"tag", offsetof(struct rpc_remove_auth_secret, tag), spdk_json_decode_int32},
+ {"user", offsetof(struct rpc_remove_auth_secret, user), spdk_json_decode_string},
+};
+
+static void
+rpc_iscsi_auth_group_remove_secret(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_remove_auth_secret req = {};
+ struct spdk_json_write_ctx *w;
+ struct spdk_iscsi_auth_group *group;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_remove_auth_secret_decoders,
+ SPDK_COUNTOF(rpc_remove_auth_secret_decoders), &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid parameters");
+ free_rpc_remove_auth_secret(&req);
+ return;
+ }
+
+ pthread_mutex_lock(&g_iscsi.mutex);
+
+ group = iscsi_find_auth_group_by_tag(req.tag);
+ if (group == NULL) {
+ pthread_mutex_unlock(&g_iscsi.mutex);
+
+ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Could not find auth group (%d)", req.tag);
+ free_rpc_remove_auth_secret(&req);
+ return;
+ }
+
+ rc = iscsi_auth_group_delete_secret(group, req.user);
+ if (rc != 0) {
+ pthread_mutex_unlock(&g_iscsi.mutex);
+
+ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Could not delete secret from CHAP group (%d), %s",
+ req.tag, spdk_strerror(-rc));
+ free_rpc_remove_auth_secret(&req);
+ return;
+ }
+
+ pthread_mutex_unlock(&g_iscsi.mutex);
+
+ free_rpc_remove_auth_secret(&req);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("iscsi_auth_group_remove_secret",
+ rpc_iscsi_auth_group_remove_secret, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_auth_group_remove_secret,
+ delete_secret_from_iscsi_auth_group)
+
+static void
+rpc_iscsi_get_auth_groups(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct spdk_json_write_ctx *w;
+
+ if (params != NULL) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "iscsi_get_auth_groups requires no parameters");
+ return;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_array_begin(w);
+ iscsi_auth_groups_info_json(w);
+ spdk_json_write_array_end(w);
+
+ spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("iscsi_get_auth_groups", rpc_iscsi_get_auth_groups, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_get_auth_groups, get_iscsi_auth_groups)
+
+static const struct spdk_json_object_decoder rpc_set_iscsi_opts_decoders[] = {
+ {"auth_file", offsetof(struct spdk_iscsi_opts, authfile), spdk_json_decode_string, true},
+ {"node_base", offsetof(struct spdk_iscsi_opts, nodebase), spdk_json_decode_string, true},
+ {"nop_timeout", offsetof(struct spdk_iscsi_opts, timeout), spdk_json_decode_int32, true},
+ {"nop_in_interval", offsetof(struct spdk_iscsi_opts, nopininterval), spdk_json_decode_int32, true},
+ {"no_discovery_auth", offsetof(struct spdk_iscsi_opts, disable_chap), spdk_json_decode_bool, true},
+ {"req_discovery_auth", offsetof(struct spdk_iscsi_opts, require_chap), spdk_json_decode_bool, true},
+ {"req_discovery_auth_mutual", offsetof(struct spdk_iscsi_opts, mutual_chap), spdk_json_decode_bool, true},
+ {"discovery_auth_group", offsetof(struct spdk_iscsi_opts, chap_group), spdk_json_decode_int32, true},
+ {"disable_chap", offsetof(struct spdk_iscsi_opts, disable_chap), spdk_json_decode_bool, true},
+ {"require_chap", offsetof(struct spdk_iscsi_opts, require_chap), spdk_json_decode_bool, true},
+ {"mutual_chap", offsetof(struct spdk_iscsi_opts, mutual_chap), spdk_json_decode_bool, true},
+ {"chap_group", offsetof(struct spdk_iscsi_opts, chap_group), spdk_json_decode_int32, true},
+ {"max_sessions", offsetof(struct spdk_iscsi_opts, MaxSessions), spdk_json_decode_uint32, true},
+ {"max_queue_depth", offsetof(struct spdk_iscsi_opts, MaxQueueDepth), spdk_json_decode_uint32, true},
+ {"max_connections_per_session", offsetof(struct spdk_iscsi_opts, MaxConnectionsPerSession), spdk_json_decode_uint32, true},
+ {"default_time2wait", offsetof(struct spdk_iscsi_opts, DefaultTime2Wait), spdk_json_decode_uint32, true},
+ {"default_time2retain", offsetof(struct spdk_iscsi_opts, DefaultTime2Retain), spdk_json_decode_uint32, true},
+ {"first_burst_length", offsetof(struct spdk_iscsi_opts, FirstBurstLength), spdk_json_decode_uint32, true},
+ {"immediate_data", offsetof(struct spdk_iscsi_opts, ImmediateData), spdk_json_decode_bool, true},
+ {"error_recovery_level", offsetof(struct spdk_iscsi_opts, ErrorRecoveryLevel), spdk_json_decode_uint32, true},
+ {"allow_duplicated_isid", offsetof(struct spdk_iscsi_opts, AllowDuplicateIsid), spdk_json_decode_bool, true},
+};
+
+static void
+rpc_iscsi_set_options(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct spdk_iscsi_opts *opts;
+ struct spdk_json_write_ctx *w;
+
+ if (g_spdk_iscsi_opts != NULL) {
+ SPDK_ERRLOG("this RPC must not be called more than once.\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Must not call more than once");
+ return;
+ }
+
+ opts = iscsi_opts_alloc();
+ if (opts == NULL) {
+ SPDK_ERRLOG("iscsi_opts_alloc() failed.\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Out of memory");
+ return;
+ }
+
+ if (params != NULL) {
+ if (spdk_json_decode_object(params, rpc_set_iscsi_opts_decoders,
+ SPDK_COUNTOF(rpc_set_iscsi_opts_decoders), opts)) {
+ SPDK_ERRLOG("spdk_json_decode_object() failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid parameters");
+ iscsi_opts_free(opts);
+ return;
+ }
+ }
+
+ g_spdk_iscsi_opts = iscsi_opts_copy(opts);
+ iscsi_opts_free(opts);
+
+ if (g_spdk_iscsi_opts == NULL) {
+ SPDK_ERRLOG("iscsi_opts_copy() failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Out of memory");
+ return;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("iscsi_set_options", rpc_iscsi_set_options, SPDK_RPC_STARTUP)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(iscsi_set_options, set_iscsi_options)
diff --git a/src/spdk/lib/iscsi/iscsi_subsystem.c b/src/spdk/lib/iscsi/iscsi_subsystem.c
new file mode 100644
index 000000000..1eb766233
--- /dev/null
+++ b/src/spdk/lib/iscsi/iscsi_subsystem.c
@@ -0,0 +1,1577 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+#include "spdk/env.h"
+#include "spdk/string.h"
+#include "spdk/sock.h"
+#include "spdk/likely.h"
+
+#include "iscsi/iscsi.h"
+#include "iscsi/init_grp.h"
+#include "iscsi/portal_grp.h"
+#include "iscsi/conn.h"
+#include "iscsi/task.h"
+#include "iscsi/tgt_node.h"
+
+#include "spdk_internal/event.h"
+#include "spdk_internal/log.h"
+
+struct spdk_iscsi_opts *g_spdk_iscsi_opts = NULL;
+
+static struct spdk_thread *g_init_thread = NULL;
+static spdk_iscsi_init_cb g_init_cb_fn = NULL;
+static void *g_init_cb_arg = NULL;
+
+static spdk_iscsi_fini_cb g_fini_cb_fn;
+static void *g_fini_cb_arg;
+
+#define ISCSI_CONFIG_TMPL \
+"[iSCSI]\n" \
+" # node name (not include optional part)\n" \
+" # Users can optionally change this to fit their environment.\n" \
+" NodeBase \"%s\"\n" \
+"\n" \
+" # files\n" \
+" %s %s\n" \
+"\n" \
+" # socket I/O timeout sec. (polling is infinity)\n" \
+" Timeout %d\n" \
+"\n" \
+" # authentication information for discovery session\n" \
+" DiscoveryAuthMethod %s\n" \
+" DiscoveryAuthGroup %s\n" \
+"\n" \
+" MaxSessions %d\n" \
+" MaxConnectionsPerSession %d\n" \
+" MaxConnections %d\n" \
+" MaxQueueDepth %d\n" \
+"\n" \
+" # iSCSI initial parameters negotiate with initiators\n" \
+" # NOTE: incorrect values might crash\n" \
+" DefaultTime2Wait %d\n" \
+" DefaultTime2Retain %d\n" \
+"\n" \
+" FirstBurstLength %d\n" \
+" ImmediateData %s\n" \
+" ErrorRecoveryLevel %d\n" \
+"\n"
+
+static void
+iscsi_globals_config_text(FILE *fp)
+{
+ const char *authmethod = "None";
+ char authgroup[32] = "None";
+
+ if (NULL == fp) {
+ return;
+ }
+
+ if (g_iscsi.require_chap) {
+ authmethod = "CHAP";
+ } else if (g_iscsi.mutual_chap) {
+ authmethod = "CHAP Mutual";
+ } else if (!g_iscsi.disable_chap) {
+ authmethod = "Auto";
+ }
+
+ if (g_iscsi.chap_group) {
+ snprintf(authgroup, sizeof(authgroup), "AuthGroup%d", g_iscsi.chap_group);
+ }
+
+ fprintf(fp, ISCSI_CONFIG_TMPL,
+ g_iscsi.nodebase,
+ g_iscsi.authfile ? "AuthFile" : "",
+ g_iscsi.authfile ? g_iscsi.authfile : "",
+ g_iscsi.timeout, authmethod, authgroup,
+ g_iscsi.MaxSessions, g_iscsi.MaxConnectionsPerSession,
+ g_iscsi.MaxConnections,
+ g_iscsi.MaxQueueDepth,
+ g_iscsi.DefaultTime2Wait, g_iscsi.DefaultTime2Retain,
+ g_iscsi.FirstBurstLength,
+ (g_iscsi.ImmediateData) ? "Yes" : "No",
+ g_iscsi.ErrorRecoveryLevel);
+}
+
+#define ISCSI_DATA_BUFFER_ALIGNMENT (0x1000)
+#define ISCSI_DATA_BUFFER_MASK (ISCSI_DATA_BUFFER_ALIGNMENT - 1)
+
+static void
+mobj_ctor(struct spdk_mempool *mp, __attribute__((unused)) void *arg,
+ void *_m, __attribute__((unused)) unsigned i)
+{
+ struct spdk_mobj *m = _m;
+
+ m->mp = mp;
+ m->buf = (uint8_t *)m + sizeof(struct spdk_mobj);
+ m->buf = (void *)((unsigned long)((uint8_t *)m->buf + ISCSI_DATA_BUFFER_ALIGNMENT) &
+ ~ISCSI_DATA_BUFFER_MASK);
+}
+
+#define NUM_PDU_PER_CONNECTION(iscsi) (2 * (iscsi->MaxQueueDepth + MAX_LARGE_DATAIN_PER_CONNECTION + 8))
+#define PDU_POOL_SIZE(iscsi) (iscsi->MaxConnections * NUM_PDU_PER_CONNECTION(iscsi))
+#define IMMEDIATE_DATA_POOL_SIZE(iscsi) (iscsi->MaxConnections * 128)
+#define DATA_OUT_POOL_SIZE(iscsi) (iscsi->MaxConnections * MAX_DATA_OUT_PER_CONNECTION)
+
+static int
+iscsi_initialize_pdu_pool(void)
+{
+ struct spdk_iscsi_globals *iscsi = &g_iscsi;
+ int imm_mobj_size = SPDK_BDEV_BUF_SIZE_WITH_MD(iscsi_get_max_immediate_data_size()) +
+ sizeof(struct spdk_mobj) + ISCSI_DATA_BUFFER_ALIGNMENT;
+ int dout_mobj_size = SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH) +
+ sizeof(struct spdk_mobj) + ISCSI_DATA_BUFFER_ALIGNMENT;
+
+ /* create PDU pool */
+ iscsi->pdu_pool = spdk_mempool_create("PDU_Pool",
+ PDU_POOL_SIZE(iscsi),
+ sizeof(struct spdk_iscsi_pdu),
+ 256, SPDK_ENV_SOCKET_ID_ANY);
+ if (!iscsi->pdu_pool) {
+ SPDK_ERRLOG("create PDU pool failed\n");
+ return -1;
+ }
+
+ iscsi->pdu_immediate_data_pool = spdk_mempool_create_ctor("PDU_immediate_data_Pool",
+ IMMEDIATE_DATA_POOL_SIZE(iscsi),
+ imm_mobj_size, 256,
+ SPDK_ENV_SOCKET_ID_ANY,
+ mobj_ctor, NULL);
+ if (!iscsi->pdu_immediate_data_pool) {
+ SPDK_ERRLOG("create PDU immediate data pool failed\n");
+ return -1;
+ }
+
+ iscsi->pdu_data_out_pool = spdk_mempool_create_ctor("PDU_data_out_Pool",
+ DATA_OUT_POOL_SIZE(iscsi),
+ dout_mobj_size, 256,
+ SPDK_ENV_SOCKET_ID_ANY,
+ mobj_ctor, NULL);
+ if (!iscsi->pdu_data_out_pool) {
+ SPDK_ERRLOG("create PDU data out pool failed\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+static void
+iscsi_sess_ctor(struct spdk_mempool *pool, void *arg, void *session_buf,
+ unsigned index)
+{
+ struct spdk_iscsi_globals *iscsi = arg;
+ struct spdk_iscsi_sess *sess = session_buf;
+
+ iscsi->session[index] = sess;
+
+ /* tsih 0 is reserved, so start tsih values at 1. */
+ sess->tsih = index + 1;
+}
+
+#define DEFAULT_TASK_POOL_SIZE 32768
+
+static int
+iscsi_initialize_task_pool(void)
+{
+ struct spdk_iscsi_globals *iscsi = &g_iscsi;
+
+ /* create scsi_task pool */
+ iscsi->task_pool = spdk_mempool_create("SCSI_TASK_Pool",
+ DEFAULT_TASK_POOL_SIZE,
+ sizeof(struct spdk_iscsi_task),
+ 128, SPDK_ENV_SOCKET_ID_ANY);
+ if (!iscsi->task_pool) {
+ SPDK_ERRLOG("create task pool failed\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+#define SESSION_POOL_SIZE(iscsi) (iscsi->MaxSessions)
+static int
+iscsi_initialize_session_pool(void)
+{
+ struct spdk_iscsi_globals *iscsi = &g_iscsi;
+
+ iscsi->session_pool = spdk_mempool_create_ctor("Session_Pool",
+ SESSION_POOL_SIZE(iscsi),
+ sizeof(struct spdk_iscsi_sess), 0,
+ SPDK_ENV_SOCKET_ID_ANY,
+ iscsi_sess_ctor, iscsi);
+ if (!iscsi->session_pool) {
+ SPDK_ERRLOG("create session pool failed\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int
+iscsi_initialize_all_pools(void)
+{
+ if (iscsi_initialize_pdu_pool() != 0) {
+ return -1;
+ }
+
+ if (iscsi_initialize_session_pool() != 0) {
+ return -1;
+ }
+
+ if (iscsi_initialize_task_pool() != 0) {
+ return -1;
+ }
+
+ return 0;
+}
+
+static void
+iscsi_check_pool(struct spdk_mempool *pool, size_t count)
+{
+ if (pool && spdk_mempool_count(pool) != count) {
+ SPDK_ERRLOG("spdk_mempool_count(%s) == %zu, should be %zu\n",
+ spdk_mempool_get_name(pool), spdk_mempool_count(pool), count);
+ }
+}
+
+static void
+iscsi_check_pools(void)
+{
+ struct spdk_iscsi_globals *iscsi = &g_iscsi;
+
+ iscsi_check_pool(iscsi->pdu_pool, PDU_POOL_SIZE(iscsi));
+ iscsi_check_pool(iscsi->session_pool, SESSION_POOL_SIZE(iscsi));
+ iscsi_check_pool(iscsi->pdu_immediate_data_pool, IMMEDIATE_DATA_POOL_SIZE(iscsi));
+ iscsi_check_pool(iscsi->pdu_data_out_pool, DATA_OUT_POOL_SIZE(iscsi));
+ iscsi_check_pool(iscsi->task_pool, DEFAULT_TASK_POOL_SIZE);
+}
+
+static void
+iscsi_free_pools(void)
+{
+ struct spdk_iscsi_globals *iscsi = &g_iscsi;
+
+ spdk_mempool_free(iscsi->pdu_pool);
+ spdk_mempool_free(iscsi->session_pool);
+ spdk_mempool_free(iscsi->pdu_immediate_data_pool);
+ spdk_mempool_free(iscsi->pdu_data_out_pool);
+ spdk_mempool_free(iscsi->task_pool);
+}
+
+void iscsi_put_pdu(struct spdk_iscsi_pdu *pdu)
+{
+ if (!pdu) {
+ return;
+ }
+
+ assert(pdu->ref > 0);
+ pdu->ref--;
+
+ if (pdu->ref == 0) {
+ if (pdu->mobj) {
+ spdk_mempool_put(pdu->mobj->mp, (void *)pdu->mobj);
+ }
+
+ if (pdu->data && !pdu->data_from_mempool) {
+ free(pdu->data);
+ }
+
+ spdk_mempool_put(g_iscsi.pdu_pool, (void *)pdu);
+ }
+}
+
+struct spdk_iscsi_pdu *iscsi_get_pdu(struct spdk_iscsi_conn *conn)
+{
+ struct spdk_iscsi_pdu *pdu;
+
+ assert(conn != NULL);
+ pdu = spdk_mempool_get(g_iscsi.pdu_pool);
+ if (!pdu) {
+ SPDK_ERRLOG("Unable to get PDU\n");
+ abort();
+ }
+
+ /* we do not want to zero out the last part of the structure reserved for AHS and sense data */
+ memset(pdu, 0, offsetof(struct spdk_iscsi_pdu, ahs));
+ pdu->ref = 1;
+ pdu->conn = conn;
+
+ return pdu;
+}
+
+static void
+iscsi_log_globals(void)
+{
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "AuthFile %s\n",
+ g_iscsi.authfile ? g_iscsi.authfile : "(none)");
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "NodeBase %s\n", g_iscsi.nodebase);
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "MaxSessions %d\n", g_iscsi.MaxSessions);
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "MaxConnectionsPerSession %d\n",
+ g_iscsi.MaxConnectionsPerSession);
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "MaxQueueDepth %d\n", g_iscsi.MaxQueueDepth);
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "DefaultTime2Wait %d\n",
+ g_iscsi.DefaultTime2Wait);
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "DefaultTime2Retain %d\n",
+ g_iscsi.DefaultTime2Retain);
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "FirstBurstLength %d\n",
+ g_iscsi.FirstBurstLength);
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "ImmediateData %s\n",
+ g_iscsi.ImmediateData ? "Yes" : "No");
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "AllowDuplicateIsid %s\n",
+ g_iscsi.AllowDuplicateIsid ? "Yes" : "No");
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "ErrorRecoveryLevel %d\n",
+ g_iscsi.ErrorRecoveryLevel);
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Timeout %d\n", g_iscsi.timeout);
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "NopInInterval %d\n",
+ g_iscsi.nopininterval);
+ if (g_iscsi.disable_chap) {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI,
+ "DiscoveryAuthMethod None\n");
+ } else if (!g_iscsi.require_chap) {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI,
+ "DiscoveryAuthMethod Auto\n");
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI,
+ "DiscoveryAuthMethod %s %s\n",
+ g_iscsi.require_chap ? "CHAP" : "",
+ g_iscsi.mutual_chap ? "Mutual" : "");
+ }
+
+ if (g_iscsi.chap_group == 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI,
+ "DiscoveryAuthGroup None\n");
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI,
+ "DiscoveryAuthGroup AuthGroup%d\n",
+ g_iscsi.chap_group);
+ }
+}
+
+static void
+iscsi_opts_init(struct spdk_iscsi_opts *opts)
+{
+ opts->MaxSessions = DEFAULT_MAX_SESSIONS;
+ opts->MaxConnectionsPerSession = DEFAULT_MAX_CONNECTIONS_PER_SESSION;
+ opts->MaxQueueDepth = DEFAULT_MAX_QUEUE_DEPTH;
+ opts->DefaultTime2Wait = DEFAULT_DEFAULTTIME2WAIT;
+ opts->DefaultTime2Retain = DEFAULT_DEFAULTTIME2RETAIN;
+ opts->FirstBurstLength = SPDK_ISCSI_FIRST_BURST_LENGTH;
+ opts->ImmediateData = DEFAULT_IMMEDIATEDATA;
+ opts->AllowDuplicateIsid = false;
+ opts->ErrorRecoveryLevel = DEFAULT_ERRORRECOVERYLEVEL;
+ opts->timeout = DEFAULT_TIMEOUT;
+ opts->nopininterval = DEFAULT_NOPININTERVAL;
+ opts->disable_chap = false;
+ opts->require_chap = false;
+ opts->mutual_chap = false;
+ opts->chap_group = 0;
+ opts->authfile = NULL;
+ opts->nodebase = NULL;
+}
+
+struct spdk_iscsi_opts *
+iscsi_opts_alloc(void)
+{
+ struct spdk_iscsi_opts *opts;
+
+ opts = calloc(1, sizeof(*opts));
+ if (!opts) {
+ SPDK_ERRLOG("calloc() failed for iscsi options\n");
+ return NULL;
+ }
+
+ iscsi_opts_init(opts);
+
+ return opts;
+}
+
+void
+iscsi_opts_free(struct spdk_iscsi_opts *opts)
+{
+ free(opts->authfile);
+ free(opts->nodebase);
+ free(opts);
+}
+
+/* Deep copy of spdk_iscsi_opts */
+struct spdk_iscsi_opts *
+iscsi_opts_copy(struct spdk_iscsi_opts *src)
+{
+ struct spdk_iscsi_opts *dst;
+
+ dst = calloc(1, sizeof(*dst));
+ if (!dst) {
+ SPDK_ERRLOG("calloc() failed for iscsi options\n");
+ return NULL;
+ }
+
+ if (src->authfile) {
+ dst->authfile = strdup(src->authfile);
+ if (!dst->authfile) {
+ free(dst);
+ SPDK_ERRLOG("failed to strdup for auth file %s\n", src->authfile);
+ return NULL;
+ }
+ }
+
+ if (src->nodebase) {
+ dst->nodebase = strdup(src->nodebase);
+ if (!dst->nodebase) {
+ free(dst->authfile);
+ free(dst);
+ SPDK_ERRLOG("failed to strdup for nodebase %s\n", src->nodebase);
+ return NULL;
+ }
+ }
+
+ dst->MaxSessions = src->MaxSessions;
+ dst->MaxConnectionsPerSession = src->MaxConnectionsPerSession;
+ dst->MaxQueueDepth = src->MaxQueueDepth;
+ dst->DefaultTime2Wait = src->DefaultTime2Wait;
+ dst->DefaultTime2Retain = src->DefaultTime2Retain;
+ dst->FirstBurstLength = src->FirstBurstLength;
+ dst->ImmediateData = src->ImmediateData;
+ dst->AllowDuplicateIsid = src->AllowDuplicateIsid;
+ dst->ErrorRecoveryLevel = src->ErrorRecoveryLevel;
+ dst->timeout = src->timeout;
+ dst->nopininterval = src->nopininterval;
+ dst->disable_chap = src->disable_chap;
+ dst->require_chap = src->require_chap;
+ dst->mutual_chap = src->mutual_chap;
+ dst->chap_group = src->chap_group;
+
+ return dst;
+}
+
+static int
+iscsi_read_config_file_params(struct spdk_conf_section *sp,
+ struct spdk_iscsi_opts *opts)
+{
+ const char *val;
+ int MaxSessions;
+ int MaxConnectionsPerSession;
+ int MaxQueueDepth;
+ int DefaultTime2Wait;
+ int DefaultTime2Retain;
+ int FirstBurstLength;
+ int ErrorRecoveryLevel;
+ int timeout;
+ int nopininterval;
+ const char *ag_tag;
+ int ag_tag_i;
+ int i;
+
+ val = spdk_conf_section_get_val(sp, "Comment");
+ if (val != NULL) {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Comment %s\n", val);
+ }
+
+ val = spdk_conf_section_get_val(sp, "AuthFile");
+ if (val != NULL) {
+ opts->authfile = strdup(val);
+ if (!opts->authfile) {
+ SPDK_ERRLOG("strdup() failed for AuthFile\n");
+ return -ENOMEM;
+ }
+ }
+
+ val = spdk_conf_section_get_val(sp, "NodeBase");
+ if (val != NULL) {
+ opts->nodebase = strdup(val);
+ if (!opts->nodebase) {
+ free(opts->authfile);
+ SPDK_ERRLOG("strdup() failed for NodeBase\n");
+ return -ENOMEM;
+ }
+ }
+
+ MaxSessions = spdk_conf_section_get_intval(sp, "MaxSessions");
+ if (MaxSessions >= 0) {
+ opts->MaxSessions = MaxSessions;
+ }
+
+ MaxConnectionsPerSession = spdk_conf_section_get_intval(sp, "MaxConnectionsPerSession");
+ if (MaxConnectionsPerSession >= 0) {
+ opts->MaxConnectionsPerSession = MaxConnectionsPerSession;
+ }
+
+ MaxQueueDepth = spdk_conf_section_get_intval(sp, "MaxQueueDepth");
+ if (MaxQueueDepth >= 0) {
+ opts->MaxQueueDepth = MaxQueueDepth;
+ }
+
+ DefaultTime2Wait = spdk_conf_section_get_intval(sp, "DefaultTime2Wait");
+ if (DefaultTime2Wait >= 0) {
+ opts->DefaultTime2Wait = DefaultTime2Wait;
+ }
+
+ DefaultTime2Retain = spdk_conf_section_get_intval(sp, "DefaultTime2Retain");
+ if (DefaultTime2Retain >= 0) {
+ opts->DefaultTime2Retain = DefaultTime2Retain;
+ }
+
+ FirstBurstLength = spdk_conf_section_get_intval(sp, "FirstBurstLength");
+ if (FirstBurstLength >= 0) {
+ opts->FirstBurstLength = FirstBurstLength;
+ }
+
+ opts->ImmediateData = spdk_conf_section_get_boolval(sp, "ImmediateData",
+ opts->ImmediateData);
+
+ /* This option is only for test.
+ * If AllowDuplicateIsid is enabled, it allows different connections carrying
+ * TSIH=0 login the target within the same session.
+ */
+ opts->AllowDuplicateIsid = spdk_conf_section_get_boolval(sp, "AllowDuplicateIsid",
+ opts->AllowDuplicateIsid);
+
+ ErrorRecoveryLevel = spdk_conf_section_get_intval(sp, "ErrorRecoveryLevel");
+ if (ErrorRecoveryLevel >= 0) {
+ opts->ErrorRecoveryLevel = ErrorRecoveryLevel;
+ }
+ timeout = spdk_conf_section_get_intval(sp, "Timeout");
+ if (timeout >= 0) {
+ opts->timeout = timeout;
+ }
+ nopininterval = spdk_conf_section_get_intval(sp, "NopInInterval");
+ if (nopininterval >= 0) {
+ opts->nopininterval = nopininterval;
+ }
+ val = spdk_conf_section_get_val(sp, "DiscoveryAuthMethod");
+ if (val != NULL) {
+ for (i = 0; ; i++) {
+ val = spdk_conf_section_get_nmval(sp, "DiscoveryAuthMethod", 0, i);
+ if (val == NULL) {
+ break;
+ }
+ if (strcasecmp(val, "CHAP") == 0) {
+ opts->require_chap = true;
+ } else if (strcasecmp(val, "Mutual") == 0) {
+ opts->require_chap = true;
+ opts->mutual_chap = true;
+ } else if (strcasecmp(val, "Auto") == 0) {
+ opts->disable_chap = false;
+ opts->require_chap = false;
+ opts->mutual_chap = false;
+ } else if (strcasecmp(val, "None") == 0) {
+ opts->disable_chap = true;
+ opts->require_chap = false;
+ opts->mutual_chap = false;
+ } else {
+ SPDK_ERRLOG("unknown CHAP mode %s\n", val);
+ }
+ }
+ if (opts->mutual_chap && !opts->require_chap) {
+ free(opts->authfile);
+ free(opts->nodebase);
+ SPDK_ERRLOG("CHAP must set to be required when using mutual CHAP.\n");
+ return -EINVAL;
+ }
+ }
+ val = spdk_conf_section_get_val(sp, "DiscoveryAuthGroup");
+ if (val != NULL) {
+ ag_tag = val;
+ if (strcasecmp(ag_tag, "None") == 0) {
+ opts->chap_group = 0;
+ } else {
+ if (strncasecmp(ag_tag, "AuthGroup",
+ strlen("AuthGroup")) != 0
+ || sscanf(ag_tag, "%*[^0-9]%d", &ag_tag_i) != 1
+ || ag_tag_i == 0) {
+ SPDK_ERRLOG("invalid auth group %s, ignoring\n", ag_tag);
+ } else {
+ opts->chap_group = ag_tag_i;
+ }
+ }
+ }
+
+ return 0;
+}
+
+static int
+iscsi_opts_verify(struct spdk_iscsi_opts *opts)
+{
+ if (!opts->nodebase) {
+ opts->nodebase = strdup(SPDK_ISCSI_DEFAULT_NODEBASE);
+ if (opts->nodebase == NULL) {
+ SPDK_ERRLOG("strdup() failed for default nodebase\n");
+ return -ENOMEM;
+ }
+ }
+
+ if (opts->MaxSessions == 0 || opts->MaxSessions > 65535) {
+ SPDK_ERRLOG("%d is invalid. MaxSessions must be more than 0 and no more than 65535\n",
+ opts->MaxSessions);
+ return -EINVAL;
+ }
+
+ if (opts->MaxConnectionsPerSession == 0 || opts->MaxConnectionsPerSession > 65535) {
+ SPDK_ERRLOG("%d is invalid. MaxConnectionsPerSession must be more than 0 and no more than 65535\n",
+ opts->MaxConnectionsPerSession);
+ return -EINVAL;
+ }
+
+ if (opts->MaxQueueDepth == 0 || opts->MaxQueueDepth > 256) {
+ SPDK_ERRLOG("%d is invalid. MaxQueueDepth must be more than 0 and no more than 256\n",
+ opts->MaxQueueDepth);
+ return -EINVAL;
+ }
+
+ if (opts->DefaultTime2Wait > 3600) {
+ SPDK_ERRLOG("%d is invalid. DefaultTime2Wait must be no more than 3600\n",
+ opts->DefaultTime2Wait);
+ return -EINVAL;
+ }
+
+ if (opts->DefaultTime2Retain > 3600) {
+ SPDK_ERRLOG("%d is invalid. DefaultTime2Retain must be no more than 3600\n",
+ opts->DefaultTime2Retain);
+ return -EINVAL;
+ }
+
+ if (opts->FirstBurstLength >= SPDK_ISCSI_MIN_FIRST_BURST_LENGTH) {
+ if (opts->FirstBurstLength > SPDK_ISCSI_MAX_BURST_LENGTH) {
+ SPDK_ERRLOG("FirstBurstLength %d shall not exceed MaxBurstLength %d\n",
+ opts->FirstBurstLength, SPDK_ISCSI_MAX_BURST_LENGTH);
+ return -EINVAL;
+ }
+ } else {
+ SPDK_ERRLOG("FirstBurstLength %d shall be no less than %d\n",
+ opts->FirstBurstLength, SPDK_ISCSI_MIN_FIRST_BURST_LENGTH);
+ return -EINVAL;
+ }
+
+ if (opts->ErrorRecoveryLevel > 2) {
+ SPDK_ERRLOG("ErrorRecoveryLevel %d is not supported.\n", opts->ErrorRecoveryLevel);
+ return -EINVAL;
+ }
+
+ if (opts->timeout < 0) {
+ SPDK_ERRLOG("%d is invalid. timeout must not be less than 0\n", opts->timeout);
+ return -EINVAL;
+ }
+
+ if (opts->nopininterval < 0 || opts->nopininterval > MAX_NOPININTERVAL) {
+ SPDK_ERRLOG("%d is invalid. nopinterval must be between 0 and %d\n",
+ opts->nopininterval, MAX_NOPININTERVAL);
+ return -EINVAL;
+ }
+
+ if (!iscsi_check_chap_params(opts->disable_chap, opts->require_chap,
+ opts->mutual_chap, opts->chap_group)) {
+ SPDK_ERRLOG("CHAP params in opts are illegal combination\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int
+iscsi_parse_options(struct spdk_iscsi_opts **popts)
+{
+ struct spdk_iscsi_opts *opts;
+ struct spdk_conf_section *sp;
+ int rc;
+
+ opts = iscsi_opts_alloc();
+ if (!opts) {
+ SPDK_ERRLOG("iscsi_opts_alloc_failed() failed\n");
+ return -ENOMEM;
+ }
+
+ /* Process parameters */
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "iscsi_read_config_file_parmas\n");
+ sp = spdk_conf_find_section(NULL, "iSCSI");
+ if (sp != NULL) {
+ rc = iscsi_read_config_file_params(sp, opts);
+ if (rc != 0) {
+ free(opts);
+ SPDK_ERRLOG("iscsi_read_config_file_params() failed\n");
+ return rc;
+ }
+ }
+
+ *popts = opts;
+
+ return 0;
+}
+
+static int
+iscsi_set_global_params(struct spdk_iscsi_opts *opts)
+{
+ int rc;
+
+ rc = iscsi_opts_verify(opts);
+ if (rc != 0) {
+ SPDK_ERRLOG("spdk_iscsi_opts_verify() failed\n");
+ return rc;
+ }
+
+ if (opts->authfile != NULL) {
+ g_iscsi.authfile = strdup(opts->authfile);
+ if (!g_iscsi.authfile) {
+ SPDK_ERRLOG("failed to strdup for auth file %s\n", opts->authfile);
+ return -ENOMEM;
+ }
+ }
+
+ g_iscsi.nodebase = strdup(opts->nodebase);
+ if (!g_iscsi.nodebase) {
+ SPDK_ERRLOG("failed to strdup for nodebase %s\n", opts->nodebase);
+ return -ENOMEM;
+ }
+
+ g_iscsi.MaxSessions = opts->MaxSessions;
+ g_iscsi.MaxConnectionsPerSession = opts->MaxConnectionsPerSession;
+ g_iscsi.MaxQueueDepth = opts->MaxQueueDepth;
+ g_iscsi.DefaultTime2Wait = opts->DefaultTime2Wait;
+ g_iscsi.DefaultTime2Retain = opts->DefaultTime2Retain;
+ g_iscsi.FirstBurstLength = opts->FirstBurstLength;
+ g_iscsi.ImmediateData = opts->ImmediateData;
+ g_iscsi.AllowDuplicateIsid = opts->AllowDuplicateIsid;
+ g_iscsi.ErrorRecoveryLevel = opts->ErrorRecoveryLevel;
+ g_iscsi.timeout = opts->timeout;
+ g_iscsi.nopininterval = opts->nopininterval;
+ g_iscsi.disable_chap = opts->disable_chap;
+ g_iscsi.require_chap = opts->require_chap;
+ g_iscsi.mutual_chap = opts->mutual_chap;
+ g_iscsi.chap_group = opts->chap_group;
+
+ iscsi_log_globals();
+
+ return 0;
+}
+
+int
+iscsi_set_discovery_auth(bool disable_chap, bool require_chap, bool mutual_chap,
+ int32_t chap_group)
+{
+ if (!iscsi_check_chap_params(disable_chap, require_chap, mutual_chap,
+ chap_group)) {
+ SPDK_ERRLOG("CHAP params are illegal combination\n");
+ return -EINVAL;
+ }
+
+ pthread_mutex_lock(&g_iscsi.mutex);
+ g_iscsi.disable_chap = disable_chap;
+ g_iscsi.require_chap = require_chap;
+ g_iscsi.mutual_chap = mutual_chap;
+ g_iscsi.chap_group = chap_group;
+ pthread_mutex_unlock(&g_iscsi.mutex);
+
+ return 0;
+}
+
+int
+iscsi_auth_group_add_secret(struct spdk_iscsi_auth_group *group,
+ const char *user, const char *secret,
+ const char *muser, const char *msecret)
+{
+ struct spdk_iscsi_auth_secret *_secret;
+ size_t len;
+
+ if (user == NULL || secret == NULL) {
+ SPDK_ERRLOG("user and secret must be specified\n");
+ return -EINVAL;
+ }
+
+ if (muser != NULL && msecret == NULL) {
+ SPDK_ERRLOG("msecret must be specified with muser\n");
+ return -EINVAL;
+ }
+
+ TAILQ_FOREACH(_secret, &group->secret_head, tailq) {
+ if (strcmp(_secret->user, user) == 0) {
+ SPDK_ERRLOG("user for secret is duplicated\n");
+ return -EEXIST;
+ }
+ }
+
+ _secret = calloc(1, sizeof(*_secret));
+ if (_secret == NULL) {
+ SPDK_ERRLOG("calloc() failed for CHAP secret\n");
+ return -ENOMEM;
+ }
+
+ len = strnlen(user, sizeof(_secret->user));
+ if (len > sizeof(_secret->user) - 1) {
+ SPDK_ERRLOG("CHAP user longer than %zu characters: %s\n",
+ sizeof(_secret->user) - 1, user);
+ free(_secret);
+ return -EINVAL;
+ }
+ memcpy(_secret->user, user, len);
+
+ len = strnlen(secret, sizeof(_secret->secret));
+ if (len > sizeof(_secret->secret) - 1) {
+ SPDK_ERRLOG("CHAP secret longer than %zu characters: %s\n",
+ sizeof(_secret->secret) - 1, secret);
+ free(_secret);
+ return -EINVAL;
+ }
+ memcpy(_secret->secret, secret, len);
+
+ if (muser != NULL) {
+ len = strnlen(muser, sizeof(_secret->muser));
+ if (len > sizeof(_secret->muser) - 1) {
+ SPDK_ERRLOG("Mutual CHAP user longer than %zu characters: %s\n",
+ sizeof(_secret->muser) - 1, muser);
+ free(_secret);
+ return -EINVAL;
+ }
+ memcpy(_secret->muser, muser, len);
+
+ len = strnlen(msecret, sizeof(_secret->msecret));
+ if (len > sizeof(_secret->msecret) - 1) {
+ SPDK_ERRLOG("Mutual CHAP secret longer than %zu characters: %s\n",
+ sizeof(_secret->msecret) - 1, msecret);
+ free(_secret);
+ return -EINVAL;
+ }
+ memcpy(_secret->msecret, msecret, len);
+ }
+
+ TAILQ_INSERT_TAIL(&group->secret_head, _secret, tailq);
+ return 0;
+}
+
+int
+iscsi_auth_group_delete_secret(struct spdk_iscsi_auth_group *group,
+ const char *user)
+{
+ struct spdk_iscsi_auth_secret *_secret;
+
+ if (user == NULL) {
+ SPDK_ERRLOG("user must be specified\n");
+ return -EINVAL;
+ }
+
+ TAILQ_FOREACH(_secret, &group->secret_head, tailq) {
+ if (strcmp(_secret->user, user) == 0) {
+ break;
+ }
+ }
+
+ if (_secret == NULL) {
+ SPDK_ERRLOG("secret is not found\n");
+ return -ENODEV;
+ }
+
+ TAILQ_REMOVE(&group->secret_head, _secret, tailq);
+ free(_secret);
+
+ return 0;
+}
+
+int
+iscsi_add_auth_group(int32_t tag, struct spdk_iscsi_auth_group **_group)
+{
+ struct spdk_iscsi_auth_group *group;
+
+ TAILQ_FOREACH(group, &g_iscsi.auth_group_head, tailq) {
+ if (group->tag == tag) {
+ SPDK_ERRLOG("Auth group (%d) already exists\n", tag);
+ return -EEXIST;
+ }
+ }
+
+ group = calloc(1, sizeof(*group));
+ if (group == NULL) {
+ SPDK_ERRLOG("calloc() failed for auth group\n");
+ return -ENOMEM;
+ }
+
+ TAILQ_INIT(&group->secret_head);
+ group->tag = tag;
+
+ TAILQ_INSERT_TAIL(&g_iscsi.auth_group_head, group, tailq);
+
+ *_group = group;
+ return 0;
+}
+
+void
+iscsi_delete_auth_group(struct spdk_iscsi_auth_group *group)
+{
+ struct spdk_iscsi_auth_secret *_secret, *tmp;
+
+ TAILQ_REMOVE(&g_iscsi.auth_group_head, group, tailq);
+
+ TAILQ_FOREACH_SAFE(_secret, &group->secret_head, tailq, tmp) {
+ TAILQ_REMOVE(&group->secret_head, _secret, tailq);
+ free(_secret);
+ }
+ free(group);
+}
+
+struct spdk_iscsi_auth_group *
+iscsi_find_auth_group_by_tag(int32_t tag)
+{
+ struct spdk_iscsi_auth_group *group;
+
+ TAILQ_FOREACH(group, &g_iscsi.auth_group_head, tailq) {
+ if (group->tag == tag) {
+ return group;
+ }
+ }
+
+ return NULL;
+}
+
+static void
+iscsi_auth_groups_destroy(void)
+{
+ struct spdk_iscsi_auth_group *group, *tmp;
+
+ TAILQ_FOREACH_SAFE(group, &g_iscsi.auth_group_head, tailq, tmp) {
+ iscsi_delete_auth_group(group);
+ }
+}
+
+static int
+iscsi_parse_auth_group(struct spdk_conf_section *sp)
+{
+ int rc;
+ int i;
+ int tag;
+ const char *val, *user, *secret, *muser, *msecret;
+ struct spdk_iscsi_auth_group *group = NULL;
+
+ val = spdk_conf_section_get_val(sp, "Comment");
+ if (val != NULL) {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Comment %s\n", val);
+ }
+
+ tag = spdk_conf_section_get_num(sp);
+
+ rc = iscsi_add_auth_group(tag, &group);
+ if (rc != 0) {
+ SPDK_ERRLOG("Failed to add auth group\n");
+ return rc;
+ }
+
+ for (i = 0; ; i++) {
+ val = spdk_conf_section_get_nval(sp, "Auth", i);
+ if (val == NULL) {
+ break;
+ }
+
+ user = spdk_conf_section_get_nmval(sp, "Auth", i, 0);
+ secret = spdk_conf_section_get_nmval(sp, "Auth", i, 1);
+ muser = spdk_conf_section_get_nmval(sp, "Auth", i, 2);
+ msecret = spdk_conf_section_get_nmval(sp, "Auth", i, 3);
+
+ rc = iscsi_auth_group_add_secret(group, user, secret, muser, msecret);
+ if (rc != 0) {
+ SPDK_ERRLOG("Failed to add secret to auth group\n");
+ iscsi_delete_auth_group(group);
+ return rc;
+ }
+ }
+
+ return 0;
+}
+
+static int
+iscsi_parse_auth_info(void)
+{
+ struct spdk_conf *config;
+ struct spdk_conf_section *sp;
+ int rc;
+
+ config = spdk_conf_allocate();
+ if (!config) {
+ SPDK_ERRLOG("Failed to allocate config file\n");
+ return -ENOMEM;
+ }
+
+ rc = spdk_conf_read(config, g_iscsi.authfile);
+ if (rc != 0) {
+ SPDK_INFOLOG(SPDK_LOG_ISCSI, "Failed to load auth file\n");
+ spdk_conf_free(config);
+ return rc;
+ }
+
+ sp = spdk_conf_first_section(config);
+ while (sp != NULL) {
+ if (spdk_conf_section_match_prefix(sp, "AuthGroup")) {
+ if (spdk_conf_section_get_num(sp) == 0) {
+ SPDK_ERRLOG("Group 0 is invalid\n");
+ iscsi_auth_groups_destroy();
+ spdk_conf_free(config);
+ return -EINVAL;
+ }
+
+ rc = iscsi_parse_auth_group(sp);
+ if (rc != 0) {
+ SPDK_ERRLOG("parse_auth_group() failed\n");
+ iscsi_auth_groups_destroy();
+ spdk_conf_free(config);
+ return rc;
+ }
+ }
+ sp = spdk_conf_next_section(sp);
+ }
+
+ spdk_conf_free(config);
+ return 0;
+}
+
+static struct spdk_iscsi_auth_secret *
+iscsi_find_auth_secret(const char *authuser, int ag_tag)
+{
+ struct spdk_iscsi_auth_group *group;
+ struct spdk_iscsi_auth_secret *_secret;
+
+ TAILQ_FOREACH(group, &g_iscsi.auth_group_head, tailq) {
+ if (group->tag == ag_tag) {
+ TAILQ_FOREACH(_secret, &group->secret_head, tailq) {
+ if (strcmp(_secret->user, authuser) == 0) {
+ return _secret;
+ }
+ }
+ }
+ }
+
+ return NULL;
+}
+
+int
+iscsi_chap_get_authinfo(struct iscsi_chap_auth *auth, const char *authuser,
+ int ag_tag)
+{
+ struct spdk_iscsi_auth_secret *_secret;
+
+ if (authuser == NULL) {
+ return -EINVAL;
+ }
+
+ if (auth->user[0] != '\0') {
+ memset(auth->user, 0, sizeof(auth->user));
+ memset(auth->secret, 0, sizeof(auth->secret));
+ memset(auth->muser, 0, sizeof(auth->muser));
+ memset(auth->msecret, 0, sizeof(auth->msecret));
+ }
+
+ pthread_mutex_lock(&g_iscsi.mutex);
+
+ _secret = iscsi_find_auth_secret(authuser, ag_tag);
+ if (_secret == NULL) {
+ pthread_mutex_unlock(&g_iscsi.mutex);
+
+ SPDK_ERRLOG("CHAP secret is not found: user:%s, tag:%d\n",
+ authuser, ag_tag);
+ return -ENOENT;
+ }
+
+ memcpy(auth->user, _secret->user, sizeof(auth->user));
+ memcpy(auth->secret, _secret->secret, sizeof(auth->secret));
+
+ if (_secret->muser[0] != '\0') {
+ memcpy(auth->muser, _secret->muser, sizeof(auth->muser));
+ memcpy(auth->msecret, _secret->msecret, sizeof(auth->msecret));
+ }
+
+ pthread_mutex_unlock(&g_iscsi.mutex);
+ return 0;
+}
+
+static int
+iscsi_initialize_global_params(void)
+{
+ int rc;
+
+ if (!g_spdk_iscsi_opts) {
+ rc = iscsi_parse_options(&g_spdk_iscsi_opts);
+ if (rc != 0) {
+ SPDK_ERRLOG("iscsi_parse_options() failed\n");
+ return rc;
+ }
+ }
+
+ rc = iscsi_set_global_params(g_spdk_iscsi_opts);
+ if (rc != 0) {
+ SPDK_ERRLOG("iscsi_set_global_params() failed\n");
+ }
+
+ iscsi_opts_free(g_spdk_iscsi_opts);
+ g_spdk_iscsi_opts = NULL;
+
+ return rc;
+}
+
+static void
+iscsi_init_complete(int rc)
+{
+ spdk_iscsi_init_cb cb_fn = g_init_cb_fn;
+ void *cb_arg = g_init_cb_arg;
+
+ g_init_cb_fn = NULL;
+ g_init_cb_arg = NULL;
+
+ cb_fn(cb_arg, rc);
+}
+
+static void
+iscsi_parse_configuration(void)
+{
+ int rc;
+
+ rc = iscsi_parse_portal_grps();
+ if (rc < 0) {
+ SPDK_ERRLOG("iscsi_parse_portal_grps() failed\n");
+ goto end;
+ }
+
+ rc = iscsi_parse_init_grps();
+ if (rc < 0) {
+ SPDK_ERRLOG("iscsi_parse_init_grps() failed\n");
+ goto end;
+ }
+
+ rc = iscsi_parse_tgt_nodes();
+ if (rc < 0) {
+ SPDK_ERRLOG("iscsi_parse_tgt_nodes() failed\n");
+ }
+
+ if (g_iscsi.authfile != NULL) {
+ if (access(g_iscsi.authfile, R_OK) == 0) {
+ rc = iscsi_parse_auth_info();
+ if (rc < 0) {
+ SPDK_ERRLOG("iscsi_parse_auth_info() failed\n");
+ }
+ } else {
+ SPDK_INFOLOG(SPDK_LOG_ISCSI, "CHAP secret file is not found in the path %s\n",
+ g_iscsi.authfile);
+ }
+ }
+
+end:
+ iscsi_init_complete(rc);
+}
+
+static int
+iscsi_poll_group_poll(void *ctx)
+{
+ struct spdk_iscsi_poll_group *group = ctx;
+ struct spdk_iscsi_conn *conn, *tmp;
+ int rc;
+
+ if (spdk_unlikely(STAILQ_EMPTY(&group->connections))) {
+ return SPDK_POLLER_IDLE;
+ }
+
+ rc = spdk_sock_group_poll(group->sock_group);
+ if (rc < 0) {
+ SPDK_ERRLOG("Failed to poll sock_group=%p\n", group->sock_group);
+ }
+
+ STAILQ_FOREACH_SAFE(conn, &group->connections, pg_link, tmp) {
+ if (conn->state == ISCSI_CONN_STATE_EXITING) {
+ iscsi_conn_destruct(conn);
+ }
+ }
+
+ return rc != 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
+}
+
+static int
+iscsi_poll_group_handle_nop(void *ctx)
+{
+ struct spdk_iscsi_poll_group *group = ctx;
+ struct spdk_iscsi_conn *conn, *tmp;
+
+ STAILQ_FOREACH_SAFE(conn, &group->connections, pg_link, tmp) {
+ iscsi_conn_handle_nop(conn);
+ }
+
+ return SPDK_POLLER_BUSY;
+}
+
+static int
+iscsi_poll_group_create(void *io_device, void *ctx_buf)
+{
+ struct spdk_iscsi_poll_group *pg = ctx_buf;
+
+ STAILQ_INIT(&pg->connections);
+ pg->sock_group = spdk_sock_group_create(NULL);
+ assert(pg->sock_group != NULL);
+
+ pg->poller = SPDK_POLLER_REGISTER(iscsi_poll_group_poll, pg, 0);
+ /* set the period to 1 sec */
+ pg->nop_poller = SPDK_POLLER_REGISTER(iscsi_poll_group_handle_nop, pg, 1000000);
+
+ return 0;
+}
+
+static void
+iscsi_poll_group_destroy(void *io_device, void *ctx_buf)
+{
+ struct spdk_iscsi_poll_group *pg = ctx_buf;
+ struct spdk_io_channel *ch;
+ struct spdk_thread *thread;
+
+ assert(pg->poller != NULL);
+ assert(pg->sock_group != NULL);
+
+ spdk_sock_group_close(&pg->sock_group);
+ spdk_poller_unregister(&pg->poller);
+ spdk_poller_unregister(&pg->nop_poller);
+
+ ch = spdk_io_channel_from_ctx(pg);
+ thread = spdk_io_channel_get_thread(ch);
+
+ assert(thread == spdk_get_thread());
+
+ spdk_thread_exit(thread);
+}
+
+static void
+_iscsi_init_thread_done(void *ctx)
+{
+ struct spdk_iscsi_poll_group *pg = ctx;
+
+ TAILQ_INSERT_TAIL(&g_iscsi.poll_group_head, pg, link);
+ if (--g_iscsi.refcnt == 0) {
+ iscsi_parse_configuration();
+ }
+}
+
+static void
+_iscsi_init_thread(void *ctx)
+{
+ struct spdk_io_channel *ch;
+ struct spdk_iscsi_poll_group *pg;
+
+ ch = spdk_get_io_channel(&g_iscsi);
+ pg = spdk_io_channel_get_ctx(ch);
+
+ spdk_thread_send_msg(g_init_thread, _iscsi_init_thread_done, pg);
+}
+
+static void
+initialize_iscsi_poll_group(void)
+{
+ struct spdk_cpuset tmp_cpumask = {};
+ uint32_t i;
+ char thread_name[32];
+ struct spdk_thread *thread;
+
+ spdk_io_device_register(&g_iscsi, iscsi_poll_group_create, iscsi_poll_group_destroy,
+ sizeof(struct spdk_iscsi_poll_group), "iscsi_tgt");
+
+ /* Create threads for CPU cores active for this application, and send a
+ * message to each thread to create a poll group on it.
+ */
+ g_init_thread = spdk_get_thread();
+ assert(g_init_thread != NULL);
+ assert(g_iscsi.refcnt == 0);
+
+ SPDK_ENV_FOREACH_CORE(i) {
+ spdk_cpuset_zero(&tmp_cpumask);
+ spdk_cpuset_set_cpu(&tmp_cpumask, i, true);
+ snprintf(thread_name, sizeof(thread_name), "iscsi_poll_group_%u", i);
+
+ thread = spdk_thread_create(thread_name, &tmp_cpumask);
+ assert(thread != NULL);
+
+ g_iscsi.refcnt++;
+ spdk_thread_send_msg(thread, _iscsi_init_thread, NULL);
+ }
+}
+
+static int
+iscsi_parse_globals(void)
+{
+ int rc;
+
+ rc = iscsi_initialize_global_params();
+ if (rc != 0) {
+ SPDK_ERRLOG("iscsi_initialize_iscsi_global_params() failed\n");
+ return rc;
+ }
+
+ g_iscsi.session = calloc(1, sizeof(struct spdk_iscsi_sess *) * g_iscsi.MaxSessions);
+ if (!g_iscsi.session) {
+ SPDK_ERRLOG("calloc() failed for session array\n");
+ return -1;
+ }
+
+ /*
+ * For now, just support same number of total connections, rather
+ * than MaxSessions * MaxConnectionsPerSession. After we add better
+ * handling for low resource conditions from our various buffer
+ * pools, we can bump this up to support more connections.
+ */
+ g_iscsi.MaxConnections = g_iscsi.MaxSessions;
+
+ rc = iscsi_initialize_all_pools();
+ if (rc != 0) {
+ SPDK_ERRLOG("initialize_all_pools() failed\n");
+ free(g_iscsi.session);
+ g_iscsi.session = NULL;
+ return -1;
+ }
+
+ rc = initialize_iscsi_conns();
+ if (rc < 0) {
+ SPDK_ERRLOG("initialize_iscsi_conns() failed\n");
+ free(g_iscsi.session);
+ g_iscsi.session = NULL;
+ return rc;
+ }
+
+ initialize_iscsi_poll_group();
+ return 0;
+}
+
+void
+spdk_iscsi_init(spdk_iscsi_init_cb cb_fn, void *cb_arg)
+{
+ int rc;
+
+ assert(cb_fn != NULL);
+ g_init_cb_fn = cb_fn;
+ g_init_cb_arg = cb_arg;
+
+ rc = iscsi_parse_globals();
+ if (rc < 0) {
+ SPDK_ERRLOG("iscsi_parse_globals() failed\n");
+ iscsi_init_complete(-1);
+ }
+
+ /*
+ * iscsi_parse_configuration() will be called as the callback to
+ * spdk_initialize_iscsi_poll_group() and will complete iSCSI
+ * subsystem initialization.
+ */
+}
+
+void
+spdk_iscsi_fini(spdk_iscsi_fini_cb cb_fn, void *cb_arg)
+{
+ g_fini_cb_fn = cb_fn;
+ g_fini_cb_arg = cb_arg;
+
+ iscsi_portal_grp_close_all();
+ shutdown_iscsi_conns();
+}
+
+static void
+iscsi_fini_done(void *io_device)
+{
+ free(g_iscsi.authfile);
+ free(g_iscsi.nodebase);
+
+ pthread_mutex_destroy(&g_iscsi.mutex);
+ g_fini_cb_fn(g_fini_cb_arg);
+}
+
+static void
+_iscsi_fini_dev_unreg(struct spdk_io_channel_iter *i, int status)
+{
+ iscsi_check_pools();
+ iscsi_free_pools();
+ free(g_iscsi.session);
+
+ assert(TAILQ_EMPTY(&g_iscsi.poll_group_head));
+
+ iscsi_shutdown_tgt_nodes();
+ iscsi_init_grps_destroy();
+ iscsi_portal_grps_destroy();
+ iscsi_auth_groups_destroy();
+
+ spdk_io_device_unregister(&g_iscsi, iscsi_fini_done);
+}
+
+static void
+_iscsi_fini_thread(struct spdk_io_channel_iter *i)
+{
+ struct spdk_io_channel *ch;
+ struct spdk_iscsi_poll_group *pg;
+
+ ch = spdk_io_channel_iter_get_channel(i);
+ pg = spdk_io_channel_get_ctx(ch);
+
+ pthread_mutex_lock(&g_iscsi.mutex);
+ TAILQ_REMOVE(&g_iscsi.poll_group_head, pg, link);
+ pthread_mutex_unlock(&g_iscsi.mutex);
+
+ spdk_put_io_channel(ch);
+
+ spdk_for_each_channel_continue(i, 0);
+}
+
+void
+shutdown_iscsi_conns_done(void)
+{
+ spdk_for_each_channel(&g_iscsi, _iscsi_fini_thread, NULL, _iscsi_fini_dev_unreg);
+}
+
+void
+spdk_iscsi_config_text(FILE *fp)
+{
+ iscsi_globals_config_text(fp);
+ iscsi_portal_grps_config_text(fp);
+ iscsi_init_grps_config_text(fp);
+ iscsi_tgt_nodes_config_text(fp);
+}
+
+void
+iscsi_opts_info_json(struct spdk_json_write_ctx *w)
+{
+ spdk_json_write_object_begin(w);
+
+ if (g_iscsi.authfile != NULL) {
+ spdk_json_write_named_string(w, "auth_file", g_iscsi.authfile);
+ }
+ spdk_json_write_named_string(w, "node_base", g_iscsi.nodebase);
+
+ spdk_json_write_named_uint32(w, "max_sessions", g_iscsi.MaxSessions);
+ spdk_json_write_named_uint32(w, "max_connections_per_session",
+ g_iscsi.MaxConnectionsPerSession);
+
+ spdk_json_write_named_uint32(w, "max_queue_depth", g_iscsi.MaxQueueDepth);
+
+ spdk_json_write_named_uint32(w, "default_time2wait", g_iscsi.DefaultTime2Wait);
+ spdk_json_write_named_uint32(w, "default_time2retain", g_iscsi.DefaultTime2Retain);
+
+ spdk_json_write_named_uint32(w, "first_burst_length", g_iscsi.FirstBurstLength);
+
+ spdk_json_write_named_bool(w, "immediate_data", g_iscsi.ImmediateData);
+
+ spdk_json_write_named_bool(w, "allow_duplicated_isid", g_iscsi.AllowDuplicateIsid);
+
+ spdk_json_write_named_uint32(w, "error_recovery_level", g_iscsi.ErrorRecoveryLevel);
+
+ spdk_json_write_named_int32(w, "nop_timeout", g_iscsi.timeout);
+ spdk_json_write_named_int32(w, "nop_in_interval", g_iscsi.nopininterval);
+
+ spdk_json_write_named_bool(w, "disable_chap", g_iscsi.disable_chap);
+ spdk_json_write_named_bool(w, "require_chap", g_iscsi.require_chap);
+ spdk_json_write_named_bool(w, "mutual_chap", g_iscsi.mutual_chap);
+ spdk_json_write_named_int32(w, "chap_group", g_iscsi.chap_group);
+
+ spdk_json_write_object_end(w);
+}
+
+static void
+iscsi_auth_group_info_json(struct spdk_iscsi_auth_group *group,
+ struct spdk_json_write_ctx *w)
+{
+ struct spdk_iscsi_auth_secret *_secret;
+
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_int32(w, "tag", group->tag);
+
+ spdk_json_write_named_array_begin(w, "secrets");
+ TAILQ_FOREACH(_secret, &group->secret_head, tailq) {
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_string(w, "user", _secret->user);
+ spdk_json_write_named_string(w, "secret", _secret->secret);
+
+ if (_secret->muser[0] != '\0') {
+ spdk_json_write_named_string(w, "muser", _secret->muser);
+ spdk_json_write_named_string(w, "msecret", _secret->msecret);
+ }
+
+ spdk_json_write_object_end(w);
+ }
+ spdk_json_write_array_end(w);
+
+ spdk_json_write_object_end(w);
+}
+
+static void
+iscsi_auth_group_config_json(struct spdk_iscsi_auth_group *group,
+ struct spdk_json_write_ctx *w)
+{
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_string(w, "method", "iscsi_create_auth_group");
+
+ spdk_json_write_name(w, "params");
+ iscsi_auth_group_info_json(group, w);
+
+ spdk_json_write_object_end(w);
+}
+
+void
+iscsi_auth_groups_info_json(struct spdk_json_write_ctx *w)
+{
+ struct spdk_iscsi_auth_group *group;
+
+ TAILQ_FOREACH(group, &g_iscsi.auth_group_head, tailq) {
+ iscsi_auth_group_info_json(group, w);
+ }
+}
+
+static void
+iscsi_auth_groups_config_json(struct spdk_json_write_ctx *w)
+{
+ struct spdk_iscsi_auth_group *group;
+
+ TAILQ_FOREACH(group, &g_iscsi.auth_group_head, tailq) {
+ iscsi_auth_group_config_json(group, w);
+ }
+}
+
+static void
+iscsi_opts_config_json(struct spdk_json_write_ctx *w)
+{
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_string(w, "method", "iscsi_set_options");
+
+ spdk_json_write_name(w, "params");
+ iscsi_opts_info_json(w);
+
+ spdk_json_write_object_end(w);
+}
+
+void
+spdk_iscsi_config_json(struct spdk_json_write_ctx *w)
+{
+ spdk_json_write_array_begin(w);
+ iscsi_opts_config_json(w);
+ iscsi_portal_grps_config_json(w);
+ iscsi_init_grps_config_json(w);
+ iscsi_tgt_nodes_config_json(w);
+ iscsi_auth_groups_config_json(w);
+ spdk_json_write_array_end(w);
+}
+
+SPDK_LOG_REGISTER_COMPONENT("iscsi", SPDK_LOG_ISCSI)
diff --git a/src/spdk/lib/iscsi/md5.c b/src/spdk/lib/iscsi/md5.c
new file mode 100644
index 000000000..c316ac354
--- /dev/null
+++ b/src/spdk/lib/iscsi/md5.c
@@ -0,0 +1,75 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include <openssl/md5.h>
+
+#include "iscsi/md5.h"
+
+int md5init(struct spdk_md5ctx *md5ctx)
+{
+ int rc;
+
+ if (md5ctx == NULL) {
+ return -1;
+ }
+ rc = MD5_Init(&md5ctx->md5ctx);
+ return rc;
+}
+
+int md5final(void *md5, struct spdk_md5ctx *md5ctx)
+{
+ int rc;
+
+ if (md5ctx == NULL || md5 == NULL) {
+ return -1;
+ }
+ rc = MD5_Final(md5, &md5ctx->md5ctx);
+ return rc;
+}
+
+int md5update(struct spdk_md5ctx *md5ctx, const void *data, size_t len)
+{
+ int rc;
+
+ if (md5ctx == NULL) {
+ return -1;
+ }
+ if (data == NULL || len == 0) {
+ return 0;
+ }
+ rc = MD5_Update(&md5ctx->md5ctx, data, len);
+ return rc;
+}
diff --git a/src/spdk/lib/iscsi/md5.h b/src/spdk/lib/iscsi/md5.h
new file mode 100644
index 000000000..d6fc4c1ff
--- /dev/null
+++ b/src/spdk/lib/iscsi/md5.h
@@ -0,0 +1,52 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_MD5_H
+#define SPDK_MD5_H
+
+#include "spdk/stdinc.h"
+
+#include <openssl/md5.h>
+
+#define SPDK_MD5DIGEST_LEN MD5_DIGEST_LENGTH
+
+struct spdk_md5ctx {
+ MD5_CTX md5ctx;
+};
+
+int md5init(struct spdk_md5ctx *md5ctx);
+int md5final(void *md5, struct spdk_md5ctx *md5ctx);
+int md5update(struct spdk_md5ctx *md5ctx, const void *data, size_t len);
+
+#endif /* SPDK_MD5_H */
diff --git a/src/spdk/lib/iscsi/param.c b/src/spdk/lib/iscsi/param.c
new file mode 100644
index 000000000..18f579359
--- /dev/null
+++ b/src/spdk/lib/iscsi/param.c
@@ -0,0 +1,1216 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/string.h"
+#include "iscsi/iscsi.h"
+#include "iscsi/param.h"
+#include "iscsi/conn.h"
+#include "spdk/string.h"
+
+#include "spdk_internal/log.h"
+
+#define MAX_TMPBUF 1024
+
+/* whose value may be bigger than 255 */
+static const char *non_simple_value_params[] = {
+ "CHAP_C",
+ "CHAP_R",
+ NULL,
+};
+
+void
+iscsi_param_free(struct iscsi_param *params)
+{
+ struct iscsi_param *param, *next_param;
+
+ if (params == NULL) {
+ return;
+ }
+ for (param = params; param != NULL; param = next_param) {
+ next_param = param->next;
+ if (param->list) {
+ free(param->list);
+ }
+ free(param->val);
+ free(param->key);
+ free(param);
+ }
+}
+
+static int
+iscsi_find_key_in_array(const char *key, const char *array[])
+{
+ int i;
+
+ for (i = 0; array[i] != NULL; i++) {
+ if (strcasecmp(key, array[i]) == 0) {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+struct iscsi_param *
+iscsi_param_find(struct iscsi_param *params, const char *key)
+{
+ struct iscsi_param *param;
+
+ if (params == NULL || key == NULL) {
+ return NULL;
+ }
+ for (param = params; param != NULL; param = param->next) {
+ if (param->key != NULL && param->key[0] == key[0]
+ && strcasecmp(param->key, key) == 0) {
+ return param;
+ }
+ }
+ return NULL;
+}
+
+int
+iscsi_param_del(struct iscsi_param **params, const char *key)
+{
+ struct iscsi_param *param, *prev_param = NULL;
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "del %s\n", key);
+ if (params == NULL || key == NULL) {
+ return 0;
+ }
+ for (param = *params; param != NULL; param = param->next) {
+ if (param->key != NULL && param->key[0] == key[0]
+ && strcasecmp(param->key, key) == 0) {
+ if (prev_param != NULL) {
+ prev_param->next = param->next;
+ } else {
+ *params = param->next;
+ }
+ param->next = NULL;
+ iscsi_param_free(param);
+ return 0;
+ }
+ prev_param = param;
+ }
+ return -1;
+}
+
+int
+iscsi_param_add(struct iscsi_param **params, const char *key,
+ const char *val, const char *list, int type)
+{
+ struct iscsi_param *param, *last_param;
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "add %s=%s, list=[%s], type=%d\n",
+ key, val, list, type);
+ if (key == NULL) {
+ return -1;
+ }
+
+ param = iscsi_param_find(*params, key);
+ if (param != NULL) {
+ iscsi_param_del(params, key);
+ }
+
+ param = calloc(1, sizeof(*param));
+ if (!param) {
+ SPDK_ERRLOG("calloc() failed for parameter\n");
+ return -ENOMEM;
+ }
+
+ param->next = NULL;
+ param->key = xstrdup(key);
+ param->val = xstrdup(val);
+ param->list = xstrdup(list);
+ param->type = type;
+
+ last_param = *params;
+ if (last_param != NULL) {
+ while (last_param->next != NULL) {
+ last_param = last_param->next;
+ }
+ last_param->next = param;
+ } else {
+ *params = param;
+ }
+
+ return 0;
+}
+
+int
+iscsi_param_set(struct iscsi_param *params, const char *key,
+ const char *val)
+{
+ struct iscsi_param *param;
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "set %s=%s\n", key, val);
+ param = iscsi_param_find(params, key);
+ if (param == NULL) {
+ SPDK_ERRLOG("no key %s\n", key);
+ return -1;
+ }
+
+ free(param->val);
+
+ param->val = xstrdup(val);
+
+ return 0;
+}
+
+int
+iscsi_param_set_int(struct iscsi_param *params, const char *key, uint32_t val)
+{
+ char buf[MAX_TMPBUF];
+ struct iscsi_param *param;
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "set %s=%d\n", key, val);
+ param = iscsi_param_find(params, key);
+ if (param == NULL) {
+ SPDK_ERRLOG("no key %s\n", key);
+ return -1;
+ }
+
+ free(param->val);
+ snprintf(buf, sizeof buf, "%d", val);
+
+ param->val = strdup(buf);
+
+ return 0;
+}
+
+/**
+ * Parse a single KEY=VAL pair
+ *
+ * data = "KEY=VAL<NUL>"
+ */
+static int
+iscsi_parse_param(struct iscsi_param **params, const uint8_t *data, uint32_t data_len)
+{
+ int rc;
+ uint8_t *key_copy, *val_copy;
+ const uint8_t *key_end;
+ int key_len, val_len;
+ int max_len;
+
+ data_len = strnlen(data, data_len);
+ /* No such thing as strnchr so use memchr instead. */
+ key_end = memchr(data, '=', data_len);
+ if (!key_end) {
+ SPDK_ERRLOG("'=' not found\n");
+ return -1;
+ }
+
+ key_len = key_end - data;
+ if (key_len == 0) {
+ SPDK_ERRLOG("Empty key\n");
+ return -1;
+ }
+ /*
+ * RFC 7143 6.1
+ */
+ if (key_len > ISCSI_TEXT_MAX_KEY_LEN) {
+ SPDK_ERRLOG("Key name length is bigger than 63\n");
+ return -1;
+ }
+
+ key_copy = malloc(key_len + 1);
+ if (!key_copy) {
+ SPDK_ERRLOG("malloc() failed for key_copy\n");
+ return -ENOMEM;
+ }
+
+ memcpy(key_copy, data, key_len);
+ key_copy[key_len] = '\0';
+ /* check whether this key is duplicated */
+ if (NULL != iscsi_param_find(*params, key_copy)) {
+ SPDK_ERRLOG("Duplicated Key %s\n", key_copy);
+ free(key_copy);
+ return -1;
+ }
+
+ val_len = strnlen(key_end + 1, data_len - key_len - 1);
+ /*
+ * RFC 3720 5.1
+ * If not otherwise specified, the maximum length of a simple-value
+ * (not its encoded representation) is 255 bytes, not including the delimiter
+ * (comma or zero byte).
+ */
+ /*
+ * comma or zero is counted in, otherwise we need to iterate each parameter
+ * value
+ */
+ max_len = iscsi_find_key_in_array(key_copy, non_simple_value_params) ?
+ ISCSI_TEXT_MAX_VAL_LEN : ISCSI_TEXT_MAX_SIMPLE_VAL_LEN;
+ if (val_len > max_len) {
+ SPDK_ERRLOG("Overflow Val %d\n", val_len);
+ free(key_copy);
+ return -1;
+ }
+
+ val_copy = calloc(1, val_len + 1);
+ if (val_copy == NULL) {
+ SPDK_ERRLOG("Could not allocate value string\n");
+ free(key_copy);
+ return -1;
+ }
+
+ memcpy(val_copy, key_end + 1, val_len);
+
+ rc = iscsi_param_add(params, key_copy, val_copy, NULL, 0);
+ free(val_copy);
+ free(key_copy);
+ if (rc < 0) {
+ SPDK_ERRLOG("iscsi_param_add() failed\n");
+ return -1;
+ }
+
+ /* return number of bytes consumed
+ * +1 for '=' and +1 for NUL
+ */
+ return key_len + 1 + val_len + 1;
+}
+
+/**
+ * Parse a sequence of KEY=VAL pairs.
+ *
+ * \param data "KEY=VAL<NUL>KEY=VAL<NUL>..."
+ * \param len length of data in bytes
+ */
+int
+iscsi_parse_params(struct iscsi_param **params, const uint8_t *data,
+ int len, bool cbit_enabled, char **partial_parameter)
+{
+ int rc, offset = 0;
+ char *p;
+ int i;
+
+ /* strip the partial text parameters if previous PDU have C enabled */
+ if (partial_parameter && *partial_parameter) {
+ for (i = 0; i < len && data[i] != '\0'; i++) {
+ ;
+ }
+ p = spdk_sprintf_alloc("%s%s", *partial_parameter, (const char *)data);
+ if (!p) {
+ return -1;
+ }
+ rc = iscsi_parse_param(params, p, i + strlen(*partial_parameter));
+ free(p);
+ if (rc < 0) {
+ return -1;
+ }
+ free(*partial_parameter);
+ *partial_parameter = NULL;
+
+ data = data + i + 1;
+ len = len - (i + 1);
+ }
+
+ /* strip the partial text parameters if C bit is enabled */
+ if (cbit_enabled) {
+ if (partial_parameter == NULL) {
+ SPDK_ERRLOG("C bit set but no partial parameters provided\n");
+ return -1;
+ }
+
+ /*
+ * reverse iterate the string from the tail not including '\0'
+ */
+ for (i = len - 1; data[i] != '\0' && i > 0; i--) {
+ ;
+ }
+ if (i != 0) {
+ /* We found a NULL character - don't copy it into the
+ * partial parameter.
+ */
+ i++;
+ }
+
+ *partial_parameter = calloc(1, len - i + 1);
+ if (*partial_parameter == NULL) {
+ SPDK_ERRLOG("could not allocate partial parameter\n");
+ return -1;
+ }
+ memcpy(*partial_parameter, &data[i], len - i);
+ if (i == 0) {
+ /* No full parameters to parse - so return now. */
+ return 0;
+ } else {
+ len = i - 1;
+ }
+ }
+
+ while (offset < len && data[offset] != '\0') {
+ rc = iscsi_parse_param(params, data + offset, len - offset);
+ if (rc < 0) {
+ return -1;
+ }
+ offset += rc;
+ }
+ return 0;
+}
+
+char *
+iscsi_param_get_val(struct iscsi_param *params, const char *key)
+{
+ struct iscsi_param *param;
+
+ param = iscsi_param_find(params, key);
+ if (param == NULL) {
+ return NULL;
+ }
+ return param->val;
+}
+
+int
+iscsi_param_eq_val(struct iscsi_param *params, const char *key,
+ const char *val)
+{
+ struct iscsi_param *param;
+
+ param = iscsi_param_find(params, key);
+ if (param == NULL) {
+ return 0;
+ }
+ if (strcasecmp(param->val, val) == 0) {
+ return 1;
+ }
+ return 0;
+}
+
+struct iscsi_param_table {
+ const char *key;
+ const char *val;
+ const char *list;
+ int type;
+};
+
+static const struct iscsi_param_table conn_param_table[] = {
+ { "HeaderDigest", "None", "CRC32C,None", ISPT_LIST },
+ { "DataDigest", "None", "CRC32C,None", ISPT_LIST },
+ { "MaxRecvDataSegmentLength", "8192", "512,16777215", ISPT_NUMERICAL_DECLARATIVE },
+ { "OFMarker", "No", "Yes,No", ISPT_BOOLEAN_AND },
+ { "IFMarker", "No", "Yes,No", ISPT_BOOLEAN_AND },
+ { "OFMarkInt", "1", "1,65535", ISPT_NUMERICAL_MIN },
+ { "IFMarkInt", "1", "1,65535", ISPT_NUMERICAL_MIN },
+ { "AuthMethod", "None", "CHAP,None", ISPT_LIST },
+ { "CHAP_A", "5", "5", ISPT_LIST },
+ { "CHAP_N", "", "", ISPT_DECLARATIVE },
+ { "CHAP_R", "", "", ISPT_DECLARATIVE },
+ { "CHAP_I", "", "", ISPT_DECLARATIVE },
+ { "CHAP_C", "", "", ISPT_DECLARATIVE },
+ { NULL, NULL, NULL, ISPT_INVALID },
+};
+
+static const struct iscsi_param_table sess_param_table[] = {
+ { "MaxConnections", "1", "1,65535", ISPT_NUMERICAL_MIN },
+#if 0
+ /* need special handling */
+ { "SendTargets", "", "", ISPT_DECLARATIVE },
+#endif
+ { "TargetName", "", "", ISPT_DECLARATIVE },
+ { "InitiatorName", "", "", ISPT_DECLARATIVE },
+ { "TargetAlias", "", "", ISPT_DECLARATIVE },
+ { "InitiatorAlias", "", "", ISPT_DECLARATIVE },
+ { "TargetAddress", "", "", ISPT_DECLARATIVE },
+ { "TargetPortalGroupTag", "1", "1,65535", ISPT_NUMERICAL_DECLARATIVE },
+ { "InitialR2T", "Yes", "Yes,No", ISPT_BOOLEAN_OR },
+ { "ImmediateData", "Yes", "Yes,No", ISPT_BOOLEAN_AND },
+ { "MaxBurstLength", "262144", "512,16777215", ISPT_NUMERICAL_MIN },
+ { "FirstBurstLength", "65536", "512,16777215", ISPT_NUMERICAL_MIN },
+ { "DefaultTime2Wait", "2", "0,3600", ISPT_NUMERICAL_MAX },
+ { "DefaultTime2Retain", "20", "0,3600", ISPT_NUMERICAL_MIN },
+ { "MaxOutstandingR2T", "1", "1,65536", ISPT_NUMERICAL_MIN },
+ { "DataPDUInOrder", "Yes", "Yes,No", ISPT_BOOLEAN_OR },
+ { "DataSequenceInOrder", "Yes", "Yes,No", ISPT_BOOLEAN_OR },
+ { "ErrorRecoveryLevel", "0", "0,2", ISPT_NUMERICAL_MIN },
+ { "SessionType", "Normal", "Normal,Discovery", ISPT_DECLARATIVE },
+ { NULL, NULL, NULL, ISPT_INVALID },
+};
+
+static int
+iscsi_params_init_internal(struct iscsi_param **params,
+ const struct iscsi_param_table *table)
+{
+ int rc;
+ int i;
+ struct iscsi_param *param;
+
+ for (i = 0; table[i].key != NULL; i++) {
+ rc = iscsi_param_add(params, table[i].key, table[i].val,
+ table[i].list, table[i].type);
+ if (rc < 0) {
+ SPDK_ERRLOG("iscsi_param_add() failed\n");
+ return -1;
+ }
+ param = iscsi_param_find(*params, table[i].key);
+ if (param != NULL) {
+ param->state_index = i;
+ } else {
+ SPDK_ERRLOG("iscsi_param_find() failed\n");
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+int
+iscsi_conn_params_init(struct iscsi_param **params)
+{
+ return iscsi_params_init_internal(params, &conn_param_table[0]);
+}
+
+int
+iscsi_sess_params_init(struct iscsi_param **params)
+{
+ return iscsi_params_init_internal(params, &sess_param_table[0]);
+}
+
+static const char *chap_type[] = {
+ "CHAP_A",
+ "CHAP_N",
+ "CHAP_R",
+ "CHAP_I",
+ "CHAP_C",
+ NULL,
+};
+
+static const char *discovery_ignored_param[] = {
+ "MaxConnections",
+ "InitialR2T",
+ "ImmediateData",
+ "MaxBurstLength",
+ "FirstBurstLength"
+ "MaxOutstandingR2T",
+ "DataPDUInOrder",
+ "DataSequenceInOrder",
+ NULL,
+};
+
+static const char *multi_negot_conn_params[] = {
+ "MaxRecvDataSegmentLength",
+ NULL,
+};
+
+/* The following params should be declared by target */
+static const char *target_declarative_params[] = {
+ "TargetAlias",
+ "TargetAddress",
+ "TargetPortalGroupTag",
+ NULL,
+};
+
+/* This function is used to construct the data from the special param (e.g.,
+ * MaxRecvDataSegmentLength)
+ * return:
+ * normal: the total len of the data
+ * error: -1
+ */
+static int
+iscsi_special_param_construction(struct spdk_iscsi_conn *conn,
+ struct iscsi_param *param,
+ bool FirstBurstLength_flag, char *data,
+ int alloc_len, int total)
+{
+ int len;
+ struct iscsi_param *param_first;
+ struct iscsi_param *param_max;
+ uint32_t FirstBurstLength;
+ uint32_t MaxBurstLength;
+ char *val;
+
+ val = malloc(ISCSI_TEXT_MAX_VAL_LEN + 1);
+ if (!val) {
+ SPDK_ERRLOG("malloc() failed for temporary buffer\n");
+ return -ENOMEM;
+ }
+
+ if (strcasecmp(param->key, "MaxRecvDataSegmentLength") == 0) {
+ /*
+ * MaxRecvDataSegmentLength is sent by both
+ * initiator and target, but is declarative - meaning
+ * each direction can have different values.
+ * So when MaxRecvDataSegmentLength is found in the
+ * the parameter set sent from the initiator, add SPDK
+ * iscsi target's MaxRecvDataSegmentLength value to
+ * the returned parameter list.
+ */
+ if (alloc_len - total < 1) {
+ SPDK_ERRLOG("data space small %d\n", alloc_len);
+ free(val);
+ return -1;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI,
+ "returning MaxRecvDataSegmentLength=%d\n",
+ SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH);
+ len = snprintf((char *)data + total, alloc_len - total,
+ "MaxRecvDataSegmentLength=%d",
+ SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH);
+ total += len + 1;
+ }
+
+ if (strcasecmp(param->key, "MaxBurstLength") == 0 &&
+ !FirstBurstLength_flag) {
+ if (alloc_len - total < 1) {
+ SPDK_ERRLOG("data space small %d\n", alloc_len);
+ free(val);
+ return -1;
+ }
+
+ param_first = iscsi_param_find(conn->sess->params,
+ "FirstBurstLength");
+ if (param_first != NULL) {
+ FirstBurstLength = (uint32_t)strtol(param_first->val, NULL, 10);
+ } else {
+ FirstBurstLength = SPDK_ISCSI_FIRST_BURST_LENGTH;
+ }
+ param_max = iscsi_param_find(conn->sess->params,
+ "MaxBurstLength");
+ if (param_max != NULL) {
+ MaxBurstLength = (uint32_t)strtol(param_max->val, NULL, 10);
+ } else {
+ MaxBurstLength = SPDK_ISCSI_MAX_BURST_LENGTH;
+ }
+
+ if (FirstBurstLength > MaxBurstLength) {
+ FirstBurstLength = MaxBurstLength;
+ if (param_first != NULL) {
+ free(param_first->val);
+ snprintf(val, ISCSI_TEXT_MAX_VAL_LEN, "%d",
+ FirstBurstLength);
+ param_first->val = xstrdup(val);
+ }
+ }
+ len = snprintf((char *)data + total, alloc_len - total,
+ "FirstBurstLength=%d", FirstBurstLength);
+ total += len + 1;
+ }
+
+ free(val);
+ return total;
+
+}
+
+/**
+ * iscsi_construct_data_from_param:
+ * To construct the data which will be returned to the initiator
+ * return: length of the negotiated data, -1 indicates error;
+ */
+static int
+iscsi_construct_data_from_param(struct iscsi_param *param, char *new_val,
+ char *data, int alloc_len, int total)
+{
+ int len;
+
+ if (param->type != ISPT_DECLARATIVE &&
+ param->type != ISPT_NUMERICAL_DECLARATIVE) {
+ if (alloc_len - total < 1) {
+ SPDK_ERRLOG("data space small %d\n", alloc_len);
+ return -1;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "negotiated %s=%s\n",
+ param->key, new_val);
+ len = snprintf((char *)data + total, alloc_len - total, "%s=%s",
+ param->key, new_val);
+ total += len + 1;
+ }
+ return total;
+}
+
+/**
+ * To negotiate param with
+ * type = ISPT_LIST
+ * return: the negotiated value of the key
+ */
+static char *
+iscsi_negotiate_param_list(int *add_param_value,
+ struct iscsi_param *param,
+ char *valid_list, char *in_val,
+ char *cur_val)
+{
+ char *val_start, *val_end;
+ char *in_start, *in_end;
+ int flag = 0;
+
+ if (add_param_value == NULL) {
+ return NULL;
+ }
+
+ in_start = in_val;
+ do {
+ if ((in_end = strchr(in_start, (int)',')) != NULL) {
+ *in_end = '\0';
+ }
+ val_start = valid_list;
+ do {
+ if ((val_end = strchr(val_start, (int)',')) != NULL) {
+ *val_end = '\0';
+ }
+ if (strcasecmp(in_start, val_start) == 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "match %s\n",
+ val_start);
+ flag = 1;
+ break;
+ }
+ if (val_end) {
+ *val_end = ',';
+ val_start = val_end + 1;
+ }
+ } while (val_end);
+ if (flag) {
+ break;
+ }
+ if (in_end) {
+ *in_end = ',';
+ in_start = in_end + 1;
+ }
+ } while (in_end);
+
+ return flag ? val_start : NULL;
+}
+
+/**
+ * To negotiate param with
+ * type = ISPT_NUMERICAL_MIN/MAX, ISPT_NUMERICAL_DECLARATIVE
+ * return: the negotiated value of the key
+ */
+static char *
+iscsi_negotiate_param_numerical(int *add_param_value,
+ struct iscsi_param *param,
+ char *valid_list, char *in_val,
+ char *cur_val)
+{
+ char *valid_next;
+ char *new_val = NULL;
+ char *min_val, *max_val;
+ int val_i, cur_val_i;
+ int min_i, max_i;
+
+ if (add_param_value == NULL) {
+ return NULL;
+ }
+
+ val_i = (int)strtol(param->val, NULL, 10);
+ /* check whether the key is FirstBurstLength, if that we use in_val */
+ if (strcasecmp(param->key, "FirstBurstLength") == 0) {
+ val_i = (int)strtol(in_val, NULL, 10);
+ }
+
+ cur_val_i = (int)strtol(cur_val, NULL, 10);
+ valid_next = valid_list;
+ min_val = spdk_strsepq(&valid_next, ",");
+ max_val = spdk_strsepq(&valid_next, ",");
+ min_i = (min_val != NULL) ? (int)strtol(min_val, NULL, 10) : 0;
+ max_i = (max_val != NULL) ? (int)strtol(max_val, NULL, 10) : 0;
+ if (val_i < min_i || val_i > max_i) {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "key %.64s reject\n", param->key);
+ new_val = NULL;
+ } else {
+ switch (param->type) {
+ case ISPT_NUMERICAL_MIN:
+ if (val_i > cur_val_i) {
+ val_i = cur_val_i;
+ }
+ break;
+ case ISPT_NUMERICAL_MAX:
+ if (val_i < cur_val_i) {
+ val_i = cur_val_i;
+ }
+ break;
+ default:
+ break;
+ }
+ snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN, "%d", val_i);
+ new_val = in_val;
+ }
+
+ return new_val;
+}
+
+/**
+ * To negotiate param with
+ * type = ISPT_BOOLEAN_OR, ISPT_BOOLEAN_AND
+ * return: the negotiated value of the key
+ */
+static char *
+iscsi_negotiate_param_boolean(int *add_param_value,
+ struct iscsi_param *param,
+ char *in_val, char *cur_val,
+ const char *value)
+{
+ char *new_val = NULL;
+
+ if (add_param_value == NULL) {
+ return NULL;
+ }
+
+ /* Make sure the val is Yes or No */
+ if (!((strcasecmp(in_val, "Yes") == 0) ||
+ (strcasecmp(in_val, "No") == 0))) {
+ /* unknown value */
+ snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN + 1, "%s", "Reject");
+ new_val = in_val;
+ *add_param_value = 1;
+ return new_val;
+ }
+
+ if (strcasecmp(cur_val, value) == 0) {
+ snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN + 1, "%s", value);
+ new_val = in_val;
+ } else {
+ new_val = param->val;
+ }
+
+ return new_val;
+}
+
+/**
+ * The entry function to handle each type of the param
+ * return value: the new negotiated value
+ */
+static char *
+iscsi_negotiate_param_all(int *add_param_value, struct iscsi_param *param,
+ char *valid_list, char *in_val, char *cur_val)
+{
+ char *new_val;
+ switch (param->type) {
+ case ISPT_LIST:
+ new_val = iscsi_negotiate_param_list(add_param_value,
+ param,
+ valid_list,
+ in_val,
+ cur_val);
+ break;
+
+ case ISPT_NUMERICAL_MIN:
+ case ISPT_NUMERICAL_MAX:
+ case ISPT_NUMERICAL_DECLARATIVE:
+ new_val = iscsi_negotiate_param_numerical(add_param_value,
+ param,
+ valid_list,
+ in_val,
+ cur_val);
+ break;
+
+ case ISPT_BOOLEAN_OR:
+ new_val = iscsi_negotiate_param_boolean(add_param_value,
+ param,
+ in_val,
+ cur_val,
+ "Yes");
+ break;
+ case ISPT_BOOLEAN_AND:
+ new_val = iscsi_negotiate_param_boolean(add_param_value,
+ param,
+ in_val,
+ cur_val,
+ "No");
+ break;
+
+ default:
+ snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN + 1, "%s", param->val);
+ new_val = in_val;
+ break;
+ }
+
+ return new_val;
+}
+
+/**
+ * This function is used to judge whether the param is in session's params or
+ * connection's params
+ */
+static int
+iscsi_negotiate_param_init(struct spdk_iscsi_conn *conn,
+ struct iscsi_param **cur_param_p,
+ struct iscsi_param **params_dst_p,
+ struct iscsi_param *param)
+{
+ int index;
+
+ *cur_param_p = iscsi_param_find(*params_dst_p, param->key);
+ if (*cur_param_p == NULL) {
+ *params_dst_p = conn->sess->params;
+ *cur_param_p = iscsi_param_find(*params_dst_p, param->key);
+ if (*cur_param_p == NULL) {
+ if ((strncasecmp(param->key, "X-", 2) == 0) ||
+ (strncasecmp(param->key, "X#", 2) == 0)) {
+ /* Extension Key */
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI,
+ "extension key %.64s\n",
+ param->key);
+ } else {
+ SPDK_ERRLOG("unknown key %.64s\n", param->key);
+ }
+ return 1;
+ } else {
+ index = (*cur_param_p)->state_index;
+ if (conn->sess_param_state_negotiated[index] &&
+ !iscsi_find_key_in_array(param->key,
+ target_declarative_params)) {
+ return SPDK_ISCSI_PARAMETER_EXCHANGE_NOT_ONCE;
+ }
+ conn->sess_param_state_negotiated[index] = true;
+ }
+ } else {
+ index = (*cur_param_p)->state_index;
+ if (conn->conn_param_state_negotiated[index] &&
+ !iscsi_find_key_in_array(param->key,
+ multi_negot_conn_params)) {
+ return SPDK_ISCSI_PARAMETER_EXCHANGE_NOT_ONCE;
+ }
+ conn->conn_param_state_negotiated[index] = true;
+ }
+
+ return 0;
+}
+
+int
+iscsi_negotiate_params(struct spdk_iscsi_conn *conn,
+ struct iscsi_param **params, uint8_t *data, int alloc_len,
+ int data_len)
+{
+ struct iscsi_param *param;
+ struct iscsi_param *cur_param;
+ char *valid_list, *in_val;
+ char *cur_val;
+ char *new_val;
+ int discovery;
+ int total;
+ int rc;
+ uint32_t FirstBurstLength;
+ uint32_t MaxBurstLength;
+ bool FirstBurstLength_flag = false;
+ int type;
+
+ total = data_len;
+ if (data_len < 0) {
+ assert(false);
+ return -EINVAL;
+ }
+ if (alloc_len < 1) {
+ return 0;
+ }
+ if (total > alloc_len) {
+ total = alloc_len;
+ data[total - 1] = '\0';
+ return total;
+ }
+
+ if (*params == NULL) {
+ /* no input */
+ return total;
+ }
+
+ /* discovery? */
+ discovery = 0;
+ cur_param = iscsi_param_find(*params, "SessionType");
+ if (cur_param == NULL) {
+ cur_param = iscsi_param_find(conn->sess->params, "SessionType");
+ if (cur_param == NULL) {
+ /* no session type */
+ } else {
+ if (strcasecmp(cur_param->val, "Discovery") == 0) {
+ discovery = 1;
+ }
+ }
+ } else {
+ if (strcasecmp(cur_param->val, "Discovery") == 0) {
+ discovery = 1;
+ }
+ }
+
+ /* for temporary store */
+ valid_list = malloc(ISCSI_TEXT_MAX_VAL_LEN + 1);
+ if (!valid_list) {
+ SPDK_ERRLOG("malloc() failed for valid_list\n");
+ return -ENOMEM;
+ }
+
+ in_val = malloc(ISCSI_TEXT_MAX_VAL_LEN + 1);
+ if (!in_val) {
+ SPDK_ERRLOG("malloc() failed for in_val\n");
+ free(valid_list);
+ return -ENOMEM;
+ }
+
+ cur_val = malloc(ISCSI_TEXT_MAX_VAL_LEN + 1);
+ if (!cur_val) {
+ SPDK_ERRLOG("malloc() failed for cur_val\n");
+ free(valid_list);
+ free(in_val);
+ return -ENOMEM;
+ }
+
+ /* To adjust the location of FirstBurstLength location and put it to
+ * the end, then we can always firstly determine the MaxBurstLength
+ */
+ param = iscsi_param_find(*params, "MaxBurstLength");
+ if (param != NULL) {
+ param = iscsi_param_find(*params, "FirstBurstLength");
+
+ /* check the existence of FirstBurstLength */
+ if (param != NULL) {
+ FirstBurstLength_flag = true;
+ if (param->next != NULL) {
+ snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN + 1, "%s", param->val);
+ type = param->type;
+ iscsi_param_add(params, "FirstBurstLength",
+ in_val, NULL, type);
+ }
+ }
+ }
+
+ for (param = *params; param != NULL; param = param->next) {
+ struct iscsi_param *params_dst = conn->params;
+ int add_param_value = 0;
+ new_val = NULL;
+ param->type = ISPT_INVALID;
+
+ /* sendtargets is special */
+ if (strcasecmp(param->key, "SendTargets") == 0) {
+ continue;
+ }
+ /* CHAP keys */
+ if (iscsi_find_key_in_array(param->key, chap_type)) {
+ continue;
+ }
+
+ /* 12.2, 12.10, 12.11, 12.13, 12.14, 12.17, 12.18, 12.19 */
+ if (discovery &&
+ iscsi_find_key_in_array(param->key, discovery_ignored_param)) {
+ snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN + 1, "%s", "Irrelevant");
+ new_val = in_val;
+ add_param_value = 1;
+ } else {
+ rc = iscsi_negotiate_param_init(conn,
+ &cur_param,
+ &params_dst,
+ param);
+ if (rc < 0) {
+ free(valid_list);
+ free(in_val);
+ free(cur_val);
+ return rc;
+ } else if (rc > 0) {
+ snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN + 1, "%s", "NotUnderstood");
+ new_val = in_val;
+ add_param_value = 1;
+ } else {
+ snprintf(valid_list, ISCSI_TEXT_MAX_VAL_LEN + 1, "%s", cur_param->list);
+ snprintf(cur_val, ISCSI_TEXT_MAX_VAL_LEN + 1, "%s", cur_param->val);
+ param->type = cur_param->type;
+ }
+ }
+
+ if (param->type > 0) {
+ snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN + 1, "%s", param->val);
+
+ /* "NotUnderstood" value shouldn't be assigned to "Understood" key */
+ if (strcasecmp(in_val, "NotUnderstood") == 0) {
+ free(in_val);
+ free(valid_list);
+ free(cur_val);
+ return SPDK_ISCSI_LOGIN_ERROR_PARAMETER;
+ }
+
+ if (strcasecmp(param->key, "FirstBurstLength") == 0) {
+ FirstBurstLength = (uint32_t)strtol(param->val, NULL,
+ 10);
+ new_val = iscsi_param_get_val(conn->sess->params,
+ "MaxBurstLength");
+ if (new_val != NULL) {
+ MaxBurstLength = (uint32_t) strtol(new_val, NULL,
+ 10);
+ } else {
+ MaxBurstLength = SPDK_ISCSI_MAX_BURST_LENGTH;
+ }
+ if (FirstBurstLength < SPDK_ISCSI_MAX_FIRST_BURST_LENGTH &&
+ FirstBurstLength > MaxBurstLength) {
+ FirstBurstLength = MaxBurstLength;
+ snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN, "%d",
+ FirstBurstLength);
+ }
+ }
+
+ /* prevent target's declarative params from being changed by initiator */
+ if (iscsi_find_key_in_array(param->key, target_declarative_params)) {
+ add_param_value = 1;
+ }
+
+ new_val = iscsi_negotiate_param_all(&add_param_value,
+ param,
+ valid_list,
+ in_val,
+ cur_val);
+ }
+
+ /* check the negotiated value of the key */
+ if (new_val != NULL) {
+ /* add_param_value = 0 means updating the value of
+ * existed key in the connection's parameters
+ */
+ if (add_param_value == 0) {
+ iscsi_param_set(params_dst, param->key, new_val);
+ }
+ total = iscsi_construct_data_from_param(param,
+ new_val,
+ data,
+ alloc_len,
+ total);
+ if (total < 0) {
+ goto final_return;
+ }
+
+ total = iscsi_special_param_construction(conn,
+ param,
+ FirstBurstLength_flag,
+ data,
+ alloc_len,
+ total);
+ if (total < 0) {
+ goto final_return;
+ }
+ } else {
+ total = -1;
+ break;
+ }
+ }
+
+final_return:
+ free(valid_list);
+ free(in_val);
+ free(cur_val);
+
+ return total;
+}
+
+int
+iscsi_copy_param2var(struct spdk_iscsi_conn *conn)
+{
+ const char *val;
+
+ val = iscsi_param_get_val(conn->params, "MaxRecvDataSegmentLength");
+ if (val == NULL) {
+ SPDK_ERRLOG("Getval MaxRecvDataSegmentLength failed\n");
+ return -1;
+ }
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI,
+ "copy MaxRecvDataSegmentLength=%s\n", val);
+ conn->MaxRecvDataSegmentLength = (int)strtol(val, NULL, 10);
+ if (conn->MaxRecvDataSegmentLength > SPDK_BDEV_LARGE_BUF_MAX_SIZE) {
+ conn->MaxRecvDataSegmentLength = SPDK_BDEV_LARGE_BUF_MAX_SIZE;
+ }
+
+ val = iscsi_param_get_val(conn->params, "HeaderDigest");
+ if (val == NULL) {
+ SPDK_ERRLOG("Getval HeaderDigest failed\n");
+ return -1;
+ }
+ if (strcasecmp(val, "CRC32C") == 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "set HeaderDigest=1\n");
+ conn->header_digest = 1;
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "set HeaderDigest=0\n");
+ conn->header_digest = 0;
+ }
+ val = iscsi_param_get_val(conn->params, "DataDigest");
+ if (val == NULL) {
+ SPDK_ERRLOG("Getval DataDigest failed\n");
+ return -1;
+ }
+ if (strcasecmp(val, "CRC32C") == 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "set DataDigest=1\n");
+ conn->data_digest = 1;
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "set DataDigest=0\n");
+ conn->data_digest = 0;
+ }
+
+ val = iscsi_param_get_val(conn->sess->params, "MaxConnections");
+ if (val == NULL) {
+ SPDK_ERRLOG("Getval MaxConnections failed\n");
+ return -1;
+ }
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "copy MaxConnections=%s\n", val);
+ conn->sess->MaxConnections = (uint32_t) strtol(val, NULL, 10);
+ val = iscsi_param_get_val(conn->sess->params, "MaxOutstandingR2T");
+ if (val == NULL) {
+ SPDK_ERRLOG("Getval MaxOutstandingR2T failed\n");
+ return -1;
+ }
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "copy MaxOutstandingR2T=%s\n", val);
+ conn->sess->MaxOutstandingR2T = (uint32_t) strtol(val, NULL, 10);
+ val = iscsi_param_get_val(conn->sess->params, "FirstBurstLength");
+ if (val == NULL) {
+ SPDK_ERRLOG("Getval FirstBurstLength failed\n");
+ return -1;
+ }
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "copy FirstBurstLength=%s\n", val);
+ conn->sess->FirstBurstLength = (uint32_t) strtol(val, NULL, 10);
+ val = iscsi_param_get_val(conn->sess->params, "MaxBurstLength");
+ if (val == NULL) {
+ SPDK_ERRLOG("Getval MaxBurstLength failed\n");
+ return -1;
+ }
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "copy MaxBurstLength=%s\n", val);
+ conn->sess->MaxBurstLength = (uint32_t) strtol(val, NULL, 10);
+ val = iscsi_param_get_val(conn->sess->params, "InitialR2T");
+ if (val == NULL) {
+ SPDK_ERRLOG("Getval InitialR2T failed\n");
+ return -1;
+ }
+ if (strcasecmp(val, "Yes") == 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "set InitialR2T=1\n");
+ conn->sess->InitialR2T = true;
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "set InitialR2T=0\n");
+ conn->sess->InitialR2T = false;
+ }
+ val = iscsi_param_get_val(conn->sess->params, "ImmediateData");
+ if (val == NULL) {
+ SPDK_ERRLOG("Getval ImmediateData failed\n");
+ return -1;
+ }
+ if (strcasecmp(val, "Yes") == 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "set ImmediateData=1\n");
+ conn->sess->ImmediateData = true;
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "set ImmediateData=0\n");
+ conn->sess->ImmediateData = false;
+ }
+ return 0;
+}
diff --git a/src/spdk/lib/iscsi/param.h b/src/spdk/lib/iscsi/param.h
new file mode 100644
index 000000000..ce194c514
--- /dev/null
+++ b/src/spdk/lib/iscsi/param.h
@@ -0,0 +1,94 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_ISCSI_PARAM_H
+#define SPDK_ISCSI_PARAM_H
+
+#include "spdk/stdinc.h"
+
+struct spdk_iscsi_conn;
+
+enum iscsi_param_type {
+ ISPT_INVALID = -1,
+ ISPT_NOTSPECIFIED = 0,
+ ISPT_LIST,
+ ISPT_NUMERICAL_MIN,
+ ISPT_NUMERICAL_MAX,
+ ISPT_NUMERICAL_DECLARATIVE,
+ ISPT_DECLARATIVE,
+ ISPT_BOOLEAN_OR,
+ ISPT_BOOLEAN_AND,
+};
+
+struct iscsi_param {
+ struct iscsi_param *next;
+ char *key;
+ char *val;
+ char *list;
+ int type;
+ int state_index;
+};
+
+void
+iscsi_param_free(struct iscsi_param *params);
+struct iscsi_param *
+iscsi_param_find(struct iscsi_param *params, const char *key);
+int
+iscsi_param_del(struct iscsi_param **params, const char *key);
+int
+iscsi_param_add(struct iscsi_param **params, const char *key,
+ const char *val, const char *list, int type);
+int
+iscsi_param_set(struct iscsi_param *params, const char *key,
+ const char *val);
+int
+iscsi_param_set_int(struct iscsi_param *params, const char *key, uint32_t val);
+int
+iscsi_parse_params(struct iscsi_param **params, const uint8_t *data,
+ int len, bool cbit_enabled, char **partial_parameter);
+char *
+iscsi_param_get_val(struct iscsi_param *params, const char *key);
+int
+iscsi_param_eq_val(struct iscsi_param *params, const char *key,
+ const char *val);
+
+int iscsi_negotiate_params(struct spdk_iscsi_conn *conn,
+ struct iscsi_param **params_p, uint8_t *data,
+ int alloc_len, int data_len);
+int iscsi_copy_param2var(struct spdk_iscsi_conn *conn);
+
+int iscsi_conn_params_init(struct iscsi_param **params);
+int iscsi_sess_params_init(struct iscsi_param **params);
+
+#endif /* SPDK_ISCSI_PARAM_H */
diff --git a/src/spdk/lib/iscsi/portal_grp.c b/src/spdk/lib/iscsi/portal_grp.c
new file mode 100644
index 000000000..986562ad7
--- /dev/null
+++ b/src/spdk/lib/iscsi/portal_grp.c
@@ -0,0 +1,655 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/conf.h"
+#include "spdk/sock.h"
+#include "spdk/string.h"
+
+#include "spdk_internal/log.h"
+
+#include "iscsi/iscsi.h"
+#include "iscsi/conn.h"
+#include "iscsi/portal_grp.h"
+#include "iscsi/tgt_node.h"
+
+#define PORTNUMSTRLEN 32
+#define ACCEPT_TIMEOUT_US 1000 /* 1ms */
+
+static int
+iscsi_portal_accept(void *arg)
+{
+ struct spdk_iscsi_portal *portal = arg;
+ struct spdk_sock *sock;
+ int rc;
+ int count = 0;
+
+ if (portal->sock == NULL) {
+ return -1;
+ }
+
+ while (1) {
+ sock = spdk_sock_accept(portal->sock);
+ if (sock != NULL) {
+ rc = iscsi_conn_construct(portal, sock);
+ if (rc < 0) {
+ spdk_sock_close(&sock);
+ SPDK_ERRLOG("spdk_iscsi_connection_construct() failed\n");
+ break;
+ }
+ count++;
+ } else {
+ if (errno != EAGAIN && errno != EWOULDBLOCK) {
+ SPDK_ERRLOG("accept error(%d): %s\n", errno, spdk_strerror(errno));
+ }
+ break;
+ }
+ }
+
+ return count;
+}
+
+static struct spdk_iscsi_portal *
+iscsi_portal_find_by_addr(const char *host, const char *port)
+{
+ struct spdk_iscsi_portal *p;
+
+ TAILQ_FOREACH(p, &g_iscsi.portal_head, g_tailq) {
+ if (!strcmp(p->host, host) && !strcmp(p->port, port)) {
+ return p;
+ }
+ }
+
+ return NULL;
+}
+
+/* Assumes caller allocated host and port strings on the heap */
+struct spdk_iscsi_portal *
+iscsi_portal_create(const char *host, const char *port)
+{
+ struct spdk_iscsi_portal *p = NULL, *tmp;
+
+ assert(host != NULL);
+ assert(port != NULL);
+
+ if (strlen(host) > MAX_PORTAL_ADDR || strlen(port) > MAX_PORTAL_PORT) {
+ return NULL;
+ }
+
+ p = calloc(1, sizeof(*p));
+ if (!p) {
+ SPDK_ERRLOG("calloc() failed for portal\n");
+ return NULL;
+ }
+
+ /* check and overwrite abbreviation of wildcard */
+ if (strcasecmp(host, "[*]") == 0) {
+ SPDK_WARNLOG("Please use \"[::]\" as IPv6 wildcard\n");
+ SPDK_WARNLOG("Convert \"[*]\" to \"[::]\" automatically\n");
+ SPDK_WARNLOG("(Use of \"[*]\" will be deprecated in a future release)");
+ snprintf(p->host, sizeof(p->host), "[::]");
+ } else if (strcasecmp(host, "*") == 0) {
+ SPDK_WARNLOG("Please use \"0.0.0.0\" as IPv4 wildcard\n");
+ SPDK_WARNLOG("Convert \"*\" to \"0.0.0.0\" automatically\n");
+ SPDK_WARNLOG("(Use of \"[*]\" will be deprecated in a future release)");
+ snprintf(p->host, sizeof(p->host), "0.0.0.0");
+ } else {
+ memcpy(p->host, host, strlen(host));
+ }
+
+ memcpy(p->port, port, strlen(port));
+
+ p->sock = NULL;
+ p->group = NULL; /* set at a later time by caller */
+ p->acceptor_poller = NULL;
+
+ pthread_mutex_lock(&g_iscsi.mutex);
+ tmp = iscsi_portal_find_by_addr(host, port);
+ if (tmp != NULL) {
+ pthread_mutex_unlock(&g_iscsi.mutex);
+ SPDK_ERRLOG("portal (%s, %s) already exists\n", host, port);
+ goto error_out;
+ }
+
+ TAILQ_INSERT_TAIL(&g_iscsi.portal_head, p, g_tailq);
+ pthread_mutex_unlock(&g_iscsi.mutex);
+
+ return p;
+
+error_out:
+ free(p);
+
+ return NULL;
+}
+
+void
+iscsi_portal_destroy(struct spdk_iscsi_portal *p)
+{
+ assert(p != NULL);
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "iscsi_portal_destroy\n");
+
+ pthread_mutex_lock(&g_iscsi.mutex);
+ TAILQ_REMOVE(&g_iscsi.portal_head, p, g_tailq);
+ pthread_mutex_unlock(&g_iscsi.mutex);
+
+ free(p);
+
+}
+
+static int
+iscsi_portal_open(struct spdk_iscsi_portal *p)
+{
+ struct spdk_sock *sock;
+ int port;
+
+ if (p->sock != NULL) {
+ SPDK_ERRLOG("portal (%s, %s) is already opened\n",
+ p->host, p->port);
+ return -1;
+ }
+
+ port = (int)strtol(p->port, NULL, 0);
+ sock = spdk_sock_listen(p->host, port, NULL);
+ if (sock == NULL) {
+ SPDK_ERRLOG("listen error %.64s.%d\n", p->host, port);
+ return -1;
+ }
+
+ p->sock = sock;
+
+ /*
+ * When the portal is created by config file, incoming connection
+ * requests for the socket are pended to accept until reactors start.
+ * However the gap between listen() and accept() will be slight and
+ * the requests will be queued by the nonzero backlog of the socket
+ * or resend by TCP.
+ */
+ p->acceptor_poller = SPDK_POLLER_REGISTER(iscsi_portal_accept, p, ACCEPT_TIMEOUT_US);
+
+ return 0;
+}
+
+static void
+iscsi_portal_close(struct spdk_iscsi_portal *p)
+{
+ if (p->sock) {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "close portal (%s, %s)\n",
+ p->host, p->port);
+ spdk_poller_unregister(&p->acceptor_poller);
+ spdk_sock_close(&p->sock);
+ }
+}
+
+static int
+iscsi_parse_portal(const char *portalstring, struct spdk_iscsi_portal **ip)
+{
+ char *host = NULL, *port = NULL;
+ int len, rc = -1;
+ const char *p;
+
+ if (portalstring == NULL) {
+ SPDK_ERRLOG("portal error\n");
+ goto error_out;
+ }
+
+ /* IP address */
+ if (portalstring[0] == '[') {
+ /* IPv6 */
+ p = strchr(portalstring + 1, ']');
+ if (p == NULL) {
+ SPDK_ERRLOG("portal error\n");
+ goto error_out;
+ }
+ p++;
+ } else {
+ /* IPv4 */
+ p = strchr(portalstring, ':');
+ if (p == NULL) {
+ p = portalstring + strlen(portalstring);
+ }
+ }
+
+ len = p - portalstring;
+ host = malloc(len + 1);
+ if (host == NULL) {
+ SPDK_ERRLOG("malloc() failed for host\n");
+ goto error_out;
+ }
+ memcpy(host, portalstring, len);
+ host[len] = '\0';
+
+ /* Port number (IPv4 and IPv6 are the same) */
+ if (p[0] == '\0') {
+ port = malloc(PORTNUMSTRLEN);
+ if (!port) {
+ SPDK_ERRLOG("malloc() failed for port\n");
+ goto error_out;
+ }
+ snprintf(port, PORTNUMSTRLEN, "%d", DEFAULT_PORT);
+ } else {
+ p++;
+ len = strlen(p);
+ port = malloc(len + 1);
+ if (port == NULL) {
+ SPDK_ERRLOG("malloc() failed for port\n");
+ goto error_out;
+ }
+ memcpy(port, p, len);
+ port[len] = '\0';
+ }
+
+ *ip = iscsi_portal_create(host, port);
+ if (!*ip) {
+ goto error_out;
+ }
+
+ rc = 0;
+error_out:
+ free(host);
+ free(port);
+
+ return rc;
+}
+
+struct spdk_iscsi_portal_grp *
+iscsi_portal_grp_create(int tag)
+{
+ struct spdk_iscsi_portal_grp *pg = malloc(sizeof(*pg));
+
+ if (!pg) {
+ SPDK_ERRLOG("malloc() failed for portal group\n");
+ return NULL;
+ }
+
+ pg->ref = 0;
+ pg->tag = tag;
+
+ pthread_mutex_lock(&g_iscsi.mutex);
+ pg->disable_chap = g_iscsi.disable_chap;
+ pg->require_chap = g_iscsi.require_chap;
+ pg->mutual_chap = g_iscsi.mutual_chap;
+ pg->chap_group = g_iscsi.chap_group;
+ pthread_mutex_unlock(&g_iscsi.mutex);
+
+ TAILQ_INIT(&pg->head);
+
+ return pg;
+}
+
+void
+iscsi_portal_grp_destroy(struct spdk_iscsi_portal_grp *pg)
+{
+ struct spdk_iscsi_portal *p;
+
+ assert(pg != NULL);
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "iscsi_portal_grp_destroy\n");
+ while (!TAILQ_EMPTY(&pg->head)) {
+ p = TAILQ_FIRST(&pg->head);
+ TAILQ_REMOVE(&pg->head, p, per_pg_tailq);
+ iscsi_portal_destroy(p);
+ }
+ free(pg);
+}
+
+int
+iscsi_portal_grp_register(struct spdk_iscsi_portal_grp *pg)
+{
+ int rc = -1;
+ struct spdk_iscsi_portal_grp *tmp;
+
+ assert(pg != NULL);
+
+ pthread_mutex_lock(&g_iscsi.mutex);
+ tmp = iscsi_portal_grp_find_by_tag(pg->tag);
+ if (tmp == NULL) {
+ TAILQ_INSERT_TAIL(&g_iscsi.pg_head, pg, tailq);
+ rc = 0;
+ }
+ pthread_mutex_unlock(&g_iscsi.mutex);
+ return rc;
+}
+
+void
+iscsi_portal_grp_add_portal(struct spdk_iscsi_portal_grp *pg,
+ struct spdk_iscsi_portal *p)
+{
+ assert(pg != NULL);
+ assert(p != NULL);
+
+ p->group = pg;
+ TAILQ_INSERT_TAIL(&pg->head, p, per_pg_tailq);
+}
+
+int
+iscsi_portal_grp_set_chap_params(struct spdk_iscsi_portal_grp *pg,
+ bool disable_chap, bool require_chap,
+ bool mutual_chap, int32_t chap_group)
+{
+ if (!iscsi_check_chap_params(disable_chap, require_chap,
+ mutual_chap, chap_group)) {
+ return -EINVAL;
+ }
+
+ pg->disable_chap = disable_chap;
+ pg->require_chap = require_chap;
+ pg->mutual_chap = mutual_chap;
+ pg->chap_group = chap_group;
+
+ return 0;
+}
+
+static int
+iscsi_parse_portal_grp(struct spdk_conf_section *sp)
+{
+ struct spdk_iscsi_portal_grp *pg;
+ struct spdk_iscsi_portal *p;
+ const char *val;
+ char *label, *portal;
+ int i = 0, rc = 0;
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "add portal group (from config file) %d\n",
+ spdk_conf_section_get_num(sp));
+
+ val = spdk_conf_section_get_val(sp, "Comment");
+ if (val != NULL) {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Comment %s\n", val);
+ }
+
+ pg = iscsi_portal_grp_create(spdk_conf_section_get_num(sp));
+ if (!pg) {
+ SPDK_ERRLOG("portal group malloc error (%s)\n", spdk_conf_section_get_name(sp));
+ return -1;
+ }
+
+ for (i = 0; ; i++) {
+ label = spdk_conf_section_get_nmval(sp, "Portal", i, 0);
+ portal = spdk_conf_section_get_nmval(sp, "Portal", i, 1);
+ if (label == NULL || portal == NULL) {
+ break;
+ }
+
+ rc = iscsi_parse_portal(portal, &p);
+ if (rc < 0) {
+ SPDK_ERRLOG("parse portal error (%s)\n", portal);
+ goto error;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI,
+ "RIndex=%d, Host=%s, Port=%s, Tag=%d\n",
+ i, p->host, p->port, spdk_conf_section_get_num(sp));
+
+ iscsi_portal_grp_add_portal(pg, p);
+ }
+
+ rc = iscsi_portal_grp_open(pg);
+ if (rc != 0) {
+ SPDK_ERRLOG("portal_grp_open failed\n");
+ goto error;
+ }
+
+ /* Add portal group to the end of the pg list */
+ rc = iscsi_portal_grp_register(pg);
+ if (rc != 0) {
+ SPDK_ERRLOG("register portal failed\n");
+ goto error;
+ }
+
+ return 0;
+
+error:
+ iscsi_portal_grp_release(pg);
+ return -1;
+}
+
+struct spdk_iscsi_portal_grp *
+iscsi_portal_grp_find_by_tag(int tag)
+{
+ struct spdk_iscsi_portal_grp *pg;
+
+ TAILQ_FOREACH(pg, &g_iscsi.pg_head, tailq) {
+ if (pg->tag == tag) {
+ return pg;
+ }
+ }
+
+ return NULL;
+}
+
+int
+iscsi_parse_portal_grps(void)
+{
+ int rc = 0;
+ struct spdk_conf_section *sp;
+
+ sp = spdk_conf_first_section(NULL);
+ while (sp != NULL) {
+ if (spdk_conf_section_match_prefix(sp, "PortalGroup")) {
+ if (spdk_conf_section_get_num(sp) == 0) {
+ SPDK_ERRLOG("Group 0 is invalid\n");
+ return -1;
+ }
+
+ /* Build portal group from cfg section PortalGroup */
+ rc = iscsi_parse_portal_grp(sp);
+ if (rc < 0) {
+ SPDK_ERRLOG("parse_portal_group() failed\n");
+ return -1;
+ }
+ }
+ sp = spdk_conf_next_section(sp);
+ }
+ return 0;
+}
+
+void
+iscsi_portal_grps_destroy(void)
+{
+ struct spdk_iscsi_portal_grp *pg;
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "iscsi_portal_grps_destroy\n");
+ pthread_mutex_lock(&g_iscsi.mutex);
+ while (!TAILQ_EMPTY(&g_iscsi.pg_head)) {
+ pg = TAILQ_FIRST(&g_iscsi.pg_head);
+ TAILQ_REMOVE(&g_iscsi.pg_head, pg, tailq);
+ pthread_mutex_unlock(&g_iscsi.mutex);
+ iscsi_portal_grp_destroy(pg);
+ pthread_mutex_lock(&g_iscsi.mutex);
+ }
+ pthread_mutex_unlock(&g_iscsi.mutex);
+}
+
+int
+iscsi_portal_grp_open(struct spdk_iscsi_portal_grp *pg)
+{
+ struct spdk_iscsi_portal *p;
+ int rc;
+
+ TAILQ_FOREACH(p, &pg->head, per_pg_tailq) {
+ rc = iscsi_portal_open(p);
+ if (rc < 0) {
+ return rc;
+ }
+ }
+ return 0;
+}
+
+static void
+iscsi_portal_grp_close(struct spdk_iscsi_portal_grp *pg)
+{
+ struct spdk_iscsi_portal *p;
+
+ TAILQ_FOREACH(p, &pg->head, per_pg_tailq) {
+ iscsi_portal_close(p);
+ }
+}
+
+void
+iscsi_portal_grp_close_all(void)
+{
+ struct spdk_iscsi_portal_grp *pg;
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "iscsi_portal_grp_close_all\n");
+ pthread_mutex_lock(&g_iscsi.mutex);
+ TAILQ_FOREACH(pg, &g_iscsi.pg_head, tailq) {
+ iscsi_portal_grp_close(pg);
+ }
+ pthread_mutex_unlock(&g_iscsi.mutex);
+}
+
+struct spdk_iscsi_portal_grp *
+iscsi_portal_grp_unregister(int tag)
+{
+ struct spdk_iscsi_portal_grp *pg;
+
+ pthread_mutex_lock(&g_iscsi.mutex);
+ TAILQ_FOREACH(pg, &g_iscsi.pg_head, tailq) {
+ if (pg->tag == tag) {
+ TAILQ_REMOVE(&g_iscsi.pg_head, pg, tailq);
+ pthread_mutex_unlock(&g_iscsi.mutex);
+ return pg;
+ }
+ }
+ pthread_mutex_unlock(&g_iscsi.mutex);
+ return NULL;
+}
+
+void
+iscsi_portal_grp_release(struct spdk_iscsi_portal_grp *pg)
+{
+ iscsi_portal_grp_close(pg);
+ iscsi_portal_grp_destroy(pg);
+}
+
+static const char *portal_group_section = \
+ "\n"
+ "# Users must change the PortalGroup section(s) to match the IP addresses\n"
+ "# for their environment.\n"
+ "# PortalGroup sections define which network portals the iSCSI target\n"
+ "# will use to listen for incoming connections. These are also used to\n"
+ "# determine which targets are accessible over each portal group.\n"
+ "# Up to 1024 Portal directives are allowed. These define the network\n"
+ "# portals of the portal group. The user must specify a IP address\n"
+ "# for each network portal, and may optionally specify a port.\n"
+ "# If the port is omitted, 3260 will be used\n"
+ "# Syntax:\n"
+ "# Portal <Name> <IP address>[:<port>]\n";
+
+#define PORTAL_GROUP_TMPL \
+"[PortalGroup%d]\n" \
+" Comment \"Portal%d\"\n"
+
+#define PORTAL_TMPL \
+" Portal DA1 %s:%s\n"
+
+void
+iscsi_portal_grps_config_text(FILE *fp)
+{
+ struct spdk_iscsi_portal *p = NULL;
+ struct spdk_iscsi_portal_grp *pg = NULL;
+
+ /* Create portal group section */
+ fprintf(fp, "%s", portal_group_section);
+
+ /* Dump portal groups */
+ TAILQ_FOREACH(pg, &g_iscsi.pg_head, tailq) {
+ if (NULL == pg) { continue; }
+ fprintf(fp, PORTAL_GROUP_TMPL, pg->tag, pg->tag);
+ /* Dump portals */
+ TAILQ_FOREACH(p, &pg->head, per_pg_tailq) {
+ if (NULL == p) { continue; }
+ fprintf(fp, PORTAL_TMPL, p->host, p->port);
+ }
+ }
+}
+
+static void
+iscsi_portal_grp_info_json(struct spdk_iscsi_portal_grp *pg,
+ struct spdk_json_write_ctx *w)
+{
+ struct spdk_iscsi_portal *portal;
+
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_int32(w, "tag", pg->tag);
+
+ spdk_json_write_named_array_begin(w, "portals");
+ TAILQ_FOREACH(portal, &pg->head, per_pg_tailq) {
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_string(w, "host", portal->host);
+ spdk_json_write_named_string(w, "port", portal->port);
+
+ spdk_json_write_object_end(w);
+ }
+ spdk_json_write_array_end(w);
+
+ spdk_json_write_object_end(w);
+}
+
+static void
+iscsi_portal_grp_config_json(struct spdk_iscsi_portal_grp *pg,
+ struct spdk_json_write_ctx *w)
+{
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_string(w, "method", "iscsi_create_portal_group");
+
+ spdk_json_write_name(w, "params");
+ iscsi_portal_grp_info_json(pg, w);
+
+ spdk_json_write_object_end(w);
+}
+
+void
+iscsi_portal_grps_info_json(struct spdk_json_write_ctx *w)
+{
+ struct spdk_iscsi_portal_grp *pg;
+
+ TAILQ_FOREACH(pg, &g_iscsi.pg_head, tailq) {
+ iscsi_portal_grp_info_json(pg, w);
+ }
+}
+
+void
+iscsi_portal_grps_config_json(struct spdk_json_write_ctx *w)
+{
+ struct spdk_iscsi_portal_grp *pg;
+
+ TAILQ_FOREACH(pg, &g_iscsi.pg_head, tailq) {
+ iscsi_portal_grp_config_json(pg, w);
+ }
+}
diff --git a/src/spdk/lib/iscsi/portal_grp.h b/src/spdk/lib/iscsi/portal_grp.h
new file mode 100644
index 000000000..7ac72e36c
--- /dev/null
+++ b/src/spdk/lib/iscsi/portal_grp.h
@@ -0,0 +1,90 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_PORTAL_GRP_H
+#define SPDK_PORTAL_GRP_H
+
+#include "spdk/conf.h"
+#include "spdk/cpuset.h"
+#include "iscsi/iscsi.h"
+
+struct spdk_json_write_ctx;
+
+struct spdk_iscsi_portal {
+ struct spdk_iscsi_portal_grp *group;
+ char host[MAX_PORTAL_ADDR + 1];
+ char port[MAX_PORTAL_PORT + 1];
+ struct spdk_sock *sock;
+ struct spdk_poller *acceptor_poller;
+ TAILQ_ENTRY(spdk_iscsi_portal) per_pg_tailq;
+ TAILQ_ENTRY(spdk_iscsi_portal) g_tailq;
+};
+
+struct spdk_iscsi_portal_grp {
+ int ref;
+ int tag;
+ bool disable_chap;
+ bool require_chap;
+ bool mutual_chap;
+ int32_t chap_group;
+ TAILQ_ENTRY(spdk_iscsi_portal_grp) tailq;
+ TAILQ_HEAD(, spdk_iscsi_portal) head;
+};
+
+/* SPDK iSCSI Portal Group management API */
+
+struct spdk_iscsi_portal *iscsi_portal_create(const char *host, const char *port);
+void iscsi_portal_destroy(struct spdk_iscsi_portal *p);
+
+struct spdk_iscsi_portal_grp *iscsi_portal_grp_create(int tag);
+void iscsi_portal_grp_add_portal(struct spdk_iscsi_portal_grp *pg,
+ struct spdk_iscsi_portal *p);
+void iscsi_portal_grp_destroy(struct spdk_iscsi_portal_grp *pg);
+void iscsi_portal_grp_release(struct spdk_iscsi_portal_grp *pg);
+int iscsi_parse_portal_grps(void);
+void iscsi_portal_grps_destroy(void);
+int iscsi_portal_grp_register(struct spdk_iscsi_portal_grp *pg);
+struct spdk_iscsi_portal_grp *iscsi_portal_grp_unregister(int tag);
+struct spdk_iscsi_portal_grp *iscsi_portal_grp_find_by_tag(int tag);
+int iscsi_portal_grp_open(struct spdk_iscsi_portal_grp *pg);
+int iscsi_portal_grp_set_chap_params(struct spdk_iscsi_portal_grp *pg,
+ bool disable_chap, bool require_chap,
+ bool mutual_chap, int32_t chap_group);
+
+void iscsi_portal_grp_close_all(void);
+void iscsi_portal_grps_config_text(FILE *fp);
+void iscsi_portal_grps_info_json(struct spdk_json_write_ctx *w);
+void iscsi_portal_grps_config_json(struct spdk_json_write_ctx *w);
+
+#endif /* SPDK_PORTAL_GRP_H */
diff --git a/src/spdk/lib/iscsi/spdk_iscsi.map b/src/spdk/lib/iscsi/spdk_iscsi.map
new file mode 100644
index 000000000..0475a800d
--- /dev/null
+++ b/src/spdk/lib/iscsi/spdk_iscsi.map
@@ -0,0 +1,11 @@
+{
+ global:
+
+ # Functions used by other SPDK libraries
+ spdk_iscsi_init;
+ spdk_iscsi_fini;
+ spdk_iscsi_config_text;
+ spdk_iscsi_config_json;
+
+ local: *;
+};
diff --git a/src/spdk/lib/iscsi/task.c b/src/spdk/lib/iscsi/task.c
new file mode 100644
index 000000000..964621178
--- /dev/null
+++ b/src/spdk/lib/iscsi/task.c
@@ -0,0 +1,98 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/env.h"
+#include "spdk/log.h"
+#include "iscsi/conn.h"
+#include "iscsi/task.h"
+
+static void
+iscsi_task_free(struct spdk_scsi_task *scsi_task)
+{
+ struct spdk_iscsi_task *task = iscsi_task_from_scsi_task(scsi_task);
+
+ if (task->parent) {
+ if (task->scsi.dxfer_dir == SPDK_SCSI_DIR_FROM_DEV) {
+ assert(task->conn->data_in_cnt > 0);
+ task->conn->data_in_cnt--;
+ }
+
+ spdk_scsi_task_put(&task->parent->scsi);
+ task->parent = NULL;
+ }
+
+ iscsi_task_disassociate_pdu(task);
+ assert(task->conn->pending_task_cnt > 0);
+ task->conn->pending_task_cnt--;
+ spdk_mempool_put(g_iscsi.task_pool, (void *)task);
+}
+
+struct spdk_iscsi_task *
+iscsi_task_get(struct spdk_iscsi_conn *conn, struct spdk_iscsi_task *parent,
+ spdk_scsi_task_cpl cpl_fn)
+{
+ struct spdk_iscsi_task *task;
+
+ task = spdk_mempool_get(g_iscsi.task_pool);
+ if (!task) {
+ SPDK_ERRLOG("Unable to get task\n");
+ abort();
+ }
+
+ assert(conn != NULL);
+ memset(task, 0, sizeof(*task));
+ task->conn = conn;
+ assert(conn->pending_task_cnt < UINT32_MAX);
+ conn->pending_task_cnt++;
+ spdk_scsi_task_construct(&task->scsi,
+ cpl_fn,
+ iscsi_task_free);
+ if (parent) {
+ parent->scsi.ref++;
+ task->parent = parent;
+ task->tag = parent->tag;
+ task->lun_id = parent->lun_id;
+ task->scsi.dxfer_dir = parent->scsi.dxfer_dir;
+ task->scsi.transfer_len = parent->scsi.transfer_len;
+ task->scsi.lun = parent->scsi.lun;
+ task->scsi.cdb = parent->scsi.cdb;
+ task->scsi.target_port = parent->scsi.target_port;
+ task->scsi.initiator_port = parent->scsi.initiator_port;
+ if (task->scsi.dxfer_dir == SPDK_SCSI_DIR_FROM_DEV) {
+ conn->data_in_cnt++;
+ }
+ }
+
+ return task;
+}
diff --git a/src/spdk/lib/iscsi/task.h b/src/spdk/lib/iscsi/task.h
new file mode 100644
index 000000000..0ef48599a
--- /dev/null
+++ b/src/spdk/lib/iscsi/task.h
@@ -0,0 +1,188 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_ISCSI_TASK_H
+#define SPDK_ISCSI_TASK_H
+
+#include "iscsi/iscsi.h"
+#include "spdk/scsi.h"
+#include "spdk/util.h"
+
+struct spdk_iscsi_task {
+ struct spdk_scsi_task scsi;
+
+ struct spdk_iscsi_task *parent;
+
+ uint8_t rsp_scsi_status;
+ uint8_t rsp_sense_data[32];
+ size_t rsp_sense_data_len;
+
+ struct spdk_iscsi_conn *conn;
+ struct spdk_iscsi_pdu *pdu;
+ uint32_t outstanding_r2t;
+
+ uint32_t desired_data_transfer_length;
+
+ /* Only valid for Read/Write */
+ uint32_t bytes_completed;
+
+ uint32_t data_out_cnt;
+
+ /*
+ * Tracks the current offset of large read io.
+ */
+ uint32_t current_datain_offset;
+
+ /*
+ * next_expected_r2t_offset is used when we receive
+ * the DataOUT PDU.
+ */
+ uint32_t next_expected_r2t_offset;
+
+ /*
+ * Tracks the length of the R2T that is in progress.
+ * Used to check that an R2T burst does not exceed
+ * MaxBurstLength.
+ */
+ uint32_t current_r2t_length;
+
+ /*
+ * next_r2t_offset is used when we are sending the
+ * R2T packet to keep track of next offset of r2t.
+ */
+ uint32_t next_r2t_offset;
+ uint32_t R2TSN;
+ uint32_t r2t_datasn; /* record next datasn for a r2tsn */
+ uint32_t acked_r2tsn; /* next r2tsn to be acked */
+ uint32_t datain_datasn;
+ uint32_t acked_data_sn; /* next expected datain datasn */
+ uint32_t ttt;
+ bool is_r2t_active;
+
+ uint32_t tag;
+
+ /**
+ * Record the lun id just in case the lun is invalid,
+ * which will happen when hot removing the lun.
+ */
+ int lun_id;
+
+ struct spdk_poller *mgmt_poller;
+
+ TAILQ_ENTRY(spdk_iscsi_task) link;
+
+ TAILQ_HEAD(subtask_list, spdk_iscsi_task) subtask_list;
+ TAILQ_ENTRY(spdk_iscsi_task) subtask_link;
+ bool is_queued; /* is queued in scsi layer for handling */
+};
+
+static inline void
+iscsi_task_put(struct spdk_iscsi_task *task)
+{
+ spdk_scsi_task_put(&task->scsi);
+}
+
+static inline struct spdk_iscsi_pdu *
+iscsi_task_get_pdu(struct spdk_iscsi_task *task)
+{
+ return task->pdu;
+}
+
+static inline void
+iscsi_task_set_pdu(struct spdk_iscsi_task *task, struct spdk_iscsi_pdu *pdu)
+{
+ task->pdu = pdu;
+}
+
+static inline struct iscsi_bhs *
+iscsi_task_get_bhs(struct spdk_iscsi_task *task)
+{
+ return &iscsi_task_get_pdu(task)->bhs;
+}
+
+static inline void
+iscsi_task_associate_pdu(struct spdk_iscsi_task *task, struct spdk_iscsi_pdu *pdu)
+{
+ iscsi_task_set_pdu(task, pdu);
+ pdu->ref++;
+}
+
+static inline void
+iscsi_task_disassociate_pdu(struct spdk_iscsi_task *task)
+{
+ if (iscsi_task_get_pdu(task)) {
+ iscsi_put_pdu(iscsi_task_get_pdu(task));
+ iscsi_task_set_pdu(task, NULL);
+ }
+}
+
+static inline int
+iscsi_task_is_immediate(struct spdk_iscsi_task *task)
+{
+ struct iscsi_bhs_scsi_req *scsi_req;
+
+ scsi_req = (struct iscsi_bhs_scsi_req *)iscsi_task_get_bhs(task);
+ return (scsi_req->immediate == 1);
+}
+
+static inline int
+iscsi_task_is_read(struct spdk_iscsi_task *task)
+{
+ struct iscsi_bhs_scsi_req *scsi_req;
+
+ scsi_req = (struct iscsi_bhs_scsi_req *)iscsi_task_get_bhs(task);
+ return (scsi_req->read_bit == 1);
+}
+
+struct spdk_iscsi_task *iscsi_task_get(struct spdk_iscsi_conn *conn,
+ struct spdk_iscsi_task *parent,
+ spdk_scsi_task_cpl cpl_fn);
+
+static inline struct spdk_iscsi_task *
+iscsi_task_from_scsi_task(struct spdk_scsi_task *task)
+{
+ return SPDK_CONTAINEROF(task, struct spdk_iscsi_task, scsi);
+}
+
+static inline struct spdk_iscsi_task *
+iscsi_task_get_primary(struct spdk_iscsi_task *task)
+{
+ if (task->parent) {
+ return task->parent;
+ } else {
+ return task;
+ }
+}
+
+#endif /* SPDK_ISCSI_TASK_H */
diff --git a/src/spdk/lib/iscsi/tgt_node.c b/src/spdk/lib/iscsi/tgt_node.c
new file mode 100644
index 000000000..0807a3384
--- /dev/null
+++ b/src/spdk/lib/iscsi/tgt_node.c
@@ -0,0 +1,1607 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/conf.h"
+#include "spdk/sock.h"
+#include "spdk/scsi.h"
+
+#include "spdk_internal/log.h"
+
+#include "iscsi/iscsi.h"
+#include "iscsi/conn.h"
+#include "iscsi/tgt_node.h"
+#include "iscsi/portal_grp.h"
+#include "iscsi/init_grp.h"
+#include "iscsi/task.h"
+
+#define MAX_TMPBUF 4096
+#define MAX_MASKBUF 128
+
+static bool
+iscsi_ipv6_netmask_allow_addr(const char *netmask, const char *addr)
+{
+ struct in6_addr in6_mask;
+ struct in6_addr in6_addr;
+ char mask[MAX_MASKBUF];
+ const char *p;
+ size_t n;
+ int bits, bmask;
+ int i;
+
+ if (netmask[0] != '[') {
+ return false;
+ }
+ p = strchr(netmask, ']');
+ if (p == NULL) {
+ return false;
+ }
+ n = p - (netmask + 1);
+ if (n + 1 > sizeof mask) {
+ return false;
+ }
+
+ memcpy(mask, netmask + 1, n);
+ mask[n] = '\0';
+ p++;
+
+ if (p[0] == '/') {
+ bits = (int) strtol(p + 1, NULL, 10);
+ if (bits <= 0 || bits > 128) {
+ return false;
+ }
+ } else {
+ bits = 128;
+ }
+
+#if 0
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "input %s\n", addr);
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "mask %s / %d\n", mask, bits);
+#endif
+
+ /* presentation to network order binary */
+ if (inet_pton(AF_INET6, mask, &in6_mask) <= 0
+ || inet_pton(AF_INET6, addr, &in6_addr) <= 0) {
+ return false;
+ }
+
+ /* check 128bits */
+ for (i = 0; i < (bits / 8); i++) {
+ if (in6_mask.s6_addr[i] != in6_addr.s6_addr[i]) {
+ return false;
+ }
+ }
+ if (bits % 8) {
+ bmask = (0xffU << (8 - (bits % 8))) & 0xffU;
+ if ((in6_mask.s6_addr[i] & bmask) != (in6_addr.s6_addr[i] & bmask)) {
+ return false;
+ }
+ }
+
+ /* match */
+ return true;
+}
+
+static bool
+iscsi_ipv4_netmask_allow_addr(const char *netmask, const char *addr)
+{
+ struct in_addr in4_mask;
+ struct in_addr in4_addr;
+ char mask[MAX_MASKBUF];
+ const char *p;
+ uint32_t bmask;
+ size_t n;
+ int bits;
+
+ p = strchr(netmask, '/');
+ if (p == NULL) {
+ p = netmask + strlen(netmask);
+ }
+ n = p - netmask;
+ if (n + 1 > sizeof mask) {
+ return false;
+ }
+
+ memcpy(mask, netmask, n);
+ mask[n] = '\0';
+
+ if (p[0] == '/') {
+ bits = (int) strtol(p + 1, NULL, 10);
+ if (bits <= 0 || bits > 32) {
+ return false;
+ }
+ } else {
+ bits = 32;
+ }
+
+ /* presentation to network order binary */
+ if (inet_pton(AF_INET, mask, &in4_mask) <= 0
+ || inet_pton(AF_INET, addr, &in4_addr) <= 0) {
+ return false;
+ }
+
+ /* check 32bits */
+ bmask = (0xffffffffU << (32 - bits)) & 0xffffffffU;
+ if ((ntohl(in4_mask.s_addr) & bmask) != (ntohl(in4_addr.s_addr) & bmask)) {
+ return false;
+ }
+
+ /* match */
+ return true;
+}
+
+static bool
+iscsi_netmask_allow_addr(const char *netmask, const char *addr)
+{
+ if (netmask == NULL || addr == NULL) {
+ return false;
+ }
+ if (strcasecmp(netmask, "ANY") == 0) {
+ return true;
+ }
+ if (netmask[0] == '[') {
+ /* IPv6 */
+ if (iscsi_ipv6_netmask_allow_addr(netmask, addr)) {
+ return true;
+ }
+ } else {
+ /* IPv4 */
+ if (iscsi_ipv4_netmask_allow_addr(netmask, addr)) {
+ return true;
+ }
+ }
+ return false;
+}
+
+static bool
+iscsi_init_grp_allow_addr(struct spdk_iscsi_init_grp *igp,
+ const char *addr)
+{
+ struct spdk_iscsi_initiator_netmask *imask;
+
+ TAILQ_FOREACH(imask, &igp->netmask_head, tailq) {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "netmask=%s, addr=%s\n",
+ imask->mask, addr);
+ if (iscsi_netmask_allow_addr(imask->mask, addr)) {
+ return true;
+ }
+ }
+ return false;
+}
+
+static int
+iscsi_init_grp_allow_iscsi_name(struct spdk_iscsi_init_grp *igp,
+ const char *iqn, bool *result)
+{
+ struct spdk_iscsi_initiator_name *iname;
+
+ TAILQ_FOREACH(iname, &igp->initiator_head, tailq) {
+ /* denied if iqn is matched */
+ if ((iname->name[0] == '!')
+ && (strcasecmp(&iname->name[1], "ANY") == 0
+ || strcasecmp(&iname->name[1], iqn) == 0)) {
+ *result = false;
+ return 0;
+ }
+ /* allowed if iqn is matched */
+ if (strcasecmp(iname->name, "ANY") == 0
+ || strcasecmp(iname->name, iqn) == 0) {
+ *result = true;
+ return 0;
+ }
+ }
+ return -1;
+}
+
+static struct spdk_iscsi_pg_map *
+iscsi_tgt_node_find_pg_map(struct spdk_iscsi_tgt_node *target,
+ struct spdk_iscsi_portal_grp *pg);
+
+bool
+iscsi_tgt_node_access(struct spdk_iscsi_conn *conn,
+ struct spdk_iscsi_tgt_node *target, const char *iqn, const char *addr)
+{
+ struct spdk_iscsi_portal_grp *pg;
+ struct spdk_iscsi_pg_map *pg_map;
+ struct spdk_iscsi_ig_map *ig_map;
+ int rc;
+ bool allowed = false;
+
+ if (conn == NULL || target == NULL || iqn == NULL || addr == NULL) {
+ return false;
+ }
+ pg = conn->portal->group;
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "pg=%d, iqn=%s, addr=%s\n",
+ pg->tag, iqn, addr);
+ pg_map = iscsi_tgt_node_find_pg_map(target, pg);
+ if (pg_map == NULL) {
+ return false;
+ }
+ TAILQ_FOREACH(ig_map, &pg_map->ig_map_head, tailq) {
+ rc = iscsi_init_grp_allow_iscsi_name(ig_map->ig, iqn, &allowed);
+ if (rc == 0) {
+ if (allowed == false) {
+ goto denied;
+ } else {
+ if (iscsi_init_grp_allow_addr(ig_map->ig, addr)) {
+ return true;
+ }
+ }
+ } else {
+ /* netmask is denied in this initiator group */
+ }
+ }
+
+denied:
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "access denied from %s (%s) to %s (%s:%s,%d)\n",
+ iqn, addr, target->name, conn->portal_host,
+ conn->portal_port, conn->pg_tag);
+ return false;
+}
+
+static bool
+iscsi_tgt_node_allow_iscsi_name(struct spdk_iscsi_tgt_node *target, const char *iqn)
+{
+ struct spdk_iscsi_pg_map *pg_map;
+ struct spdk_iscsi_ig_map *ig_map;
+ int rc;
+ bool result = false;
+
+ if (target == NULL || iqn == NULL) {
+ return false;
+ }
+
+ TAILQ_FOREACH(pg_map, &target->pg_map_head, tailq) {
+ TAILQ_FOREACH(ig_map, &pg_map->ig_map_head, tailq) {
+ rc = iscsi_init_grp_allow_iscsi_name(ig_map->ig, iqn, &result);
+ if (rc == 0) {
+ return result;
+ }
+ }
+ }
+
+ return false;
+}
+
+int
+iscsi_send_tgts(struct spdk_iscsi_conn *conn, const char *iiqn,
+ const char *iaddr, const char *tiqn, uint8_t *data, int alloc_len,
+ int data_len)
+{
+ char buf[MAX_TMPBUF];
+ struct spdk_iscsi_portal_grp *pg;
+ struct spdk_iscsi_pg_map *pg_map;
+ struct spdk_iscsi_portal *p;
+ struct spdk_iscsi_tgt_node *target;
+ char *host;
+ int total;
+ int len;
+ int rc;
+
+ if (conn == NULL) {
+ return 0;
+ }
+
+ total = data_len;
+ if (alloc_len < 1) {
+ return 0;
+ }
+ if (total >= alloc_len) {
+ total = alloc_len;
+ data[total - 1] = '\0';
+ return total;
+ }
+
+ pthread_mutex_lock(&g_iscsi.mutex);
+ TAILQ_FOREACH(target, &g_iscsi.target_head, tailq) {
+ if (strcasecmp(tiqn, "ALL") != 0
+ && strcasecmp(tiqn, target->name) != 0) {
+ continue;
+ }
+ rc = iscsi_tgt_node_allow_iscsi_name(target, iiqn);
+ if (rc == 0) {
+ continue;
+ }
+
+ /* DO SENDTARGETS */
+ len = snprintf((char *) data + total, alloc_len - total,
+ "TargetName=%s", target->name);
+ total += len + 1;
+
+ /* write to data */
+ TAILQ_FOREACH(pg_map, &target->pg_map_head, tailq) {
+ pg = pg_map->pg;
+ TAILQ_FOREACH(p, &pg->head, per_pg_tailq) {
+ if (alloc_len - total < 1) {
+ pthread_mutex_unlock(&g_iscsi.mutex);
+ /* TODO: long text responses support */
+ SPDK_ERRLOG("SPDK doesn't support long text responses now, "
+ "you can use larger MaxRecvDataSegmentLength"
+ "value in initiator\n");
+ return alloc_len;
+ }
+ host = p->host;
+ /* wildcard? */
+ if (strcasecmp(host, "[::]") == 0
+ || strcasecmp(host, "0.0.0.0") == 0) {
+ if (spdk_sock_is_ipv6(conn->sock)) {
+ snprintf(buf, sizeof buf, "[%s]",
+ conn->target_addr);
+ host = buf;
+ } else if (spdk_sock_is_ipv4(conn->sock)) {
+ snprintf(buf, sizeof buf, "%s",
+ conn->target_addr);
+ host = buf;
+ } else {
+ /* skip portal for the family */
+ continue;
+ }
+ }
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI,
+ "TargetAddress=%s:%s,%d\n",
+ host, p->port, pg->tag);
+ len = snprintf((char *) data + total,
+ alloc_len - total,
+ "TargetAddress=%s:%s,%d",
+ host, p->port, pg->tag);
+ total += len + 1;
+ }
+ }
+ }
+ pthread_mutex_unlock(&g_iscsi.mutex);
+
+ return total;
+}
+
+struct spdk_iscsi_tgt_node *
+iscsi_find_tgt_node(const char *target_name)
+{
+ struct spdk_iscsi_tgt_node *target;
+
+ if (target_name == NULL) {
+ return NULL;
+ }
+ TAILQ_FOREACH(target, &g_iscsi.target_head, tailq) {
+ if (strcasecmp(target_name, target->name) == 0) {
+ return target;
+ }
+ }
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "can't find target %s\n", target_name);
+ return NULL;
+}
+
+static int
+iscsi_tgt_node_register(struct spdk_iscsi_tgt_node *target)
+{
+ pthread_mutex_lock(&g_iscsi.mutex);
+
+ if (iscsi_find_tgt_node(target->name) != NULL) {
+ pthread_mutex_unlock(&g_iscsi.mutex);
+ return -EEXIST;
+ }
+
+ TAILQ_INSERT_TAIL(&g_iscsi.target_head, target, tailq);
+
+ pthread_mutex_unlock(&g_iscsi.mutex);
+ return 0;
+}
+
+static int
+iscsi_tgt_node_unregister(struct spdk_iscsi_tgt_node *target)
+{
+ struct spdk_iscsi_tgt_node *t;
+
+ TAILQ_FOREACH(t, &g_iscsi.target_head, tailq) {
+ if (t == target) {
+ TAILQ_REMOVE(&g_iscsi.target_head, t, tailq);
+ return 0;
+ }
+ }
+
+ return -1;
+}
+
+static struct spdk_iscsi_ig_map *
+iscsi_pg_map_find_ig_map(struct spdk_iscsi_pg_map *pg_map,
+ struct spdk_iscsi_init_grp *ig)
+{
+ struct spdk_iscsi_ig_map *ig_map;
+
+ TAILQ_FOREACH(ig_map, &pg_map->ig_map_head, tailq) {
+ if (ig_map->ig == ig) {
+ return ig_map;
+ }
+ }
+
+ return NULL;
+}
+
+static struct spdk_iscsi_ig_map *
+iscsi_pg_map_add_ig_map(struct spdk_iscsi_pg_map *pg_map,
+ struct spdk_iscsi_init_grp *ig)
+{
+ struct spdk_iscsi_ig_map *ig_map;
+
+ if (iscsi_pg_map_find_ig_map(pg_map, ig) != NULL) {
+ return NULL;
+ }
+
+ ig_map = malloc(sizeof(*ig_map));
+ if (ig_map == NULL) {
+ return NULL;
+ }
+
+ ig_map->ig = ig;
+ ig->ref++;
+ pg_map->num_ig_maps++;
+ TAILQ_INSERT_TAIL(&pg_map->ig_map_head, ig_map, tailq);
+
+ return ig_map;
+}
+
+static void
+_iscsi_pg_map_delete_ig_map(struct spdk_iscsi_pg_map *pg_map,
+ struct spdk_iscsi_ig_map *ig_map)
+{
+ TAILQ_REMOVE(&pg_map->ig_map_head, ig_map, tailq);
+ pg_map->num_ig_maps--;
+ ig_map->ig->ref--;
+ free(ig_map);
+}
+
+static int
+iscsi_pg_map_delete_ig_map(struct spdk_iscsi_pg_map *pg_map,
+ struct spdk_iscsi_init_grp *ig)
+{
+ struct spdk_iscsi_ig_map *ig_map;
+
+ ig_map = iscsi_pg_map_find_ig_map(pg_map, ig);
+ if (ig_map == NULL) {
+ return -ENOENT;
+ }
+
+ _iscsi_pg_map_delete_ig_map(pg_map, ig_map);
+ return 0;
+}
+
+static void
+iscsi_pg_map_delete_all_ig_maps(struct spdk_iscsi_pg_map *pg_map)
+{
+ struct spdk_iscsi_ig_map *ig_map, *tmp;
+
+ TAILQ_FOREACH_SAFE(ig_map, &pg_map->ig_map_head, tailq, tmp) {
+ _iscsi_pg_map_delete_ig_map(pg_map, ig_map);
+ }
+}
+
+static struct spdk_iscsi_pg_map *
+iscsi_tgt_node_find_pg_map(struct spdk_iscsi_tgt_node *target,
+ struct spdk_iscsi_portal_grp *pg)
+{
+ struct spdk_iscsi_pg_map *pg_map;
+
+ TAILQ_FOREACH(pg_map, &target->pg_map_head, tailq) {
+ if (pg_map->pg == pg) {
+ return pg_map;
+ }
+ }
+
+ return NULL;
+}
+
+static struct spdk_iscsi_pg_map *
+iscsi_tgt_node_add_pg_map(struct spdk_iscsi_tgt_node *target,
+ struct spdk_iscsi_portal_grp *pg)
+{
+ struct spdk_iscsi_pg_map *pg_map;
+ char port_name[MAX_TMPBUF];
+ int rc;
+
+ if (iscsi_tgt_node_find_pg_map(target, pg) != NULL) {
+ return NULL;
+ }
+
+ if (target->num_pg_maps >= SPDK_SCSI_DEV_MAX_PORTS) {
+ SPDK_ERRLOG("Number of PG maps is more than allowed (max=%d)\n",
+ SPDK_SCSI_DEV_MAX_PORTS);
+ return NULL;
+ }
+
+ pg_map = malloc(sizeof(*pg_map));
+ if (pg_map == NULL) {
+ return NULL;
+ }
+
+ snprintf(port_name, sizeof(port_name), "%s,t,0x%4.4x",
+ spdk_scsi_dev_get_name(target->dev), pg->tag);
+ rc = spdk_scsi_dev_add_port(target->dev, pg->tag, port_name);
+ if (rc != 0) {
+ free(pg_map);
+ return NULL;
+ }
+
+ TAILQ_INIT(&pg_map->ig_map_head);
+ pg_map->num_ig_maps = 0;
+ pg->ref++;
+ pg_map->pg = pg;
+ target->num_pg_maps++;
+ TAILQ_INSERT_TAIL(&target->pg_map_head, pg_map, tailq);
+
+ return pg_map;
+}
+
+static void
+_iscsi_tgt_node_delete_pg_map(struct spdk_iscsi_tgt_node *target,
+ struct spdk_iscsi_pg_map *pg_map)
+{
+ TAILQ_REMOVE(&target->pg_map_head, pg_map, tailq);
+ target->num_pg_maps--;
+ pg_map->pg->ref--;
+
+ spdk_scsi_dev_delete_port(target->dev, pg_map->pg->tag);
+
+ free(pg_map);
+}
+
+static int
+iscsi_tgt_node_delete_pg_map(struct spdk_iscsi_tgt_node *target,
+ struct spdk_iscsi_portal_grp *pg)
+{
+ struct spdk_iscsi_pg_map *pg_map;
+
+ pg_map = iscsi_tgt_node_find_pg_map(target, pg);
+ if (pg_map == NULL) {
+ return -ENOENT;
+ }
+
+ if (pg_map->num_ig_maps > 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "delete %d ig_maps forcefully\n",
+ pg_map->num_ig_maps);
+ }
+
+ iscsi_pg_map_delete_all_ig_maps(pg_map);
+ _iscsi_tgt_node_delete_pg_map(target, pg_map);
+ return 0;
+}
+
+static void
+iscsi_tgt_node_delete_ig_maps(struct spdk_iscsi_tgt_node *target,
+ struct spdk_iscsi_init_grp *ig)
+{
+ struct spdk_iscsi_pg_map *pg_map, *tmp;
+
+ TAILQ_FOREACH_SAFE(pg_map, &target->pg_map_head, tailq, tmp) {
+ iscsi_pg_map_delete_ig_map(pg_map, ig);
+ if (pg_map->num_ig_maps == 0) {
+ _iscsi_tgt_node_delete_pg_map(target, pg_map);
+ }
+ }
+}
+
+static void
+iscsi_tgt_node_delete_all_pg_maps(struct spdk_iscsi_tgt_node *target)
+{
+ struct spdk_iscsi_pg_map *pg_map, *tmp;
+
+ TAILQ_FOREACH_SAFE(pg_map, &target->pg_map_head, tailq, tmp) {
+ iscsi_pg_map_delete_all_ig_maps(pg_map);
+ _iscsi_tgt_node_delete_pg_map(target, pg_map);
+ }
+}
+
+static void
+_iscsi_tgt_node_destruct(void *cb_arg, int rc)
+{
+ struct spdk_iscsi_tgt_node *target = cb_arg;
+ iscsi_tgt_node_destruct_cb destruct_cb_fn = target->destruct_cb_fn;
+ void *destruct_cb_arg = target->destruct_cb_arg;
+
+ if (rc != 0) {
+ if (destruct_cb_fn) {
+ destruct_cb_fn(destruct_cb_arg, rc);
+ }
+ return;
+ }
+
+ pthread_mutex_lock(&g_iscsi.mutex);
+ iscsi_tgt_node_delete_all_pg_maps(target);
+ pthread_mutex_unlock(&g_iscsi.mutex);
+
+ pthread_mutex_destroy(&target->mutex);
+ free(target);
+
+ if (destruct_cb_fn) {
+ destruct_cb_fn(destruct_cb_arg, 0);
+ }
+}
+
+static int
+iscsi_tgt_node_check_active_conns(void *arg)
+{
+ struct spdk_iscsi_tgt_node *target = arg;
+
+ if (iscsi_get_active_conns(target) != 0) {
+ return SPDK_POLLER_BUSY;
+ }
+
+ spdk_poller_unregister(&target->destruct_poller);
+
+ spdk_scsi_dev_destruct(target->dev, _iscsi_tgt_node_destruct, target);
+
+ return SPDK_POLLER_BUSY;
+}
+
+static void
+iscsi_tgt_node_destruct(struct spdk_iscsi_tgt_node *target,
+ iscsi_tgt_node_destruct_cb cb_fn, void *cb_arg)
+{
+ if (target == NULL) {
+ if (cb_fn) {
+ cb_fn(cb_arg, -ENOENT);
+ }
+ return;
+ }
+
+ if (target->destructed) {
+ SPDK_ERRLOG("Destructing %s is already started\n", target->name);
+ if (cb_fn) {
+ cb_fn(cb_arg, -EBUSY);
+ }
+ return;
+ }
+
+ target->destructed = true;
+ target->destruct_cb_fn = cb_fn;
+ target->destruct_cb_arg = cb_arg;
+
+ iscsi_conns_request_logout(target);
+
+ if (iscsi_get_active_conns(target) != 0) {
+ target->destruct_poller = SPDK_POLLER_REGISTER(iscsi_tgt_node_check_active_conns,
+ target, 10);
+ } else {
+ spdk_scsi_dev_destruct(target->dev, _iscsi_tgt_node_destruct, target);
+ }
+
+}
+
+static int
+iscsi_tgt_node_delete_pg_ig_map(struct spdk_iscsi_tgt_node *target,
+ int pg_tag, int ig_tag)
+{
+ struct spdk_iscsi_portal_grp *pg;
+ struct spdk_iscsi_init_grp *ig;
+ struct spdk_iscsi_pg_map *pg_map;
+ struct spdk_iscsi_ig_map *ig_map;
+
+ pg = iscsi_portal_grp_find_by_tag(pg_tag);
+ if (pg == NULL) {
+ SPDK_ERRLOG("%s: PortalGroup%d not found\n", target->name, pg_tag);
+ return -ENOENT;
+ }
+ ig = iscsi_init_grp_find_by_tag(ig_tag);
+ if (ig == NULL) {
+ SPDK_ERRLOG("%s: InitiatorGroup%d not found\n", target->name, ig_tag);
+ return -ENOENT;
+ }
+
+ pg_map = iscsi_tgt_node_find_pg_map(target, pg);
+ if (pg_map == NULL) {
+ SPDK_ERRLOG("%s: PortalGroup%d is not mapped\n", target->name, pg_tag);
+ return -ENOENT;
+ }
+ ig_map = iscsi_pg_map_find_ig_map(pg_map, ig);
+ if (ig_map == NULL) {
+ SPDK_ERRLOG("%s: InitiatorGroup%d is not mapped\n", target->name, pg_tag);
+ return -ENOENT;
+ }
+
+ _iscsi_pg_map_delete_ig_map(pg_map, ig_map);
+ if (pg_map->num_ig_maps == 0) {
+ _iscsi_tgt_node_delete_pg_map(target, pg_map);
+ }
+
+ return 0;
+}
+
+static int
+iscsi_tgt_node_add_pg_ig_map(struct spdk_iscsi_tgt_node *target,
+ int pg_tag, int ig_tag)
+{
+ struct spdk_iscsi_portal_grp *pg;
+ struct spdk_iscsi_pg_map *pg_map;
+ struct spdk_iscsi_init_grp *ig;
+ struct spdk_iscsi_ig_map *ig_map;
+ bool new_pg_map = false;
+
+ pg = iscsi_portal_grp_find_by_tag(pg_tag);
+ if (pg == NULL) {
+ SPDK_ERRLOG("%s: PortalGroup%d not found\n", target->name, pg_tag);
+ return -ENOENT;
+ }
+ ig = iscsi_init_grp_find_by_tag(ig_tag);
+ if (ig == NULL) {
+ SPDK_ERRLOG("%s: InitiatorGroup%d not found\n", target->name, ig_tag);
+ return -ENOENT;
+ }
+
+ /* get existing pg_map or create new pg_map and add it to target */
+ pg_map = iscsi_tgt_node_find_pg_map(target, pg);
+ if (pg_map == NULL) {
+ pg_map = iscsi_tgt_node_add_pg_map(target, pg);
+ if (pg_map == NULL) {
+ goto failed;
+ }
+ new_pg_map = true;
+ }
+
+ /* create new ig_map and add it to pg_map */
+ ig_map = iscsi_pg_map_add_ig_map(pg_map, ig);
+ if (ig_map == NULL) {
+ goto failed;
+ }
+
+ return 0;
+
+failed:
+ if (new_pg_map) {
+ _iscsi_tgt_node_delete_pg_map(target, pg_map);
+ }
+
+ return -1;
+}
+
+int
+iscsi_target_node_add_pg_ig_maps(struct spdk_iscsi_tgt_node *target,
+ int *pg_tag_list, int *ig_tag_list, uint16_t num_maps)
+{
+ uint16_t i;
+ int rc;
+
+ pthread_mutex_lock(&g_iscsi.mutex);
+ for (i = 0; i < num_maps; i++) {
+ rc = iscsi_tgt_node_add_pg_ig_map(target, pg_tag_list[i],
+ ig_tag_list[i]);
+ if (rc != 0) {
+ SPDK_ERRLOG("could not add map to target\n");
+ goto invalid;
+ }
+ }
+ pthread_mutex_unlock(&g_iscsi.mutex);
+ return 0;
+
+invalid:
+ for (; i > 0; --i) {
+ iscsi_tgt_node_delete_pg_ig_map(target, pg_tag_list[i - 1],
+ ig_tag_list[i - 1]);
+ }
+ pthread_mutex_unlock(&g_iscsi.mutex);
+ return -1;
+}
+
+int
+iscsi_target_node_remove_pg_ig_maps(struct spdk_iscsi_tgt_node *target,
+ int *pg_tag_list, int *ig_tag_list, uint16_t num_maps)
+{
+ uint16_t i;
+ int rc;
+
+ pthread_mutex_lock(&g_iscsi.mutex);
+ for (i = 0; i < num_maps; i++) {
+ rc = iscsi_tgt_node_delete_pg_ig_map(target, pg_tag_list[i],
+ ig_tag_list[i]);
+ if (rc != 0) {
+ SPDK_ERRLOG("could not delete map from target\n");
+ goto invalid;
+ }
+ }
+ pthread_mutex_unlock(&g_iscsi.mutex);
+ return 0;
+
+invalid:
+ for (; i > 0; --i) {
+ rc = iscsi_tgt_node_add_pg_ig_map(target, pg_tag_list[i - 1],
+ ig_tag_list[i - 1]);
+ if (rc != 0) {
+ iscsi_tgt_node_delete_all_pg_maps(target);
+ break;
+ }
+ }
+ pthread_mutex_unlock(&g_iscsi.mutex);
+ return -1;
+}
+
+static int
+check_iscsi_name(const char *name)
+{
+ const unsigned char *up = (const unsigned char *) name;
+ size_t n;
+
+ /* valid iSCSI name no larger than 223 bytes */
+ if (strlen(name) > MAX_TARGET_NAME) {
+ return -1;
+ }
+
+ /* valid iSCSI name? */
+ for (n = 0; up[n] != 0; n++) {
+ if (up[n] > 0x00U && up[n] <= 0x2cU) {
+ return -1;
+ }
+ if (up[n] == 0x2fU) {
+ return -1;
+ }
+ if (up[n] >= 0x3bU && up[n] <= 0x40U) {
+ return -1;
+ }
+ if (up[n] >= 0x5bU && up[n] <= 0x60U) {
+ return -1;
+ }
+ if (up[n] >= 0x7bU && up[n] <= 0x7fU) {
+ return -1;
+ }
+ if (isspace(up[n])) {
+ return -1;
+ }
+ }
+ /* valid format? */
+ if (strncasecmp(name, "iqn.", 4) == 0) {
+ /* iqn.YYYY-MM.reversed.domain.name */
+ if (!isdigit(up[4]) || !isdigit(up[5]) || !isdigit(up[6])
+ || !isdigit(up[7]) || up[8] != '-' || !isdigit(up[9])
+ || !isdigit(up[10]) || up[11] != '.') {
+ SPDK_ERRLOG("invalid iqn format. "
+ "expect \"iqn.YYYY-MM.reversed.domain.name\"\n");
+ return -1;
+ }
+ } else if (strncasecmp(name, "eui.", 4) == 0) {
+ /* EUI-64 -> 16bytes */
+ /* XXX */
+ } else if (strncasecmp(name, "naa.", 4) == 0) {
+ /* 64bit -> 16bytes, 128bit -> 32bytes */
+ /* XXX */
+ }
+ /* OK */
+ return 0;
+}
+
+bool
+iscsi_check_chap_params(bool disable, bool require, bool mutual, int group)
+{
+ if (group < 0) {
+ SPDK_ERRLOG("Invalid auth group ID (%d)\n", group);
+ return false;
+ }
+ if ((!disable && !require && !mutual) || /* Auto */
+ (disable && !require && !mutual) || /* None */
+ (!disable && require && !mutual) || /* CHAP */
+ (!disable && require && mutual)) { /* CHAP Mutual */
+ return true;
+ }
+ SPDK_ERRLOG("Invalid combination of CHAP params (d=%d,r=%d,m=%d)\n",
+ disable, require, mutual);
+ return false;
+}
+
+struct spdk_iscsi_tgt_node *iscsi_tgt_node_construct(int target_index,
+ const char *name, const char *alias,
+ int *pg_tag_list, int *ig_tag_list, uint16_t num_maps,
+ const char *bdev_name_list[], int *lun_id_list, int num_luns,
+ int queue_depth,
+ bool disable_chap, bool require_chap, bool mutual_chap, int chap_group,
+ bool header_digest, bool data_digest)
+{
+ char fullname[MAX_TMPBUF];
+ struct spdk_iscsi_tgt_node *target;
+ int rc;
+
+ if (!iscsi_check_chap_params(disable_chap, require_chap,
+ mutual_chap, chap_group)) {
+ return NULL;
+ }
+
+ if (num_maps == 0) {
+ SPDK_ERRLOG("num_maps = 0\n");
+ return NULL;
+ }
+
+ if (name == NULL) {
+ SPDK_ERRLOG("TargetName not found\n");
+ return NULL;
+ }
+
+ if (strncasecmp(name, "iqn.", 4) != 0
+ && strncasecmp(name, "eui.", 4) != 0
+ && strncasecmp(name, "naa.", 4) != 0) {
+ snprintf(fullname, sizeof(fullname), "%s:%s", g_iscsi.nodebase, name);
+ } else {
+ snprintf(fullname, sizeof(fullname), "%s", name);
+ }
+
+ if (check_iscsi_name(fullname) != 0) {
+ SPDK_ERRLOG("TargetName %s contains an invalid character or format.\n",
+ name);
+ return NULL;
+ }
+
+ target = calloc(1, sizeof(*target));
+ if (!target) {
+ SPDK_ERRLOG("could not allocate target\n");
+ return NULL;
+ }
+
+ rc = pthread_mutex_init(&target->mutex, NULL);
+ if (rc != 0) {
+ SPDK_ERRLOG("tgt_node%d: mutex_init() failed\n", target->num);
+ iscsi_tgt_node_destruct(target, NULL, NULL);
+ return NULL;
+ }
+
+ target->num = target_index;
+
+ memcpy(target->name, fullname, strlen(fullname));
+
+ if (alias != NULL) {
+ if (strlen(alias) > MAX_TARGET_NAME) {
+ iscsi_tgt_node_destruct(target, NULL, NULL);
+ return NULL;
+ }
+ memcpy(target->alias, alias, strlen(alias));
+ }
+
+ target->dev = spdk_scsi_dev_construct(fullname, bdev_name_list, lun_id_list, num_luns,
+ SPDK_SPC_PROTOCOL_IDENTIFIER_ISCSI, NULL, NULL);
+ if (!target->dev) {
+ SPDK_ERRLOG("Could not construct SCSI device\n");
+ iscsi_tgt_node_destruct(target, NULL, NULL);
+ return NULL;
+ }
+
+ TAILQ_INIT(&target->pg_map_head);
+ rc = iscsi_target_node_add_pg_ig_maps(target, pg_tag_list,
+ ig_tag_list, num_maps);
+ if (rc != 0) {
+ SPDK_ERRLOG("could not add map to target\n");
+ iscsi_tgt_node_destruct(target, NULL, NULL);
+ return NULL;
+ }
+
+ target->disable_chap = disable_chap;
+ target->require_chap = require_chap;
+ target->mutual_chap = mutual_chap;
+ target->chap_group = chap_group;
+ target->header_digest = header_digest;
+ target->data_digest = data_digest;
+
+ if (queue_depth > 0 && ((uint32_t)queue_depth <= g_iscsi.MaxQueueDepth)) {
+ target->queue_depth = queue_depth;
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "QueueDepth %d is invalid and %d is used instead.\n",
+ queue_depth, g_iscsi.MaxQueueDepth);
+ target->queue_depth = g_iscsi.MaxQueueDepth;
+ }
+
+ rc = iscsi_tgt_node_register(target);
+ if (rc != 0) {
+ SPDK_ERRLOG("register target is failed\n");
+ iscsi_tgt_node_destruct(target, NULL, NULL);
+ return NULL;
+ }
+
+ return target;
+}
+
+static int
+iscsi_parse_tgt_node(struct spdk_conf_section *sp)
+{
+ char buf[MAX_TMPBUF];
+ struct spdk_iscsi_tgt_node *target;
+ int pg_tag_list[MAX_TARGET_MAP], ig_tag_list[MAX_TARGET_MAP];
+ int num_target_maps;
+ const char *alias, *pg_tag, *ig_tag;
+ const char *ag_tag;
+ const char *val, *name;
+ int target_num, chap_group, pg_tag_i, ig_tag_i;
+ bool header_digest, data_digest;
+ bool disable_chap, require_chap, mutual_chap;
+ int i;
+ int lun_id_list[SPDK_SCSI_DEV_MAX_LUN];
+ const char *bdev_name_list[SPDK_SCSI_DEV_MAX_LUN];
+ int num_luns, queue_depth;
+
+ target_num = spdk_conf_section_get_num(sp);
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "add unit %d\n", target_num);
+
+ data_digest = false;
+ header_digest = false;
+
+ name = spdk_conf_section_get_val(sp, "TargetName");
+
+ if (name == NULL) {
+ SPDK_ERRLOG("tgt_node%d: TargetName not found\n", target_num);
+ return -1;
+ }
+
+ alias = spdk_conf_section_get_val(sp, "TargetAlias");
+
+ /* Setup initiator and portal group mapping */
+ val = spdk_conf_section_get_val(sp, "Mapping");
+ if (val == NULL) {
+ /* no map */
+ SPDK_ERRLOG("tgt_node%d: no Mapping\n", target_num);
+ return -1;
+ }
+
+ for (i = 0; i < MAX_TARGET_MAP; i++) {
+ val = spdk_conf_section_get_nmval(sp, "Mapping", i, 0);
+ if (val == NULL) {
+ break;
+ }
+ pg_tag = spdk_conf_section_get_nmval(sp, "Mapping", i, 0);
+ ig_tag = spdk_conf_section_get_nmval(sp, "Mapping", i, 1);
+ if (pg_tag == NULL || ig_tag == NULL) {
+ SPDK_ERRLOG("tgt_node%d: mapping error\n", target_num);
+ return -1;
+ }
+ if (strncasecmp(pg_tag, "PortalGroup",
+ strlen("PortalGroup")) != 0
+ || sscanf(pg_tag, "%*[^0-9]%d", &pg_tag_i) != 1) {
+ SPDK_ERRLOG("tgt_node%d: mapping portal error\n", target_num);
+ return -1;
+ }
+ if (strncasecmp(ig_tag, "InitiatorGroup",
+ strlen("InitiatorGroup")) != 0
+ || sscanf(ig_tag, "%*[^0-9]%d", &ig_tag_i) != 1) {
+ SPDK_ERRLOG("tgt_node%d: mapping initiator error\n", target_num);
+ return -1;
+ }
+ if (pg_tag_i < 1 || ig_tag_i < 1) {
+ SPDK_ERRLOG("tgt_node%d: invalid group tag\n", target_num);
+ return -1;
+ }
+ pg_tag_list[i] = pg_tag_i;
+ ig_tag_list[i] = ig_tag_i;
+ }
+
+ num_target_maps = i;
+
+ /* Setup AuthMethod */
+ val = spdk_conf_section_get_val(sp, "AuthMethod");
+ disable_chap = false;
+ require_chap = false;
+ mutual_chap = false;
+ if (val != NULL) {
+ for (i = 0; ; i++) {
+ val = spdk_conf_section_get_nmval(sp, "AuthMethod", 0, i);
+ if (val == NULL) {
+ break;
+ }
+ if (strcasecmp(val, "CHAP") == 0) {
+ require_chap = true;
+ } else if (strcasecmp(val, "Mutual") == 0) {
+ mutual_chap = true;
+ } else if (strcasecmp(val, "Auto") == 0) {
+ disable_chap = false;
+ require_chap = false;
+ mutual_chap = false;
+ } else if (strcasecmp(val, "None") == 0) {
+ disable_chap = true;
+ require_chap = false;
+ mutual_chap = false;
+ } else {
+ SPDK_ERRLOG("tgt_node%d: unknown auth\n", target_num);
+ return -1;
+ }
+ }
+ if (mutual_chap && !require_chap) {
+ SPDK_ERRLOG("tgt_node%d: Mutual but not CHAP\n", target_num);
+ return -1;
+ }
+ }
+ if (disable_chap) {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "AuthMethod None\n");
+ } else if (!require_chap) {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "AuthMethod Auto\n");
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "AuthMethod CHAP %s\n",
+ mutual_chap ? "Mutual" : "");
+ }
+
+ val = spdk_conf_section_get_val(sp, "AuthGroup");
+ if (val == NULL) {
+ chap_group = 0;
+ } else {
+ ag_tag = val;
+ if (strcasecmp(ag_tag, "None") == 0) {
+ chap_group = 0;
+ } else {
+ if (strncasecmp(ag_tag, "AuthGroup",
+ strlen("AuthGroup")) != 0
+ || sscanf(ag_tag, "%*[^0-9]%d", &chap_group) != 1) {
+ SPDK_ERRLOG("tgt_node%d: auth group error\n", target_num);
+ return -1;
+ }
+ if (chap_group == 0) {
+ SPDK_ERRLOG("tgt_node%d: invalid auth group 0\n", target_num);
+ return -1;
+ }
+ }
+ }
+ if (chap_group == 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "AuthGroup None\n");
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "AuthGroup AuthGroup%d\n", chap_group);
+ }
+
+ val = spdk_conf_section_get_val(sp, "UseDigest");
+ if (val != NULL) {
+ for (i = 0; ; i++) {
+ val = spdk_conf_section_get_nmval(sp, "UseDigest", 0, i);
+ if (val == NULL) {
+ break;
+ }
+ if (strcasecmp(val, "Header") == 0) {
+ header_digest = true;
+ } else if (strcasecmp(val, "Data") == 0) {
+ data_digest = true;
+ } else if (strcasecmp(val, "Auto") == 0) {
+ header_digest = false;
+ data_digest = false;
+ } else {
+ SPDK_ERRLOG("tgt_node%d: unknown digest\n", target_num);
+ return -1;
+ }
+ }
+ }
+ if (!header_digest && !data_digest) {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "UseDigest Auto\n");
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "UseDigest %s %s\n",
+ header_digest ? "Header" : "",
+ data_digest ? "Data" : "");
+ }
+
+ val = spdk_conf_section_get_val(sp, "QueueDepth");
+ if (val == NULL) {
+ queue_depth = g_iscsi.MaxQueueDepth;
+ } else {
+ queue_depth = (int) strtol(val, NULL, 10);
+ }
+
+ num_luns = 0;
+
+ for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) {
+ snprintf(buf, sizeof(buf), "LUN%d", i);
+ val = spdk_conf_section_get_val(sp, buf);
+ if (val == NULL) {
+ continue;
+ }
+
+ bdev_name_list[num_luns] = val;
+ lun_id_list[num_luns] = i;
+ num_luns++;
+ }
+
+ if (num_luns == 0) {
+ SPDK_ERRLOG("tgt_node%d: No LUN specified for target %s.\n", target_num, name);
+ return -1;
+ }
+
+ target = iscsi_tgt_node_construct(target_num, name, alias,
+ pg_tag_list, ig_tag_list, num_target_maps,
+ bdev_name_list, lun_id_list, num_luns, queue_depth,
+ disable_chap, require_chap, mutual_chap, chap_group,
+ header_digest, data_digest);
+
+ if (target == NULL) {
+ SPDK_ERRLOG("tgt_node%d: add_iscsi_target_node error\n", target_num);
+ return -1;
+ }
+
+ for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) {
+ struct spdk_scsi_lun *lun = spdk_scsi_dev_get_lun(target->dev, i);
+
+ if (lun) {
+ SPDK_INFOLOG(SPDK_LOG_ISCSI, "device %d: LUN%d %s\n",
+ spdk_scsi_dev_get_id(target->dev),
+ spdk_scsi_lun_get_id(lun),
+ spdk_scsi_lun_get_bdev_name(lun));
+ }
+ }
+
+ return 0;
+}
+
+int iscsi_parse_tgt_nodes(void)
+{
+ struct spdk_conf_section *sp;
+ int rc;
+
+ SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "iscsi_parse_tgt_nodes\n");
+
+ sp = spdk_conf_first_section(NULL);
+ while (sp != NULL) {
+ if (spdk_conf_section_match_prefix(sp, "TargetNode")) {
+ int tag = spdk_conf_section_get_num(sp);
+
+ if (tag > SPDK_TN_TAG_MAX) {
+ SPDK_ERRLOG("tag %d is invalid\n", tag);
+ return -1;
+ }
+ rc = iscsi_parse_tgt_node(sp);
+ if (rc < 0) {
+ SPDK_ERRLOG("spdk_iscsi_parse_tgt_node() failed\n");
+ return -1;
+ }
+ }
+ sp = spdk_conf_next_section(sp);
+ }
+ return 0;
+}
+
+void
+iscsi_shutdown_tgt_nodes(void)
+{
+ struct spdk_iscsi_tgt_node *target;
+
+ pthread_mutex_lock(&g_iscsi.mutex);
+ while (!TAILQ_EMPTY(&g_iscsi.target_head)) {
+ target = TAILQ_FIRST(&g_iscsi.target_head);
+ TAILQ_REMOVE(&g_iscsi.target_head, target, tailq);
+
+ pthread_mutex_unlock(&g_iscsi.mutex);
+
+ iscsi_tgt_node_destruct(target, NULL, NULL);
+
+ pthread_mutex_lock(&g_iscsi.mutex);
+ }
+ pthread_mutex_unlock(&g_iscsi.mutex);
+}
+
+void
+iscsi_shutdown_tgt_node_by_name(const char *target_name,
+ iscsi_tgt_node_destruct_cb cb_fn, void *cb_arg)
+{
+ struct spdk_iscsi_tgt_node *target;
+
+ pthread_mutex_lock(&g_iscsi.mutex);
+ target = iscsi_find_tgt_node(target_name);
+ if (target != NULL) {
+ iscsi_tgt_node_unregister(target);
+ pthread_mutex_unlock(&g_iscsi.mutex);
+
+ iscsi_tgt_node_destruct(target, cb_fn, cb_arg);
+
+ return;
+ }
+ pthread_mutex_unlock(&g_iscsi.mutex);
+
+ if (cb_fn) {
+ cb_fn(cb_arg, -ENOENT);
+ }
+}
+
+bool
+iscsi_tgt_node_is_destructed(struct spdk_iscsi_tgt_node *target)
+{
+ return target->destructed;
+}
+
+int
+iscsi_tgt_node_cleanup_luns(struct spdk_iscsi_conn *conn,
+ struct spdk_iscsi_tgt_node *target)
+{
+ int i;
+ struct spdk_iscsi_task *task;
+
+ for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) {
+ struct spdk_scsi_lun *lun = spdk_scsi_dev_get_lun(target->dev, i);
+
+ if (!lun) {
+ continue;
+ }
+
+ /* we create a fake management task per LUN to cleanup */
+ task = iscsi_task_get(conn, NULL, iscsi_task_mgmt_cpl);
+ if (!task) {
+ SPDK_ERRLOG("Unable to acquire task\n");
+ return -1;
+ }
+
+ task->scsi.target_port = conn->target_port;
+ task->scsi.initiator_port = conn->initiator_port;
+ task->scsi.lun = lun;
+
+ iscsi_op_abort_task_set(task, SPDK_SCSI_TASK_FUNC_LUN_RESET);
+ }
+
+ return 0;
+}
+
+void iscsi_tgt_node_delete_map(struct spdk_iscsi_portal_grp *portal_group,
+ struct spdk_iscsi_init_grp *initiator_group)
+{
+ struct spdk_iscsi_tgt_node *target;
+
+ pthread_mutex_lock(&g_iscsi.mutex);
+ TAILQ_FOREACH(target, &g_iscsi.target_head, tailq) {
+ if (portal_group) {
+ iscsi_tgt_node_delete_pg_map(target, portal_group);
+ }
+ if (initiator_group) {
+ iscsi_tgt_node_delete_ig_maps(target, initiator_group);
+ }
+ }
+ pthread_mutex_unlock(&g_iscsi.mutex);
+}
+
+int
+iscsi_tgt_node_add_lun(struct spdk_iscsi_tgt_node *target,
+ const char *bdev_name, int lun_id)
+{
+ struct spdk_scsi_dev *dev;
+ int rc;
+
+ if (target->num_active_conns > 0) {
+ SPDK_ERRLOG("Target has active connections (count=%d)\n",
+ target->num_active_conns);
+ return -1;
+ }
+
+ if (lun_id < -1 || lun_id >= SPDK_SCSI_DEV_MAX_LUN) {
+ SPDK_ERRLOG("Specified LUN ID (%d) is invalid\n", lun_id);
+ return -1;
+ }
+
+ dev = target->dev;
+ if (dev == NULL) {
+ SPDK_ERRLOG("SCSI device is not found\n");
+ return -1;
+ }
+
+ rc = spdk_scsi_dev_add_lun(dev, bdev_name, lun_id, NULL, NULL);
+ if (rc != 0) {
+ SPDK_ERRLOG("spdk_scsi_dev_add_lun failed\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+int
+iscsi_tgt_node_set_chap_params(struct spdk_iscsi_tgt_node *target,
+ bool disable_chap, bool require_chap,
+ bool mutual_chap, int32_t chap_group)
+{
+ if (!iscsi_check_chap_params(disable_chap, require_chap,
+ mutual_chap, chap_group)) {
+ return -EINVAL;
+ }
+
+ pthread_mutex_lock(&target->mutex);
+ target->disable_chap = disable_chap;
+ target->require_chap = require_chap;
+ target->mutual_chap = mutual_chap;
+ target->chap_group = chap_group;
+ pthread_mutex_unlock(&target->mutex);
+
+ return 0;
+}
+
+static const char *target_nodes_section = \
+ "\n"
+ "# Users should change the TargetNode section(s) below to match the\n"
+ "# desired iSCSI target node configuration.\n"
+ "# TargetName, Mapping, LUN0 are minimum required\n";
+
+#define TARGET_NODE_TMPL \
+"[TargetNode%d]\n" \
+" Comment \"Target%d\"\n" \
+" TargetName %s\n" \
+" TargetAlias \"%s\"\n"
+
+#define TARGET_NODE_PGIG_MAPPING_TMPL \
+" Mapping PortalGroup%d InitiatorGroup%d\n"
+
+#define TARGET_NODE_AUTH_TMPL \
+" AuthMethod %s\n" \
+" AuthGroup %s\n" \
+" UseDigest %s\n"
+
+#define TARGET_NODE_QD_TMPL \
+" QueueDepth %d\n\n"
+
+#define TARGET_NODE_LUN_TMPL \
+" LUN%d %s\n"
+
+void
+iscsi_tgt_nodes_config_text(FILE *fp)
+{
+ int l = 0;
+ struct spdk_scsi_dev *dev = NULL;
+ struct spdk_iscsi_tgt_node *target = NULL;
+ struct spdk_iscsi_pg_map *pg_map;
+ struct spdk_iscsi_ig_map *ig_map;
+
+ /* Create target nodes section */
+ fprintf(fp, "%s", target_nodes_section);
+
+ TAILQ_FOREACH(target, &g_iscsi.target_head, tailq) {
+ int idx;
+ const char *authmethod = "None";
+ char authgroup[32] = "None";
+ const char *usedigest = "Auto";
+
+ dev = target->dev;
+ if (NULL == dev) { continue; }
+
+ idx = target->num;
+ fprintf(fp, TARGET_NODE_TMPL, idx, idx, target->name, spdk_scsi_dev_get_name(dev));
+
+ TAILQ_FOREACH(pg_map, &target->pg_map_head, tailq) {
+ TAILQ_FOREACH(ig_map, &pg_map->ig_map_head, tailq) {
+ fprintf(fp, TARGET_NODE_PGIG_MAPPING_TMPL,
+ pg_map->pg->tag,
+ ig_map->ig->tag);
+ }
+ }
+
+ if (target->disable_chap) {
+ authmethod = "None";
+ } else if (!target->require_chap) {
+ authmethod = "Auto";
+ } else if (target->mutual_chap) {
+ authmethod = "CHAP Mutual";
+ } else {
+ authmethod = "CHAP";
+ }
+
+ if (target->chap_group > 0) {
+ snprintf(authgroup, sizeof(authgroup), "AuthGroup%d", target->chap_group);
+ }
+
+ if (target->header_digest) {
+ usedigest = "Header";
+ } else if (target->data_digest) {
+ usedigest = "Data";
+ }
+
+ fprintf(fp, TARGET_NODE_AUTH_TMPL,
+ authmethod, authgroup, usedigest);
+
+ for (l = 0; l < SPDK_SCSI_DEV_MAX_LUN; l++) {
+ struct spdk_scsi_lun *lun = spdk_scsi_dev_get_lun(dev, l);
+
+ if (!lun) {
+ continue;
+ }
+
+ fprintf(fp, TARGET_NODE_LUN_TMPL,
+ spdk_scsi_lun_get_id(lun),
+ spdk_scsi_lun_get_bdev_name(lun));
+ }
+
+ fprintf(fp, TARGET_NODE_QD_TMPL,
+ target->queue_depth);
+ }
+}
+
+static void
+iscsi_tgt_node_info_json(struct spdk_iscsi_tgt_node *target,
+ struct spdk_json_write_ctx *w)
+{
+ struct spdk_iscsi_pg_map *pg_map;
+ struct spdk_iscsi_ig_map *ig_map;
+ int i;
+
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_string(w, "name", target->name);
+
+ if (target->alias[0] != '\0') {
+ spdk_json_write_named_string(w, "alias_name", target->alias);
+ }
+
+ spdk_json_write_named_array_begin(w, "pg_ig_maps");
+ TAILQ_FOREACH(pg_map, &target->pg_map_head, tailq) {
+ TAILQ_FOREACH(ig_map, &pg_map->ig_map_head, tailq) {
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_int32(w, "pg_tag", pg_map->pg->tag);
+ spdk_json_write_named_int32(w, "ig_tag", ig_map->ig->tag);
+ spdk_json_write_object_end(w);
+ }
+ }
+ spdk_json_write_array_end(w);
+
+ spdk_json_write_named_array_begin(w, "luns");
+ for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) {
+ struct spdk_scsi_lun *lun = spdk_scsi_dev_get_lun(target->dev, i);
+
+ if (lun) {
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "bdev_name", spdk_scsi_lun_get_bdev_name(lun));
+ spdk_json_write_named_int32(w, "lun_id", spdk_scsi_lun_get_id(lun));
+ spdk_json_write_object_end(w);
+ }
+ }
+ spdk_json_write_array_end(w);
+
+ spdk_json_write_named_int32(w, "queue_depth", target->queue_depth);
+
+ spdk_json_write_named_bool(w, "disable_chap", target->disable_chap);
+ spdk_json_write_named_bool(w, "require_chap", target->require_chap);
+ spdk_json_write_named_bool(w, "mutual_chap", target->mutual_chap);
+ spdk_json_write_named_int32(w, "chap_group", target->chap_group);
+
+ spdk_json_write_named_bool(w, "header_digest", target->header_digest);
+ spdk_json_write_named_bool(w, "data_digest", target->data_digest);
+
+ spdk_json_write_object_end(w);
+}
+
+static void
+iscsi_tgt_node_config_json(struct spdk_iscsi_tgt_node *target,
+ struct spdk_json_write_ctx *w)
+{
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_string(w, "method", "iscsi_create_target_node");
+
+ spdk_json_write_name(w, "params");
+ iscsi_tgt_node_info_json(target, w);
+
+ spdk_json_write_object_end(w);
+}
+
+void
+iscsi_tgt_nodes_info_json(struct spdk_json_write_ctx *w)
+{
+ struct spdk_iscsi_tgt_node *target;
+
+ TAILQ_FOREACH(target, &g_iscsi.target_head, tailq) {
+ iscsi_tgt_node_info_json(target, w);
+ }
+}
+
+void
+iscsi_tgt_nodes_config_json(struct spdk_json_write_ctx *w)
+{
+ struct spdk_iscsi_tgt_node *target;
+
+ TAILQ_FOREACH(target, &g_iscsi.target_head, tailq) {
+ iscsi_tgt_node_config_json(target, w);
+ }
+}
diff --git a/src/spdk/lib/iscsi/tgt_node.h b/src/spdk/lib/iscsi/tgt_node.h
new file mode 100644
index 000000000..2787fac91
--- /dev/null
+++ b/src/spdk/lib/iscsi/tgt_node.h
@@ -0,0 +1,147 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_ISCSI_TGT_NODE_H_
+#define SPDK_ISCSI_TGT_NODE_H_
+
+#include "spdk/stdinc.h"
+
+#include "iscsi/iscsi.h"
+
+struct spdk_iscsi_conn;
+struct spdk_iscsi_init_grp;
+struct spdk_iscsi_portal_grp;
+struct spdk_iscsi_portal;
+struct spdk_json_write_ctx;
+
+#define MAX_TARGET_MAP 256
+#define SPDK_TN_TAG_MAX 0x0000ffff
+
+typedef void (*iscsi_tgt_node_destruct_cb)(void *cb_arg, int rc);
+
+struct spdk_iscsi_ig_map {
+ struct spdk_iscsi_init_grp *ig;
+ TAILQ_ENTRY(spdk_iscsi_ig_map) tailq;
+};
+
+struct spdk_iscsi_pg_map {
+ struct spdk_iscsi_portal_grp *pg;
+ int num_ig_maps;
+ TAILQ_HEAD(, spdk_iscsi_ig_map) ig_map_head;
+ TAILQ_ENTRY(spdk_iscsi_pg_map) tailq ;
+};
+
+struct spdk_iscsi_tgt_node {
+ int num;
+ char name[MAX_TARGET_NAME + 1];
+ char alias[MAX_TARGET_NAME + 1];
+
+ pthread_mutex_t mutex;
+
+ bool disable_chap;
+ bool require_chap;
+ bool mutual_chap;
+ int chap_group;
+ bool header_digest;
+ bool data_digest;
+ int queue_depth;
+
+ struct spdk_scsi_dev *dev;
+ /**
+ * Counts number of active iSCSI connections associated with this
+ * target node.
+ */
+ uint32_t num_active_conns;
+ struct spdk_iscsi_poll_group *pg;
+
+ int num_pg_maps;
+ TAILQ_HEAD(, spdk_iscsi_pg_map) pg_map_head;
+ TAILQ_ENTRY(spdk_iscsi_tgt_node) tailq;
+
+ bool destructed;
+ struct spdk_poller *destruct_poller;
+ iscsi_tgt_node_destruct_cb destruct_cb_fn;
+ void *destruct_cb_arg;
+};
+
+int iscsi_parse_tgt_nodes(void);
+
+void iscsi_shutdown_tgt_nodes(void);
+void iscsi_shutdown_tgt_node_by_name(const char *target_name,
+ iscsi_tgt_node_destruct_cb cb_fn, void *cb_arg);
+bool iscsi_tgt_node_is_destructed(struct spdk_iscsi_tgt_node *target);
+int iscsi_send_tgts(struct spdk_iscsi_conn *conn, const char *iiqn,
+ const char *iaddr, const char *tiqn, uint8_t *data, int alloc_len,
+ int data_len);
+
+/*
+ * bdev_name_list and lun_id_list are equal sized arrays of size num_luns.
+ * bdev_name_list refers to the names of the bdevs that will be used for the LUNs on the
+ * new target node.
+ * lun_id_list refers to the LUN IDs that will be used for the LUNs on the target node.
+ */
+struct spdk_iscsi_tgt_node *iscsi_tgt_node_construct(int target_index,
+ const char *name, const char *alias,
+ int *pg_tag_list, int *ig_tag_list, uint16_t num_maps,
+ const char *bdev_name_list[], int *lun_id_list, int num_luns,
+ int queue_depth,
+ bool disable_chap, bool require_chap, bool mutual_chap, int chap_group,
+ bool header_digest, bool data_digest);
+
+bool iscsi_check_chap_params(bool disable, bool require, bool mutual, int group);
+
+int iscsi_target_node_add_pg_ig_maps(struct spdk_iscsi_tgt_node *target,
+ int *pg_tag_list, int *ig_tag_list,
+ uint16_t num_maps);
+int iscsi_target_node_remove_pg_ig_maps(struct spdk_iscsi_tgt_node *target,
+ int *pg_tag_list, int *ig_tag_list,
+ uint16_t num_maps);
+
+bool iscsi_tgt_node_access(struct spdk_iscsi_conn *conn,
+ struct spdk_iscsi_tgt_node *target, const char *iqn,
+ const char *addr);
+struct spdk_iscsi_tgt_node *iscsi_find_tgt_node(const char *target_name);
+int iscsi_tgt_node_cleanup_luns(struct spdk_iscsi_conn *conn,
+ struct spdk_iscsi_tgt_node *target);
+void iscsi_tgt_node_delete_map(struct spdk_iscsi_portal_grp *portal_group,
+ struct spdk_iscsi_init_grp *initiator_group);
+int iscsi_tgt_node_add_lun(struct spdk_iscsi_tgt_node *target,
+ const char *bdev_name, int lun_id);
+int iscsi_tgt_node_set_chap_params(struct spdk_iscsi_tgt_node *target,
+ bool disable_chap, bool require_chap,
+ bool mutual_chap, int32_t chap_group);
+void iscsi_tgt_nodes_config_text(FILE *fp);
+void iscsi_tgt_nodes_info_json(struct spdk_json_write_ctx *w);
+void iscsi_tgt_nodes_config_json(struct spdk_json_write_ctx *w);
+#endif /* SPDK_ISCSI_TGT_NODE_H_ */
diff --git a/src/spdk/lib/json/Makefile b/src/spdk/lib/json/Makefile
new file mode 100644
index 000000000..91cb8868f
--- /dev/null
+++ b/src/spdk/lib/json/Makefile
@@ -0,0 +1,45 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = json_parse.c json_util.c json_write.c
+LIBNAME = json
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_json.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/json/json_parse.c b/src/spdk/lib/json/json_parse.c
new file mode 100644
index 000000000..8639d5ff8
--- /dev/null
+++ b/src/spdk/lib/json/json_parse.c
@@ -0,0 +1,668 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/json.h"
+
+#include "spdk_internal/utf.h"
+
+#define SPDK_JSON_MAX_NESTING_DEPTH 64
+
+static int
+hex_value(uint8_t c)
+{
+#define V(x, y) [x] = y + 1
+ static const int8_t val[256] = {
+ V('0', 0), V('1', 1), V('2', 2), V('3', 3), V('4', 4),
+ V('5', 5), V('6', 6), V('7', 7), V('8', 8), V('9', 9),
+ V('A', 0xA), V('B', 0xB), V('C', 0xC), V('D', 0xD), V('E', 0xE), V('F', 0xF),
+ V('a', 0xA), V('b', 0xB), V('c', 0xC), V('d', 0xD), V('e', 0xE), V('f', 0xF),
+ };
+#undef V
+
+ return val[c] - 1;
+}
+
+static int
+json_decode_string_escape_unicode(uint8_t **strp, uint8_t *buf_end, uint8_t *out)
+{
+ uint8_t *str = *strp;
+ int v0, v1, v2, v3;
+ uint32_t val;
+ uint32_t surrogate_high = 0;
+ int rc;
+decode:
+ /* \uXXXX */
+ assert(buf_end > str);
+
+ if (*str++ != '\\') { return SPDK_JSON_PARSE_INVALID; }
+ if (buf_end == str) { return SPDK_JSON_PARSE_INCOMPLETE; }
+
+ if (*str++ != 'u') { return SPDK_JSON_PARSE_INVALID; }
+ if (buf_end == str) { return SPDK_JSON_PARSE_INCOMPLETE; }
+
+ if ((v3 = hex_value(*str++)) < 0) { return SPDK_JSON_PARSE_INVALID; }
+ if (buf_end == str) { return SPDK_JSON_PARSE_INCOMPLETE; }
+
+ if ((v2 = hex_value(*str++)) < 0) { return SPDK_JSON_PARSE_INVALID; }
+ if (buf_end == str) { return SPDK_JSON_PARSE_INCOMPLETE; }
+
+ if ((v1 = hex_value(*str++)) < 0) { return SPDK_JSON_PARSE_INVALID; }
+ if (buf_end == str) { return SPDK_JSON_PARSE_INCOMPLETE; }
+
+ if ((v0 = hex_value(*str++)) < 0) { return SPDK_JSON_PARSE_INVALID; }
+ if (buf_end == str) { return SPDK_JSON_PARSE_INCOMPLETE; }
+
+ val = v0 | (v1 << 4) | (v2 << 8) | (v3 << 12);
+
+ if (surrogate_high) {
+ /* We already parsed the high surrogate, so this should be the low part. */
+ if (!utf16_valid_surrogate_low(val)) {
+ return SPDK_JSON_PARSE_INVALID;
+ }
+
+ /* Convert UTF-16 surrogate pair into codepoint and fall through to utf8_encode. */
+ val = utf16_decode_surrogate_pair(surrogate_high, val);
+ } else if (utf16_valid_surrogate_high(val)) {
+ surrogate_high = val;
+
+ /*
+ * We parsed a \uXXXX sequence that decoded to the first half of a
+ * UTF-16 surrogate pair, so it must be immediately followed by another
+ * \uXXXX escape.
+ *
+ * Loop around to get the low half of the surrogate pair.
+ */
+ if (buf_end == str) { return SPDK_JSON_PARSE_INCOMPLETE; }
+ goto decode;
+ } else if (utf16_valid_surrogate_low(val)) {
+ /*
+ * We found the second half of surrogate pair without the first half;
+ * this is an invalid encoding.
+ */
+ return SPDK_JSON_PARSE_INVALID;
+ }
+
+ /*
+ * Convert Unicode escape (or surrogate pair) to UTF-8 in place.
+ *
+ * This is safe (will not write beyond the buffer) because the \uXXXX sequence is 6 bytes
+ * (or 12 bytes for surrogate pairs), and the longest possible UTF-8 encoding of a
+ * single codepoint is 4 bytes.
+ */
+ if (out) {
+ rc = utf8_encode_unsafe(out, val);
+ } else {
+ rc = utf8_codepoint_len(val);
+ }
+ if (rc < 0) {
+ return SPDK_JSON_PARSE_INVALID;
+ }
+
+ *strp = str; /* update input pointer */
+ return rc; /* return number of bytes decoded */
+}
+
+static int
+json_decode_string_escape_twochar(uint8_t **strp, uint8_t *buf_end, uint8_t *out)
+{
+ static const uint8_t escapes[256] = {
+ ['b'] = '\b',
+ ['f'] = '\f',
+ ['n'] = '\n',
+ ['r'] = '\r',
+ ['t'] = '\t',
+ ['/'] = '/',
+ ['"'] = '"',
+ ['\\'] = '\\',
+ };
+ uint8_t *str = *strp;
+ uint8_t c;
+
+ assert(buf_end > str);
+ if (buf_end - str < 2) {
+ return SPDK_JSON_PARSE_INCOMPLETE;
+ }
+
+ assert(str[0] == '\\');
+
+ c = escapes[str[1]];
+ if (c) {
+ if (out) {
+ *out = c;
+ }
+ *strp += 2; /* consumed two bytes */
+ return 1; /* produced one byte */
+ }
+
+ return SPDK_JSON_PARSE_INVALID;
+}
+
+/*
+ * Decode JSON string backslash escape.
+ * \param strp pointer to pointer to first character of escape (the backslash).
+ * *strp is also advanced to indicate how much input was consumed.
+ *
+ * \return Number of bytes appended to out
+ */
+static int
+json_decode_string_escape(uint8_t **strp, uint8_t *buf_end, uint8_t *out)
+{
+ int rc;
+
+ rc = json_decode_string_escape_twochar(strp, buf_end, out);
+ if (rc > 0) {
+ return rc;
+ }
+
+ return json_decode_string_escape_unicode(strp, buf_end, out);
+}
+
+/*
+ * Decode JSON string in place.
+ *
+ * \param str_start Pointer to the beginning of the string (the opening " character).
+ *
+ * \return Number of bytes in decoded string (beginning from start).
+ */
+static int
+json_decode_string(uint8_t *str_start, uint8_t *buf_end, uint8_t **str_end, uint32_t flags)
+{
+ uint8_t *str = str_start;
+ uint8_t *out = str_start + 1; /* Decode string in place (skip the initial quote) */
+ int rc;
+
+ if (buf_end - str_start < 2) {
+ /*
+ * Shortest valid string (the empty string) is two bytes (""),
+ * so this can't possibly be valid
+ */
+ *str_end = str;
+ return SPDK_JSON_PARSE_INCOMPLETE;
+ }
+
+ if (*str++ != '"') {
+ *str_end = str;
+ return SPDK_JSON_PARSE_INVALID;
+ }
+
+ while (str < buf_end) {
+ if (str[0] == '"') {
+ /*
+ * End of string.
+ * Update str_end to point at next input byte and return output length.
+ */
+ *str_end = str + 1;
+ return out - str_start - 1;
+ } else if (str[0] == '\\') {
+ rc = json_decode_string_escape(&str, buf_end,
+ flags & SPDK_JSON_PARSE_FLAG_DECODE_IN_PLACE ? out : NULL);
+ assert(rc != 0);
+ if (rc < 0) {
+ *str_end = str;
+ return rc;
+ }
+ out += rc;
+ } else if (str[0] <= 0x1f) {
+ /* control characters must be escaped */
+ *str_end = str;
+ return SPDK_JSON_PARSE_INVALID;
+ } else {
+ rc = utf8_valid(str, buf_end);
+ if (rc == 0) {
+ *str_end = str;
+ return SPDK_JSON_PARSE_INCOMPLETE;
+ } else if (rc < 0) {
+ *str_end = str;
+ return SPDK_JSON_PARSE_INVALID;
+ }
+
+ if (out && out != str && (flags & SPDK_JSON_PARSE_FLAG_DECODE_IN_PLACE)) {
+ memmove(out, str, rc);
+ }
+ out += rc;
+ str += rc;
+ }
+ }
+
+ /* If execution gets here, we ran out of buffer. */
+ *str_end = str;
+ return SPDK_JSON_PARSE_INCOMPLETE;
+}
+
+static int
+json_valid_number(uint8_t *start, uint8_t *buf_end)
+{
+ uint8_t *p = start;
+ uint8_t c;
+
+ if (p >= buf_end) { return -1; }
+
+ c = *p++;
+ if (c >= '1' && c <= '9') { goto num_int_digits; }
+ if (c == '0') { goto num_frac_or_exp; }
+ if (c == '-') { goto num_int_first_digit; }
+ p--;
+ goto done_invalid;
+
+num_int_first_digit:
+ if (spdk_likely(p != buf_end)) {
+ c = *p++;
+ if (c == '0') { goto num_frac_or_exp; }
+ if (c >= '1' && c <= '9') { goto num_int_digits; }
+ p--;
+ }
+ goto done_invalid;
+
+num_int_digits:
+ if (spdk_likely(p != buf_end)) {
+ c = *p++;
+ if (c >= '0' && c <= '9') { goto num_int_digits; }
+ if (c == '.') { goto num_frac_first_digit; }
+ if (c == 'e' || c == 'E') { goto num_exp_sign; }
+ p--;
+ }
+ goto done_valid;
+
+num_frac_or_exp:
+ if (spdk_likely(p != buf_end)) {
+ c = *p++;
+ if (c == '.') { goto num_frac_first_digit; }
+ if (c == 'e' || c == 'E') { goto num_exp_sign; }
+ p--;
+ }
+ goto done_valid;
+
+num_frac_first_digit:
+ if (spdk_likely(p != buf_end)) {
+ c = *p++;
+ if (c >= '0' && c <= '9') { goto num_frac_digits; }
+ p--;
+ }
+ goto done_invalid;
+
+num_frac_digits:
+ if (spdk_likely(p != buf_end)) {
+ c = *p++;
+ if (c >= '0' && c <= '9') { goto num_frac_digits; }
+ if (c == 'e' || c == 'E') { goto num_exp_sign; }
+ p--;
+ }
+ goto done_valid;
+
+num_exp_sign:
+ if (spdk_likely(p != buf_end)) {
+ c = *p++;
+ if (c >= '0' && c <= '9') { goto num_exp_digits; }
+ if (c == '-' || c == '+') { goto num_exp_first_digit; }
+ p--;
+ }
+ goto done_invalid;
+
+num_exp_first_digit:
+ if (spdk_likely(p != buf_end)) {
+ c = *p++;
+ if (c >= '0' && c <= '9') { goto num_exp_digits; }
+ p--;
+ }
+ goto done_invalid;
+
+num_exp_digits:
+ if (spdk_likely(p != buf_end)) {
+ c = *p++;
+ if (c >= '0' && c <= '9') { goto num_exp_digits; }
+ p--;
+ }
+ goto done_valid;
+
+done_valid:
+ /* Valid end state */
+ return p - start;
+
+done_invalid:
+ /* Invalid end state */
+ if (p == buf_end) {
+ /* Hit the end of the buffer - the stream is incomplete. */
+ return SPDK_JSON_PARSE_INCOMPLETE;
+ }
+
+ /* Found an invalid character in an invalid end state */
+ return SPDK_JSON_PARSE_INVALID;
+}
+
+static int
+json_valid_comment(const uint8_t *start, const uint8_t *buf_end)
+{
+ const uint8_t *p = start;
+ bool multiline;
+
+ assert(buf_end > p);
+ if (buf_end - p < 2) {
+ return SPDK_JSON_PARSE_INCOMPLETE;
+ }
+
+ if (p[0] != '/') {
+ return SPDK_JSON_PARSE_INVALID;
+ }
+ if (p[1] == '*') {
+ multiline = true;
+ } else if (p[1] == '/') {
+ multiline = false;
+ } else {
+ return SPDK_JSON_PARSE_INVALID;
+ }
+ p += 2;
+
+ if (multiline) {
+ while (p != buf_end - 1) {
+ if (p[0] == '*' && p[1] == '/') {
+ /* Include the terminating star and slash in the comment */
+ return p - start + 2;
+ }
+ p++;
+ }
+ } else {
+ while (p != buf_end) {
+ if (*p == '\r' || *p == '\n') {
+ /* Do not include the line terminator in the comment */
+ return p - start;
+ }
+ p++;
+ }
+ }
+
+ return SPDK_JSON_PARSE_INCOMPLETE;
+}
+
+struct json_literal {
+ enum spdk_json_val_type type;
+ uint32_t len;
+ uint8_t str[8];
+};
+
+/*
+ * JSON only defines 3 possible literals; they can be uniquely identified by bits
+ * 3 and 4 of the first character:
+ * 'f' = 0b11[00]110
+ * 'n' = 0b11[01]110
+ * 't' = 0b11[10]100
+ * These two bits can be used as an index into the g_json_literals array.
+ */
+static const struct json_literal g_json_literals[] = {
+ {SPDK_JSON_VAL_FALSE, 5, "false"},
+ {SPDK_JSON_VAL_NULL, 4, "null"},
+ {SPDK_JSON_VAL_TRUE, 4, "true"},
+ {}
+};
+
+static int
+match_literal(const uint8_t *start, const uint8_t *end, const uint8_t *literal, size_t len)
+{
+ assert(end >= start);
+ if ((size_t)(end - start) < len) {
+ return SPDK_JSON_PARSE_INCOMPLETE;
+ }
+
+ if (memcmp(start, literal, len) != 0) {
+ return SPDK_JSON_PARSE_INVALID;
+ }
+
+ return len;
+}
+
+ssize_t
+spdk_json_parse(void *json, size_t size, struct spdk_json_val *values, size_t num_values,
+ void **end, uint32_t flags)
+{
+ uint8_t *json_end = json + size;
+ enum spdk_json_val_type containers[SPDK_JSON_MAX_NESTING_DEPTH];
+ size_t con_value[SPDK_JSON_MAX_NESTING_DEPTH];
+ enum spdk_json_val_type con_type = SPDK_JSON_VAL_INVALID;
+ bool trailing_comma = false;
+ size_t depth = 0; /* index into containers */
+ size_t cur_value = 0; /* index into values */
+ size_t con_start_value;
+ uint8_t *data = json;
+ uint8_t *new_data;
+ int rc = 0;
+ const struct json_literal *lit;
+ enum {
+ STATE_VALUE, /* initial state */
+ STATE_VALUE_SEPARATOR, /* value separator (comma) */
+ STATE_NAME, /* "name": value */
+ STATE_NAME_SEPARATOR, /* colon */
+ STATE_END, /* parsed the complete value, so only whitespace is valid */
+ } state = STATE_VALUE;
+
+#define ADD_VALUE(t, val_start_ptr, val_end_ptr) \
+ if (values && cur_value < num_values) { \
+ values[cur_value].type = t; \
+ values[cur_value].start = val_start_ptr; \
+ values[cur_value].len = val_end_ptr - val_start_ptr; \
+ } \
+ cur_value++
+
+ while (data < json_end) {
+ uint8_t c = *data;
+
+ switch (c) {
+ case ' ':
+ case '\t':
+ case '\r':
+ case '\n':
+ /* Whitespace is allowed between any tokens. */
+ data++;
+ break;
+
+ case 't':
+ case 'f':
+ case 'n':
+ /* true, false, or null */
+ if (state != STATE_VALUE) { goto done_invalid; }
+ lit = &g_json_literals[(c >> 3) & 3]; /* See comment above g_json_literals[] */
+ assert(lit->str[0] == c);
+ rc = match_literal(data, json_end, lit->str, lit->len);
+ if (rc < 0) { goto done_rc; }
+ ADD_VALUE(lit->type, data, data + rc);
+ data += rc;
+ state = depth ? STATE_VALUE_SEPARATOR : STATE_END;
+ trailing_comma = false;
+ break;
+
+ case '"':
+ if (state != STATE_VALUE && state != STATE_NAME) { goto done_invalid; }
+ rc = json_decode_string(data, json_end, &new_data, flags);
+ if (rc < 0) {
+ data = new_data;
+ goto done_rc;
+ }
+ /*
+ * Start is data + 1 to skip initial quote.
+ * Length is data + rc - 1 to skip both quotes.
+ */
+ ADD_VALUE(state == STATE_VALUE ? SPDK_JSON_VAL_STRING : SPDK_JSON_VAL_NAME,
+ data + 1, data + rc - 1);
+ data = new_data;
+ if (state == STATE_NAME) {
+ state = STATE_NAME_SEPARATOR;
+ } else {
+ state = depth ? STATE_VALUE_SEPARATOR : STATE_END;
+ }
+ trailing_comma = false;
+ break;
+
+ case '-':
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ if (state != STATE_VALUE) { goto done_invalid; }
+ rc = json_valid_number(data, json_end);
+ if (rc < 0) { goto done_rc; }
+ ADD_VALUE(SPDK_JSON_VAL_NUMBER, data, data + rc);
+ data += rc;
+ state = depth ? STATE_VALUE_SEPARATOR : STATE_END;
+ trailing_comma = false;
+ break;
+
+ case '{':
+ case '[':
+ if (state != STATE_VALUE) { goto done_invalid; }
+ if (depth == SPDK_JSON_MAX_NESTING_DEPTH) {
+ rc = SPDK_JSON_PARSE_MAX_DEPTH_EXCEEDED;
+ goto done_rc;
+ }
+ if (c == '{') {
+ con_type = SPDK_JSON_VAL_OBJECT_BEGIN;
+ state = STATE_NAME;
+ } else {
+ con_type = SPDK_JSON_VAL_ARRAY_BEGIN;
+ state = STATE_VALUE;
+ }
+ con_value[depth] = cur_value;
+ containers[depth++] = con_type;
+ ADD_VALUE(con_type, data, data + 1);
+ data++;
+ trailing_comma = false;
+ break;
+
+ case '}':
+ case ']':
+ if (trailing_comma) { goto done_invalid; }
+ if (depth == 0) { goto done_invalid; }
+ con_type = containers[--depth];
+ con_start_value = con_value[depth];
+ if (values && con_start_value < num_values) {
+ values[con_start_value].len = cur_value - con_start_value - 1;
+ }
+ if (c == '}') {
+ if (state != STATE_NAME && state != STATE_VALUE_SEPARATOR) {
+ goto done_invalid;
+ }
+ if (con_type != SPDK_JSON_VAL_OBJECT_BEGIN) {
+ goto done_invalid;
+ }
+ ADD_VALUE(SPDK_JSON_VAL_OBJECT_END, data, data + 1);
+ } else {
+ if (state != STATE_VALUE && state != STATE_VALUE_SEPARATOR) {
+ goto done_invalid;
+ }
+ if (con_type != SPDK_JSON_VAL_ARRAY_BEGIN) {
+ goto done_invalid;
+ }
+ ADD_VALUE(SPDK_JSON_VAL_ARRAY_END, data, data + 1);
+ }
+ con_type = depth == 0 ? SPDK_JSON_VAL_INVALID : containers[depth - 1];
+ data++;
+ state = depth ? STATE_VALUE_SEPARATOR : STATE_END;
+ trailing_comma = false;
+ break;
+
+ case ',':
+ if (state != STATE_VALUE_SEPARATOR) { goto done_invalid; }
+ data++;
+ assert(con_type == SPDK_JSON_VAL_ARRAY_BEGIN ||
+ con_type == SPDK_JSON_VAL_OBJECT_BEGIN);
+ state = con_type == SPDK_JSON_VAL_ARRAY_BEGIN ? STATE_VALUE : STATE_NAME;
+ trailing_comma = true;
+ break;
+
+ case ':':
+ if (state != STATE_NAME_SEPARATOR) { goto done_invalid; }
+ data++;
+ state = STATE_VALUE;
+ break;
+
+ case '/':
+ if (!(flags & SPDK_JSON_PARSE_FLAG_ALLOW_COMMENTS)) {
+ goto done_invalid;
+ }
+ rc = json_valid_comment(data, json_end);
+ if (rc < 0) { goto done_rc; }
+ /* Skip over comment */
+ data += rc;
+ break;
+
+ default:
+ goto done_invalid;
+ }
+
+ if (state == STATE_END) {
+ break;
+ }
+ }
+
+ if (state == STATE_END) {
+ /* Skip trailing whitespace */
+ while (data < json_end) {
+ uint8_t c = *data;
+
+ if (c == ' ' || c == '\t' || c == '\r' || c == '\n') {
+ data++;
+ } else {
+ break;
+ }
+ }
+
+ /*
+ * These asserts are just for sanity checking - they are guaranteed by the allowed
+ * state transitions.
+ */
+ assert(depth == 0);
+ assert(trailing_comma == false);
+ assert(data <= json_end);
+ if (end) {
+ *end = data;
+ }
+ return cur_value;
+ }
+
+ /* Invalid end state - ran out of data */
+ rc = SPDK_JSON_PARSE_INCOMPLETE;
+
+done_rc:
+ assert(rc < 0);
+ if (end) {
+ *end = data;
+ }
+ return rc;
+
+done_invalid:
+ rc = SPDK_JSON_PARSE_INVALID;
+ goto done_rc;
+}
diff --git a/src/spdk/lib/json/json_util.c b/src/spdk/lib/json/json_util.c
new file mode 100644
index 000000000..18d751047
--- /dev/null
+++ b/src/spdk/lib/json/json_util.c
@@ -0,0 +1,653 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/json.h"
+
+#include "spdk_internal/utf.h"
+#include "spdk_internal/log.h"
+
+#define SPDK_JSON_DEBUG(...) SPDK_DEBUGLOG(SPDK_LOG_JSON, __VA_ARGS__)
+
+size_t
+spdk_json_val_len(const struct spdk_json_val *val)
+{
+ if (val == NULL) {
+ return 0;
+ }
+
+ if (val->type == SPDK_JSON_VAL_ARRAY_BEGIN || val->type == SPDK_JSON_VAL_OBJECT_BEGIN) {
+ return val->len + 2;
+ }
+
+ return 1;
+}
+
+bool
+spdk_json_strequal(const struct spdk_json_val *val, const char *str)
+{
+ size_t len;
+
+ if (val->type != SPDK_JSON_VAL_STRING && val->type != SPDK_JSON_VAL_NAME) {
+ return false;
+ }
+
+ len = strlen(str);
+ if (val->len != len) {
+ return false;
+ }
+
+ return memcmp(val->start, str, len) == 0;
+}
+
+char *
+spdk_json_strdup(const struct spdk_json_val *val)
+{
+ size_t len;
+ char *s;
+
+ if (val->type != SPDK_JSON_VAL_STRING && val->type != SPDK_JSON_VAL_NAME) {
+ return NULL;
+ }
+
+ len = val->len;
+
+ if (memchr(val->start, '\0', len)) {
+ /* String contains embedded NUL, so it is not a valid C string. */
+ return NULL;
+ }
+
+ s = malloc(len + 1);
+ if (s == NULL) {
+ return s;
+ }
+
+ memcpy(s, val->start, len);
+ s[len] = '\0';
+
+ return s;
+}
+
+struct spdk_json_num {
+ bool negative;
+ uint64_t significand;
+ int64_t exponent;
+};
+
+static int
+json_number_split(const struct spdk_json_val *val, struct spdk_json_num *num)
+{
+ const char *iter;
+ size_t remaining;
+ uint64_t *pval;
+ uint64_t frac_digits = 0;
+ uint64_t exponent_u64 = 0;
+ bool exponent_negative = false;
+ enum {
+ NUM_STATE_INT,
+ NUM_STATE_FRAC,
+ NUM_STATE_EXP,
+ } state;
+
+ memset(num, 0, sizeof(*num));
+
+ if (val->type != SPDK_JSON_VAL_NUMBER) {
+ return -EINVAL;
+ }
+
+ remaining = val->len;
+ if (remaining == 0) {
+ return -EINVAL;
+ }
+
+ iter = val->start;
+ if (*iter == '-') {
+ num->negative = true;
+ iter++;
+ remaining--;
+ }
+
+ state = NUM_STATE_INT;
+ pval = &num->significand;
+ while (remaining--) {
+ char c = *iter++;
+
+ if (c == '.') {
+ state = NUM_STATE_FRAC;
+ } else if (c == 'e' || c == 'E') {
+ state = NUM_STATE_EXP;
+ pval = &exponent_u64;
+ } else if (c == '-') {
+ assert(state == NUM_STATE_EXP);
+ exponent_negative = true;
+ } else if (c == '+') {
+ assert(state == NUM_STATE_EXP);
+ /* exp_negative = false; */ /* already false by default */
+ } else {
+ uint64_t new_val;
+
+ assert(c >= '0' && c <= '9');
+ new_val = *pval * 10 + c - '0';
+ if (new_val < *pval) {
+ return -ERANGE;
+ }
+
+ if (state == NUM_STATE_FRAC) {
+ frac_digits++;
+ }
+
+ *pval = new_val;
+ }
+ }
+
+ if (exponent_negative) {
+ if (exponent_u64 > 9223372036854775808ULL) { /* abs(INT64_MIN) */
+ return -ERANGE;
+ }
+ num->exponent = (int64_t) - exponent_u64;
+ } else {
+ if (exponent_u64 > INT64_MAX) {
+ return -ERANGE;
+ }
+ num->exponent = exponent_u64;
+ }
+ num->exponent -= frac_digits;
+
+ /* Apply as much of the exponent as possible without overflow or truncation */
+ if (num->exponent < 0) {
+ while (num->exponent && num->significand >= 10 && num->significand % 10 == 0) {
+ num->significand /= 10;
+ num->exponent++;
+ }
+ } else { /* positive exponent */
+ while (num->exponent) {
+ uint64_t new_val = num->significand * 10;
+
+ if (new_val < num->significand) {
+ break;
+ }
+
+ num->significand = new_val;
+ num->exponent--;
+ }
+ }
+
+ return 0;
+}
+
+int
+spdk_json_number_to_uint16(const struct spdk_json_val *val, uint16_t *num)
+{
+ struct spdk_json_num split_num;
+ int rc;
+
+ rc = json_number_split(val, &split_num);
+ if (rc) {
+ return rc;
+ }
+
+ if (split_num.exponent || split_num.negative) {
+ return -ERANGE;
+ }
+
+ if (split_num.significand > UINT16_MAX) {
+ return -ERANGE;
+ }
+ *num = (uint16_t)split_num.significand;
+ return 0;
+}
+
+int
+spdk_json_number_to_int32(const struct spdk_json_val *val, int32_t *num)
+{
+ struct spdk_json_num split_num;
+ int rc;
+
+ rc = json_number_split(val, &split_num);
+ if (rc) {
+ return rc;
+ }
+
+ if (split_num.exponent) {
+ return -ERANGE;
+ }
+
+ if (split_num.negative) {
+ if (split_num.significand > 2147483648) { /* abs(INT32_MIN) */
+ return -ERANGE;
+ }
+ *num = (int32_t) - (int64_t)split_num.significand;
+ return 0;
+ }
+
+ /* positive */
+ if (split_num.significand > INT32_MAX) {
+ return -ERANGE;
+ }
+ *num = (int32_t)split_num.significand;
+ return 0;
+}
+
+int
+spdk_json_number_to_uint32(const struct spdk_json_val *val, uint32_t *num)
+{
+ struct spdk_json_num split_num;
+ int rc;
+
+ rc = json_number_split(val, &split_num);
+ if (rc) {
+ return rc;
+ }
+
+ if (split_num.exponent || split_num.negative) {
+ return -ERANGE;
+ }
+
+ if (split_num.significand > UINT32_MAX) {
+ return -ERANGE;
+ }
+ *num = (uint32_t)split_num.significand;
+ return 0;
+}
+
+int
+spdk_json_number_to_uint64(const struct spdk_json_val *val, uint64_t *num)
+{
+ struct spdk_json_num split_num;
+ int rc;
+
+ rc = json_number_split(val, &split_num);
+ if (rc) {
+ return rc;
+ }
+
+ if (split_num.exponent || split_num.negative) {
+ return -ERANGE;
+ }
+
+ *num = split_num.significand;
+ return 0;
+}
+
+int
+spdk_json_decode_object(const struct spdk_json_val *values,
+ const struct spdk_json_object_decoder *decoders, size_t num_decoders, void *out)
+{
+ uint32_t i;
+ bool invalid = false;
+ size_t decidx;
+ bool *seen;
+
+ if (values == NULL || values->type != SPDK_JSON_VAL_OBJECT_BEGIN) {
+ return -1;
+ }
+
+ seen = calloc(sizeof(bool), num_decoders);
+ if (seen == NULL) {
+ return -1;
+ }
+
+ for (i = 0; i < values->len;) {
+ const struct spdk_json_val *name = &values[i + 1];
+ const struct spdk_json_val *v = &values[i + 2];
+ bool found = false;
+
+ for (decidx = 0; decidx < num_decoders; decidx++) {
+ const struct spdk_json_object_decoder *dec = &decoders[decidx];
+ if (spdk_json_strequal(name, dec->name)) {
+ void *field = (void *)((uintptr_t)out + dec->offset);
+
+ found = true;
+
+ if (seen[decidx]) {
+ /* duplicate field name */
+ invalid = true;
+ SPDK_JSON_DEBUG("Duplicate key '%s'\n", dec->name);
+ } else {
+ seen[decidx] = true;
+ if (dec->decode_func(v, field)) {
+ invalid = true;
+ SPDK_JSON_DEBUG("Decoder failed to decode key '%s'\n", dec->name);
+ /* keep going to fill out any other valid keys */
+ }
+ }
+ break;
+ }
+ }
+
+ if (!found) {
+ invalid = true;
+ SPDK_JSON_DEBUG("Decoder not found for key '%.*s'\n", name->len, (char *)name->start);
+ }
+
+ i += 1 + spdk_json_val_len(v);
+ }
+
+ for (decidx = 0; decidx < num_decoders; decidx++) {
+ if (!decoders[decidx].optional && !seen[decidx]) {
+ /* required field is missing */
+ invalid = true;
+ break;
+ }
+ }
+
+ free(seen);
+ return invalid ? -1 : 0;
+}
+
+int
+spdk_json_decode_array(const struct spdk_json_val *values, spdk_json_decode_fn decode_func,
+ void *out, size_t max_size, size_t *out_size, size_t stride)
+{
+ uint32_t i;
+ char *field;
+ char *out_end;
+
+ if (values == NULL || values->type != SPDK_JSON_VAL_ARRAY_BEGIN) {
+ return -1;
+ }
+
+ *out_size = 0;
+ field = out;
+ out_end = field + max_size * stride;
+ for (i = 0; i < values->len;) {
+ const struct spdk_json_val *v = &values[i + 1];
+
+ if (field == out_end) {
+ return -1;
+ }
+
+ if (decode_func(v, field)) {
+ return -1;
+ }
+
+ i += spdk_json_val_len(v);
+ field += stride;
+ (*out_size)++;
+ }
+
+ return 0;
+}
+
+int
+spdk_json_decode_bool(const struct spdk_json_val *val, void *out)
+{
+ bool *f = out;
+
+ if (val->type != SPDK_JSON_VAL_TRUE && val->type != SPDK_JSON_VAL_FALSE) {
+ return -1;
+ }
+
+ *f = val->type == SPDK_JSON_VAL_TRUE;
+ return 0;
+}
+
+int
+spdk_json_decode_uint16(const struct spdk_json_val *val, void *out)
+{
+ uint16_t *i = out;
+
+ return spdk_json_number_to_uint16(val, i);
+}
+
+int
+spdk_json_decode_int32(const struct spdk_json_val *val, void *out)
+{
+ int32_t *i = out;
+
+ return spdk_json_number_to_int32(val, i);
+}
+
+int
+spdk_json_decode_uint32(const struct spdk_json_val *val, void *out)
+{
+ uint32_t *i = out;
+
+ return spdk_json_number_to_uint32(val, i);
+}
+
+int
+spdk_json_decode_uint64(const struct spdk_json_val *val, void *out)
+{
+ uint64_t *i = out;
+
+ return spdk_json_number_to_uint64(val, i);
+}
+
+int
+spdk_json_decode_string(const struct spdk_json_val *val, void *out)
+{
+ char **s = out;
+
+ free(*s);
+
+ *s = spdk_json_strdup(val);
+
+ if (*s) {
+ return 0;
+ } else {
+ return -1;
+ }
+}
+
+static struct spdk_json_val *
+json_first(struct spdk_json_val *object, enum spdk_json_val_type type)
+{
+ /* 'object' must be JSON object or array. 'type' might be combination of these two. */
+ assert((type & (SPDK_JSON_VAL_ARRAY_BEGIN | SPDK_JSON_VAL_OBJECT_BEGIN)) != 0);
+
+ assert(object != NULL);
+
+ if ((object->type & type) == 0) {
+ return NULL;
+ }
+
+ object++;
+ if (object->len == 0) {
+ return NULL;
+ }
+
+ return object;
+}
+
+static struct spdk_json_val *
+json_value(struct spdk_json_val *key)
+{
+ return key->type == SPDK_JSON_VAL_NAME ? key + 1 : NULL;
+}
+
+int
+spdk_json_find(struct spdk_json_val *object, const char *key_name, struct spdk_json_val **key,
+ struct spdk_json_val **val, enum spdk_json_val_type type)
+{
+ struct spdk_json_val *_key = NULL;
+ struct spdk_json_val *_val = NULL;
+ struct spdk_json_val *it;
+
+ assert(object != NULL);
+
+ for (it = json_first(object, SPDK_JSON_VAL_ARRAY_BEGIN | SPDK_JSON_VAL_OBJECT_BEGIN);
+ it != NULL;
+ it = spdk_json_next(it)) {
+ if (it->type != SPDK_JSON_VAL_NAME) {
+ continue;
+ }
+
+ if (spdk_json_strequal(it, key_name) != true) {
+ continue;
+ }
+
+ if (_key) {
+ SPDK_JSON_DEBUG("Duplicate key '%s'", key_name);
+ return -EINVAL;
+ }
+
+ _key = it;
+ _val = json_value(_key);
+
+ if (type != SPDK_JSON_VAL_INVALID && (_val->type & type) == 0) {
+ SPDK_JSON_DEBUG("key '%s' type is %#x but expected one of %#x\n", key_name, _val->type, type);
+ return -EDOM;
+ }
+ }
+
+ if (key) {
+ *key = _key;
+ }
+
+ if (val) {
+ *val = _val;
+ }
+
+ return _val ? 0 : -ENOENT;
+}
+
+int
+spdk_json_find_string(struct spdk_json_val *object, const char *key_name,
+ struct spdk_json_val **key, struct spdk_json_val **val)
+{
+ return spdk_json_find(object, key_name, key, val, SPDK_JSON_VAL_STRING);
+}
+
+int
+spdk_json_find_array(struct spdk_json_val *object, const char *key_name,
+ struct spdk_json_val **key, struct spdk_json_val **val)
+{
+ return spdk_json_find(object, key_name, key, val, SPDK_JSON_VAL_ARRAY_BEGIN);
+}
+
+struct spdk_json_val *
+spdk_json_object_first(struct spdk_json_val *object)
+{
+ struct spdk_json_val *first = json_first(object, SPDK_JSON_VAL_OBJECT_BEGIN);
+
+ /* Empty object? */
+ return first && first->type != SPDK_JSON_VAL_OBJECT_END ? first : NULL;
+}
+
+struct spdk_json_val *
+spdk_json_array_first(struct spdk_json_val *array_begin)
+{
+ struct spdk_json_val *first = json_first(array_begin, SPDK_JSON_VAL_ARRAY_BEGIN);
+
+ /* Empty array? */
+ return first && first->type != SPDK_JSON_VAL_ARRAY_END ? first : NULL;
+}
+
+static struct spdk_json_val *
+json_skip_object_or_array(struct spdk_json_val *val)
+{
+ unsigned lvl;
+ enum spdk_json_val_type end_type;
+ struct spdk_json_val *it;
+
+ if (val->type == SPDK_JSON_VAL_OBJECT_BEGIN) {
+ end_type = SPDK_JSON_VAL_OBJECT_END;
+ } else if (val->type == SPDK_JSON_VAL_ARRAY_BEGIN) {
+ end_type = SPDK_JSON_VAL_ARRAY_END;
+ } else {
+ SPDK_JSON_DEBUG("Expected JSON object (%#x) or array (%#x) but got %#x\n",
+ SPDK_JSON_VAL_OBJECT_BEGIN, SPDK_JSON_VAL_ARRAY_BEGIN, val->type);
+ return NULL;
+ }
+
+ lvl = 1;
+ for (it = val + 1; it->type != SPDK_JSON_VAL_INVALID && lvl != 0; it++) {
+ if (it->type == val->type) {
+ lvl++;
+ } else if (it->type == end_type) {
+ lvl--;
+ }
+ }
+
+ /* if lvl != 0 we have invalid JSON object */
+ if (lvl != 0) {
+ SPDK_JSON_DEBUG("Can't find end of object (type: %#x): lvl (%u) != 0)\n", val->type, lvl);
+ it = NULL;
+ }
+
+ return it;
+}
+
+struct spdk_json_val *
+spdk_json_next(struct spdk_json_val *it)
+{
+ struct spdk_json_val *val, *next;
+
+ switch (it->type) {
+ case SPDK_JSON_VAL_NAME:
+ val = json_value(it);
+ next = spdk_json_next(val);
+ break;
+
+ /* We are in the middle of an array - get to next entry */
+ case SPDK_JSON_VAL_NULL:
+ case SPDK_JSON_VAL_TRUE:
+ case SPDK_JSON_VAL_FALSE:
+ case SPDK_JSON_VAL_NUMBER:
+ case SPDK_JSON_VAL_STRING:
+ val = it + 1;
+ return val;
+
+ case SPDK_JSON_VAL_ARRAY_BEGIN:
+ case SPDK_JSON_VAL_OBJECT_BEGIN:
+ next = json_skip_object_or_array(it);
+ break;
+
+ /* Can't go to the next object if started from the end of array or object */
+ case SPDK_JSON_VAL_ARRAY_END:
+ case SPDK_JSON_VAL_OBJECT_END:
+ case SPDK_JSON_VAL_INVALID:
+ return NULL;
+ default:
+ assert(false);
+ return NULL;
+
+ }
+
+ /* EOF ? */
+ if (next == NULL) {
+ return NULL;
+ }
+
+ switch (next->type) {
+ case SPDK_JSON_VAL_ARRAY_END:
+ case SPDK_JSON_VAL_OBJECT_END:
+ case SPDK_JSON_VAL_INVALID:
+ return NULL;
+ default:
+ /* Next value */
+ return next;
+ }
+}
+
+SPDK_LOG_REGISTER_COMPONENT("json_util", SPDK_LOG_JSON)
diff --git a/src/spdk/lib/json/json_write.c b/src/spdk/lib/json/json_write.c
new file mode 100644
index 000000000..7e9fbb5c3
--- /dev/null
+++ b/src/spdk/lib/json/json_write.c
@@ -0,0 +1,687 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/json.h"
+
+#include "spdk_internal/utf.h"
+
+struct spdk_json_write_ctx {
+ spdk_json_write_cb write_cb;
+ void *cb_ctx;
+ uint32_t flags;
+ uint32_t indent;
+ bool new_indent;
+ bool first_value;
+ bool failed;
+ size_t buf_filled;
+ uint8_t buf[4096];
+};
+
+static int emit_buf_full(struct spdk_json_write_ctx *w, const void *data, size_t size);
+
+static int
+fail(struct spdk_json_write_ctx *w)
+{
+ w->failed = true;
+ return -1;
+}
+
+static int
+flush_buf(struct spdk_json_write_ctx *w)
+{
+ int rc;
+
+ rc = w->write_cb(w->cb_ctx, w->buf, w->buf_filled);
+ if (rc != 0) {
+ return fail(w);
+ }
+
+ w->buf_filled = 0;
+
+ return 0;
+}
+
+struct spdk_json_write_ctx *
+spdk_json_write_begin(spdk_json_write_cb write_cb, void *cb_ctx, uint32_t flags)
+{
+ struct spdk_json_write_ctx *w;
+
+ w = calloc(1, sizeof(*w));
+ if (w == NULL) {
+ return w;
+ }
+
+ w->write_cb = write_cb;
+ w->cb_ctx = cb_ctx;
+ w->flags = flags;
+ w->indent = 0;
+ w->new_indent = false;
+ w->first_value = true;
+ w->failed = false;
+ w->buf_filled = 0;
+
+ return w;
+}
+
+int
+spdk_json_write_end(struct spdk_json_write_ctx *w)
+{
+ bool failed;
+ int rc;
+
+ if (w == NULL) {
+ return 0;
+ }
+
+ failed = w->failed;
+
+ rc = flush_buf(w);
+ if (rc != 0) {
+ failed = true;
+ }
+
+ free(w);
+
+ return failed ? -1 : 0;
+}
+
+static inline int
+emit(struct spdk_json_write_ctx *w, const void *data, size_t size)
+{
+ size_t buf_remain = sizeof(w->buf) - w->buf_filled;
+
+ if (spdk_unlikely(size > buf_remain)) {
+ /* Not enough space in buffer for the new data. */
+ return emit_buf_full(w, data, size);
+ }
+
+ /* Copy the new data into buf. */
+ memcpy(w->buf + w->buf_filled, data, size);
+ w->buf_filled += size;
+ return 0;
+}
+
+static int
+emit_buf_full(struct spdk_json_write_ctx *w, const void *data, size_t size)
+{
+ size_t buf_remain = sizeof(w->buf) - w->buf_filled;
+ int rc;
+
+ assert(size > buf_remain);
+
+ /* Copy as much of the new data as possible into the buffer and flush it. */
+ memcpy(w->buf + w->buf_filled, data, buf_remain);
+ w->buf_filled += buf_remain;
+
+ rc = flush_buf(w);
+ if (rc != 0) {
+ return fail(w);
+ }
+
+ /* Recurse to emit the rest of the data. */
+ return emit(w, data + buf_remain, size - buf_remain);
+}
+
+static int
+emit_fmt(struct spdk_json_write_ctx *w, const void *data, size_t size)
+{
+ if (w->flags & SPDK_JSON_WRITE_FLAG_FORMATTED) {
+ return emit(w, data, size);
+ }
+ return 0;
+}
+
+static int
+emit_indent(struct spdk_json_write_ctx *w)
+{
+ uint32_t i;
+
+ if (w->flags & SPDK_JSON_WRITE_FLAG_FORMATTED) {
+ for (i = 0; i < w->indent; i++) {
+ if (emit(w, " ", 2)) { return fail(w); }
+ }
+ }
+ return 0;
+}
+
+static int
+begin_value(struct spdk_json_write_ctx *w)
+{
+ /* TODO: check for value state */
+ if (w->new_indent) {
+ if (emit_fmt(w, "\n", 1)) { return fail(w); }
+ if (emit_indent(w)) { return fail(w); }
+ }
+ if (!w->first_value) {
+ if (emit(w, ",", 1)) { return fail(w); }
+ if (emit_fmt(w, "\n", 1)) { return fail(w); }
+ if (emit_indent(w)) { return fail(w); }
+ }
+ w->first_value = false;
+ w->new_indent = false;
+ return 0;
+}
+
+int
+spdk_json_write_val_raw(struct spdk_json_write_ctx *w, const void *data, size_t len)
+{
+ if (begin_value(w)) { return fail(w); }
+ return emit(w, data, len);
+}
+
+int
+spdk_json_write_null(struct spdk_json_write_ctx *w)
+{
+ if (begin_value(w)) { return fail(w); }
+ return emit(w, "null", 4);
+}
+
+int
+spdk_json_write_bool(struct spdk_json_write_ctx *w, bool val)
+{
+ if (begin_value(w)) { return fail(w); }
+ if (val) {
+ return emit(w, "true", 4);
+ } else {
+ return emit(w, "false", 5);
+ }
+}
+
+int
+spdk_json_write_int32(struct spdk_json_write_ctx *w, int32_t val)
+{
+ char buf[32];
+ int count;
+
+ if (begin_value(w)) { return fail(w); }
+ count = snprintf(buf, sizeof(buf), "%" PRId32, val);
+ if (count <= 0 || (size_t)count >= sizeof(buf)) { return fail(w); }
+ return emit(w, buf, count);
+}
+
+int
+spdk_json_write_uint32(struct spdk_json_write_ctx *w, uint32_t val)
+{
+ char buf[32];
+ int count;
+
+ if (begin_value(w)) { return fail(w); }
+ count = snprintf(buf, sizeof(buf), "%" PRIu32, val);
+ if (count <= 0 || (size_t)count >= sizeof(buf)) { return fail(w); }
+ return emit(w, buf, count);
+}
+
+int
+spdk_json_write_int64(struct spdk_json_write_ctx *w, int64_t val)
+{
+ char buf[32];
+ int count;
+
+ if (begin_value(w)) { return fail(w); }
+ count = snprintf(buf, sizeof(buf), "%" PRId64, val);
+ if (count <= 0 || (size_t)count >= sizeof(buf)) { return fail(w); }
+ return emit(w, buf, count);
+}
+
+int
+spdk_json_write_uint64(struct spdk_json_write_ctx *w, uint64_t val)
+{
+ char buf[32];
+ int count;
+
+ if (begin_value(w)) { return fail(w); }
+ count = snprintf(buf, sizeof(buf), "%" PRIu64, val);
+ if (count <= 0 || (size_t)count >= sizeof(buf)) { return fail(w); }
+ return emit(w, buf, count);
+}
+
+static void
+write_hex_4(void *dest, uint16_t val)
+{
+ uint8_t *p = dest;
+ char hex[] = "0123456789ABCDEF";
+
+ p[0] = hex[(val >> 12)];
+ p[1] = hex[(val >> 8) & 0xF];
+ p[2] = hex[(val >> 4) & 0xF];
+ p[3] = hex[val & 0xF];
+}
+
+static inline int
+write_codepoint(struct spdk_json_write_ctx *w, uint32_t codepoint)
+{
+ static const uint8_t escapes[] = {
+ ['\b'] = 'b',
+ ['\f'] = 'f',
+ ['\n'] = 'n',
+ ['\r'] = 'r',
+ ['\t'] = 't',
+ ['"'] = '"',
+ ['\\'] = '\\',
+ /*
+ * Forward slash (/) is intentionally not converted to an escape
+ * (it is valid unescaped).
+ */
+ };
+ uint16_t high, low;
+ char out[13];
+ size_t out_len;
+
+ if (codepoint < sizeof(escapes) && escapes[codepoint]) {
+ out[0] = '\\';
+ out[1] = escapes[codepoint];
+ out_len = 2;
+ } else if (codepoint >= 0x20 && codepoint < 0x7F) {
+ /*
+ * Encode plain ASCII directly (except 0x7F, since it is really
+ * a control character, despite the JSON spec not considering it one).
+ */
+ out[0] = (uint8_t)codepoint;
+ out_len = 1;
+ } else if (codepoint < 0x10000) {
+ out[0] = '\\';
+ out[1] = 'u';
+ write_hex_4(&out[2], (uint16_t)codepoint);
+ out_len = 6;
+ } else {
+ utf16_encode_surrogate_pair(codepoint, &high, &low);
+ out[0] = '\\';
+ out[1] = 'u';
+ write_hex_4(&out[2], high);
+ out[6] = '\\';
+ out[7] = 'u';
+ write_hex_4(&out[8], low);
+ out_len = 12;
+ }
+
+ return emit(w, out, out_len);
+}
+
+static int
+write_string_or_name(struct spdk_json_write_ctx *w, const char *val, size_t len)
+{
+ const uint8_t *p = val;
+ const uint8_t *end = val + len;
+
+ if (emit(w, "\"", 1)) { return fail(w); }
+
+ while (p != end) {
+ int codepoint_len;
+ uint32_t codepoint;
+
+ codepoint_len = utf8_valid(p, end);
+ switch (codepoint_len) {
+ case 1:
+ codepoint = utf8_decode_unsafe_1(p);
+ break;
+ case 2:
+ codepoint = utf8_decode_unsafe_2(p);
+ break;
+ case 3:
+ codepoint = utf8_decode_unsafe_3(p);
+ break;
+ case 4:
+ codepoint = utf8_decode_unsafe_4(p);
+ break;
+ default:
+ return fail(w);
+ }
+
+ if (write_codepoint(w, codepoint)) { return fail(w); }
+ p += codepoint_len;
+ }
+
+ return emit(w, "\"", 1);
+}
+
+static int
+write_string_or_name_utf16le(struct spdk_json_write_ctx *w, const uint16_t *val, size_t len)
+{
+ const uint16_t *p = val;
+ const uint16_t *end = val + len;
+
+ if (emit(w, "\"", 1)) { return fail(w); }
+
+ while (p != end) {
+ int codepoint_len;
+ uint32_t codepoint;
+
+ codepoint_len = utf16le_valid(p, end);
+ switch (codepoint_len) {
+ case 1:
+ codepoint = from_le16(&p[0]);
+ break;
+ case 2:
+ codepoint = utf16_decode_surrogate_pair(from_le16(&p[0]), from_le16(&p[1]));
+ break;
+ default:
+ return fail(w);
+ }
+
+ if (write_codepoint(w, codepoint)) { return fail(w); }
+ p += codepoint_len;
+ }
+
+ return emit(w, "\"", 1);
+}
+
+int
+spdk_json_write_string_raw(struct spdk_json_write_ctx *w, const char *val, size_t len)
+{
+ if (begin_value(w)) { return fail(w); }
+ return write_string_or_name(w, val, len);
+}
+
+int
+spdk_json_write_string(struct spdk_json_write_ctx *w, const char *val)
+{
+ return spdk_json_write_string_raw(w, val, strlen(val));
+}
+
+int
+spdk_json_write_string_utf16le_raw(struct spdk_json_write_ctx *w, const uint16_t *val, size_t len)
+{
+ if (begin_value(w)) { return fail(w); }
+ return write_string_or_name_utf16le(w, val, len);
+}
+
+int
+spdk_json_write_string_utf16le(struct spdk_json_write_ctx *w, const uint16_t *val)
+{
+ const uint16_t *p;
+ size_t len;
+
+ for (len = 0, p = val; *p; p++) {
+ len++;
+ }
+
+ return spdk_json_write_string_utf16le_raw(w, val, len);
+}
+
+int
+spdk_json_write_string_fmt(struct spdk_json_write_ctx *w, const char *fmt, ...)
+{
+ va_list args;
+ int rc;
+
+ va_start(args, fmt);
+ rc = spdk_json_write_string_fmt_v(w, fmt, args);
+ va_end(args);
+
+ return rc;
+}
+
+int
+spdk_json_write_string_fmt_v(struct spdk_json_write_ctx *w, const char *fmt, va_list args)
+{
+ char *s;
+ int rc;
+
+ s = spdk_vsprintf_alloc(fmt, args);
+ if (s == NULL) {
+ return -1;
+ }
+
+ rc = spdk_json_write_string(w, s);
+ free(s);
+ return rc;
+}
+
+int
+spdk_json_write_array_begin(struct spdk_json_write_ctx *w)
+{
+ if (begin_value(w)) { return fail(w); }
+ w->first_value = true;
+ w->new_indent = true;
+ w->indent++;
+ if (emit(w, "[", 1)) { return fail(w); }
+ return 0;
+}
+
+int
+spdk_json_write_array_end(struct spdk_json_write_ctx *w)
+{
+ w->first_value = false;
+ if (w->indent == 0) { return fail(w); }
+ w->indent--;
+ if (!w->new_indent) {
+ if (emit_fmt(w, "\n", 1)) { return fail(w); }
+ if (emit_indent(w)) { return fail(w); }
+ }
+ w->new_indent = false;
+ return emit(w, "]", 1);
+}
+
+int
+spdk_json_write_object_begin(struct spdk_json_write_ctx *w)
+{
+ if (begin_value(w)) { return fail(w); }
+ w->first_value = true;
+ w->new_indent = true;
+ w->indent++;
+ if (emit(w, "{", 1)) { return fail(w); }
+ return 0;
+}
+
+int
+spdk_json_write_object_end(struct spdk_json_write_ctx *w)
+{
+ w->first_value = false;
+ w->indent--;
+ if (!w->new_indent) {
+ if (emit_fmt(w, "\n", 1)) { return fail(w); }
+ if (emit_indent(w)) { return fail(w); }
+ }
+ w->new_indent = false;
+ return emit(w, "}", 1);
+}
+
+int
+spdk_json_write_name_raw(struct spdk_json_write_ctx *w, const char *name, size_t len)
+{
+ /* TODO: check that container is an object */
+ if (begin_value(w)) { return fail(w); }
+ if (write_string_or_name(w, name, len)) { return fail(w); }
+ w->first_value = true;
+ if (emit(w, ":", 1)) { return fail(w); }
+ return emit_fmt(w, " ", 1);
+}
+
+int
+spdk_json_write_name(struct spdk_json_write_ctx *w, const char *name)
+{
+ return spdk_json_write_name_raw(w, name, strlen(name));
+}
+
+int
+spdk_json_write_val(struct spdk_json_write_ctx *w, const struct spdk_json_val *val)
+{
+ size_t num_values, i;
+
+ switch (val->type) {
+ case SPDK_JSON_VAL_NUMBER:
+ return spdk_json_write_val_raw(w, val->start, val->len);
+
+ case SPDK_JSON_VAL_STRING:
+ return spdk_json_write_string_raw(w, val->start, val->len);
+
+ case SPDK_JSON_VAL_NAME:
+ return spdk_json_write_name_raw(w, val->start, val->len);
+
+ case SPDK_JSON_VAL_TRUE:
+ return spdk_json_write_bool(w, true);
+
+ case SPDK_JSON_VAL_FALSE:
+ return spdk_json_write_bool(w, false);
+
+ case SPDK_JSON_VAL_NULL:
+ return spdk_json_write_null(w);
+
+ case SPDK_JSON_VAL_ARRAY_BEGIN:
+ case SPDK_JSON_VAL_OBJECT_BEGIN:
+ num_values = val[0].len;
+
+ if (val[0].type == SPDK_JSON_VAL_OBJECT_BEGIN) {
+ if (spdk_json_write_object_begin(w)) {
+ return fail(w);
+ }
+ } else {
+ if (spdk_json_write_array_begin(w)) {
+ return fail(w);
+ }
+ }
+
+ /* Loop up to and including the _END value */
+ for (i = 0; i < num_values + 1;) {
+ if (spdk_json_write_val(w, &val[i + 1])) {
+ return fail(w);
+ }
+ if (val[i + 1].type == SPDK_JSON_VAL_ARRAY_BEGIN ||
+ val[i + 1].type == SPDK_JSON_VAL_OBJECT_BEGIN) {
+ i += val[i + 1].len + 2;
+ } else {
+ i++;
+ }
+ }
+ return 0;
+
+ case SPDK_JSON_VAL_ARRAY_END:
+ return spdk_json_write_array_end(w);
+
+ case SPDK_JSON_VAL_OBJECT_END:
+ return spdk_json_write_object_end(w);
+
+ case SPDK_JSON_VAL_INVALID:
+ /* Handle INVALID to make the compiler happy (and catch other unhandled types) */
+ return fail(w);
+ }
+
+ return fail(w);
+}
+
+int spdk_json_write_named_null(struct spdk_json_write_ctx *w, const char *name)
+{
+ int rc = spdk_json_write_name(w, name);
+ return rc ? rc : spdk_json_write_null(w);
+}
+
+int spdk_json_write_named_bool(struct spdk_json_write_ctx *w, const char *name, bool val)
+{
+ int rc = spdk_json_write_name(w, name);
+
+ return rc ? rc : spdk_json_write_bool(w, val);
+}
+
+int spdk_json_write_named_int32(struct spdk_json_write_ctx *w, const char *name, int32_t val)
+{
+ int rc = spdk_json_write_name(w, name);
+
+ return rc ? rc : spdk_json_write_int32(w, val);
+}
+
+int spdk_json_write_named_uint32(struct spdk_json_write_ctx *w, const char *name, uint32_t val)
+{
+ int rc = spdk_json_write_name(w, name);
+
+ return rc ? rc : spdk_json_write_uint32(w, val);
+}
+
+int spdk_json_write_named_uint64(struct spdk_json_write_ctx *w, const char *name, uint64_t val)
+{
+ int rc = spdk_json_write_name(w, name);
+
+ return rc ? rc : spdk_json_write_uint64(w, val);
+}
+
+int spdk_json_write_named_int64(struct spdk_json_write_ctx *w, const char *name, int64_t val)
+{
+ int rc = spdk_json_write_name(w, name);
+
+ return rc ? rc : spdk_json_write_int64(w, val);
+}
+
+int spdk_json_write_named_string(struct spdk_json_write_ctx *w, const char *name, const char *val)
+{
+ int rc = spdk_json_write_name(w, name);
+
+ return rc ? rc : spdk_json_write_string(w, val);
+}
+
+int spdk_json_write_named_string_fmt(struct spdk_json_write_ctx *w, const char *name,
+ const char *fmt, ...)
+{
+ va_list args;
+ int rc;
+
+ va_start(args, fmt);
+ rc = spdk_json_write_named_string_fmt_v(w, name, fmt, args);
+ va_end(args);
+
+ return rc;
+}
+
+int spdk_json_write_named_string_fmt_v(struct spdk_json_write_ctx *w, const char *name,
+ const char *fmt, va_list args)
+{
+ char *s;
+ int rc;
+
+ rc = spdk_json_write_name(w, name);
+ if (rc) {
+ return rc;
+ }
+
+ s = spdk_vsprintf_alloc(fmt, args);
+
+ if (s == NULL) {
+ return -1;
+ }
+
+ rc = spdk_json_write_string(w, s);
+ free(s);
+ return rc;
+}
+
+int spdk_json_write_named_array_begin(struct spdk_json_write_ctx *w, const char *name)
+{
+ int rc = spdk_json_write_name(w, name);
+
+ return rc ? rc : spdk_json_write_array_begin(w);
+}
+
+int spdk_json_write_named_object_begin(struct spdk_json_write_ctx *w, const char *name)
+{
+ int rc = spdk_json_write_name(w, name);
+
+ return rc ? rc : spdk_json_write_object_begin(w);
+}
diff --git a/src/spdk/lib/json/spdk_json.map b/src/spdk/lib/json/spdk_json.map
new file mode 100644
index 000000000..0699feaad
--- /dev/null
+++ b/src/spdk/lib/json/spdk_json.map
@@ -0,0 +1,67 @@
+{
+ global:
+
+ # public functions
+ spdk_json_parse;
+ spdk_json_decode_object;
+ spdk_json_decode_array;
+ spdk_json_decode_bool;
+ spdk_json_decode_uint16;
+ spdk_json_decode_int32;
+ spdk_json_decode_uint32;
+ spdk_json_decode_uint64;
+ spdk_json_decode_string;
+
+ spdk_json_val_len;
+ spdk_json_strequal;
+ spdk_json_strdup;
+
+ spdk_json_number_to_uint16;
+ spdk_json_number_to_int32;
+ spdk_json_number_to_uint32;
+ spdk_json_number_to_uint64;
+
+ spdk_json_write_begin;
+ spdk_json_write_end;
+ spdk_json_write_null;
+ spdk_json_write_bool;
+ spdk_json_write_int32;
+ spdk_json_write_uint32;
+ spdk_json_write_int64;
+ spdk_json_write_uint64;
+ spdk_json_write_string;
+ spdk_json_write_string_raw;
+ spdk_json_write_string_utf16le;
+ spdk_json_write_string_utf16le_raw;
+ spdk_json_write_string_fmt;
+ spdk_json_write_string_fmt_v;
+ spdk_json_write_array_begin;
+ spdk_json_write_array_end;
+ spdk_json_write_object_begin;
+ spdk_json_write_object_end;
+ spdk_json_write_name;
+ spdk_json_write_name_raw;
+ spdk_json_write_val;
+ spdk_json_write_val_raw;
+
+ spdk_json_write_named_null;
+ spdk_json_write_named_bool;
+ spdk_json_write_named_int32;
+ spdk_json_write_named_uint32;
+ spdk_json_write_named_uint64;
+ spdk_json_write_named_int64;
+ spdk_json_write_named_string;
+ spdk_json_write_named_string_fmt;
+ spdk_json_write_named_string_fmt_v;
+ spdk_json_write_named_array_begin;
+ spdk_json_write_named_object_begin;
+
+ spdk_json_find;
+ spdk_json_find_string;
+ spdk_json_find_array;
+ spdk_json_object_first;
+ spdk_json_array_first;
+ spdk_json_next;
+
+ local: *;
+};
diff --git a/src/spdk/lib/jsonrpc/Makefile b/src/spdk/lib/jsonrpc/Makefile
new file mode 100644
index 000000000..7eb8dd683
--- /dev/null
+++ b/src/spdk/lib/jsonrpc/Makefile
@@ -0,0 +1,46 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+LIBNAME = jsonrpc
+C_SRCS = jsonrpc_server.c jsonrpc_server_tcp.c
+C_SRCS += jsonrpc_client.c jsonrpc_client_tcp.c
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_jsonrpc.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/jsonrpc/jsonrpc_client.c b/src/spdk/lib/jsonrpc/jsonrpc_client.c
new file mode 100644
index 000000000..e3940a4d4
--- /dev/null
+++ b/src/spdk/lib/jsonrpc/jsonrpc_client.c
@@ -0,0 +1,227 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/util.h"
+#include "jsonrpc_internal.h"
+
+static int
+capture_version(const struct spdk_json_val *val, void *out)
+{
+ const struct spdk_json_val **vptr = out;
+
+ if (spdk_json_strequal(val, "2.0") != true) {
+ return SPDK_JSON_PARSE_INVALID;
+ }
+
+ *vptr = val;
+ return 0;
+}
+
+static int
+capture_id(const struct spdk_json_val *val, void *out)
+{
+ const struct spdk_json_val **vptr = out;
+
+ if (val->type != SPDK_JSON_VAL_STRING && val->type != SPDK_JSON_VAL_NUMBER) {
+ return -EINVAL;
+ }
+
+ *vptr = val;
+ return 0;
+}
+
+static int
+capture_any(const struct spdk_json_val *val, void *out)
+{
+ const struct spdk_json_val **vptr = out;
+
+ *vptr = val;
+ return 0;
+}
+
+static const struct spdk_json_object_decoder jsonrpc_response_decoders[] = {
+ {"jsonrpc", offsetof(struct spdk_jsonrpc_client_response, version), capture_version},
+ {"id", offsetof(struct spdk_jsonrpc_client_response, id), capture_id, true},
+ {"result", offsetof(struct spdk_jsonrpc_client_response, result), capture_any, true},
+ {"error", offsetof(struct spdk_jsonrpc_client_response, error), capture_any, true},
+};
+
+int
+jsonrpc_parse_response(struct spdk_jsonrpc_client *client)
+{
+ struct spdk_jsonrpc_client_response_internal *r;
+ ssize_t rc;
+ size_t buf_len;
+ size_t values_cnt;
+ void *end = NULL;
+
+
+ /* Check to see if we have received a full JSON value. */
+ rc = spdk_json_parse(client->recv_buf, client->recv_offset, NULL, 0, &end, 0);
+ if (rc == SPDK_JSON_PARSE_INCOMPLETE) {
+ return 0;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_RPC_CLIENT, "JSON string is :\n%s\n", client->recv_buf);
+ if (rc < 0 || rc > SPDK_JSONRPC_CLIENT_MAX_VALUES) {
+ SPDK_ERRLOG("JSON parse error (rc: %zd)\n", rc);
+ /*
+ * Can't recover from parse error (no guaranteed resync point in streaming JSON).
+ * Return an error to indicate that the connection should be closed.
+ */
+ return -EINVAL;
+ }
+
+ values_cnt = rc;
+
+ r = calloc(1, sizeof(*r) + sizeof(struct spdk_json_val) * (values_cnt + 1));
+ if (!r) {
+ return -errno;
+ }
+
+ if (client->resp) {
+ free(r);
+ return -ENOSPC;
+ }
+
+ client->resp = r;
+
+ r->buf = client->recv_buf;
+ buf_len = client->recv_offset;
+ r->values_cnt = values_cnt;
+
+ client->recv_buf_size = 0;
+ client->recv_offset = 0;
+ client->recv_buf = NULL;
+
+ /* Decode a second time now that there is a full JSON value available. */
+ rc = spdk_json_parse(r->buf, buf_len, r->values, values_cnt, &end,
+ SPDK_JSON_PARSE_FLAG_DECODE_IN_PLACE);
+ if (rc != (ssize_t)values_cnt) {
+ SPDK_ERRLOG("JSON parse error on second pass (rc: %zd, expected: %zu)\n", rc, values_cnt);
+ goto err;
+ }
+
+ assert(end != NULL);
+
+ if (r->values[0].type != SPDK_JSON_VAL_OBJECT_BEGIN) {
+ SPDK_ERRLOG("top-level JSON value was not object\n");
+ goto err;
+ }
+
+ if (spdk_json_decode_object(r->values, jsonrpc_response_decoders,
+ SPDK_COUNTOF(jsonrpc_response_decoders), &r->jsonrpc)) {
+ goto err;
+ }
+
+ r->ready = 1;
+ return 1;
+
+err:
+ client->resp = NULL;
+ spdk_jsonrpc_client_free_response(&r->jsonrpc);
+ return -EINVAL;
+}
+
+static int
+jsonrpc_client_write_cb(void *cb_ctx, const void *data, size_t size)
+{
+ struct spdk_jsonrpc_client_request *request = cb_ctx;
+ size_t new_size = request->send_buf_size;
+
+ while (new_size - request->send_len < size) {
+ if (new_size >= SPDK_JSONRPC_SEND_BUF_SIZE_MAX) {
+ SPDK_ERRLOG("Send buf exceeded maximum size (%zu)\n",
+ (size_t)SPDK_JSONRPC_SEND_BUF_SIZE_MAX);
+ return -ENOSPC;
+ }
+
+ new_size *= 2;
+ }
+
+ if (new_size != request->send_buf_size) {
+ uint8_t *new_buf;
+
+ new_buf = realloc(request->send_buf, new_size);
+ if (new_buf == NULL) {
+ SPDK_ERRLOG("Resizing send_buf failed (current size %zu, new size %zu)\n",
+ request->send_buf_size, new_size);
+ return -ENOMEM;
+ }
+
+ request->send_buf = new_buf;
+ request->send_buf_size = new_size;
+ }
+
+ memcpy(request->send_buf + request->send_len, data, size);
+ request->send_len += size;
+
+ return 0;
+}
+
+struct spdk_json_write_ctx *
+spdk_jsonrpc_begin_request(struct spdk_jsonrpc_client_request *request, int32_t id,
+ const char *method)
+{
+ struct spdk_json_write_ctx *w;
+
+ w = spdk_json_write_begin(jsonrpc_client_write_cb, request, 0);
+ if (w == NULL) {
+ return NULL;
+ }
+
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "jsonrpc", "2.0");
+
+ if (id >= 0) {
+ spdk_json_write_named_int32(w, "id", id);
+ }
+
+ if (method) {
+ spdk_json_write_named_string(w, "method", method);
+ }
+
+ return w;
+}
+
+void
+spdk_jsonrpc_end_request(struct spdk_jsonrpc_client_request *request, struct spdk_json_write_ctx *w)
+{
+ assert(w != NULL);
+
+ spdk_json_write_object_end(w);
+ spdk_json_write_end(w);
+ jsonrpc_client_write_cb(request, "\n", 1);
+}
+
+SPDK_LOG_REGISTER_COMPONENT("rpc_client", SPDK_LOG_RPC_CLIENT)
diff --git a/src/spdk/lib/jsonrpc/jsonrpc_client_tcp.c b/src/spdk/lib/jsonrpc/jsonrpc_client_tcp.c
new file mode 100644
index 000000000..512f6261c
--- /dev/null
+++ b/src/spdk/lib/jsonrpc/jsonrpc_client_tcp.c
@@ -0,0 +1,431 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include "spdk/string.h"
+#include "jsonrpc_internal.h"
+#include "spdk/util.h"
+
+#define RPC_DEFAULT_PORT "5260"
+
+static int
+jsonrpc_client_send_request(struct spdk_jsonrpc_client *client)
+{
+ ssize_t rc;
+ struct spdk_jsonrpc_client_request *request = client->request;
+
+ if (!request) {
+ return 0;
+ }
+
+ if (request->send_len > 0) {
+ rc = send(client->sockfd, request->send_buf + request->send_offset,
+ request->send_len, 0);
+ if (rc < 0) {
+ /* For EINTR we pretend that nothing was send. */
+ if (errno == EINTR) {
+ rc = 0;
+ } else {
+ rc = -errno;
+ SPDK_ERRLOG("poll() failed (%d): %s\n", errno, spdk_strerror(errno));
+ }
+
+ return rc;
+ }
+
+ request->send_offset += rc;
+ request->send_len -= rc;
+ }
+
+ if (request->send_len == 0) {
+ client->request = NULL;
+ spdk_jsonrpc_client_free_request(request);
+ }
+
+ return 0;
+}
+
+static int
+recv_buf_expand(struct spdk_jsonrpc_client *client)
+{
+ uint8_t *new_buf;
+
+ if (client->recv_buf_size * 2 > SPDK_JSONRPC_SEND_BUF_SIZE_MAX) {
+ return -ENOSPC;
+ }
+
+ new_buf = realloc(client->recv_buf, client->recv_buf_size * 2);
+ if (new_buf == NULL) {
+ SPDK_ERRLOG("Resizing recv_buf failed (current size %zu, new size %zu)\n",
+ client->recv_buf_size, client->recv_buf_size * 2);
+ return -ENOMEM;
+ }
+
+ client->recv_buf = new_buf;
+ client->recv_buf_size *= 2;
+
+ return 0;
+}
+
+static int
+jsonrpc_client_resp_ready_count(struct spdk_jsonrpc_client *client)
+{
+ return client->resp != NULL && client->resp->ready ? 1 : 0;
+}
+
+static int
+jsonrpc_client_recv(struct spdk_jsonrpc_client *client)
+{
+ ssize_t rc;
+
+ if (client->recv_buf == NULL) {
+ client->recv_buf = malloc(SPDK_JSONRPC_SEND_BUF_SIZE_INIT);
+ if (!client->recv_buf) {
+ rc = errno;
+ SPDK_ERRLOG("malloc() failed (%d): %s\n", (int)rc, spdk_strerror(rc));
+ return -rc;
+ }
+ client->recv_buf_size = SPDK_JSONRPC_SEND_BUF_SIZE_INIT;
+ client->recv_offset = 0;
+ } else if (client->recv_offset == client->recv_buf_size - 1) {
+ rc = recv_buf_expand(client);
+ if (rc) {
+ return rc;
+ }
+ }
+
+ rc = recv(client->sockfd, client->recv_buf + client->recv_offset,
+ client->recv_buf_size - client->recv_offset - 1, 0);
+ if (rc < 0) {
+ /* For EINTR we pretend that nothing was reveived. */
+ if (errno == EINTR) {
+ return 0;
+ } else {
+ rc = -errno;
+ SPDK_ERRLOG("recv() failed (%d): %s\n", errno, spdk_strerror(errno));
+ return rc;
+ }
+ } else if (rc == 0) {
+ return -EIO;
+ }
+
+ client->recv_offset += rc;
+ client->recv_buf[client->recv_offset] = '\0';
+
+ /* Check to see if we have received a full JSON value. */
+ return jsonrpc_parse_response(client);
+}
+
+static int
+jsonrpc_client_poll(struct spdk_jsonrpc_client *client, int timeout)
+{
+ int rc;
+ struct pollfd pfd = { .fd = client->sockfd, .events = POLLIN | POLLOUT };
+
+ rc = poll(&pfd, 1, timeout);
+ if (rc == -1) {
+ if (errno == EINTR) {
+ /* For EINTR we pretend that nothing was received nor send. */
+ rc = 0;
+ } else {
+ rc = -errno;
+ SPDK_ERRLOG("poll() failed (%d): %s\n", errno, spdk_strerror(errno));
+ }
+ } else if (rc > 0) {
+ rc = 0;
+
+ if (pfd.revents & POLLOUT) {
+ rc = jsonrpc_client_send_request(client);
+ }
+
+ if (rc == 0 && (pfd.revents & POLLIN)) {
+ rc = jsonrpc_client_recv(client);
+ /* Incomplete message in buffer isn't an error. */
+ if (rc == -EAGAIN) {
+ rc = 0;
+ }
+ }
+ }
+
+ return rc ? rc : jsonrpc_client_resp_ready_count(client);
+}
+
+static int
+jsonrpc_client_poll_connecting(struct spdk_jsonrpc_client *client, int timeout)
+{
+ socklen_t rc_len;
+ int rc;
+
+ struct pollfd pfd = {
+ .fd = client->sockfd,
+ .events = POLLOUT
+ };
+
+ rc = poll(&pfd, 1, timeout);
+ if (rc == 0) {
+ return -ENOTCONN;
+ } else if (rc == -1) {
+ if (errno != EINTR) {
+ SPDK_ERRLOG("poll() failed (%d): %s\n", errno, spdk_strerror(errno));
+ goto err;
+ }
+
+ /* We are still not connected. Caller will have to call us again. */
+ return -ENOTCONN;
+ } else if (pfd.revents & ~POLLOUT) {
+ /* We only poll for POLLOUT */
+ goto err;
+ } else if ((pfd.revents & POLLOUT) == 0) {
+ /* Is this even possible to get here? */
+ return -ENOTCONN;
+ }
+
+ rc_len = sizeof(int);
+ /* connection might fail so need to check SO_ERROR. */
+ if (getsockopt(client->sockfd, SOL_SOCKET, SO_ERROR, &rc, &rc_len) == -1) {
+ goto err;
+ }
+
+ if (rc == 0) {
+ client->connected = true;
+ return 0;
+ }
+
+err:
+ return -EIO;
+}
+
+static int
+jsonrpc_client_connect(struct spdk_jsonrpc_client *client, int domain, int protocol,
+ struct sockaddr *server_addr, socklen_t addrlen)
+{
+ int rc, flags;
+
+ client->sockfd = socket(domain, SOCK_STREAM, protocol);
+ if (client->sockfd < 0) {
+ rc = errno;
+ SPDK_ERRLOG("socket() failed\n");
+ return -rc;
+ }
+
+ flags = fcntl(client->sockfd, F_GETFL);
+ if (flags < 0 || fcntl(client->sockfd, F_SETFL, flags | O_NONBLOCK) < 0) {
+ rc = errno;
+ SPDK_ERRLOG("fcntl(): can't set nonblocking mode for socket (%d): %s\n",
+ errno, spdk_strerror(errno));
+ goto err;
+ }
+
+ rc = connect(client->sockfd, server_addr, addrlen);
+ if (rc != 0) {
+ rc = errno;
+ if (rc != EINPROGRESS) {
+ SPDK_ERRLOG("could not connect to JSON-RPC server: %s\n", spdk_strerror(errno));
+ goto err;
+ }
+ } else {
+ client->connected = true;
+ }
+
+ return -rc;
+err:
+ close(client->sockfd);
+ client->sockfd = -1;
+ return -rc;
+}
+
+struct spdk_jsonrpc_client *
+spdk_jsonrpc_client_connect(const char *addr, int addr_family)
+{
+ struct spdk_jsonrpc_client *client = calloc(1, sizeof(struct spdk_jsonrpc_client));
+ /* Unix Domain Socket */
+ struct sockaddr_un addr_un = {};
+ char *add_in = NULL;
+ int rc;
+
+ if (client == NULL) {
+ SPDK_ERRLOG("%s\n", spdk_strerror(errno));
+ return NULL;
+ }
+
+ if (addr_family == AF_UNIX) {
+ addr_un.sun_family = AF_UNIX;
+ rc = snprintf(addr_un.sun_path, sizeof(addr_un.sun_path), "%s", addr);
+ if (rc < 0 || (size_t)rc >= sizeof(addr_un.sun_path)) {
+ rc = -EINVAL;
+ SPDK_ERRLOG("RPC Listen address Unix socket path too long\n");
+ goto err;
+ }
+
+ rc = jsonrpc_client_connect(client, AF_UNIX, 0, (struct sockaddr *)&addr_un, sizeof(addr_un));
+ } else {
+ /* TCP/IP socket */
+ struct addrinfo hints;
+ struct addrinfo *res;
+ char *host, *port;
+
+ add_in = strdup(addr);
+ if (!add_in) {
+ rc = -errno;
+ SPDK_ERRLOG("%s\n", spdk_strerror(errno));
+ goto err;
+ }
+
+ rc = spdk_parse_ip_addr(add_in, &host, &port);
+ if (rc) {
+ SPDK_ERRLOG("Invalid listen address '%s'\n", addr);
+ goto err;
+ }
+
+ if (port == NULL) {
+ port = RPC_DEFAULT_PORT;
+ }
+
+ memset(&hints, 0, sizeof(hints));
+ hints.ai_family = AF_UNSPEC;
+ hints.ai_socktype = SOCK_STREAM;
+ hints.ai_protocol = IPPROTO_TCP;
+
+ rc = getaddrinfo(host, port, &hints, &res);
+ if (rc != 0) {
+ SPDK_ERRLOG("Unable to look up RPC connnect address '%s' (%d): %s\n", addr, rc, gai_strerror(rc));
+ rc = -EINVAL;
+ goto err;
+ }
+
+ rc = jsonrpc_client_connect(client, res->ai_family, res->ai_protocol, res->ai_addr,
+ res->ai_addrlen);
+ freeaddrinfo(res);
+ }
+
+err:
+ if (rc != 0 && rc != -EINPROGRESS) {
+ free(client);
+ client = NULL;
+ errno = -rc;
+ }
+
+ free(add_in);
+ return client;
+}
+
+void
+spdk_jsonrpc_client_close(struct spdk_jsonrpc_client *client)
+{
+ if (client->sockfd >= 0) {
+ close(client->sockfd);
+ }
+
+ free(client->recv_buf);
+ if (client->resp) {
+ spdk_jsonrpc_client_free_response(&client->resp->jsonrpc);
+ }
+
+ free(client);
+}
+
+struct spdk_jsonrpc_client_request *
+spdk_jsonrpc_client_create_request(void)
+{
+ struct spdk_jsonrpc_client_request *request;
+
+ request = calloc(1, sizeof(*request));
+ if (request == NULL) {
+ return NULL;
+ }
+
+ /* memory malloc for send-buf */
+ request->send_buf = malloc(SPDK_JSONRPC_SEND_BUF_SIZE_INIT);
+ if (!request->send_buf) {
+ SPDK_ERRLOG("memory malloc for send-buf failed\n");
+ free(request);
+ return NULL;
+ }
+ request->send_buf_size = SPDK_JSONRPC_SEND_BUF_SIZE_INIT;
+
+ return request;
+}
+
+void
+spdk_jsonrpc_client_free_request(struct spdk_jsonrpc_client_request *req)
+{
+ free(req->send_buf);
+ free(req);
+}
+
+int
+spdk_jsonrpc_client_poll(struct spdk_jsonrpc_client *client, int timeout)
+{
+ if (client->connected) {
+ return jsonrpc_client_poll(client, timeout);
+ } else {
+ return jsonrpc_client_poll_connecting(client, timeout);
+ }
+}
+
+int spdk_jsonrpc_client_send_request(struct spdk_jsonrpc_client *client,
+ struct spdk_jsonrpc_client_request *req)
+{
+ if (client->request != NULL) {
+ return -ENOSPC;
+ }
+
+ client->request = req;
+ return 0;
+}
+
+struct spdk_jsonrpc_client_response *
+spdk_jsonrpc_client_get_response(struct spdk_jsonrpc_client *client)
+{
+ struct spdk_jsonrpc_client_response_internal *r;
+
+ r = client->resp;
+ if (r == NULL || r->ready == false) {
+ return NULL;
+ }
+
+ client->resp = NULL;
+ return &r->jsonrpc;
+}
+
+void
+spdk_jsonrpc_client_free_response(struct spdk_jsonrpc_client_response *resp)
+{
+ struct spdk_jsonrpc_client_response_internal *r;
+
+ if (!resp) {
+ return;
+ }
+
+ r = SPDK_CONTAINEROF(resp, struct spdk_jsonrpc_client_response_internal, jsonrpc);
+ free(r->buf);
+ free(r);
+}
diff --git a/src/spdk/lib/jsonrpc/jsonrpc_internal.h b/src/spdk/lib/jsonrpc/jsonrpc_internal.h
new file mode 100644
index 000000000..f51bedf62
--- /dev/null
+++ b/src/spdk/lib/jsonrpc/jsonrpc_internal.h
@@ -0,0 +1,166 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_JSONRPC_INTERNAL_H_
+#define SPDK_JSONRPC_INTERNAL_H_
+
+#include "spdk/stdinc.h"
+
+#include "spdk/jsonrpc.h"
+
+#include "spdk_internal/log.h"
+
+#define SPDK_JSONRPC_RECV_BUF_SIZE (32 * 1024)
+#define SPDK_JSONRPC_SEND_BUF_SIZE_INIT (32 * 1024)
+#define SPDK_JSONRPC_SEND_BUF_SIZE_MAX (32 * 1024 * 1024)
+#define SPDK_JSONRPC_ID_MAX_LEN 128
+#define SPDK_JSONRPC_MAX_CONNS 64
+#define SPDK_JSONRPC_MAX_VALUES 1024
+#define SPDK_JSONRPC_CLIENT_MAX_VALUES 8192
+
+struct spdk_jsonrpc_request {
+ struct spdk_jsonrpc_server_conn *conn;
+
+ /* Copy of request id value */
+ const struct spdk_json_val *id;
+
+ /* Total space allocated for send_buf */
+ size_t send_buf_size;
+
+ /* Number of bytes used in send_buf (<= send_buf_size) */
+ size_t send_len;
+
+ size_t send_offset;
+
+ uint8_t *recv_buffer;
+ struct spdk_json_val *values;
+ size_t values_cnt;
+
+ uint8_t *send_buf;
+
+ struct spdk_json_write_ctx *response;
+
+ STAILQ_ENTRY(spdk_jsonrpc_request) link;
+};
+
+struct spdk_jsonrpc_server_conn {
+ struct spdk_jsonrpc_server *server;
+ int sockfd;
+ bool closed;
+ size_t recv_len;
+ uint8_t recv_buf[SPDK_JSONRPC_RECV_BUF_SIZE];
+ uint32_t outstanding_requests;
+
+ pthread_spinlock_t queue_lock;
+ STAILQ_HEAD(, spdk_jsonrpc_request) send_queue;
+
+ struct spdk_jsonrpc_request *send_request;
+
+ spdk_jsonrpc_conn_closed_fn close_cb;
+ void *close_cb_ctx;
+
+ TAILQ_ENTRY(spdk_jsonrpc_server_conn) link;
+};
+
+struct spdk_jsonrpc_server {
+ int sockfd;
+ spdk_jsonrpc_handle_request_fn handle_request;
+
+ TAILQ_HEAD(, spdk_jsonrpc_server_conn) free_conns;
+ TAILQ_HEAD(, spdk_jsonrpc_server_conn) conns;
+
+ struct spdk_jsonrpc_server_conn conns_array[SPDK_JSONRPC_MAX_CONNS];
+};
+
+struct spdk_jsonrpc_client_request {
+ /* Total space allocated for send_buf */
+ size_t send_buf_size;
+
+ /* Number of bytes used in send_buf (<= send_buf_size) */
+ size_t send_len;
+
+ size_t send_offset;
+
+ uint8_t *send_buf;
+};
+
+struct spdk_jsonrpc_client_response_internal {
+ struct spdk_jsonrpc_client_response jsonrpc;
+ bool ready;
+ uint8_t *buf;
+ size_t values_cnt;
+ struct spdk_json_val values[];
+};
+
+struct spdk_jsonrpc_client {
+ int sockfd;
+ bool connected;
+
+ size_t recv_buf_size;
+ size_t recv_offset;
+ char *recv_buf;
+
+ /* Parsed response */
+ struct spdk_jsonrpc_client_response_internal *resp;
+ struct spdk_jsonrpc_client_request *request;
+};
+
+/* jsonrpc_server_tcp */
+void jsonrpc_server_handle_request(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *method,
+ const struct spdk_json_val *params);
+void jsonrpc_server_handle_error(struct spdk_jsonrpc_request *request, int error);
+
+/* Might be called from any thread */
+void jsonrpc_server_send_response(struct spdk_jsonrpc_request *request);
+
+/* jsonrpc_server */
+int jsonrpc_parse_request(struct spdk_jsonrpc_server_conn *conn, const void *json,
+ size_t size);
+
+/* Must be called only from server poll thread */
+void jsonrpc_free_request(struct spdk_jsonrpc_request *request);
+
+/*
+ * Parse JSON data as RPC command response.
+ *
+ * \param client structure pointer of jsonrpc client
+ *
+ * \return 0 On success. Negative error code in error
+ * -EAGAIN - If the provided data is not a complete JSON value (SPDK_JSON_PARSE_INCOMPLETE)
+ * -EINVAL - If the provided data has invalid JSON syntax and can't be parsed (SPDK_JSON_PARSE_INVALID).
+ * -ENOSPC - No space left to store parsed response.
+ */
+int jsonrpc_parse_response(struct spdk_jsonrpc_client *client);
+
+#endif
diff --git a/src/spdk/lib/jsonrpc/jsonrpc_server.c b/src/spdk/lib/jsonrpc/jsonrpc_server.c
new file mode 100644
index 000000000..774612b25
--- /dev/null
+++ b/src/spdk/lib/jsonrpc/jsonrpc_server.c
@@ -0,0 +1,361 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "jsonrpc_internal.h"
+
+#include "spdk/util.h"
+
+struct jsonrpc_request {
+ const struct spdk_json_val *version;
+ const struct spdk_json_val *method;
+ const struct spdk_json_val *params;
+ const struct spdk_json_val *id;
+};
+
+static int
+capture_val(const struct spdk_json_val *val, void *out)
+{
+ const struct spdk_json_val **vptr = out;
+
+ *vptr = val;
+ return 0;
+}
+
+static const struct spdk_json_object_decoder jsonrpc_request_decoders[] = {
+ {"jsonrpc", offsetof(struct jsonrpc_request, version), capture_val, true},
+ {"method", offsetof(struct jsonrpc_request, method), capture_val},
+ {"params", offsetof(struct jsonrpc_request, params), capture_val, true},
+ {"id", offsetof(struct jsonrpc_request, id), capture_val, true},
+};
+
+static void
+parse_single_request(struct spdk_jsonrpc_request *request, struct spdk_json_val *values)
+{
+ struct jsonrpc_request req = {};
+ const struct spdk_json_val *params = NULL;
+
+ if (spdk_json_decode_object(values, jsonrpc_request_decoders,
+ SPDK_COUNTOF(jsonrpc_request_decoders),
+ &req)) {
+ goto invalid;
+ }
+
+ if (req.version && (req.version->type != SPDK_JSON_VAL_STRING ||
+ !spdk_json_strequal(req.version, "2.0"))) {
+ goto invalid;
+ }
+
+ if (!req.method || req.method->type != SPDK_JSON_VAL_STRING) {
+ goto invalid;
+ }
+
+ if (req.id) {
+ if (req.id->type == SPDK_JSON_VAL_STRING ||
+ req.id->type == SPDK_JSON_VAL_NUMBER ||
+ req.id->type == SPDK_JSON_VAL_NULL) {
+ request->id = req.id;
+ } else {
+ goto invalid;
+ }
+ }
+
+ if (req.params) {
+ /* null json value is as if there were no parameters */
+ if (req.params->type != SPDK_JSON_VAL_NULL) {
+ if (req.params->type != SPDK_JSON_VAL_ARRAY_BEGIN &&
+ req.params->type != SPDK_JSON_VAL_OBJECT_BEGIN) {
+ goto invalid;
+ }
+ params = req.params;
+ }
+ }
+
+ jsonrpc_server_handle_request(request, req.method, params);
+ return;
+
+invalid:
+ jsonrpc_server_handle_error(request, SPDK_JSONRPC_ERROR_INVALID_REQUEST);
+}
+
+static int
+jsonrpc_server_write_cb(void *cb_ctx, const void *data, size_t size)
+{
+ struct spdk_jsonrpc_request *request = cb_ctx;
+ size_t new_size = request->send_buf_size;
+
+ while (new_size - request->send_len < size) {
+ if (new_size >= SPDK_JSONRPC_SEND_BUF_SIZE_MAX) {
+ SPDK_ERRLOG("Send buf exceeded maximum size (%zu)\n",
+ (size_t)SPDK_JSONRPC_SEND_BUF_SIZE_MAX);
+ return -1;
+ }
+
+ new_size *= 2;
+ }
+
+ if (new_size != request->send_buf_size) {
+ uint8_t *new_buf;
+
+ new_buf = realloc(request->send_buf, new_size);
+ if (new_buf == NULL) {
+ SPDK_ERRLOG("Resizing send_buf failed (current size %zu, new size %zu)\n",
+ request->send_buf_size, new_size);
+ return -1;
+ }
+
+ request->send_buf = new_buf;
+ request->send_buf_size = new_size;
+ }
+
+ memcpy(request->send_buf + request->send_len, data, size);
+ request->send_len += size;
+
+ return 0;
+}
+
+int
+jsonrpc_parse_request(struct spdk_jsonrpc_server_conn *conn, const void *json, size_t size)
+{
+ struct spdk_jsonrpc_request *request;
+ ssize_t rc;
+ size_t len;
+ void *end = NULL;
+
+ /* Check to see if we have received a full JSON value. It is safe to cast away const
+ * as we don't decode in place. */
+ rc = spdk_json_parse((void *)json, size, NULL, 0, &end, 0);
+ if (rc == SPDK_JSON_PARSE_INCOMPLETE) {
+ return 0;
+ }
+
+ request = calloc(1, sizeof(*request));
+ if (request == NULL) {
+ SPDK_DEBUGLOG(SPDK_LOG_RPC, "Out of memory allocating request\n");
+ return -1;
+ }
+
+ conn->outstanding_requests++;
+
+ request->conn = conn;
+
+ len = end - json;
+ request->recv_buffer = malloc(len + 1);
+ if (request->recv_buffer == NULL) {
+ SPDK_ERRLOG("Failed to allocate buffer to copy request (%zu bytes)\n", len + 1);
+ jsonrpc_free_request(request);
+ return -1;
+ }
+
+ memcpy(request->recv_buffer, json, len);
+ request->recv_buffer[len] = '\0';
+
+ if (rc > 0 && rc <= SPDK_JSONRPC_MAX_VALUES) {
+ request->values_cnt = rc;
+ request->values = malloc(request->values_cnt * sizeof(request->values[0]));
+ if (request->values == NULL) {
+ SPDK_ERRLOG("Failed to allocate buffer for JSON values (%zu bytes)\n",
+ request->values_cnt * sizeof(request->values[0]));
+ jsonrpc_free_request(request);
+ return -1;
+ }
+ }
+
+ request->send_offset = 0;
+ request->send_len = 0;
+ request->send_buf_size = SPDK_JSONRPC_SEND_BUF_SIZE_INIT;
+ request->send_buf = malloc(request->send_buf_size);
+ if (request->send_buf == NULL) {
+ SPDK_ERRLOG("Failed to allocate send_buf (%zu bytes)\n", request->send_buf_size);
+ jsonrpc_free_request(request);
+ return -1;
+ }
+
+ request->response = spdk_json_write_begin(jsonrpc_server_write_cb, request, 0);
+ if (request->response == NULL) {
+ SPDK_ERRLOG("Failed to allocate response JSON write context.\n");
+ jsonrpc_free_request(request);
+ return -1;
+ }
+
+ if (rc <= 0 || rc > SPDK_JSONRPC_MAX_VALUES) {
+ SPDK_DEBUGLOG(SPDK_LOG_RPC, "JSON parse error\n");
+ jsonrpc_server_handle_error(request, SPDK_JSONRPC_ERROR_PARSE_ERROR);
+
+ /*
+ * Can't recover from parse error (no guaranteed resync point in streaming JSON).
+ * Return an error to indicate that the connection should be closed.
+ */
+ return -1;
+ }
+
+ /* Decode a second time now that there is a full JSON value available. */
+ rc = spdk_json_parse(request->recv_buffer, size, request->values, request->values_cnt, &end,
+ SPDK_JSON_PARSE_FLAG_DECODE_IN_PLACE);
+ if (rc < 0 || rc > SPDK_JSONRPC_MAX_VALUES) {
+ SPDK_DEBUGLOG(SPDK_LOG_RPC, "JSON parse error on second pass\n");
+ jsonrpc_server_handle_error(request, SPDK_JSONRPC_ERROR_PARSE_ERROR);
+ return -1;
+ }
+
+ assert(end != NULL);
+
+ if (request->values[0].type == SPDK_JSON_VAL_OBJECT_BEGIN) {
+ parse_single_request(request, request->values);
+ } else if (request->values[0].type == SPDK_JSON_VAL_ARRAY_BEGIN) {
+ SPDK_DEBUGLOG(SPDK_LOG_RPC, "Got batch array (not currently supported)\n");
+ jsonrpc_server_handle_error(request, SPDK_JSONRPC_ERROR_INVALID_REQUEST);
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_RPC, "top-level JSON value was not array or object\n");
+ jsonrpc_server_handle_error(request, SPDK_JSONRPC_ERROR_INVALID_REQUEST);
+ }
+
+ return len;
+}
+
+struct spdk_jsonrpc_server_conn *
+spdk_jsonrpc_get_conn(struct spdk_jsonrpc_request *request)
+{
+ return request->conn;
+}
+
+/* Never return NULL */
+static struct spdk_json_write_ctx *
+begin_response(struct spdk_jsonrpc_request *request)
+{
+ struct spdk_json_write_ctx *w = request->response;
+
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "jsonrpc", "2.0");
+
+ spdk_json_write_name(w, "id");
+ if (request->id) {
+ spdk_json_write_val(w, request->id);
+ } else {
+ spdk_json_write_null(w);
+ }
+
+ return w;
+}
+
+static void
+skip_response(struct spdk_jsonrpc_request *request)
+{
+ request->send_len = 0;
+ spdk_json_write_end(request->response);
+ request->response = NULL;
+ jsonrpc_server_send_response(request);
+}
+
+static void
+end_response(struct spdk_jsonrpc_request *request)
+{
+ spdk_json_write_object_end(request->response);
+ spdk_json_write_end(request->response);
+ request->response = NULL;
+
+ jsonrpc_server_write_cb(request, "\n", 1);
+ jsonrpc_server_send_response(request);
+}
+
+void
+jsonrpc_free_request(struct spdk_jsonrpc_request *request)
+{
+ if (!request) {
+ return;
+ }
+
+ /* We must send or skip response explicitly */
+ assert(request->response == NULL);
+
+ request->conn->outstanding_requests--;
+ free(request->recv_buffer);
+ free(request->values);
+ free(request->send_buf);
+ free(request);
+}
+
+struct spdk_json_write_ctx *
+spdk_jsonrpc_begin_result(struct spdk_jsonrpc_request *request)
+{
+ struct spdk_json_write_ctx *w = begin_response(request);
+
+ spdk_json_write_name(w, "result");
+ return w;
+}
+
+void
+spdk_jsonrpc_end_result(struct spdk_jsonrpc_request *request, struct spdk_json_write_ctx *w)
+{
+ assert(w != NULL);
+ assert(w == request->response);
+
+ /* If there was no ID in request we skip response. */
+ if (request->id && request->id->type != SPDK_JSON_VAL_NULL) {
+ end_response(request);
+ } else {
+ skip_response(request);
+ }
+}
+
+void
+spdk_jsonrpc_send_error_response(struct spdk_jsonrpc_request *request,
+ int error_code, const char *msg)
+{
+ struct spdk_json_write_ctx *w = begin_response(request);
+
+ spdk_json_write_named_object_begin(w, "error");
+ spdk_json_write_named_int32(w, "code", error_code);
+ spdk_json_write_named_string(w, "message", msg);
+ spdk_json_write_object_end(w);
+
+ end_response(request);
+}
+
+void
+spdk_jsonrpc_send_error_response_fmt(struct spdk_jsonrpc_request *request,
+ int error_code, const char *fmt, ...)
+{
+ struct spdk_json_write_ctx *w = begin_response(request);
+ va_list args;
+
+ spdk_json_write_named_object_begin(w, "error");
+ spdk_json_write_named_int32(w, "code", error_code);
+ va_start(args, fmt);
+ spdk_json_write_named_string_fmt_v(w, "message", fmt, args);
+ va_end(args);
+ spdk_json_write_object_end(w);
+
+ end_response(request);
+}
+
+SPDK_LOG_REGISTER_COMPONENT("rpc", SPDK_LOG_RPC)
diff --git a/src/spdk/lib/jsonrpc/jsonrpc_server_tcp.c b/src/spdk/lib/jsonrpc/jsonrpc_server_tcp.c
new file mode 100644
index 000000000..1e38f713f
--- /dev/null
+++ b/src/spdk/lib/jsonrpc/jsonrpc_server_tcp.c
@@ -0,0 +1,441 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "jsonrpc_internal.h"
+#include "spdk/string.h"
+#include "spdk/util.h"
+
+struct spdk_jsonrpc_server *
+spdk_jsonrpc_server_listen(int domain, int protocol,
+ struct sockaddr *listen_addr, socklen_t addrlen,
+ spdk_jsonrpc_handle_request_fn handle_request)
+{
+ struct spdk_jsonrpc_server *server;
+ int rc, val, flag, i;
+
+ server = calloc(1, sizeof(struct spdk_jsonrpc_server));
+ if (server == NULL) {
+ return NULL;
+ }
+
+ TAILQ_INIT(&server->free_conns);
+ TAILQ_INIT(&server->conns);
+
+ for (i = 0; i < SPDK_JSONRPC_MAX_CONNS; i++) {
+ TAILQ_INSERT_TAIL(&server->free_conns, &server->conns_array[i], link);
+ }
+
+ server->handle_request = handle_request;
+
+ server->sockfd = socket(domain, SOCK_STREAM, protocol);
+ if (server->sockfd < 0) {
+ SPDK_ERRLOG("socket() failed\n");
+ free(server);
+ return NULL;
+ }
+
+ val = 1;
+ setsockopt(server->sockfd, SOL_SOCKET, SO_REUSEADDR, &val, sizeof(val));
+
+ flag = fcntl(server->sockfd, F_GETFL);
+ if (fcntl(server->sockfd, F_SETFL, flag | O_NONBLOCK) < 0) {
+ SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%s)\n",
+ server->sockfd, spdk_strerror(errno));
+ close(server->sockfd);
+ free(server);
+ return NULL;
+ }
+
+ rc = bind(server->sockfd, listen_addr, addrlen);
+ if (rc != 0) {
+ SPDK_ERRLOG("could not bind JSON-RPC server: %s\n", spdk_strerror(errno));
+ close(server->sockfd);
+ free(server);
+ return NULL;
+ }
+
+ rc = listen(server->sockfd, 512);
+ if (rc != 0) {
+ SPDK_ERRLOG("listen() failed, errno = %d\n", errno);
+ close(server->sockfd);
+ free(server);
+ return NULL;
+ }
+
+ return server;
+}
+
+static struct spdk_jsonrpc_request *
+jsonrpc_server_dequeue_request(struct spdk_jsonrpc_server_conn *conn)
+{
+ struct spdk_jsonrpc_request *request = NULL;
+
+ pthread_spin_lock(&conn->queue_lock);
+ request = STAILQ_FIRST(&conn->send_queue);
+ if (request) {
+ STAILQ_REMOVE_HEAD(&conn->send_queue, link);
+ }
+ pthread_spin_unlock(&conn->queue_lock);
+ return request;
+}
+
+static void
+jsonrpc_server_free_conn_request(struct spdk_jsonrpc_server_conn *conn)
+{
+ struct spdk_jsonrpc_request *request;
+
+ jsonrpc_free_request(conn->send_request);
+ conn->send_request = NULL ;
+ while ((request = jsonrpc_server_dequeue_request(conn)) != NULL) {
+ jsonrpc_free_request(request);
+ }
+}
+
+static void
+jsonrpc_server_conn_close(struct spdk_jsonrpc_server_conn *conn)
+{
+ conn->closed = true;
+
+ if (conn->sockfd >= 0) {
+ jsonrpc_server_free_conn_request(conn);
+ close(conn->sockfd);
+ conn->sockfd = -1;
+
+ if (conn->close_cb) {
+ conn->close_cb(conn, conn->close_cb_ctx);
+ }
+ }
+}
+
+void
+spdk_jsonrpc_server_shutdown(struct spdk_jsonrpc_server *server)
+{
+ struct spdk_jsonrpc_server_conn *conn;
+
+ close(server->sockfd);
+
+ TAILQ_FOREACH(conn, &server->conns, link) {
+ jsonrpc_server_conn_close(conn);
+ }
+
+ free(server);
+}
+
+static void
+jsonrpc_server_conn_remove(struct spdk_jsonrpc_server_conn *conn)
+{
+ struct spdk_jsonrpc_server *server = conn->server;
+
+ jsonrpc_server_conn_close(conn);
+
+ pthread_spin_destroy(&conn->queue_lock);
+ assert(STAILQ_EMPTY(&conn->send_queue));
+
+ TAILQ_REMOVE(&server->conns, conn, link);
+ TAILQ_INSERT_HEAD(&server->free_conns, conn, link);
+}
+
+int
+spdk_jsonrpc_conn_add_close_cb(struct spdk_jsonrpc_server_conn *conn,
+ spdk_jsonrpc_conn_closed_fn cb, void *ctx)
+{
+ int rc = 0;
+
+ pthread_spin_lock(&conn->queue_lock);
+ if (conn->close_cb == NULL) {
+ conn->close_cb = cb;
+ conn->close_cb_ctx = ctx;
+ } else {
+ rc = conn->close_cb == cb && conn->close_cb_ctx == ctx ? -EEXIST : -ENOSPC;
+ }
+ pthread_spin_unlock(&conn->queue_lock);
+
+ return rc;
+}
+
+int
+spdk_jsonrpc_conn_del_close_cb(struct spdk_jsonrpc_server_conn *conn,
+ spdk_jsonrpc_conn_closed_fn cb, void *ctx)
+{
+ int rc = 0;
+
+ pthread_spin_lock(&conn->queue_lock);
+ if (conn->close_cb == NULL || conn->close_cb != cb || conn->close_cb_ctx != ctx) {
+ rc = -ENOENT;
+ } else {
+ conn->close_cb = NULL;
+ }
+ pthread_spin_unlock(&conn->queue_lock);
+
+ return rc;
+}
+
+static int
+jsonrpc_server_accept(struct spdk_jsonrpc_server *server)
+{
+ struct spdk_jsonrpc_server_conn *conn;
+ int rc, flag;
+
+ rc = accept(server->sockfd, NULL, NULL);
+ if (rc >= 0) {
+ conn = TAILQ_FIRST(&server->free_conns);
+ assert(conn != NULL);
+
+ conn->server = server;
+ conn->sockfd = rc;
+ conn->closed = false;
+ conn->recv_len = 0;
+ conn->outstanding_requests = 0;
+ STAILQ_INIT(&conn->send_queue);
+ conn->send_request = NULL;
+
+ if (pthread_spin_init(&conn->queue_lock, PTHREAD_PROCESS_PRIVATE)) {
+ SPDK_ERRLOG("Unable to create queue lock for socket: %d", conn->sockfd);
+ close(conn->sockfd);
+ return -1;
+ }
+
+ flag = fcntl(conn->sockfd, F_GETFL);
+ if (fcntl(conn->sockfd, F_SETFL, flag | O_NONBLOCK) < 0) {
+ SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%s)\n",
+ conn->sockfd, spdk_strerror(errno));
+ close(conn->sockfd);
+ pthread_spin_destroy(&conn->queue_lock);
+ return -1;
+ }
+
+ TAILQ_REMOVE(&server->free_conns, conn, link);
+ TAILQ_INSERT_TAIL(&server->conns, conn, link);
+ return 0;
+ }
+
+ if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR) {
+ return 0;
+ }
+
+ return -1;
+}
+
+void
+jsonrpc_server_handle_request(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *method, const struct spdk_json_val *params)
+{
+ request->conn->server->handle_request(request, method, params);
+}
+
+void
+jsonrpc_server_handle_error(struct spdk_jsonrpc_request *request, int error)
+{
+ const char *msg;
+
+ switch (error) {
+ case SPDK_JSONRPC_ERROR_PARSE_ERROR:
+ msg = "Parse error";
+ break;
+
+ case SPDK_JSONRPC_ERROR_INVALID_REQUEST:
+ msg = "Invalid request";
+ break;
+
+ case SPDK_JSONRPC_ERROR_METHOD_NOT_FOUND:
+ msg = "Method not found";
+ break;
+
+ case SPDK_JSONRPC_ERROR_INVALID_PARAMS:
+ msg = "Invalid parameters";
+ break;
+
+ case SPDK_JSONRPC_ERROR_INTERNAL_ERROR:
+ msg = "Internal error";
+ break;
+
+ default:
+ msg = "Error";
+ break;
+ }
+
+ spdk_jsonrpc_send_error_response(request, error, msg);
+}
+
+static int
+jsonrpc_server_conn_recv(struct spdk_jsonrpc_server_conn *conn)
+{
+ ssize_t rc, offset;
+ size_t recv_avail = SPDK_JSONRPC_RECV_BUF_SIZE - conn->recv_len;
+
+ rc = recv(conn->sockfd, conn->recv_buf + conn->recv_len, recv_avail, 0);
+ if (rc == -1) {
+ if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR) {
+ return 0;
+ }
+ SPDK_DEBUGLOG(SPDK_LOG_RPC, "recv() failed: %s\n", spdk_strerror(errno));
+ return -1;
+ }
+
+ if (rc == 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_RPC, "remote closed connection\n");
+ conn->closed = true;
+ return 0;
+ }
+
+ conn->recv_len += rc;
+
+ offset = 0;
+ do {
+ rc = jsonrpc_parse_request(conn, conn->recv_buf + offset, conn->recv_len - offset);
+ if (rc < 0) {
+ SPDK_ERRLOG("jsonrpc parse request failed\n");
+ return -1;
+ }
+
+ offset += rc;
+ } while (rc > 0);
+
+ if (offset > 0) {
+ /*
+ * Successfully parsed a requests - move any data past the end of the
+ * parsed requests down to the beginning.
+ */
+ assert((size_t)offset <= conn->recv_len);
+ memmove(conn->recv_buf, conn->recv_buf + offset, conn->recv_len - offset);
+ conn->recv_len -= offset;
+ }
+
+ return 0;
+}
+
+void
+jsonrpc_server_send_response(struct spdk_jsonrpc_request *request)
+{
+ struct spdk_jsonrpc_server_conn *conn = request->conn;
+
+ /* Queue the response to be sent */
+ pthread_spin_lock(&conn->queue_lock);
+ STAILQ_INSERT_TAIL(&conn->send_queue, request, link);
+ pthread_spin_unlock(&conn->queue_lock);
+}
+
+
+static int
+jsonrpc_server_conn_send(struct spdk_jsonrpc_server_conn *conn)
+{
+ struct spdk_jsonrpc_request *request;
+ ssize_t rc;
+
+more:
+ if (conn->outstanding_requests == 0) {
+ return 0;
+ }
+
+ if (conn->send_request == NULL) {
+ conn->send_request = jsonrpc_server_dequeue_request(conn);
+ }
+
+ request = conn->send_request;
+ if (request == NULL) {
+ /* Nothing to send right now */
+ return 0;
+ }
+
+ if (request->send_len > 0) {
+ rc = send(conn->sockfd, request->send_buf + request->send_offset,
+ request->send_len, 0);
+ if (rc < 0) {
+ if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR) {
+ return 0;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_RPC, "send() failed: %s\n", spdk_strerror(errno));
+ return -1;
+ }
+
+ request->send_offset += rc;
+ request->send_len -= rc;
+ }
+
+ if (request->send_len == 0) {
+ /*
+ * Full response has been sent.
+ * Free it and set send_request to NULL to move on to the next queued response.
+ */
+ conn->send_request = NULL;
+ jsonrpc_free_request(request);
+ goto more;
+ }
+
+ return 0;
+}
+
+int
+spdk_jsonrpc_server_poll(struct spdk_jsonrpc_server *server)
+{
+ int rc;
+ struct spdk_jsonrpc_server_conn *conn, *conn_tmp;
+
+ TAILQ_FOREACH_SAFE(conn, &server->conns, link, conn_tmp) {
+ /* If we can't receive and there are no outstanding requests close the connection. */
+ if (conn->closed == true && conn->outstanding_requests == 0) {
+ jsonrpc_server_conn_close(conn);
+ }
+
+ if (conn->sockfd == -1 && conn->outstanding_requests == 0) {
+ jsonrpc_server_conn_remove(conn);
+ }
+ }
+
+ /* Check listen socket */
+ if (!TAILQ_EMPTY(&server->free_conns)) {
+ jsonrpc_server_accept(server);
+ }
+
+ TAILQ_FOREACH(conn, &server->conns, link) {
+ if (conn->sockfd == -1) {
+ continue;
+ }
+
+ rc = jsonrpc_server_conn_send(conn);
+ if (rc != 0) {
+ jsonrpc_server_conn_close(conn);
+ continue;
+ }
+
+ if (!conn->closed) {
+ rc = jsonrpc_server_conn_recv(conn);
+ if (rc != 0) {
+ jsonrpc_server_conn_close(conn);
+ }
+ }
+ }
+
+ return 0;
+}
diff --git a/src/spdk/lib/jsonrpc/spdk_jsonrpc.map b/src/spdk/lib/jsonrpc/spdk_jsonrpc.map
new file mode 100644
index 000000000..461fd0766
--- /dev/null
+++ b/src/spdk/lib/jsonrpc/spdk_jsonrpc.map
@@ -0,0 +1,28 @@
+{
+ global:
+
+ # public functions
+ spdk_jsonrpc_server_listen;
+ spdk_jsonrpc_server_poll;
+ spdk_jsonrpc_server_shutdown;
+ spdk_jsonrpc_get_conn;
+ spdk_jsonrpc_conn_add_close_cb;
+ spdk_jsonrpc_conn_del_close_cb;
+ spdk_jsonrpc_begin_result;
+ spdk_jsonrpc_end_result;
+ spdk_jsonrpc_send_error_response;
+ spdk_jsonrpc_send_error_response_fmt;
+ spdk_jsonrpc_begin_request;
+ spdk_jsonrpc_end_request;
+ spdk_jsonrpc_client_connect;
+ spdk_jsonrpc_client_close;
+ spdk_jsonrpc_client_create_request;
+ spdk_jsonrpc_client_free_request;
+ spdk_jsonrpc_client_send_request;
+ spdk_jsonrpc_client_poll;
+ spdk_jsonrpc_client_get_response;
+ spdk_jsonrpc_client_free_response;
+
+
+ local: *;
+};
diff --git a/src/spdk/lib/log/Makefile b/src/spdk/lib/log/Makefile
new file mode 100644
index 000000000..4e7c25758
--- /dev/null
+++ b/src/spdk/lib/log/Makefile
@@ -0,0 +1,46 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 3
+SO_MINOR := 0
+SO_SUFFIX := $(SO_VER).$(SO_MINOR)
+
+C_SRCS = log.c log_flags.c
+LIBNAME = log
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_log.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/log/log.c b/src/spdk/lib/log/log.c
new file mode 100644
index 000000000..0ab50d69c
--- /dev/null
+++ b/src/spdk/lib/log/log.c
@@ -0,0 +1,203 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk_internal/log.h"
+
+static const char *const spdk_level_names[] = {
+ [SPDK_LOG_ERROR] = "ERROR",
+ [SPDK_LOG_WARN] = "WARNING",
+ [SPDK_LOG_NOTICE] = "NOTICE",
+ [SPDK_LOG_INFO] = "INFO",
+ [SPDK_LOG_DEBUG] = "DEBUG",
+};
+
+#define MAX_TMPBUF 1024
+
+static logfunc *g_log = NULL;
+
+void
+spdk_log_open(logfunc *logf)
+{
+ if (logf) {
+ g_log = logf;
+ } else {
+ openlog("spdk", LOG_PID, LOG_LOCAL7);
+ }
+}
+
+void
+spdk_log_close(void)
+{
+ if (!g_log) {
+ closelog();
+ }
+}
+
+static void
+get_timestamp_prefix(char *buf, int buf_size)
+{
+ struct tm *info;
+ char date[24];
+ struct timespec ts;
+ long usec;
+
+ clock_gettime(CLOCK_REALTIME, &ts);
+ info = localtime(&ts.tv_sec);
+ usec = ts.tv_nsec / 1000;
+ if (info == NULL) {
+ snprintf(buf, buf_size, "[%s.%06ld] ", "unknown date", usec);
+ return;
+ }
+
+ strftime(date, sizeof(date), "%Y-%m-%d %H:%M:%S", info);
+ snprintf(buf, buf_size, "[%s.%06ld] ", date, usec);
+}
+
+void
+spdk_log(enum spdk_log_level level, const char *file, const int line, const char *func,
+ const char *format, ...)
+{
+ va_list ap;
+
+ va_start(ap, format);
+ spdk_vlog(level, file, line, func, format, ap);
+ va_end(ap);
+}
+
+void
+spdk_vlog(enum spdk_log_level level, const char *file, const int line, const char *func,
+ const char *format, va_list ap)
+{
+ int severity = LOG_INFO;
+ char buf[MAX_TMPBUF];
+ char timestamp[64];
+
+ if (g_log) {
+ g_log(level, file, line, func, format, ap);
+ return;
+ }
+
+ if (level > g_spdk_log_print_level && level > g_spdk_log_level) {
+ return;
+ }
+
+ switch (level) {
+ case SPDK_LOG_ERROR:
+ severity = LOG_ERR;
+ break;
+ case SPDK_LOG_WARN:
+ severity = LOG_WARNING;
+ break;
+ case SPDK_LOG_NOTICE:
+ severity = LOG_NOTICE;
+ break;
+ case SPDK_LOG_INFO:
+ case SPDK_LOG_DEBUG:
+ severity = LOG_INFO;
+ break;
+ case SPDK_LOG_DISABLED:
+ return;
+ }
+
+ vsnprintf(buf, sizeof(buf), format, ap);
+
+ if (level <= g_spdk_log_print_level) {
+ get_timestamp_prefix(timestamp, sizeof(timestamp));
+ if (file) {
+ fprintf(stderr, "%s%s:%4d:%s: *%s*: %s", timestamp, file, line, func, spdk_level_names[level], buf);
+ } else {
+ fprintf(stderr, "%s%s", timestamp, buf);
+ }
+ }
+
+ if (level <= g_spdk_log_level) {
+ if (file) {
+ syslog(severity, "%s:%4d:%s: *%s*: %s", file, line, func, spdk_level_names[level], buf);
+ } else {
+ syslog(severity, "%s", buf);
+ }
+ }
+}
+
+static void
+fdump(FILE *fp, const char *label, const uint8_t *buf, size_t len)
+{
+ char tmpbuf[MAX_TMPBUF];
+ char buf16[16 + 1];
+ size_t total;
+ unsigned int idx;
+
+ fprintf(fp, "%s\n", label);
+
+ memset(buf16, 0, sizeof buf16);
+ total = 0;
+ for (idx = 0; idx < len; idx++) {
+ if (idx != 0 && idx % 16 == 0) {
+ snprintf(tmpbuf + total, sizeof tmpbuf - total,
+ " %s", buf16);
+ memset(buf16, 0, sizeof buf16);
+ fprintf(fp, "%s\n", tmpbuf);
+ total = 0;
+ }
+ if (idx % 16 == 0) {
+ total += snprintf(tmpbuf + total, sizeof tmpbuf - total,
+ "%08x ", idx);
+ }
+ if (idx % 8 == 0) {
+ total += snprintf(tmpbuf + total, sizeof tmpbuf - total,
+ "%s", " ");
+ }
+ total += snprintf(tmpbuf + total, sizeof tmpbuf - total,
+ "%2.2x ", buf[idx] & 0xff);
+ buf16[idx % 16] = isprint(buf[idx]) ? buf[idx] : '.';
+ }
+ for (; idx % 16 != 0; idx++) {
+ if (idx == 8) {
+ total += snprintf(tmpbuf + total, sizeof tmpbuf - total,
+ " ");
+ }
+
+ total += snprintf(tmpbuf + total, sizeof tmpbuf - total, " ");
+ }
+ snprintf(tmpbuf + total, sizeof tmpbuf - total, " %s", buf16);
+ fprintf(fp, "%s\n", tmpbuf);
+ fflush(fp);
+}
+
+void
+spdk_log_dump(FILE *fp, const char *label, const void *buf, size_t len)
+{
+ fdump(fp, label, buf, len);
+}
diff --git a/src/spdk/lib/log/log_flags.c b/src/spdk/lib/log/log_flags.c
new file mode 100644
index 000000000..c767a3786
--- /dev/null
+++ b/src/spdk/lib/log/log_flags.c
@@ -0,0 +1,188 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk_internal/log.h"
+
+static TAILQ_HEAD(, spdk_log_flag) g_log_flags = TAILQ_HEAD_INITIALIZER(g_log_flags);
+
+enum spdk_log_level g_spdk_log_level = SPDK_LOG_NOTICE;
+enum spdk_log_level g_spdk_log_print_level = SPDK_LOG_NOTICE;
+
+SPDK_LOG_REGISTER_COMPONENT("log", SPDK_LOG_LOG)
+
+#define MAX_TMPBUF 1024
+
+void
+spdk_log_set_level(enum spdk_log_level level)
+{
+ assert(level >= SPDK_LOG_DISABLED);
+ assert(level <= SPDK_LOG_DEBUG);
+ g_spdk_log_level = level;
+}
+
+enum spdk_log_level
+spdk_log_get_level(void) {
+ return g_spdk_log_level;
+}
+
+void
+spdk_log_set_print_level(enum spdk_log_level level)
+{
+ assert(level >= SPDK_LOG_DISABLED);
+ assert(level <= SPDK_LOG_DEBUG);
+ g_spdk_log_print_level = level;
+}
+
+enum spdk_log_level
+spdk_log_get_print_level(void) {
+ return g_spdk_log_print_level;
+}
+
+static struct spdk_log_flag *
+get_log_flag(const char *name)
+{
+ struct spdk_log_flag *flag;
+
+ TAILQ_FOREACH(flag, &g_log_flags, tailq) {
+ if (strcasecmp(name, flag->name) == 0) {
+ return flag;
+ }
+ }
+
+ return NULL;
+}
+
+void
+spdk_log_register_flag(const char *name, struct spdk_log_flag *flag)
+{
+ struct spdk_log_flag *iter;
+
+ if (name == NULL || flag == NULL) {
+ SPDK_ERRLOG("missing spdk_log_flag parameters\n");
+ assert(false);
+ return;
+ }
+
+ if (get_log_flag(name)) {
+ SPDK_ERRLOG("duplicate spdk_log_flag '%s'\n", name);
+ assert(false);
+ return;
+ }
+
+ TAILQ_FOREACH(iter, &g_log_flags, tailq) {
+ if (strcasecmp(iter->name, flag->name) > 0) {
+ TAILQ_INSERT_BEFORE(iter, flag, tailq);
+ return;
+ }
+ }
+
+ TAILQ_INSERT_TAIL(&g_log_flags, flag, tailq);
+}
+
+bool
+spdk_log_get_flag(const char *name)
+{
+ struct spdk_log_flag *flag = get_log_flag(name);
+
+ if (flag && flag->enabled) {
+ return true;
+ }
+
+ return false;
+}
+
+static int
+log_set_flag(const char *name, bool value)
+{
+ struct spdk_log_flag *flag;
+
+ if (strcasecmp(name, "all") == 0) {
+ TAILQ_FOREACH(flag, &g_log_flags, tailq) {
+ flag->enabled = value;
+ }
+ return 0;
+ }
+
+ flag = get_log_flag(name);
+ if (flag == NULL) {
+ return -1;
+ }
+
+ flag->enabled = value;
+
+ return 0;
+}
+
+int
+spdk_log_set_flag(const char *name)
+{
+ return log_set_flag(name, true);
+}
+
+int
+spdk_log_clear_flag(const char *name)
+{
+ return log_set_flag(name, false);
+}
+
+struct spdk_log_flag *
+spdk_log_get_first_flag(void)
+{
+ return TAILQ_FIRST(&g_log_flags);
+}
+
+struct spdk_log_flag *
+spdk_log_get_next_flag(struct spdk_log_flag *flag)
+{
+ return TAILQ_NEXT(flag, tailq);
+}
+
+void
+spdk_log_usage(FILE *f, const char *log_arg)
+{
+#ifdef DEBUG
+ struct spdk_log_flag *flag;
+ fprintf(f, " %s, --logflag <flag> enable debug log flag (all", log_arg);
+
+ TAILQ_FOREACH(flag, &g_log_flags, tailq) {
+ fprintf(f, ", %s", flag->name);
+ }
+
+ fprintf(f, ")\n");
+#else
+ fprintf(f, " %s, --logflag <flag> enable debug log flag (not supported"
+ " - must reconfigure with --enable-debug)\n", log_arg);
+#endif
+}
diff --git a/src/spdk/lib/log/spdk_log.map b/src/spdk/lib/log/spdk_log.map
new file mode 100644
index 000000000..84629d555
--- /dev/null
+++ b/src/spdk/lib/log/spdk_log.map
@@ -0,0 +1,25 @@
+{
+ global:
+
+ # public functions
+ spdk_log_open;
+ spdk_log_close;
+ spdk_log_set_level;
+ spdk_log_get_level;
+ spdk_log_set_print_level;
+ spdk_log_get_print_level;
+ spdk_log;
+ spdk_vlog;
+ spdk_log_dump;
+ spdk_log_get_flag;
+ spdk_log_set_flag;
+ spdk_log_clear_flag;
+ spdk_log_usage;
+
+ # functions used by other SPDK libraries
+ spdk_log_register_flag;
+ spdk_log_get_first_flag;
+ spdk_log_get_next_flag;
+
+ local: *;
+};
diff --git a/src/spdk/lib/log_rpc/Makefile b/src/spdk/lib/log_rpc/Makefile
new file mode 100644
index 000000000..2c7a78deb
--- /dev/null
+++ b/src/spdk/lib/log_rpc/Makefile
@@ -0,0 +1,45 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = log_rpc.c
+LIBNAME = log_rpc
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_log_rpc.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/log_rpc/log_rpc.c b/src/spdk/lib/log_rpc/log_rpc.c
new file mode 100644
index 000000000..78b74c1f5
--- /dev/null
+++ b/src/spdk/lib/log_rpc/log_rpc.c
@@ -0,0 +1,340 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/rpc.h"
+#include "spdk/util.h"
+
+#include "spdk_internal/log.h"
+
+struct rpc_log_flag {
+ char *flag;
+};
+
+struct rpc_log_level {
+ char *level;
+};
+
+static void
+free_rpc_log_flag(struct rpc_log_flag *p)
+{
+ free(p->flag);
+}
+
+static void
+free_rpc_log_level(struct rpc_log_level *p)
+{
+ free(p->level);
+}
+
+static const struct spdk_json_object_decoder rpc_log_flag_decoders[] = {
+ {"flag", offsetof(struct rpc_log_flag, flag), spdk_json_decode_string},
+};
+
+static const struct spdk_json_object_decoder rpc_log_level_decoders[] = {
+ {"level", offsetof(struct rpc_log_level, level), spdk_json_decode_string},
+};
+
+static int
+_parse_log_level(char *level)
+{
+ if (!strcasecmp(level, "ERROR")) {
+ return SPDK_LOG_ERROR;
+ } else if (!strcasecmp(level, "WARNING")) {
+ return SPDK_LOG_WARN;
+ } else if (!strcasecmp(level, "NOTICE")) {
+ return SPDK_LOG_NOTICE;
+ } else if (!strcasecmp(level, "INFO")) {
+ return SPDK_LOG_INFO;
+ } else if (!strcasecmp(level, "DEBUG")) {
+ return SPDK_LOG_DEBUG;
+ }
+ return -1;
+}
+
+static const char *
+_log_get_level_name(int level)
+{
+ if (level == SPDK_LOG_ERROR) {
+ return "ERROR";
+ } else if (level == SPDK_LOG_WARN) {
+ return "WARNING";
+ } else if (level == SPDK_LOG_NOTICE) {
+ return "NOTICE";
+ } else if (level == SPDK_LOG_INFO) {
+ return "INFO";
+ } else if (level == SPDK_LOG_DEBUG) {
+ return "DEBUG";
+ }
+ return NULL;
+}
+
+static void
+rpc_log_set_print_level(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_log_level req = {};
+ int level;
+ struct spdk_json_write_ctx *w;
+
+ if (spdk_json_decode_object(params, rpc_log_level_decoders,
+ SPDK_COUNTOF(rpc_log_level_decoders), &req)) {
+ SPDK_DEBUGLOG(SPDK_LOG_LOG_RPC, "spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto invalid;
+ }
+
+ level = _parse_log_level(req.level);
+ if (level == -1) {
+ SPDK_DEBUGLOG(SPDK_LOG_LOG_RPC, "tried to set invalid log level\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "invalid log level");
+ goto invalid;
+ }
+
+ spdk_log_set_print_level(level);
+ free_rpc_log_level(&req);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ return;
+
+invalid:
+ free_rpc_log_level(&req);
+}
+SPDK_RPC_REGISTER("log_set_print_level", rpc_log_set_print_level,
+ SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(log_set_print_level, set_log_print_level)
+
+static void
+rpc_log_get_print_level(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct spdk_json_write_ctx *w;
+ int level;
+ const char *name;
+
+ if (params != NULL) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "log_get_print_level requires no parameters");
+ return;
+ }
+
+ level = spdk_log_get_print_level();
+ name = _log_get_level_name(level);
+ if (name == NULL) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "invalid log level");
+ return;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_string(w, name);
+
+ spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("log_get_print_level", rpc_log_get_print_level,
+ SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(log_get_print_level, get_log_print_level)
+
+static void
+rpc_log_set_level(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_log_level req = {};
+ int level;
+ struct spdk_json_write_ctx *w;
+
+ if (spdk_json_decode_object(params, rpc_log_level_decoders,
+ SPDK_COUNTOF(rpc_log_level_decoders), &req)) {
+ SPDK_DEBUGLOG(SPDK_LOG_LOG_RPC, "spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto invalid;
+ }
+
+ level = _parse_log_level(req.level);
+ if (level == -1) {
+ SPDK_DEBUGLOG(SPDK_LOG_LOG_RPC, "tried to set invalid log level\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "invalid log level");
+ goto invalid;
+ }
+
+
+ spdk_log_set_level(level);
+ free_rpc_log_level(&req);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ return;
+
+invalid:
+ free_rpc_log_level(&req);
+}
+SPDK_RPC_REGISTER("log_set_level", rpc_log_set_level, SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(log_set_level, set_log_level)
+
+static void
+rpc_log_get_level(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct spdk_json_write_ctx *w;
+ int level;
+ const char *name;
+
+ if (params != NULL) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "log_get_level requires no parameters");
+ return;
+ }
+
+ level = spdk_log_get_level();
+ name = _log_get_level_name(level);
+ if (name == NULL) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "invalid log level");
+ return;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_string(w, name);
+
+ spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("log_get_level", rpc_log_get_level, SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(log_get_level, get_log_level)
+
+static void
+rpc_log_set_flag(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_log_flag req = {};
+ struct spdk_json_write_ctx *w;
+
+ if (spdk_json_decode_object(params, rpc_log_flag_decoders,
+ SPDK_COUNTOF(rpc_log_flag_decoders), &req)) {
+ SPDK_DEBUGLOG(SPDK_LOG_LOG_RPC, "spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto invalid;
+ }
+
+ if (req.flag == 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_LOG_RPC, "invalid flag 0\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "invalid flag 0");
+ goto invalid;
+ }
+
+ spdk_log_set_flag(req.flag);
+ free_rpc_log_flag(&req);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ return;
+
+invalid:
+ free_rpc_log_flag(&req);
+}
+SPDK_RPC_REGISTER("log_set_flag", rpc_log_set_flag, SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(log_set_flag, set_log_flag)
+
+static void
+rpc_log_clear_flag(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_log_flag req = {};
+ struct spdk_json_write_ctx *w;
+
+ if (spdk_json_decode_object(params, rpc_log_flag_decoders,
+ SPDK_COUNTOF(rpc_log_flag_decoders), &req)) {
+ SPDK_DEBUGLOG(SPDK_LOG_LOG_RPC, "spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto invalid;
+ }
+
+ if (req.flag == 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_LOG_RPC, "Invalid flag 0\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "invalid flag 0");
+ goto invalid;
+ }
+
+ spdk_log_clear_flag(req.flag);
+ free_rpc_log_flag(&req);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ return;
+
+invalid:
+ free_rpc_log_flag(&req);
+}
+SPDK_RPC_REGISTER("log_clear_flag", rpc_log_clear_flag,
+ SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(log_clear_flag, clear_log_flag)
+
+static void
+rpc_log_get_flags(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct spdk_json_write_ctx *w;
+ struct spdk_log_flag *flag;
+
+ if (params != NULL) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "log_get_flags requires no parameters");
+ return;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_object_begin(w);
+ flag = spdk_log_get_first_flag();
+ while (flag) {
+ spdk_json_write_name(w, flag->name);
+ spdk_json_write_bool(w, flag->enabled);
+ flag = spdk_log_get_next_flag(flag);
+ }
+ spdk_json_write_object_end(w);
+ spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("log_get_flags", rpc_log_get_flags, SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(log_get_flags, get_log_flags)
+
+SPDK_LOG_REGISTER_COMPONENT("log_rpc", SPDK_LOG_LOG_RPC)
diff --git a/src/spdk/lib/log_rpc/spdk_log_rpc.map b/src/spdk/lib/log_rpc/spdk_log_rpc.map
new file mode 100644
index 000000000..8bee6cdd3
--- /dev/null
+++ b/src/spdk/lib/log_rpc/spdk_log_rpc.map
@@ -0,0 +1,3 @@
+{
+ local: *;
+};
diff --git a/src/spdk/lib/lvol/Makefile b/src/spdk/lib/lvol/Makefile
new file mode 100644
index 000000000..c370a19a5
--- /dev/null
+++ b/src/spdk/lib/lvol/Makefile
@@ -0,0 +1,45 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = lvol.c
+LIBNAME = lvol
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_lvol.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/lvol/lvol.c b/src/spdk/lib/lvol/lvol.c
new file mode 100644
index 000000000..50b42d7b0
--- /dev/null
+++ b/src/spdk/lib/lvol/lvol.c
@@ -0,0 +1,1509 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk_internal/lvolstore.h"
+#include "spdk_internal/log.h"
+#include "spdk/string.h"
+#include "spdk/thread.h"
+#include "spdk/blob_bdev.h"
+#include "spdk/util.h"
+
+/* Default blob channel opts for lvol */
+#define SPDK_LVOL_BLOB_OPTS_CHANNEL_OPS 512
+
+#define LVOL_NAME "name"
+
+SPDK_LOG_REGISTER_COMPONENT("lvol", SPDK_LOG_LVOL)
+
+static TAILQ_HEAD(, spdk_lvol_store) g_lvol_stores = TAILQ_HEAD_INITIALIZER(g_lvol_stores);
+static pthread_mutex_t g_lvol_stores_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static int
+add_lvs_to_list(struct spdk_lvol_store *lvs)
+{
+ struct spdk_lvol_store *tmp;
+ bool name_conflict = false;
+
+ pthread_mutex_lock(&g_lvol_stores_mutex);
+ TAILQ_FOREACH(tmp, &g_lvol_stores, link) {
+ if (!strncmp(lvs->name, tmp->name, SPDK_LVS_NAME_MAX)) {
+ name_conflict = true;
+ break;
+ }
+ }
+ if (!name_conflict) {
+ lvs->on_list = true;
+ TAILQ_INSERT_TAIL(&g_lvol_stores, lvs, link);
+ }
+ pthread_mutex_unlock(&g_lvol_stores_mutex);
+
+ return name_conflict ? -1 : 0;
+}
+
+static void
+lvs_free(struct spdk_lvol_store *lvs)
+{
+ pthread_mutex_lock(&g_lvol_stores_mutex);
+ if (lvs->on_list) {
+ TAILQ_REMOVE(&g_lvol_stores, lvs, link);
+ }
+ pthread_mutex_unlock(&g_lvol_stores_mutex);
+
+ free(lvs);
+}
+
+static void
+lvol_free(struct spdk_lvol *lvol)
+{
+ free(lvol);
+}
+
+static void
+lvol_open_cb(void *cb_arg, struct spdk_blob *blob, int lvolerrno)
+{
+ struct spdk_lvol_with_handle_req *req = cb_arg;
+ struct spdk_lvol *lvol = req->lvol;
+
+ if (lvolerrno != 0) {
+ SPDK_INFOLOG(SPDK_LOG_LVOL, "Failed to open lvol %s\n", lvol->unique_id);
+ goto end;
+ }
+
+ lvol->ref_count++;
+ lvol->blob = blob;
+end:
+ req->cb_fn(req->cb_arg, lvol, lvolerrno);
+ free(req);
+}
+
+void
+spdk_lvol_open(struct spdk_lvol *lvol, spdk_lvol_op_with_handle_complete cb_fn, void *cb_arg)
+{
+ struct spdk_lvol_with_handle_req *req;
+ struct spdk_blob_open_opts opts;
+
+ assert(cb_fn != NULL);
+
+ if (lvol == NULL) {
+ SPDK_ERRLOG("lvol does not exist\n");
+ cb_fn(cb_arg, NULL, -ENODEV);
+ return;
+ }
+
+ if (lvol->action_in_progress == true) {
+ SPDK_ERRLOG("Cannot open lvol - operations on lvol pending\n");
+ cb_fn(cb_arg, lvol, -EBUSY);
+ return;
+ }
+
+ if (lvol->ref_count > 0) {
+ lvol->ref_count++;
+ cb_fn(cb_arg, lvol, 0);
+ return;
+ }
+
+ req = calloc(1, sizeof(*req));
+ if (req == NULL) {
+ SPDK_ERRLOG("Cannot alloc memory for request structure\n");
+ cb_fn(cb_arg, NULL, -ENOMEM);
+ return;
+ }
+
+ req->cb_fn = cb_fn;
+ req->cb_arg = cb_arg;
+ req->lvol = lvol;
+
+ spdk_blob_open_opts_init(&opts);
+ opts.clear_method = lvol->clear_method;
+
+ spdk_bs_open_blob_ext(lvol->lvol_store->blobstore, lvol->blob_id, &opts, lvol_open_cb, req);
+}
+
+static void
+bs_unload_with_error_cb(void *cb_arg, int lvolerrno)
+{
+ struct spdk_lvs_with_handle_req *req = (struct spdk_lvs_with_handle_req *)cb_arg;
+
+ req->cb_fn(req->cb_arg, NULL, req->lvserrno);
+ free(req);
+}
+
+static void
+load_next_lvol(void *cb_arg, struct spdk_blob *blob, int lvolerrno)
+{
+ struct spdk_lvs_with_handle_req *req = cb_arg;
+ struct spdk_lvol_store *lvs = req->lvol_store;
+ struct spdk_blob_store *bs = lvs->blobstore;
+ struct spdk_lvol *lvol, *tmp;
+ spdk_blob_id blob_id;
+ const char *attr;
+ size_t value_len;
+ int rc;
+
+ if (lvolerrno == -ENOENT) {
+ /* Finished iterating */
+ req->cb_fn(req->cb_arg, lvs, 0);
+ free(req);
+ return;
+ } else if (lvolerrno < 0) {
+ SPDK_ERRLOG("Failed to fetch blobs list\n");
+ req->lvserrno = lvolerrno;
+ goto invalid;
+ }
+
+ blob_id = spdk_blob_get_id(blob);
+
+ if (blob_id == lvs->super_blob_id) {
+ SPDK_INFOLOG(SPDK_LOG_LVOL, "found superblob %"PRIu64"\n", (uint64_t)blob_id);
+ spdk_bs_iter_next(bs, blob, load_next_lvol, req);
+ return;
+ }
+
+ lvol = calloc(1, sizeof(*lvol));
+ if (!lvol) {
+ SPDK_ERRLOG("Cannot alloc memory for lvol base pointer\n");
+ req->lvserrno = -ENOMEM;
+ goto invalid;
+ }
+
+ lvol->blob = blob;
+ lvol->blob_id = blob_id;
+ lvol->lvol_store = lvs;
+ lvol->thin_provision = spdk_blob_is_thin_provisioned(blob);
+
+ rc = spdk_blob_get_xattr_value(blob, "uuid", (const void **)&attr, &value_len);
+ if (rc != 0 || value_len != SPDK_UUID_STRING_LEN || attr[SPDK_UUID_STRING_LEN - 1] != '\0' ||
+ spdk_uuid_parse(&lvol->uuid, attr) != 0) {
+ SPDK_INFOLOG(SPDK_LOG_LVOL, "Missing or corrupt lvol uuid\n");
+ memset(&lvol->uuid, 0, sizeof(lvol->uuid));
+ }
+ spdk_uuid_fmt_lower(lvol->uuid_str, sizeof(lvol->uuid_str), &lvol->uuid);
+
+ if (!spdk_mem_all_zero(&lvol->uuid, sizeof(lvol->uuid))) {
+ snprintf(lvol->unique_id, sizeof(lvol->unique_id), "%s", lvol->uuid_str);
+ } else {
+ spdk_uuid_fmt_lower(lvol->unique_id, sizeof(lvol->unique_id), &lvol->lvol_store->uuid);
+ value_len = strlen(lvol->unique_id);
+ snprintf(lvol->unique_id + value_len, sizeof(lvol->unique_id) - value_len, "_%"PRIu64,
+ (uint64_t)blob_id);
+ }
+
+ rc = spdk_blob_get_xattr_value(blob, "name", (const void **)&attr, &value_len);
+ if (rc != 0 || value_len > SPDK_LVOL_NAME_MAX) {
+ SPDK_ERRLOG("Cannot assign lvol name\n");
+ lvol_free(lvol);
+ req->lvserrno = -EINVAL;
+ goto invalid;
+ }
+
+ snprintf(lvol->name, sizeof(lvol->name), "%s", attr);
+
+ TAILQ_INSERT_TAIL(&lvs->lvols, lvol, link);
+
+ lvs->lvol_count++;
+
+ SPDK_INFOLOG(SPDK_LOG_LVOL, "added lvol %s (%s)\n", lvol->unique_id, lvol->uuid_str);
+
+ spdk_bs_iter_next(bs, blob, load_next_lvol, req);
+
+ return;
+
+invalid:
+ TAILQ_FOREACH_SAFE(lvol, &lvs->lvols, link, tmp) {
+ TAILQ_REMOVE(&lvs->lvols, lvol, link);
+ free(lvol);
+ }
+
+ lvs_free(lvs);
+ spdk_bs_unload(bs, bs_unload_with_error_cb, req);
+}
+
+static void
+close_super_cb(void *cb_arg, int lvolerrno)
+{
+ struct spdk_lvs_with_handle_req *req = (struct spdk_lvs_with_handle_req *)cb_arg;
+ struct spdk_lvol_store *lvs = req->lvol_store;
+ struct spdk_blob_store *bs = lvs->blobstore;
+
+ if (lvolerrno != 0) {
+ SPDK_INFOLOG(SPDK_LOG_LVOL, "Could not close super blob\n");
+ lvs_free(lvs);
+ req->lvserrno = -ENODEV;
+ spdk_bs_unload(bs, bs_unload_with_error_cb, req);
+ return;
+ }
+
+ /* Start loading lvols */
+ spdk_bs_iter_first(lvs->blobstore, load_next_lvol, req);
+}
+
+static void
+close_super_blob_with_error_cb(void *cb_arg, int lvolerrno)
+{
+ struct spdk_lvs_with_handle_req *req = (struct spdk_lvs_with_handle_req *)cb_arg;
+ struct spdk_lvol_store *lvs = req->lvol_store;
+ struct spdk_blob_store *bs = lvs->blobstore;
+
+ lvs_free(lvs);
+
+ spdk_bs_unload(bs, bs_unload_with_error_cb, req);
+}
+
+static void
+lvs_read_uuid(void *cb_arg, struct spdk_blob *blob, int lvolerrno)
+{
+ struct spdk_lvs_with_handle_req *req = (struct spdk_lvs_with_handle_req *)cb_arg;
+ struct spdk_lvol_store *lvs = req->lvol_store;
+ struct spdk_blob_store *bs = lvs->blobstore;
+ const char *attr;
+ size_t value_len;
+ int rc;
+
+ if (lvolerrno != 0) {
+ SPDK_INFOLOG(SPDK_LOG_LVOL, "Could not open super blob\n");
+ lvs_free(lvs);
+ req->lvserrno = -ENODEV;
+ spdk_bs_unload(bs, bs_unload_with_error_cb, req);
+ return;
+ }
+
+ rc = spdk_blob_get_xattr_value(blob, "uuid", (const void **)&attr, &value_len);
+ if (rc != 0 || value_len != SPDK_UUID_STRING_LEN || attr[SPDK_UUID_STRING_LEN - 1] != '\0') {
+ SPDK_INFOLOG(SPDK_LOG_LVOL, "missing or incorrect UUID\n");
+ req->lvserrno = -EINVAL;
+ spdk_blob_close(blob, close_super_blob_with_error_cb, req);
+ return;
+ }
+
+ if (spdk_uuid_parse(&lvs->uuid, attr)) {
+ SPDK_INFOLOG(SPDK_LOG_LVOL, "incorrect UUID '%s'\n", attr);
+ req->lvserrno = -EINVAL;
+ spdk_blob_close(blob, close_super_blob_with_error_cb, req);
+ return;
+ }
+
+ rc = spdk_blob_get_xattr_value(blob, "name", (const void **)&attr, &value_len);
+ if (rc != 0 || value_len > SPDK_LVS_NAME_MAX) {
+ SPDK_INFOLOG(SPDK_LOG_LVOL, "missing or invalid name\n");
+ req->lvserrno = -EINVAL;
+ spdk_blob_close(blob, close_super_blob_with_error_cb, req);
+ return;
+ }
+
+ snprintf(lvs->name, sizeof(lvs->name), "%s", attr);
+
+ rc = add_lvs_to_list(lvs);
+ if (rc) {
+ SPDK_INFOLOG(SPDK_LOG_LVOL, "lvolstore with name %s already exists\n", lvs->name);
+ req->lvserrno = -EEXIST;
+ spdk_blob_close(blob, close_super_blob_with_error_cb, req);
+ return;
+ }
+
+ lvs->super_blob_id = spdk_blob_get_id(blob);
+
+ spdk_blob_close(blob, close_super_cb, req);
+}
+
+static void
+lvs_open_super(void *cb_arg, spdk_blob_id blobid, int lvolerrno)
+{
+ struct spdk_lvs_with_handle_req *req = (struct spdk_lvs_with_handle_req *)cb_arg;
+ struct spdk_lvol_store *lvs = req->lvol_store;
+ struct spdk_blob_store *bs = lvs->blobstore;
+
+ if (lvolerrno != 0) {
+ SPDK_INFOLOG(SPDK_LOG_LVOL, "Super blob not found\n");
+ lvs_free(lvs);
+ req->lvserrno = -ENODEV;
+ spdk_bs_unload(bs, bs_unload_with_error_cb, req);
+ return;
+ }
+
+ spdk_bs_open_blob(bs, blobid, lvs_read_uuid, req);
+}
+
+static void
+lvs_load_cb(void *cb_arg, struct spdk_blob_store *bs, int lvolerrno)
+{
+ struct spdk_lvs_with_handle_req *req = (struct spdk_lvs_with_handle_req *)cb_arg;
+ struct spdk_lvol_store *lvs;
+
+ if (lvolerrno != 0) {
+ req->cb_fn(req->cb_arg, NULL, lvolerrno);
+ free(req);
+ return;
+ }
+
+ lvs = calloc(1, sizeof(*lvs));
+ if (lvs == NULL) {
+ SPDK_ERRLOG("Cannot alloc memory for lvol store\n");
+ spdk_bs_unload(bs, bs_unload_with_error_cb, req);
+ return;
+ }
+
+ lvs->blobstore = bs;
+ lvs->bs_dev = req->bs_dev;
+ TAILQ_INIT(&lvs->lvols);
+ TAILQ_INIT(&lvs->pending_lvols);
+
+ req->lvol_store = lvs;
+
+ spdk_bs_get_super(bs, lvs_open_super, req);
+}
+
+static void
+lvs_bs_opts_init(struct spdk_bs_opts *opts)
+{
+ spdk_bs_opts_init(opts);
+ opts->max_channel_ops = SPDK_LVOL_BLOB_OPTS_CHANNEL_OPS;
+}
+
+void
+spdk_lvs_load(struct spdk_bs_dev *bs_dev, spdk_lvs_op_with_handle_complete cb_fn, void *cb_arg)
+{
+ struct spdk_lvs_with_handle_req *req;
+ struct spdk_bs_opts opts = {};
+
+ assert(cb_fn != NULL);
+
+ if (bs_dev == NULL) {
+ SPDK_ERRLOG("Blobstore device does not exist\n");
+ cb_fn(cb_arg, NULL, -ENODEV);
+ return;
+ }
+
+ req = calloc(1, sizeof(*req));
+ if (req == NULL) {
+ SPDK_ERRLOG("Cannot alloc memory for request structure\n");
+ cb_fn(cb_arg, NULL, -ENOMEM);
+ return;
+ }
+
+ req->cb_fn = cb_fn;
+ req->cb_arg = cb_arg;
+ req->bs_dev = bs_dev;
+
+ lvs_bs_opts_init(&opts);
+ snprintf(opts.bstype.bstype, sizeof(opts.bstype.bstype), "LVOLSTORE");
+
+ spdk_bs_load(bs_dev, &opts, lvs_load_cb, req);
+}
+
+static void
+remove_bs_on_error_cb(void *cb_arg, int bserrno)
+{
+}
+
+static void
+super_create_close_cb(void *cb_arg, int lvolerrno)
+{
+ struct spdk_lvs_with_handle_req *req = cb_arg;
+ struct spdk_lvol_store *lvs = req->lvol_store;
+
+ if (lvolerrno < 0) {
+ SPDK_ERRLOG("Lvol store init failed: could not close super blob\n");
+ req->cb_fn(req->cb_arg, NULL, lvolerrno);
+ spdk_bs_destroy(lvs->blobstore, remove_bs_on_error_cb, NULL);
+ lvs_free(lvs);
+ free(req);
+ return;
+ }
+
+ req->cb_fn(req->cb_arg, lvs, lvolerrno);
+ free(req);
+}
+
+static void
+super_blob_set_cb(void *cb_arg, int lvolerrno)
+{
+ struct spdk_lvs_with_handle_req *req = cb_arg;
+ struct spdk_lvol_store *lvs = req->lvol_store;
+ struct spdk_blob *blob = lvs->super_blob;
+
+ if (lvolerrno < 0) {
+ req->cb_fn(req->cb_arg, NULL, lvolerrno);
+ SPDK_ERRLOG("Lvol store init failed: could not set uuid for super blob\n");
+ spdk_bs_destroy(lvs->blobstore, remove_bs_on_error_cb, NULL);
+ lvs_free(lvs);
+ free(req);
+ return;
+ }
+
+ spdk_blob_close(blob, super_create_close_cb, req);
+}
+
+static void
+super_blob_init_cb(void *cb_arg, int lvolerrno)
+{
+ struct spdk_lvs_with_handle_req *req = cb_arg;
+ struct spdk_lvol_store *lvs = req->lvol_store;
+ struct spdk_blob *blob = lvs->super_blob;
+ char uuid[SPDK_UUID_STRING_LEN];
+
+ if (lvolerrno < 0) {
+ req->cb_fn(req->cb_arg, NULL, lvolerrno);
+ SPDK_ERRLOG("Lvol store init failed: could not set super blob\n");
+ spdk_bs_destroy(lvs->blobstore, remove_bs_on_error_cb, NULL);
+ lvs_free(lvs);
+ free(req);
+ return;
+ }
+
+ spdk_uuid_fmt_lower(uuid, sizeof(uuid), &lvs->uuid);
+
+ spdk_blob_set_xattr(blob, "uuid", uuid, sizeof(uuid));
+ spdk_blob_set_xattr(blob, "name", lvs->name, strnlen(lvs->name, SPDK_LVS_NAME_MAX) + 1);
+ spdk_blob_sync_md(blob, super_blob_set_cb, req);
+}
+
+static void
+super_blob_create_open_cb(void *cb_arg, struct spdk_blob *blob, int lvolerrno)
+{
+ struct spdk_lvs_with_handle_req *req = cb_arg;
+ struct spdk_lvol_store *lvs = req->lvol_store;
+
+ if (lvolerrno < 0) {
+ req->cb_fn(req->cb_arg, NULL, lvolerrno);
+ SPDK_ERRLOG("Lvol store init failed: could not open super blob\n");
+ spdk_bs_destroy(lvs->blobstore, remove_bs_on_error_cb, NULL);
+ lvs_free(lvs);
+ free(req);
+ return;
+ }
+
+ lvs->super_blob = blob;
+ lvs->super_blob_id = spdk_blob_get_id(blob);
+
+ spdk_bs_set_super(lvs->blobstore, lvs->super_blob_id, super_blob_init_cb, req);
+}
+
+static void
+super_blob_create_cb(void *cb_arg, spdk_blob_id blobid, int lvolerrno)
+{
+ struct spdk_lvs_with_handle_req *req = cb_arg;
+ struct spdk_lvol_store *lvs = req->lvol_store;
+ struct spdk_blob_store *bs;
+
+ if (lvolerrno < 0) {
+ req->cb_fn(req->cb_arg, NULL, lvolerrno);
+ SPDK_ERRLOG("Lvol store init failed: could not create super blob\n");
+ spdk_bs_destroy(lvs->blobstore, remove_bs_on_error_cb, NULL);
+ lvs_free(lvs);
+ free(req);
+ return;
+ }
+
+ bs = req->lvol_store->blobstore;
+
+ spdk_bs_open_blob(bs, blobid, super_blob_create_open_cb, req);
+}
+
+static void
+lvs_init_cb(void *cb_arg, struct spdk_blob_store *bs, int lvserrno)
+{
+ struct spdk_lvs_with_handle_req *lvs_req = cb_arg;
+ struct spdk_lvol_store *lvs = lvs_req->lvol_store;
+
+ if (lvserrno != 0) {
+ assert(bs == NULL);
+ lvs_req->cb_fn(lvs_req->cb_arg, NULL, lvserrno);
+ SPDK_ERRLOG("Lvol store init failed: could not initialize blobstore\n");
+ lvs_free(lvs);
+ free(lvs_req);
+ return;
+ }
+
+ assert(bs != NULL);
+ lvs->blobstore = bs;
+ TAILQ_INIT(&lvs->lvols);
+ TAILQ_INIT(&lvs->pending_lvols);
+
+ SPDK_INFOLOG(SPDK_LOG_LVOL, "Lvol store initialized\n");
+
+ /* create super blob */
+ spdk_bs_create_blob(lvs->blobstore, super_blob_create_cb, lvs_req);
+}
+
+void
+spdk_lvs_opts_init(struct spdk_lvs_opts *o)
+{
+ o->cluster_sz = SPDK_LVS_OPTS_CLUSTER_SZ;
+ o->clear_method = LVS_CLEAR_WITH_UNMAP;
+ memset(o->name, 0, sizeof(o->name));
+}
+
+static void
+setup_lvs_opts(struct spdk_bs_opts *bs_opts, struct spdk_lvs_opts *o)
+{
+ assert(o != NULL);
+ lvs_bs_opts_init(bs_opts);
+ bs_opts->cluster_sz = o->cluster_sz;
+ bs_opts->clear_method = (enum bs_clear_method)o->clear_method;
+}
+
+int
+spdk_lvs_init(struct spdk_bs_dev *bs_dev, struct spdk_lvs_opts *o,
+ spdk_lvs_op_with_handle_complete cb_fn, void *cb_arg)
+{
+ struct spdk_lvol_store *lvs;
+ struct spdk_lvs_with_handle_req *lvs_req;
+ struct spdk_bs_opts opts = {};
+ int rc;
+
+ if (bs_dev == NULL) {
+ SPDK_ERRLOG("Blobstore device does not exist\n");
+ return -ENODEV;
+ }
+
+ if (o == NULL) {
+ SPDK_ERRLOG("spdk_lvs_opts not specified\n");
+ return -EINVAL;
+ }
+
+ setup_lvs_opts(&opts, o);
+
+ if (strnlen(o->name, SPDK_LVS_NAME_MAX) == SPDK_LVS_NAME_MAX) {
+ SPDK_ERRLOG("Name has no null terminator.\n");
+ return -EINVAL;
+ }
+
+ if (strnlen(o->name, SPDK_LVS_NAME_MAX) == 0) {
+ SPDK_ERRLOG("No name specified.\n");
+ return -EINVAL;
+ }
+
+ lvs = calloc(1, sizeof(*lvs));
+ if (!lvs) {
+ SPDK_ERRLOG("Cannot alloc memory for lvol store base pointer\n");
+ return -ENOMEM;
+ }
+
+ spdk_uuid_generate(&lvs->uuid);
+ snprintf(lvs->name, sizeof(lvs->name), "%s", o->name);
+
+ rc = add_lvs_to_list(lvs);
+ if (rc) {
+ SPDK_ERRLOG("lvolstore with name %s already exists\n", lvs->name);
+ lvs_free(lvs);
+ return -EEXIST;
+ }
+
+ lvs_req = calloc(1, sizeof(*lvs_req));
+ if (!lvs_req) {
+ lvs_free(lvs);
+ SPDK_ERRLOG("Cannot alloc memory for lvol store request pointer\n");
+ return -ENOMEM;
+ }
+
+ assert(cb_fn != NULL);
+ lvs_req->cb_fn = cb_fn;
+ lvs_req->cb_arg = cb_arg;
+ lvs_req->lvol_store = lvs;
+ lvs->bs_dev = bs_dev;
+ lvs->destruct = false;
+
+ snprintf(opts.bstype.bstype, sizeof(opts.bstype.bstype), "LVOLSTORE");
+
+ SPDK_INFOLOG(SPDK_LOG_LVOL, "Initializing lvol store\n");
+ spdk_bs_init(bs_dev, &opts, lvs_init_cb, lvs_req);
+
+ return 0;
+}
+
+static void
+lvs_rename_cb(void *cb_arg, int lvolerrno)
+{
+ struct spdk_lvs_req *req = cb_arg;
+
+ if (lvolerrno != 0) {
+ req->lvserrno = lvolerrno;
+ }
+ if (req->lvserrno != 0) {
+ SPDK_ERRLOG("Lvol store rename operation failed\n");
+ /* Lvs renaming failed, so we should 'clear' new_name.
+ * Otherwise it could cause a failure on the next attepmt to change the name to 'new_name' */
+ snprintf(req->lvol_store->new_name,
+ sizeof(req->lvol_store->new_name),
+ "%s", req->lvol_store->name);
+ } else {
+ /* Update lvs name with new_name */
+ snprintf(req->lvol_store->name,
+ sizeof(req->lvol_store->name),
+ "%s", req->lvol_store->new_name);
+ }
+
+ req->cb_fn(req->cb_arg, req->lvserrno);
+ free(req);
+}
+
+static void
+lvs_rename_sync_cb(void *cb_arg, int lvolerrno)
+{
+ struct spdk_lvs_req *req = cb_arg;
+ struct spdk_blob *blob = req->lvol_store->super_blob;
+
+ if (lvolerrno < 0) {
+ req->lvserrno = lvolerrno;
+ }
+
+ spdk_blob_close(blob, lvs_rename_cb, req);
+}
+
+static void
+lvs_rename_open_cb(void *cb_arg, struct spdk_blob *blob, int lvolerrno)
+{
+ struct spdk_lvs_req *req = cb_arg;
+ int rc;
+
+ if (lvolerrno < 0) {
+ lvs_rename_cb(cb_arg, lvolerrno);
+ return;
+ }
+
+ rc = spdk_blob_set_xattr(blob, "name", req->lvol_store->new_name,
+ strlen(req->lvol_store->new_name) + 1);
+ if (rc < 0) {
+ req->lvserrno = rc;
+ lvs_rename_sync_cb(req, rc);
+ return;
+ }
+
+ req->lvol_store->super_blob = blob;
+
+ spdk_blob_sync_md(blob, lvs_rename_sync_cb, req);
+}
+
+void
+spdk_lvs_rename(struct spdk_lvol_store *lvs, const char *new_name,
+ spdk_lvs_op_complete cb_fn, void *cb_arg)
+{
+ struct spdk_lvs_req *req;
+ struct spdk_lvol_store *tmp;
+
+ /* Check if new name is current lvs name.
+ * If so, return success immediately */
+ if (strncmp(lvs->name, new_name, SPDK_LVS_NAME_MAX) == 0) {
+ cb_fn(cb_arg, 0);
+ return;
+ }
+
+ /* Check if new or new_name is already used in other lvs */
+ pthread_mutex_lock(&g_lvol_stores_mutex);
+ TAILQ_FOREACH(tmp, &g_lvol_stores, link) {
+ if (!strncmp(new_name, tmp->name, SPDK_LVS_NAME_MAX) ||
+ !strncmp(new_name, tmp->new_name, SPDK_LVS_NAME_MAX)) {
+ pthread_mutex_unlock(&g_lvol_stores_mutex);
+ cb_fn(cb_arg, -EEXIST);
+ return;
+ }
+ }
+ pthread_mutex_unlock(&g_lvol_stores_mutex);
+
+ req = calloc(1, sizeof(*req));
+ if (!req) {
+ SPDK_ERRLOG("Cannot alloc memory for lvol request pointer\n");
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+ snprintf(lvs->new_name, sizeof(lvs->new_name), "%s", new_name);
+ req->lvol_store = lvs;
+ req->cb_fn = cb_fn;
+ req->cb_arg = cb_arg;
+
+ spdk_bs_open_blob(lvs->blobstore, lvs->super_blob_id, lvs_rename_open_cb, req);
+}
+
+static void
+_lvs_unload_cb(void *cb_arg, int lvserrno)
+{
+ struct spdk_lvs_req *lvs_req = cb_arg;
+
+ SPDK_INFOLOG(SPDK_LOG_LVOL, "Lvol store unloaded\n");
+ assert(lvs_req->cb_fn != NULL);
+ lvs_req->cb_fn(lvs_req->cb_arg, lvserrno);
+ free(lvs_req);
+}
+
+int
+spdk_lvs_unload(struct spdk_lvol_store *lvs, spdk_lvs_op_complete cb_fn,
+ void *cb_arg)
+{
+ struct spdk_lvs_req *lvs_req;
+ struct spdk_lvol *lvol, *tmp;
+
+ if (lvs == NULL) {
+ SPDK_ERRLOG("Lvol store is NULL\n");
+ return -ENODEV;
+ }
+
+ TAILQ_FOREACH_SAFE(lvol, &lvs->lvols, link, tmp) {
+ if (lvol->action_in_progress == true) {
+ SPDK_ERRLOG("Cannot unload lvol store - operations on lvols pending\n");
+ cb_fn(cb_arg, -EBUSY);
+ return -EBUSY;
+ } else if (lvol->ref_count != 0) {
+ SPDK_ERRLOG("Lvols still open on lvol store\n");
+ cb_fn(cb_arg, -EBUSY);
+ return -EBUSY;
+ }
+ }
+
+ TAILQ_FOREACH_SAFE(lvol, &lvs->lvols, link, tmp) {
+ TAILQ_REMOVE(&lvs->lvols, lvol, link);
+ lvol_free(lvol);
+ }
+
+ lvs_req = calloc(1, sizeof(*lvs_req));
+ if (!lvs_req) {
+ SPDK_ERRLOG("Cannot alloc memory for lvol store request pointer\n");
+ return -ENOMEM;
+ }
+
+ lvs_req->cb_fn = cb_fn;
+ lvs_req->cb_arg = cb_arg;
+
+ SPDK_INFOLOG(SPDK_LOG_LVOL, "Unloading lvol store\n");
+ spdk_bs_unload(lvs->blobstore, _lvs_unload_cb, lvs_req);
+ lvs_free(lvs);
+
+ return 0;
+}
+
+static void
+_lvs_destroy_cb(void *cb_arg, int lvserrno)
+{
+ struct spdk_lvs_destroy_req *lvs_req = cb_arg;
+
+ SPDK_INFOLOG(SPDK_LOG_LVOL, "Lvol store destroyed\n");
+ assert(lvs_req->cb_fn != NULL);
+ lvs_req->cb_fn(lvs_req->cb_arg, lvserrno);
+ free(lvs_req);
+}
+
+static void
+_lvs_destroy_super_cb(void *cb_arg, int bserrno)
+{
+ struct spdk_lvs_destroy_req *lvs_req = cb_arg;
+ struct spdk_lvol_store *lvs = lvs_req->lvs;
+
+ assert(lvs != NULL);
+
+ SPDK_INFOLOG(SPDK_LOG_LVOL, "Destroying lvol store\n");
+ spdk_bs_destroy(lvs->blobstore, _lvs_destroy_cb, lvs_req);
+ lvs_free(lvs);
+}
+
+int
+spdk_lvs_destroy(struct spdk_lvol_store *lvs, spdk_lvs_op_complete cb_fn,
+ void *cb_arg)
+{
+ struct spdk_lvs_destroy_req *lvs_req;
+ struct spdk_lvol *iter_lvol, *tmp;
+
+ if (lvs == NULL) {
+ SPDK_ERRLOG("Lvol store is NULL\n");
+ return -ENODEV;
+ }
+
+ TAILQ_FOREACH_SAFE(iter_lvol, &lvs->lvols, link, tmp) {
+ if (iter_lvol->action_in_progress == true) {
+ SPDK_ERRLOG("Cannot destroy lvol store - operations on lvols pending\n");
+ cb_fn(cb_arg, -EBUSY);
+ return -EBUSY;
+ } else if (iter_lvol->ref_count != 0) {
+ SPDK_ERRLOG("Lvols still open on lvol store\n");
+ cb_fn(cb_arg, -EBUSY);
+ return -EBUSY;
+ }
+ }
+
+ TAILQ_FOREACH_SAFE(iter_lvol, &lvs->lvols, link, tmp) {
+ free(iter_lvol);
+ }
+
+ lvs_req = calloc(1, sizeof(*lvs_req));
+ if (!lvs_req) {
+ SPDK_ERRLOG("Cannot alloc memory for lvol store request pointer\n");
+ return -ENOMEM;
+ }
+
+ lvs_req->cb_fn = cb_fn;
+ lvs_req->cb_arg = cb_arg;
+ lvs_req->lvs = lvs;
+
+ SPDK_INFOLOG(SPDK_LOG_LVOL, "Deleting super blob\n");
+ spdk_bs_delete_blob(lvs->blobstore, lvs->super_blob_id, _lvs_destroy_super_cb, lvs_req);
+
+ return 0;
+}
+
+static void
+lvol_close_blob_cb(void *cb_arg, int lvolerrno)
+{
+ struct spdk_lvol_req *req = cb_arg;
+ struct spdk_lvol *lvol = req->lvol;
+
+ if (lvolerrno < 0) {
+ SPDK_ERRLOG("Could not close blob on lvol\n");
+ lvol_free(lvol);
+ goto end;
+ }
+
+ lvol->ref_count--;
+ lvol->action_in_progress = false;
+ SPDK_INFOLOG(SPDK_LOG_LVOL, "Lvol %s closed\n", lvol->unique_id);
+
+end:
+ req->cb_fn(req->cb_arg, lvolerrno);
+ free(req);
+}
+
+bool
+spdk_lvol_deletable(struct spdk_lvol *lvol)
+{
+ size_t count = 0;
+
+ spdk_blob_get_clones(lvol->lvol_store->blobstore, lvol->blob_id, NULL, &count);
+ return (count == 0);
+}
+
+static void
+lvol_delete_blob_cb(void *cb_arg, int lvolerrno)
+{
+ struct spdk_lvol_req *req = cb_arg;
+ struct spdk_lvol *lvol = req->lvol;
+
+ if (lvolerrno < 0) {
+ SPDK_ERRLOG("Could not remove blob on lvol gracefully - forced removal\n");
+ } else {
+ SPDK_INFOLOG(SPDK_LOG_LVOL, "Lvol %s deleted\n", lvol->unique_id);
+ }
+
+ TAILQ_REMOVE(&lvol->lvol_store->lvols, lvol, link);
+ lvol_free(lvol);
+ req->cb_fn(req->cb_arg, lvolerrno);
+ free(req);
+}
+
+static void
+lvol_create_open_cb(void *cb_arg, struct spdk_blob *blob, int lvolerrno)
+{
+ struct spdk_lvol_with_handle_req *req = cb_arg;
+ struct spdk_lvol *lvol = req->lvol;
+
+ TAILQ_REMOVE(&req->lvol->lvol_store->pending_lvols, req->lvol, link);
+
+ if (lvolerrno < 0) {
+ free(lvol);
+ req->cb_fn(req->cb_arg, NULL, lvolerrno);
+ free(req);
+ return;
+ }
+
+ lvol->blob = blob;
+ lvol->blob_id = spdk_blob_get_id(blob);
+
+ TAILQ_INSERT_TAIL(&lvol->lvol_store->lvols, lvol, link);
+
+ snprintf(lvol->unique_id, sizeof(lvol->unique_id), "%s", lvol->uuid_str);
+ lvol->ref_count++;
+
+ assert(req->cb_fn != NULL);
+ req->cb_fn(req->cb_arg, req->lvol, lvolerrno);
+ free(req);
+}
+
+static void
+lvol_create_cb(void *cb_arg, spdk_blob_id blobid, int lvolerrno)
+{
+ struct spdk_lvol_with_handle_req *req = cb_arg;
+ struct spdk_blob_store *bs;
+ struct spdk_blob_open_opts opts;
+
+ if (lvolerrno < 0) {
+ TAILQ_REMOVE(&req->lvol->lvol_store->pending_lvols, req->lvol, link);
+ free(req->lvol);
+ assert(req->cb_fn != NULL);
+ req->cb_fn(req->cb_arg, NULL, lvolerrno);
+ free(req);
+ return;
+ }
+
+ spdk_blob_open_opts_init(&opts);
+ opts.clear_method = req->lvol->clear_method;
+ bs = req->lvol->lvol_store->blobstore;
+
+ spdk_bs_open_blob_ext(bs, blobid, &opts, lvol_create_open_cb, req);
+}
+
+static void
+lvol_get_xattr_value(void *xattr_ctx, const char *name,
+ const void **value, size_t *value_len)
+{
+ struct spdk_lvol *lvol = xattr_ctx;
+
+ if (!strcmp(LVOL_NAME, name)) {
+ *value = lvol->name;
+ *value_len = SPDK_LVOL_NAME_MAX;
+ } else if (!strcmp("uuid", name)) {
+ *value = lvol->uuid_str;
+ *value_len = sizeof(lvol->uuid_str);
+ }
+}
+
+static int
+lvs_verify_lvol_name(struct spdk_lvol_store *lvs, const char *name)
+{
+ struct spdk_lvol *tmp;
+
+ if (name == NULL || strnlen(name, SPDK_LVOL_NAME_MAX) == 0) {
+ SPDK_INFOLOG(SPDK_LOG_LVOL, "lvol name not provided.\n");
+ return -EINVAL;
+ }
+
+ if (strnlen(name, SPDK_LVOL_NAME_MAX) == SPDK_LVOL_NAME_MAX) {
+ SPDK_ERRLOG("Name has no null terminator.\n");
+ return -EINVAL;
+ }
+
+ TAILQ_FOREACH(tmp, &lvs->lvols, link) {
+ if (!strncmp(name, tmp->name, SPDK_LVOL_NAME_MAX)) {
+ SPDK_ERRLOG("lvol with name %s already exists\n", name);
+ return -EEXIST;
+ }
+ }
+
+ TAILQ_FOREACH(tmp, &lvs->pending_lvols, link) {
+ if (!strncmp(name, tmp->name, SPDK_LVOL_NAME_MAX)) {
+ SPDK_ERRLOG("lvol with name %s is being already created\n", name);
+ return -EEXIST;
+ }
+ }
+
+ return 0;
+}
+
+int
+spdk_lvol_create(struct spdk_lvol_store *lvs, const char *name, uint64_t sz,
+ bool thin_provision, enum lvol_clear_method clear_method, spdk_lvol_op_with_handle_complete cb_fn,
+ void *cb_arg)
+{
+ struct spdk_lvol_with_handle_req *req;
+ struct spdk_blob_store *bs;
+ struct spdk_lvol *lvol;
+ struct spdk_blob_opts opts;
+ uint64_t num_clusters;
+ char *xattr_names[] = {LVOL_NAME, "uuid"};
+ int rc;
+
+ if (lvs == NULL) {
+ SPDK_ERRLOG("lvol store does not exist\n");
+ return -EINVAL;
+ }
+
+ rc = lvs_verify_lvol_name(lvs, name);
+ if (rc < 0) {
+ return rc;
+ }
+
+ bs = lvs->blobstore;
+
+ req = calloc(1, sizeof(*req));
+ if (!req) {
+ SPDK_ERRLOG("Cannot alloc memory for lvol request pointer\n");
+ return -ENOMEM;
+ }
+ req->cb_fn = cb_fn;
+ req->cb_arg = cb_arg;
+
+ lvol = calloc(1, sizeof(*lvol));
+ if (!lvol) {
+ free(req);
+ SPDK_ERRLOG("Cannot alloc memory for lvol base pointer\n");
+ return -ENOMEM;
+ }
+ lvol->lvol_store = lvs;
+ num_clusters = spdk_divide_round_up(sz, spdk_bs_get_cluster_size(bs));
+ lvol->thin_provision = thin_provision;
+ lvol->clear_method = (enum blob_clear_method)clear_method;
+ snprintf(lvol->name, sizeof(lvol->name), "%s", name);
+ TAILQ_INSERT_TAIL(&lvol->lvol_store->pending_lvols, lvol, link);
+ spdk_uuid_generate(&lvol->uuid);
+ spdk_uuid_fmt_lower(lvol->uuid_str, sizeof(lvol->uuid_str), &lvol->uuid);
+ req->lvol = lvol;
+
+ spdk_blob_opts_init(&opts);
+ opts.thin_provision = thin_provision;
+ opts.num_clusters = num_clusters;
+ opts.clear_method = lvol->clear_method;
+ opts.xattrs.count = SPDK_COUNTOF(xattr_names);
+ opts.xattrs.names = xattr_names;
+ opts.xattrs.ctx = lvol;
+ opts.xattrs.get_value = lvol_get_xattr_value;
+
+ spdk_bs_create_blob_ext(lvs->blobstore, &opts, lvol_create_cb, req);
+
+ return 0;
+}
+
+void
+spdk_lvol_create_snapshot(struct spdk_lvol *origlvol, const char *snapshot_name,
+ spdk_lvol_op_with_handle_complete cb_fn, void *cb_arg)
+{
+ struct spdk_lvol_store *lvs;
+ struct spdk_lvol *newlvol;
+ struct spdk_blob *origblob;
+ struct spdk_lvol_with_handle_req *req;
+ struct spdk_blob_xattr_opts snapshot_xattrs;
+ char *xattr_names[] = {LVOL_NAME, "uuid"};
+ int rc;
+
+ if (origlvol == NULL) {
+ SPDK_INFOLOG(SPDK_LOG_LVOL, "Lvol not provided.\n");
+ cb_fn(cb_arg, NULL, -EINVAL);
+ return;
+ }
+
+ origblob = origlvol->blob;
+ lvs = origlvol->lvol_store;
+ if (lvs == NULL) {
+ SPDK_ERRLOG("lvol store does not exist\n");
+ cb_fn(cb_arg, NULL, -EINVAL);
+ return;
+ }
+
+ rc = lvs_verify_lvol_name(lvs, snapshot_name);
+ if (rc < 0) {
+ cb_fn(cb_arg, NULL, rc);
+ return;
+ }
+
+ req = calloc(1, sizeof(*req));
+ if (!req) {
+ SPDK_ERRLOG("Cannot alloc memory for lvol request pointer\n");
+ cb_fn(cb_arg, NULL, -ENOMEM);
+ return;
+ }
+
+ newlvol = calloc(1, sizeof(*newlvol));
+ if (!newlvol) {
+ SPDK_ERRLOG("Cannot alloc memory for lvol base pointer\n");
+ free(req);
+ cb_fn(cb_arg, NULL, -ENOMEM);
+ return;
+ }
+
+ newlvol->lvol_store = origlvol->lvol_store;
+ snprintf(newlvol->name, sizeof(newlvol->name), "%s", snapshot_name);
+ TAILQ_INSERT_TAIL(&newlvol->lvol_store->pending_lvols, newlvol, link);
+ spdk_uuid_generate(&newlvol->uuid);
+ spdk_uuid_fmt_lower(newlvol->uuid_str, sizeof(newlvol->uuid_str), &newlvol->uuid);
+ snapshot_xattrs.count = SPDK_COUNTOF(xattr_names);
+ snapshot_xattrs.ctx = newlvol;
+ snapshot_xattrs.names = xattr_names;
+ snapshot_xattrs.get_value = lvol_get_xattr_value;
+ req->lvol = newlvol;
+ req->cb_fn = cb_fn;
+ req->cb_arg = cb_arg;
+
+ spdk_bs_create_snapshot(lvs->blobstore, spdk_blob_get_id(origblob), &snapshot_xattrs,
+ lvol_create_cb, req);
+}
+
+void
+spdk_lvol_create_clone(struct spdk_lvol *origlvol, const char *clone_name,
+ spdk_lvol_op_with_handle_complete cb_fn, void *cb_arg)
+{
+ struct spdk_lvol *newlvol;
+ struct spdk_lvol_with_handle_req *req;
+ struct spdk_lvol_store *lvs;
+ struct spdk_blob *origblob;
+ struct spdk_blob_xattr_opts clone_xattrs;
+ char *xattr_names[] = {LVOL_NAME, "uuid"};
+ int rc;
+
+ if (origlvol == NULL) {
+ SPDK_INFOLOG(SPDK_LOG_LVOL, "Lvol not provided.\n");
+ cb_fn(cb_arg, NULL, -EINVAL);
+ return;
+ }
+
+ origblob = origlvol->blob;
+ lvs = origlvol->lvol_store;
+ if (lvs == NULL) {
+ SPDK_ERRLOG("lvol store does not exist\n");
+ cb_fn(cb_arg, NULL, -EINVAL);
+ return;
+ }
+
+ rc = lvs_verify_lvol_name(lvs, clone_name);
+ if (rc < 0) {
+ cb_fn(cb_arg, NULL, rc);
+ return;
+ }
+
+ req = calloc(1, sizeof(*req));
+ if (!req) {
+ SPDK_ERRLOG("Cannot alloc memory for lvol request pointer\n");
+ cb_fn(cb_arg, NULL, -ENOMEM);
+ return;
+ }
+
+ newlvol = calloc(1, sizeof(*newlvol));
+ if (!newlvol) {
+ SPDK_ERRLOG("Cannot alloc memory for lvol base pointer\n");
+ free(req);
+ cb_fn(cb_arg, NULL, -ENOMEM);
+ return;
+ }
+
+ newlvol->lvol_store = lvs;
+ snprintf(newlvol->name, sizeof(newlvol->name), "%s", clone_name);
+ TAILQ_INSERT_TAIL(&newlvol->lvol_store->pending_lvols, newlvol, link);
+ spdk_uuid_generate(&newlvol->uuid);
+ spdk_uuid_fmt_lower(newlvol->uuid_str, sizeof(newlvol->uuid_str), &newlvol->uuid);
+ clone_xattrs.count = SPDK_COUNTOF(xattr_names);
+ clone_xattrs.ctx = newlvol;
+ clone_xattrs.names = xattr_names;
+ clone_xattrs.get_value = lvol_get_xattr_value;
+ req->lvol = newlvol;
+ req->cb_fn = cb_fn;
+ req->cb_arg = cb_arg;
+
+ spdk_bs_create_clone(lvs->blobstore, spdk_blob_get_id(origblob), &clone_xattrs,
+ lvol_create_cb,
+ req);
+}
+
+static void
+lvol_resize_done(void *cb_arg, int lvolerrno)
+{
+ struct spdk_lvol_req *req = cb_arg;
+
+ req->cb_fn(req->cb_arg, lvolerrno);
+ free(req);
+}
+
+static void
+lvol_blob_resize_cb(void *cb_arg, int bserrno)
+{
+ struct spdk_lvol_req *req = cb_arg;
+ struct spdk_lvol *lvol = req->lvol;
+
+ if (bserrno != 0) {
+ req->cb_fn(req->cb_arg, bserrno);
+ free(req);
+ return;
+ }
+
+ spdk_blob_sync_md(lvol->blob, lvol_resize_done, req);
+}
+
+void
+spdk_lvol_resize(struct spdk_lvol *lvol, uint64_t sz,
+ spdk_lvol_op_complete cb_fn, void *cb_arg)
+{
+ struct spdk_blob *blob = lvol->blob;
+ struct spdk_lvol_store *lvs = lvol->lvol_store;
+ struct spdk_lvol_req *req;
+ uint64_t new_clusters = spdk_divide_round_up(sz, spdk_bs_get_cluster_size(lvs->blobstore));
+
+ req = calloc(1, sizeof(*req));
+ if (!req) {
+ SPDK_ERRLOG("Cannot alloc memory for lvol request pointer\n");
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+ req->cb_fn = cb_fn;
+ req->cb_arg = cb_arg;
+ req->lvol = lvol;
+
+ spdk_blob_resize(blob, new_clusters, lvol_blob_resize_cb, req);
+}
+
+static void
+lvol_set_read_only_cb(void *cb_arg, int lvolerrno)
+{
+ struct spdk_lvol_req *req = cb_arg;
+
+ req->cb_fn(req->cb_arg, lvolerrno);
+ free(req);
+}
+
+void
+spdk_lvol_set_read_only(struct spdk_lvol *lvol, spdk_lvol_op_complete cb_fn, void *cb_arg)
+{
+ struct spdk_lvol_req *req;
+
+ req = calloc(1, sizeof(*req));
+ if (!req) {
+ SPDK_ERRLOG("Cannot alloc memory for lvol request pointer\n");
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+ req->cb_fn = cb_fn;
+ req->cb_arg = cb_arg;
+
+ spdk_blob_set_read_only(lvol->blob);
+ spdk_blob_sync_md(lvol->blob, lvol_set_read_only_cb, req);
+}
+
+static void
+lvol_rename_cb(void *cb_arg, int lvolerrno)
+{
+ struct spdk_lvol_req *req = cb_arg;
+
+ if (lvolerrno != 0) {
+ SPDK_ERRLOG("Lvol rename operation failed\n");
+ } else {
+ snprintf(req->lvol->name, sizeof(req->lvol->name), "%s", req->name);
+ }
+
+ req->cb_fn(req->cb_arg, lvolerrno);
+ free(req);
+}
+
+void
+spdk_lvol_rename(struct spdk_lvol *lvol, const char *new_name,
+ spdk_lvol_op_complete cb_fn, void *cb_arg)
+{
+ struct spdk_lvol *tmp;
+ struct spdk_blob *blob = lvol->blob;
+ struct spdk_lvol_req *req;
+ int rc;
+
+ /* Check if new name is current lvol name.
+ * If so, return success immediately */
+ if (strncmp(lvol->name, new_name, SPDK_LVOL_NAME_MAX) == 0) {
+ cb_fn(cb_arg, 0);
+ return;
+ }
+
+ /* Check if lvol with 'new_name' already exists in lvolstore */
+ TAILQ_FOREACH(tmp, &lvol->lvol_store->lvols, link) {
+ if (strncmp(tmp->name, new_name, SPDK_LVOL_NAME_MAX) == 0) {
+ SPDK_ERRLOG("Lvol %s already exists in lvol store %s\n", new_name, lvol->lvol_store->name);
+ cb_fn(cb_arg, -EEXIST);
+ return;
+ }
+ }
+
+ req = calloc(1, sizeof(*req));
+ if (!req) {
+ SPDK_ERRLOG("Cannot alloc memory for lvol request pointer\n");
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+ req->cb_fn = cb_fn;
+ req->cb_arg = cb_arg;
+ req->lvol = lvol;
+ snprintf(req->name, sizeof(req->name), "%s", new_name);
+
+ rc = spdk_blob_set_xattr(blob, "name", new_name, strlen(new_name) + 1);
+ if (rc < 0) {
+ free(req);
+ cb_fn(cb_arg, rc);
+ return;
+ }
+
+ spdk_blob_sync_md(blob, lvol_rename_cb, req);
+}
+
+void
+spdk_lvol_destroy(struct spdk_lvol *lvol, spdk_lvol_op_complete cb_fn, void *cb_arg)
+{
+ struct spdk_lvol_req *req;
+ struct spdk_blob_store *bs;
+
+ assert(cb_fn != NULL);
+
+ if (lvol == NULL) {
+ SPDK_ERRLOG("lvol does not exist\n");
+ cb_fn(cb_arg, -ENODEV);
+ return;
+ }
+
+ if (lvol->ref_count != 0) {
+ SPDK_ERRLOG("Cannot destroy lvol %s because it is still open\n", lvol->unique_id);
+ cb_fn(cb_arg, -EBUSY);
+ return;
+ }
+
+ lvol->action_in_progress = true;
+
+ req = calloc(1, sizeof(*req));
+ if (!req) {
+ SPDK_ERRLOG("Cannot alloc memory for lvol request pointer\n");
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ req->cb_fn = cb_fn;
+ req->cb_arg = cb_arg;
+ req->lvol = lvol;
+ bs = lvol->lvol_store->blobstore;
+
+ spdk_bs_delete_blob(bs, lvol->blob_id, lvol_delete_blob_cb, req);
+}
+
+void
+spdk_lvol_close(struct spdk_lvol *lvol, spdk_lvol_op_complete cb_fn, void *cb_arg)
+{
+ struct spdk_lvol_req *req;
+
+ assert(cb_fn != NULL);
+
+ if (lvol == NULL) {
+ SPDK_ERRLOG("lvol does not exist\n");
+ cb_fn(cb_arg, -ENODEV);
+ return;
+ }
+
+ if (lvol->ref_count > 1) {
+ lvol->ref_count--;
+ cb_fn(cb_arg, 0);
+ return;
+ } else if (lvol->ref_count == 0) {
+ cb_fn(cb_arg, -EINVAL);
+ return;
+ }
+
+ lvol->action_in_progress = true;
+
+ req = calloc(1, sizeof(*req));
+ if (!req) {
+ SPDK_ERRLOG("Cannot alloc memory for lvol request pointer\n");
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ req->cb_fn = cb_fn;
+ req->cb_arg = cb_arg;
+ req->lvol = lvol;
+
+ spdk_blob_close(lvol->blob, lvol_close_blob_cb, req);
+}
+
+struct spdk_io_channel *
+spdk_lvol_get_io_channel(struct spdk_lvol *lvol)
+{
+ return spdk_bs_alloc_io_channel(lvol->lvol_store->blobstore);
+}
+
+static void
+lvol_inflate_cb(void *cb_arg, int lvolerrno)
+{
+ struct spdk_lvol_req *req = cb_arg;
+
+ spdk_bs_free_io_channel(req->channel);
+
+ if (lvolerrno < 0) {
+ SPDK_ERRLOG("Could not inflate lvol\n");
+ }
+
+ req->cb_fn(req->cb_arg, lvolerrno);
+ free(req);
+}
+
+void
+spdk_lvol_inflate(struct spdk_lvol *lvol, spdk_lvol_op_complete cb_fn, void *cb_arg)
+{
+ struct spdk_lvol_req *req;
+ spdk_blob_id blob_id;
+
+ assert(cb_fn != NULL);
+
+ if (lvol == NULL) {
+ SPDK_ERRLOG("Lvol does not exist\n");
+ cb_fn(cb_arg, -ENODEV);
+ return;
+ }
+
+ req = calloc(1, sizeof(*req));
+ if (!req) {
+ SPDK_ERRLOG("Cannot alloc memory for lvol request pointer\n");
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ req->cb_fn = cb_fn;
+ req->cb_arg = cb_arg;
+ req->channel = spdk_bs_alloc_io_channel(lvol->lvol_store->blobstore);
+ if (req->channel == NULL) {
+ SPDK_ERRLOG("Cannot alloc io channel for lvol inflate request\n");
+ free(req);
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ blob_id = spdk_blob_get_id(lvol->blob);
+ spdk_bs_inflate_blob(lvol->lvol_store->blobstore, req->channel, blob_id, lvol_inflate_cb,
+ req);
+}
+
+void
+spdk_lvol_decouple_parent(struct spdk_lvol *lvol, spdk_lvol_op_complete cb_fn, void *cb_arg)
+{
+ struct spdk_lvol_req *req;
+ spdk_blob_id blob_id;
+
+ assert(cb_fn != NULL);
+
+ if (lvol == NULL) {
+ SPDK_ERRLOG("Lvol does not exist\n");
+ cb_fn(cb_arg, -ENODEV);
+ return;
+ }
+
+ req = calloc(1, sizeof(*req));
+ if (!req) {
+ SPDK_ERRLOG("Cannot alloc memory for lvol request pointer\n");
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ req->cb_fn = cb_fn;
+ req->cb_arg = cb_arg;
+ req->channel = spdk_bs_alloc_io_channel(lvol->lvol_store->blobstore);
+ if (req->channel == NULL) {
+ SPDK_ERRLOG("Cannot alloc io channel for lvol inflate request\n");
+ free(req);
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ blob_id = spdk_blob_get_id(lvol->blob);
+ spdk_bs_blob_decouple_parent(lvol->lvol_store->blobstore, req->channel, blob_id,
+ lvol_inflate_cb, req);
+}
diff --git a/src/spdk/lib/lvol/spdk_lvol.map b/src/spdk/lib/lvol/spdk_lvol.map
new file mode 100644
index 000000000..6ddeb3be6
--- /dev/null
+++ b/src/spdk/lib/lvol/spdk_lvol.map
@@ -0,0 +1,28 @@
+{
+ global:
+
+ # public functions
+ spdk_lvs_opts_init;
+ spdk_lvs_init;
+ spdk_lvs_rename;
+ spdk_lvs_unload;
+ spdk_lvs_destroy;
+ spdk_lvol_create;
+ spdk_lvol_create_snapshot;
+ spdk_lvol_create_clone;
+ spdk_lvol_rename;
+ spdk_lvol_deletable;
+ spdk_lvol_destroy;
+ spdk_lvol_close;
+ spdk_lvol_get_io_channel;
+ spdk_lvs_load;
+ spdk_lvol_open;
+ spdk_lvol_inflate;
+ spdk_lvol_decouple_parent;
+
+ # internal functions
+ spdk_lvol_resize;
+ spdk_lvol_set_read_only;
+
+ local: *;
+};
diff --git a/src/spdk/lib/nbd/Makefile b/src/spdk/lib/nbd/Makefile
new file mode 100644
index 000000000..69b13d133
--- /dev/null
+++ b/src/spdk/lib/nbd/Makefile
@@ -0,0 +1,45 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+LIBNAME = nbd
+C_SRCS = nbd.c nbd_rpc.c
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_nbd.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/nbd/nbd.c b/src/spdk/lib/nbd/nbd.c
new file mode 100644
index 000000000..7d96b9315
--- /dev/null
+++ b/src/spdk/lib/nbd/nbd.c
@@ -0,0 +1,1093 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+#include "spdk/string.h"
+
+#include <linux/nbd.h>
+
+#include "spdk/nbd.h"
+#include "nbd_internal.h"
+#include "spdk/bdev.h"
+#include "spdk/endian.h"
+#include "spdk/env.h"
+#include "spdk/log.h"
+#include "spdk/util.h"
+#include "spdk/thread.h"
+
+#include "spdk_internal/log.h"
+#include "spdk/queue.h"
+
+#define GET_IO_LOOP_COUNT 16
+#define NBD_BUSY_WAITING_MS 1000
+#define NBD_BUSY_POLLING_INTERVAL_US 20000
+
+enum nbd_io_state_t {
+ /* Receiving or ready to receive nbd request header */
+ NBD_IO_RECV_REQ = 0,
+ /* Receiving write payload */
+ NBD_IO_RECV_PAYLOAD,
+ /* Transmitting or ready to transmit nbd response header */
+ NBD_IO_XMIT_RESP,
+ /* Transmitting read payload */
+ NBD_IO_XMIT_PAYLOAD,
+};
+
+struct nbd_io {
+ struct spdk_nbd_disk *nbd;
+ enum nbd_io_state_t state;
+
+ void *payload;
+ uint32_t payload_size;
+
+ struct nbd_request req;
+ struct nbd_reply resp;
+
+ /*
+ * Tracks current progress on reading/writing a request,
+ * response, or payload from the nbd socket.
+ */
+ uint32_t offset;
+
+ /* for bdev io_wait */
+ struct spdk_bdev_io_wait_entry bdev_io_wait;
+
+ TAILQ_ENTRY(nbd_io) tailq;
+};
+
+enum nbd_disk_state_t {
+ NBD_DISK_STATE_RUNNING = 0,
+ /* soft disconnection caused by receiving nbd_cmd_disc */
+ NBD_DISK_STATE_SOFTDISC,
+ /* hard disconnection caused by mandatory conditions */
+ NBD_DISK_STATE_HARDDISC,
+};
+
+struct spdk_nbd_disk {
+ struct spdk_bdev *bdev;
+ struct spdk_bdev_desc *bdev_desc;
+ struct spdk_io_channel *ch;
+ int dev_fd;
+ char *nbd_path;
+ int kernel_sp_fd;
+ int spdk_sp_fd;
+ struct spdk_poller *nbd_poller;
+ uint32_t buf_align;
+
+ struct nbd_io *io_in_recv;
+ TAILQ_HEAD(, nbd_io) received_io_list;
+ TAILQ_HEAD(, nbd_io) executed_io_list;
+
+ enum nbd_disk_state_t state;
+ /* count of nbd_io in spdk_nbd_disk */
+ int io_count;
+
+ TAILQ_ENTRY(spdk_nbd_disk) tailq;
+};
+
+struct spdk_nbd_disk_globals {
+ TAILQ_HEAD(, spdk_nbd_disk) disk_head;
+};
+
+static struct spdk_nbd_disk_globals g_spdk_nbd;
+
+static int
+nbd_submit_bdev_io(struct spdk_nbd_disk *nbd, struct nbd_io *io);
+
+int
+spdk_nbd_init(void)
+{
+ TAILQ_INIT(&g_spdk_nbd.disk_head);
+
+ return 0;
+}
+
+void
+spdk_nbd_fini(void)
+{
+ struct spdk_nbd_disk *nbd_idx, *nbd_tmp;
+
+ /*
+ * Stop running spdk_nbd_disk.
+ * Here, nbd removing are unnecessary, but _SAFE variant
+ * is needed, since internal nbd_disk_unregister will
+ * remove nbd from TAILQ.
+ */
+ TAILQ_FOREACH_SAFE(nbd_idx, &g_spdk_nbd.disk_head, tailq, nbd_tmp) {
+ spdk_nbd_stop(nbd_idx);
+ }
+}
+
+static int
+nbd_disk_register(struct spdk_nbd_disk *nbd)
+{
+ if (nbd_disk_find_by_nbd_path(nbd->nbd_path)) {
+ SPDK_NOTICELOG("%s is already exported\n", nbd->nbd_path);
+ return -EBUSY;
+ }
+
+ TAILQ_INSERT_TAIL(&g_spdk_nbd.disk_head, nbd, tailq);
+
+ return 0;
+}
+
+static void
+nbd_disk_unregister(struct spdk_nbd_disk *nbd)
+{
+ struct spdk_nbd_disk *nbd_idx, *nbd_tmp;
+
+ /*
+ * nbd disk may be stopped before registered.
+ * check whether it was registered.
+ */
+ TAILQ_FOREACH_SAFE(nbd_idx, &g_spdk_nbd.disk_head, tailq, nbd_tmp) {
+ if (nbd == nbd_idx) {
+ TAILQ_REMOVE(&g_spdk_nbd.disk_head, nbd_idx, tailq);
+ break;
+ }
+ }
+}
+
+struct spdk_nbd_disk *
+nbd_disk_find_by_nbd_path(const char *nbd_path)
+{
+ struct spdk_nbd_disk *nbd;
+
+ /*
+ * check whether nbd has already been registered by nbd path.
+ */
+ TAILQ_FOREACH(nbd, &g_spdk_nbd.disk_head, tailq) {
+ if (!strcmp(nbd->nbd_path, nbd_path)) {
+ return nbd;
+ }
+ }
+
+ return NULL;
+}
+
+struct spdk_nbd_disk *nbd_disk_first(void)
+{
+ return TAILQ_FIRST(&g_spdk_nbd.disk_head);
+}
+
+struct spdk_nbd_disk *nbd_disk_next(struct spdk_nbd_disk *prev)
+{
+ return TAILQ_NEXT(prev, tailq);
+}
+
+const char *
+nbd_disk_get_nbd_path(struct spdk_nbd_disk *nbd)
+{
+ return nbd->nbd_path;
+}
+
+const char *
+nbd_disk_get_bdev_name(struct spdk_nbd_disk *nbd)
+{
+ return spdk_bdev_get_name(nbd->bdev);
+}
+
+void
+spdk_nbd_write_config_json(struct spdk_json_write_ctx *w)
+{
+ struct spdk_nbd_disk *nbd;
+
+ spdk_json_write_array_begin(w);
+
+ TAILQ_FOREACH(nbd, &g_spdk_nbd.disk_head, tailq) {
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_string(w, "method", "nbd_start_disk");
+
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_string(w, "nbd_device", nbd_disk_get_nbd_path(nbd));
+ spdk_json_write_named_string(w, "bdev_name", nbd_disk_get_bdev_name(nbd));
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+ }
+
+ spdk_json_write_array_end(w);
+}
+
+void
+nbd_disconnect(struct spdk_nbd_disk *nbd)
+{
+ /*
+ * nbd soft-disconnection to terminate transmission phase.
+ * After receiving this ioctl command, nbd kernel module will send
+ * a NBD_CMD_DISC type io to nbd server in order to inform server.
+ */
+ ioctl(nbd->dev_fd, NBD_DISCONNECT);
+}
+
+static struct nbd_io *
+nbd_get_io(struct spdk_nbd_disk *nbd)
+{
+ struct nbd_io *io;
+
+ io = calloc(1, sizeof(*io));
+ if (!io) {
+ return NULL;
+ }
+
+ io->nbd = nbd;
+ to_be32(&io->resp.magic, NBD_REPLY_MAGIC);
+
+ nbd->io_count++;
+
+ return io;
+}
+
+static void
+nbd_put_io(struct spdk_nbd_disk *nbd, struct nbd_io *io)
+{
+ if (io->payload) {
+ spdk_free(io->payload);
+ }
+ free(io);
+
+ nbd->io_count--;
+}
+
+/*
+ * Check whether received nbd_io are all transmitted.
+ *
+ * \return 1 there is still some nbd_io not transmitted.
+ * 0 all nbd_io received are transmitted.
+ */
+static int
+nbd_io_xmit_check(struct spdk_nbd_disk *nbd)
+{
+ if (nbd->io_count == 0) {
+ return 0;
+ } else if (nbd->io_count == 1 && nbd->io_in_recv != NULL) {
+ return 0;
+ }
+
+ return 1;
+}
+
+/*
+ * Check whether received nbd_io are all executed,
+ * and put back executed nbd_io instead of transmitting them
+ *
+ * \return 1 there is still some nbd_io under executing
+ * 0 all nbd_io gotten are freed.
+ */
+static int
+nbd_cleanup_io(struct spdk_nbd_disk *nbd)
+{
+ struct nbd_io *io, *io_tmp;
+
+ /* free io_in_recv */
+ if (nbd->io_in_recv != NULL) {
+ nbd_put_io(nbd, nbd->io_in_recv);
+ nbd->io_in_recv = NULL;
+ }
+
+ /* free io in received_io_list */
+ if (!TAILQ_EMPTY(&nbd->received_io_list)) {
+ TAILQ_FOREACH_SAFE(io, &nbd->received_io_list, tailq, io_tmp) {
+ TAILQ_REMOVE(&nbd->received_io_list, io, tailq);
+ nbd_put_io(nbd, io);
+ }
+ }
+
+ /* free io in executed_io_list */
+ if (!TAILQ_EMPTY(&nbd->executed_io_list)) {
+ TAILQ_FOREACH_SAFE(io, &nbd->executed_io_list, tailq, io_tmp) {
+ TAILQ_REMOVE(&nbd->executed_io_list, io, tailq);
+ nbd_put_io(nbd, io);
+ }
+ }
+
+ /*
+ * Some nbd_io may be under executing in bdev.
+ * Wait for their done operation.
+ */
+ if (nbd->io_count != 0) {
+ return 1;
+ }
+
+ return 0;
+}
+
+static void
+_nbd_stop(struct spdk_nbd_disk *nbd)
+{
+ if (nbd->ch) {
+ spdk_put_io_channel(nbd->ch);
+ }
+
+ if (nbd->bdev_desc) {
+ spdk_bdev_close(nbd->bdev_desc);
+ }
+
+ if (nbd->spdk_sp_fd >= 0) {
+ close(nbd->spdk_sp_fd);
+ }
+
+ if (nbd->kernel_sp_fd >= 0) {
+ close(nbd->kernel_sp_fd);
+ }
+
+ if (nbd->dev_fd >= 0) {
+ /* Clear nbd device only if it is occupied by SPDK app */
+ if (nbd->nbd_path && nbd_disk_find_by_nbd_path(nbd->nbd_path)) {
+ ioctl(nbd->dev_fd, NBD_CLEAR_QUE);
+ ioctl(nbd->dev_fd, NBD_CLEAR_SOCK);
+ }
+ close(nbd->dev_fd);
+ }
+
+ if (nbd->nbd_path) {
+ free(nbd->nbd_path);
+ }
+
+ if (nbd->nbd_poller) {
+ spdk_poller_unregister(&nbd->nbd_poller);
+ }
+
+ nbd_disk_unregister(nbd);
+
+ free(nbd);
+}
+
+void
+spdk_nbd_stop(struct spdk_nbd_disk *nbd)
+{
+ if (nbd == NULL) {
+ return;
+ }
+
+ nbd->state = NBD_DISK_STATE_HARDDISC;
+
+ /*
+ * Stop action should be called only after all nbd_io are executed.
+ */
+ if (!nbd_cleanup_io(nbd)) {
+ _nbd_stop(nbd);
+ }
+}
+
+static int64_t
+read_from_socket(int fd, void *buf, size_t length)
+{
+ ssize_t bytes_read;
+
+ bytes_read = read(fd, buf, length);
+ if (bytes_read == 0) {
+ return -EIO;
+ } else if (bytes_read == -1) {
+ if (errno != EAGAIN) {
+ return -errno;
+ }
+ return 0;
+ } else {
+ return bytes_read;
+ }
+}
+
+static int64_t
+write_to_socket(int fd, void *buf, size_t length)
+{
+ ssize_t bytes_written;
+
+ bytes_written = write(fd, buf, length);
+ if (bytes_written == 0) {
+ return -EIO;
+ } else if (bytes_written == -1) {
+ if (errno != EAGAIN) {
+ return -errno;
+ }
+ return 0;
+ } else {
+ return bytes_written;
+ }
+}
+
+static void
+nbd_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct nbd_io *io = cb_arg;
+ struct spdk_nbd_disk *nbd = io->nbd;
+
+ if (success) {
+ io->resp.error = 0;
+ } else {
+ to_be32(&io->resp.error, EIO);
+ }
+
+ memcpy(&io->resp.handle, &io->req.handle, sizeof(io->resp.handle));
+ TAILQ_INSERT_TAIL(&nbd->executed_io_list, io, tailq);
+
+ if (bdev_io != NULL) {
+ spdk_bdev_free_io(bdev_io);
+ }
+
+ if (nbd->state == NBD_DISK_STATE_HARDDISC && !nbd_cleanup_io(nbd)) {
+ _nbd_stop(nbd);
+ }
+}
+
+static void
+nbd_resubmit_io(void *arg)
+{
+ struct nbd_io *io = (struct nbd_io *)arg;
+ struct spdk_nbd_disk *nbd = io->nbd;
+ int rc = 0;
+
+ rc = nbd_submit_bdev_io(nbd, io);
+ if (rc) {
+ SPDK_INFOLOG(SPDK_LOG_NBD, "nbd: io resubmit for dev %s , io_type %d, returned %d.\n",
+ nbd_disk_get_bdev_name(nbd), from_be32(&io->req.type), rc);
+ }
+}
+
+static void
+nbd_queue_io(struct nbd_io *io)
+{
+ int rc;
+ struct spdk_bdev *bdev = io->nbd->bdev;
+
+ io->bdev_io_wait.bdev = bdev;
+ io->bdev_io_wait.cb_fn = nbd_resubmit_io;
+ io->bdev_io_wait.cb_arg = io;
+
+ rc = spdk_bdev_queue_io_wait(bdev, io->nbd->ch, &io->bdev_io_wait);
+ if (rc != 0) {
+ SPDK_ERRLOG("Queue io failed in nbd_queue_io, rc=%d.\n", rc);
+ nbd_io_done(NULL, false, io);
+ }
+}
+
+static int
+nbd_submit_bdev_io(struct spdk_nbd_disk *nbd, struct nbd_io *io)
+{
+ struct spdk_bdev_desc *desc = nbd->bdev_desc;
+ struct spdk_io_channel *ch = nbd->ch;
+ int rc = 0;
+
+ switch (from_be32(&io->req.type)) {
+ case NBD_CMD_READ:
+ rc = spdk_bdev_read(desc, ch, io->payload, from_be64(&io->req.from),
+ io->payload_size, nbd_io_done, io);
+ break;
+ case NBD_CMD_WRITE:
+ rc = spdk_bdev_write(desc, ch, io->payload, from_be64(&io->req.from),
+ io->payload_size, nbd_io_done, io);
+ break;
+#ifdef NBD_FLAG_SEND_FLUSH
+ case NBD_CMD_FLUSH:
+ rc = spdk_bdev_flush(desc, ch, 0,
+ spdk_bdev_get_num_blocks(nbd->bdev) * spdk_bdev_get_block_size(nbd->bdev),
+ nbd_io_done, io);
+ break;
+#endif
+#ifdef NBD_FLAG_SEND_TRIM
+ case NBD_CMD_TRIM:
+ rc = spdk_bdev_unmap(desc, ch, from_be64(&io->req.from),
+ from_be32(&io->req.len), nbd_io_done, io);
+ break;
+#endif
+ case NBD_CMD_DISC:
+ nbd_put_io(nbd, io);
+ nbd->state = NBD_DISK_STATE_SOFTDISC;
+ break;
+ default:
+ rc = -1;
+ }
+
+ if (rc < 0) {
+ if (rc == -ENOMEM) {
+ SPDK_INFOLOG(SPDK_LOG_NBD, "No memory, start to queue io.\n");
+ nbd_queue_io(io);
+ } else {
+ SPDK_ERRLOG("nbd io failed in nbd_queue_io, rc=%d.\n", rc);
+ nbd_io_done(NULL, false, io);
+ }
+ }
+
+ return 0;
+}
+
+static int
+nbd_io_exec(struct spdk_nbd_disk *nbd)
+{
+ struct nbd_io *io, *io_tmp;
+ int io_count = 0;
+ int ret = 0;
+
+ /*
+ * For soft disconnection, nbd server must handle all outstanding
+ * request before closing connection.
+ */
+ if (nbd->state == NBD_DISK_STATE_HARDDISC) {
+ return 0;
+ }
+
+ if (!TAILQ_EMPTY(&nbd->received_io_list)) {
+ TAILQ_FOREACH_SAFE(io, &nbd->received_io_list, tailq, io_tmp) {
+ TAILQ_REMOVE(&nbd->received_io_list, io, tailq);
+ ret = nbd_submit_bdev_io(nbd, io);
+ if (ret < 0) {
+ return ret;
+ }
+
+ io_count++;
+ }
+ }
+
+ return io_count;
+}
+
+static int
+nbd_io_recv_internal(struct spdk_nbd_disk *nbd)
+{
+ struct nbd_io *io;
+ int ret = 0;
+ int received = 0;
+
+ if (nbd->io_in_recv == NULL) {
+ nbd->io_in_recv = nbd_get_io(nbd);
+ if (!nbd->io_in_recv) {
+ return -ENOMEM;
+ }
+ }
+
+ io = nbd->io_in_recv;
+
+ if (io->state == NBD_IO_RECV_REQ) {
+ ret = read_from_socket(nbd->spdk_sp_fd, (char *)&io->req + io->offset,
+ sizeof(io->req) - io->offset);
+ if (ret < 0) {
+ nbd_put_io(nbd, io);
+ nbd->io_in_recv = NULL;
+ return ret;
+ }
+
+ io->offset += ret;
+ received = ret;
+
+ /* request is fully received */
+ if (io->offset == sizeof(io->req)) {
+ io->offset = 0;
+
+ /* req magic check */
+ if (from_be32(&io->req.magic) != NBD_REQUEST_MAGIC) {
+ SPDK_ERRLOG("invalid request magic\n");
+ nbd_put_io(nbd, io);
+ nbd->io_in_recv = NULL;
+ return -EINVAL;
+ }
+
+ /* io except read/write should ignore payload */
+ if (from_be32(&io->req.type) == NBD_CMD_WRITE ||
+ from_be32(&io->req.type) == NBD_CMD_READ) {
+ io->payload_size = from_be32(&io->req.len);
+ } else {
+ io->payload_size = 0;
+ }
+
+ /* io payload allocate */
+ if (io->payload_size) {
+ io->payload = spdk_malloc(io->payload_size, nbd->buf_align, NULL,
+ SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+ if (io->payload == NULL) {
+ SPDK_ERRLOG("could not allocate io->payload of size %d\n", io->payload_size);
+ nbd_put_io(nbd, io);
+ nbd->io_in_recv = NULL;
+ return -ENOMEM;
+ }
+ } else {
+ io->payload = NULL;
+ }
+
+ /* next io step */
+ if (from_be32(&io->req.type) == NBD_CMD_WRITE) {
+ io->state = NBD_IO_RECV_PAYLOAD;
+ } else {
+ io->state = NBD_IO_XMIT_RESP;
+ nbd->io_in_recv = NULL;
+ TAILQ_INSERT_TAIL(&nbd->received_io_list, io, tailq);
+ }
+ }
+ }
+
+ if (io->state == NBD_IO_RECV_PAYLOAD) {
+ ret = read_from_socket(nbd->spdk_sp_fd, io->payload + io->offset, io->payload_size - io->offset);
+ if (ret < 0) {
+ nbd_put_io(nbd, io);
+ nbd->io_in_recv = NULL;
+ return ret;
+ }
+
+ io->offset += ret;
+ received += ret;
+
+ /* request payload is fully received */
+ if (io->offset == io->payload_size) {
+ io->offset = 0;
+ io->state = NBD_IO_XMIT_RESP;
+ nbd->io_in_recv = NULL;
+ TAILQ_INSERT_TAIL(&nbd->received_io_list, io, tailq);
+ }
+
+ }
+
+ return received;
+}
+
+static int
+nbd_io_recv(struct spdk_nbd_disk *nbd)
+{
+ int i, rc, ret = 0;
+
+ /*
+ * nbd server should not accept request in both soft and hard
+ * disconnect states.
+ */
+ if (nbd->state != NBD_DISK_STATE_RUNNING) {
+ return 0;
+ }
+
+ for (i = 0; i < GET_IO_LOOP_COUNT; i++) {
+ rc = nbd_io_recv_internal(nbd);
+ if (rc < 0) {
+ return rc;
+ }
+ ret += rc;
+ }
+
+ return ret;
+}
+
+static int
+nbd_io_xmit_internal(struct spdk_nbd_disk *nbd)
+{
+ struct nbd_io *io;
+ int ret = 0;
+ int sent = 0;
+
+ io = TAILQ_FIRST(&nbd->executed_io_list);
+ if (io == NULL) {
+ return 0;
+ }
+
+ /* Remove IO from list now assuming it will be completed. It will be inserted
+ * back to the head if it cannot be completed. This approach is specifically
+ * taken to work around a scan-build use-after-free mischaracterization.
+ */
+ TAILQ_REMOVE(&nbd->executed_io_list, io, tailq);
+
+ /* resp error and handler are already set in io_done */
+
+ if (io->state == NBD_IO_XMIT_RESP) {
+ ret = write_to_socket(nbd->spdk_sp_fd, (char *)&io->resp + io->offset,
+ sizeof(io->resp) - io->offset);
+ if (ret <= 0) {
+ goto reinsert;
+ }
+
+ io->offset += ret;
+ sent = ret;
+
+ /* response is fully transmitted */
+ if (io->offset == sizeof(io->resp)) {
+ io->offset = 0;
+
+ /* transmit payload only when NBD_CMD_READ with no resp error */
+ if (from_be32(&io->req.type) != NBD_CMD_READ || io->resp.error != 0) {
+ nbd_put_io(nbd, io);
+ return 0;
+ } else {
+ io->state = NBD_IO_XMIT_PAYLOAD;
+ }
+ }
+ }
+
+ if (io->state == NBD_IO_XMIT_PAYLOAD) {
+ ret = write_to_socket(nbd->spdk_sp_fd, io->payload + io->offset, io->payload_size - io->offset);
+ if (ret <= 0) {
+ goto reinsert;
+ }
+
+ io->offset += ret;
+ sent += ret;
+
+ /* read payload is fully transmitted */
+ if (io->offset == io->payload_size) {
+ nbd_put_io(nbd, io);
+ return sent;
+ }
+ }
+
+reinsert:
+ TAILQ_INSERT_HEAD(&nbd->executed_io_list, io, tailq);
+ return ret < 0 ? ret : sent;
+}
+
+static int
+nbd_io_xmit(struct spdk_nbd_disk *nbd)
+{
+ int ret = 0;
+ int rc;
+
+ /*
+ * For soft disconnection, nbd server must handle all outstanding
+ * request before closing connection.
+ */
+ if (nbd->state == NBD_DISK_STATE_HARDDISC) {
+ return 0;
+ }
+
+ while (!TAILQ_EMPTY(&nbd->executed_io_list)) {
+ rc = nbd_io_xmit_internal(nbd);
+ if (rc < 0) {
+ return rc;
+ }
+
+ ret += rc;
+ }
+
+ /*
+ * For soft disconnection, nbd server can close connection after all
+ * outstanding request are transmitted.
+ */
+ if (nbd->state == NBD_DISK_STATE_SOFTDISC && !nbd_io_xmit_check(nbd)) {
+ return -1;
+ }
+
+ return ret;
+}
+
+/**
+ * Poll an NBD instance.
+ *
+ * \return 0 on success or negated errno values on error (e.g. connection closed).
+ */
+static int
+_nbd_poll(struct spdk_nbd_disk *nbd)
+{
+ int received, sent, executed;
+
+ /* transmit executed io first */
+ sent = nbd_io_xmit(nbd);
+ if (sent < 0) {
+ return sent;
+ }
+
+ received = nbd_io_recv(nbd);
+ if (received < 0) {
+ return received;
+ }
+
+ executed = nbd_io_exec(nbd);
+ if (executed < 0) {
+ return executed;
+ }
+
+ return sent + received + executed;
+}
+
+static int
+nbd_poll(void *arg)
+{
+ struct spdk_nbd_disk *nbd = arg;
+ int rc;
+
+ rc = _nbd_poll(nbd);
+ if (rc < 0) {
+ SPDK_INFOLOG(SPDK_LOG_NBD, "nbd_poll() returned %s (%d); closing connection\n",
+ spdk_strerror(-rc), rc);
+ spdk_nbd_stop(nbd);
+ }
+
+ return rc > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
+}
+
+static void *
+nbd_start_kernel(void *arg)
+{
+ int dev_fd = (int)(intptr_t)arg;
+
+ spdk_unaffinitize_thread();
+
+ /* This will block in the kernel until we close the spdk_sp_fd. */
+ ioctl(dev_fd, NBD_DO_IT);
+
+ pthread_exit(NULL);
+}
+
+static void
+nbd_bdev_hot_remove(void *remove_ctx)
+{
+ struct spdk_nbd_disk *nbd = remove_ctx;
+
+ spdk_nbd_stop(nbd);
+}
+
+struct spdk_nbd_start_ctx {
+ struct spdk_nbd_disk *nbd;
+ spdk_nbd_start_cb cb_fn;
+ void *cb_arg;
+ struct spdk_poller *poller;
+ int polling_count;
+};
+
+static void
+nbd_start_complete(struct spdk_nbd_start_ctx *ctx)
+{
+ int rc;
+ pthread_t tid;
+ int flag;
+
+ /* Add nbd_disk to the end of disk list */
+ rc = nbd_disk_register(ctx->nbd);
+ if (rc != 0) {
+ SPDK_ERRLOG("Failed to register %s, it should not happen.\n", ctx->nbd->nbd_path);
+ assert(false);
+ goto err;
+ }
+
+ rc = ioctl(ctx->nbd->dev_fd, NBD_SET_BLKSIZE, spdk_bdev_get_block_size(ctx->nbd->bdev));
+ if (rc == -1) {
+ SPDK_ERRLOG("ioctl(NBD_SET_BLKSIZE) failed: %s\n", spdk_strerror(errno));
+ rc = -errno;
+ goto err;
+ }
+
+ rc = ioctl(ctx->nbd->dev_fd, NBD_SET_SIZE_BLOCKS, spdk_bdev_get_num_blocks(ctx->nbd->bdev));
+ if (rc == -1) {
+ SPDK_ERRLOG("ioctl(NBD_SET_SIZE_BLOCKS) failed: %s\n", spdk_strerror(errno));
+ rc = -errno;
+ goto err;
+ }
+
+#ifdef NBD_FLAG_SEND_TRIM
+ rc = ioctl(ctx->nbd->dev_fd, NBD_SET_FLAGS, NBD_FLAG_SEND_TRIM);
+ if (rc == -1) {
+ SPDK_ERRLOG("ioctl(NBD_SET_FLAGS) failed: %s\n", spdk_strerror(errno));
+ rc = -errno;
+ goto err;
+ }
+#endif
+
+ rc = pthread_create(&tid, NULL, nbd_start_kernel, (void *)(intptr_t)ctx->nbd->dev_fd);
+ if (rc != 0) {
+ SPDK_ERRLOG("could not create thread: %s\n", spdk_strerror(rc));
+ rc = -rc;
+ goto err;
+ }
+
+ rc = pthread_detach(tid);
+ if (rc != 0) {
+ SPDK_ERRLOG("could not detach thread for nbd kernel: %s\n", spdk_strerror(rc));
+ rc = -rc;
+ goto err;
+ }
+
+ flag = fcntl(ctx->nbd->spdk_sp_fd, F_GETFL);
+ if (fcntl(ctx->nbd->spdk_sp_fd, F_SETFL, flag | O_NONBLOCK) < 0) {
+ SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%s)\n",
+ ctx->nbd->spdk_sp_fd, spdk_strerror(errno));
+ rc = -errno;
+ goto err;
+ }
+
+ ctx->nbd->nbd_poller = SPDK_POLLER_REGISTER(nbd_poll, ctx->nbd, 0);
+
+ if (ctx->cb_fn) {
+ ctx->cb_fn(ctx->cb_arg, ctx->nbd, 0);
+ }
+
+ free(ctx);
+ return;
+
+err:
+ spdk_nbd_stop(ctx->nbd);
+ if (ctx->cb_fn) {
+ ctx->cb_fn(ctx->cb_arg, NULL, rc);
+ }
+ free(ctx);
+}
+
+static int
+nbd_enable_kernel(void *arg)
+{
+ struct spdk_nbd_start_ctx *ctx = arg;
+ int rc;
+
+ /* Declare device setup by this process */
+ rc = ioctl(ctx->nbd->dev_fd, NBD_SET_SOCK, ctx->nbd->kernel_sp_fd);
+ if (rc == -1) {
+ if (errno == EBUSY && ctx->polling_count-- > 0) {
+ if (ctx->poller == NULL) {
+ ctx->poller = SPDK_POLLER_REGISTER(nbd_enable_kernel, ctx,
+ NBD_BUSY_POLLING_INTERVAL_US);
+ }
+ /* If the kernel is busy, check back later */
+ return SPDK_POLLER_BUSY;
+ }
+
+ SPDK_ERRLOG("ioctl(NBD_SET_SOCK) failed: %s\n", spdk_strerror(errno));
+ if (ctx->poller) {
+ spdk_poller_unregister(&ctx->poller);
+ }
+
+ spdk_nbd_stop(ctx->nbd);
+
+ if (ctx->cb_fn) {
+ ctx->cb_fn(ctx->cb_arg, NULL, -errno);
+ }
+
+ free(ctx);
+ return SPDK_POLLER_BUSY;
+ }
+
+ if (ctx->poller) {
+ spdk_poller_unregister(&ctx->poller);
+ }
+
+ nbd_start_complete(ctx);
+
+ return SPDK_POLLER_BUSY;
+}
+
+void
+spdk_nbd_start(const char *bdev_name, const char *nbd_path,
+ spdk_nbd_start_cb cb_fn, void *cb_arg)
+{
+ struct spdk_nbd_start_ctx *ctx = NULL;
+ struct spdk_nbd_disk *nbd = NULL;
+ struct spdk_bdev *bdev;
+ int rc;
+ int sp[2];
+
+ bdev = spdk_bdev_get_by_name(bdev_name);
+ if (bdev == NULL) {
+ SPDK_ERRLOG("no bdev %s exists\n", bdev_name);
+ rc = -EINVAL;
+ goto err;
+ }
+
+ nbd = calloc(1, sizeof(*nbd));
+ if (nbd == NULL) {
+ rc = -ENOMEM;
+ goto err;
+ }
+
+ nbd->dev_fd = -1;
+ nbd->spdk_sp_fd = -1;
+ nbd->kernel_sp_fd = -1;
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (ctx == NULL) {
+ rc = -ENOMEM;
+ goto err;
+ }
+
+ ctx->nbd = nbd;
+ ctx->cb_fn = cb_fn;
+ ctx->cb_arg = cb_arg;
+ ctx->polling_count = NBD_BUSY_WAITING_MS * 1000ULL / NBD_BUSY_POLLING_INTERVAL_US;
+
+ rc = spdk_bdev_open(bdev, true, nbd_bdev_hot_remove, nbd, &nbd->bdev_desc);
+ if (rc != 0) {
+ SPDK_ERRLOG("could not open bdev %s, error=%d\n", spdk_bdev_get_name(bdev), rc);
+ goto err;
+ }
+
+ nbd->bdev = bdev;
+
+ nbd->ch = spdk_bdev_get_io_channel(nbd->bdev_desc);
+ nbd->buf_align = spdk_max(spdk_bdev_get_buf_align(bdev), 64);
+
+ rc = socketpair(AF_UNIX, SOCK_STREAM, 0, sp);
+ if (rc != 0) {
+ SPDK_ERRLOG("socketpair failed\n");
+ rc = -errno;
+ goto err;
+ }
+
+ nbd->spdk_sp_fd = sp[0];
+ nbd->kernel_sp_fd = sp[1];
+ nbd->nbd_path = strdup(nbd_path);
+ if (!nbd->nbd_path) {
+ SPDK_ERRLOG("strdup allocation failure\n");
+ rc = -ENOMEM;
+ goto err;
+ }
+
+ TAILQ_INIT(&nbd->received_io_list);
+ TAILQ_INIT(&nbd->executed_io_list);
+
+ /* Make sure nbd_path is not used in this SPDK app */
+ if (nbd_disk_find_by_nbd_path(nbd->nbd_path)) {
+ SPDK_NOTICELOG("%s is already exported\n", nbd->nbd_path);
+ rc = -EBUSY;
+ goto err;
+ }
+
+ nbd->dev_fd = open(nbd_path, O_RDWR);
+ if (nbd->dev_fd == -1) {
+ SPDK_ERRLOG("open(\"%s\") failed: %s\n", nbd_path, spdk_strerror(errno));
+ rc = -errno;
+ goto err;
+ }
+
+ SPDK_INFOLOG(SPDK_LOG_NBD, "Enabling kernel access to bdev %s via %s\n",
+ spdk_bdev_get_name(bdev), nbd_path);
+
+ nbd_enable_kernel(ctx);
+ return;
+
+err:
+ free(ctx);
+ if (nbd) {
+ spdk_nbd_stop(nbd);
+ }
+
+ if (cb_fn) {
+ cb_fn(cb_arg, NULL, rc);
+ }
+}
+
+const char *
+spdk_nbd_get_path(struct spdk_nbd_disk *nbd)
+{
+ return nbd->nbd_path;
+}
+
+SPDK_LOG_REGISTER_COMPONENT("nbd", SPDK_LOG_NBD)
diff --git a/src/spdk/lib/nbd/nbd_internal.h b/src/spdk/lib/nbd/nbd_internal.h
new file mode 100644
index 000000000..c0d7ee220
--- /dev/null
+++ b/src/spdk/lib/nbd/nbd_internal.h
@@ -0,0 +1,52 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_NBD_INTERNAL_H
+#define SPDK_NBD_INTERNAL_H
+
+#include "spdk/stdinc.h"
+#include "spdk/nbd.h"
+
+struct spdk_nbd_disk *nbd_disk_find_by_nbd_path(const char *nbd_path);
+
+struct spdk_nbd_disk *nbd_disk_first(void);
+
+struct spdk_nbd_disk *nbd_disk_next(struct spdk_nbd_disk *prev);
+
+const char *nbd_disk_get_nbd_path(struct spdk_nbd_disk *nbd);
+
+const char *nbd_disk_get_bdev_name(struct spdk_nbd_disk *nbd);
+
+void nbd_disconnect(struct spdk_nbd_disk *nbd);
+
+#endif /* SPDK_NBD_INTERNAL_H */
diff --git a/src/spdk/lib/nbd/nbd_rpc.c b/src/spdk/lib/nbd/nbd_rpc.c
new file mode 100644
index 000000000..a00c0a7e6
--- /dev/null
+++ b/src/spdk/lib/nbd/nbd_rpc.c
@@ -0,0 +1,422 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/string.h"
+#include "spdk/env.h"
+#include "spdk/rpc.h"
+#include "spdk/util.h"
+
+#include <linux/nbd.h>
+
+#include "nbd_internal.h"
+#include "spdk_internal/log.h"
+
+struct rpc_nbd_start_disk {
+ char *bdev_name;
+ char *nbd_device;
+ /* Used to search one available nbd device */
+ int nbd_idx;
+ bool nbd_idx_specified;
+ struct spdk_jsonrpc_request *request;
+};
+
+static void
+free_rpc_nbd_start_disk(struct rpc_nbd_start_disk *req)
+{
+ free(req->bdev_name);
+ free(req->nbd_device);
+ free(req);
+}
+
+static const struct spdk_json_object_decoder rpc_nbd_start_disk_decoders[] = {
+ {"bdev_name", offsetof(struct rpc_nbd_start_disk, bdev_name), spdk_json_decode_string},
+ {"nbd_device", offsetof(struct rpc_nbd_start_disk, nbd_device), spdk_json_decode_string, true},
+};
+
+/* Return 0 to indicate the nbd_device might be available,
+ * or non-zero to indicate the nbd_device is invalid or in using.
+ */
+static int
+check_available_nbd_disk(char *nbd_device)
+{
+ char nbd_block_path[256];
+ char tail[2];
+ int rc;
+ unsigned int nbd_idx;
+ struct spdk_nbd_disk *nbd;
+
+ /* nbd device path must be in format of /dev/nbd<num>, with no tail. */
+ rc = sscanf(nbd_device, "/dev/nbd%u%1s", &nbd_idx, tail);
+ if (rc != 1) {
+ return -errno;
+ }
+
+ /* make sure nbd_device is not registered inside SPDK */
+ nbd = nbd_disk_find_by_nbd_path(nbd_device);
+ if (nbd) {
+ /* nbd_device is in using */
+ return -EBUSY;
+ }
+
+ /* A valid pid file in /sys/block indicates the device is in using */
+ snprintf(nbd_block_path, 256, "/sys/block/nbd%u/pid", nbd_idx);
+
+ rc = open(nbd_block_path, O_RDONLY);
+ if (rc < 0) {
+ if (errno == ENOENT) {
+ /* nbd_device might be available */
+ return 0;
+ } else {
+ SPDK_ERRLOG("Failed to check PID file %s: %s\n", nbd_block_path, spdk_strerror(errno));
+ return -errno;
+ }
+ }
+
+ close(rc);
+
+ /* nbd_device is in using */
+ return -EBUSY;
+}
+
+static char *
+find_available_nbd_disk(int nbd_idx, int *next_nbd_idx)
+{
+ int i, rc;
+ char nbd_device[20];
+
+ for (i = nbd_idx; ; i++) {
+ snprintf(nbd_device, 20, "/dev/nbd%d", i);
+ /* Check whether an nbd device exists in order to reach the last one nbd device */
+ rc = access(nbd_device, F_OK);
+ if (rc != 0) {
+ break;
+ }
+
+ rc = check_available_nbd_disk(nbd_device);
+ if (rc == 0) {
+ if (next_nbd_idx != NULL) {
+ *next_nbd_idx = i + 1;
+ }
+
+ return strdup(nbd_device);
+ }
+ }
+
+ return NULL;
+}
+
+static void
+rpc_start_nbd_done(void *cb_arg, struct spdk_nbd_disk *nbd, int rc)
+{
+ struct rpc_nbd_start_disk *req = cb_arg;
+ struct spdk_jsonrpc_request *request = req->request;
+ struct spdk_json_write_ctx *w;
+
+ /* Check whether it's automatic nbd-device assignment */
+ if (rc == -EBUSY && req->nbd_idx_specified == false) {
+ free(req->nbd_device);
+
+ req->nbd_device = find_available_nbd_disk(req->nbd_idx, &req->nbd_idx);
+ if (req->nbd_device != NULL) {
+ spdk_nbd_start(req->bdev_name, req->nbd_device,
+ rpc_start_nbd_done, req);
+ return;
+ }
+
+ SPDK_INFOLOG(SPDK_LOG_NBD, "There is no available nbd device.\n");
+ }
+
+ if (rc) {
+ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc));
+ return;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_string(w, spdk_nbd_get_path(nbd));
+ spdk_jsonrpc_end_result(request, w);
+
+ free_rpc_nbd_start_disk(req);
+}
+
+static void
+rpc_nbd_start_disk(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_nbd_start_disk *req;
+ int rc;
+
+ req = calloc(1, sizeof(*req));
+ if (req == NULL) {
+ SPDK_ERRLOG("could not allocate nbd_start_disk request.\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory");
+ return;
+ }
+
+ if (spdk_json_decode_object(params, rpc_nbd_start_disk_decoders,
+ SPDK_COUNTOF(rpc_nbd_start_disk_decoders),
+ req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto invalid;
+ }
+
+ if (req->bdev_name == NULL) {
+ goto invalid;
+ }
+
+ if (req->nbd_device != NULL) {
+ req->nbd_idx_specified = true;
+ rc = check_available_nbd_disk(req->nbd_device);
+ if (rc == -EBUSY) {
+ SPDK_DEBUGLOG(SPDK_LOG_NBD, "NBD device %s is in using.\n", req->nbd_device);
+ spdk_jsonrpc_send_error_response(request, -EBUSY, spdk_strerror(-rc));
+ goto invalid;
+ }
+
+ if (rc != 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_NBD, "Illegal nbd_device %s.\n", req->nbd_device);
+ spdk_jsonrpc_send_error_response_fmt(request, -ENODEV,
+ "illegal nbd device %s", req->nbd_device);
+ goto invalid;
+ }
+ } else {
+ req->nbd_idx = 0;
+ req->nbd_device = find_available_nbd_disk(req->nbd_idx, &req->nbd_idx);
+ if (req->nbd_device == NULL) {
+ SPDK_INFOLOG(SPDK_LOG_NBD, "There is no available nbd device.\n");
+ spdk_jsonrpc_send_error_response(request, -ENODEV,
+ "nbd device not found");
+ goto invalid;
+ }
+ }
+
+ req->request = request;
+ spdk_nbd_start(req->bdev_name, req->nbd_device,
+ rpc_start_nbd_done, req);
+
+ return;
+
+invalid:
+ free_rpc_nbd_start_disk(req);
+}
+
+SPDK_RPC_REGISTER("nbd_start_disk", rpc_nbd_start_disk, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(nbd_start_disk, start_nbd_disk)
+
+struct rpc_nbd_stop_disk {
+ char *nbd_device;
+};
+
+static void
+free_rpc_nbd_stop_disk(struct rpc_nbd_stop_disk *req)
+{
+ free(req->nbd_device);
+}
+
+static const struct spdk_json_object_decoder rpc_nbd_stop_disk_decoders[] = {
+ {"nbd_device", offsetof(struct rpc_nbd_stop_disk, nbd_device), spdk_json_decode_string},
+};
+
+struct nbd_disconnect_arg {
+ struct spdk_jsonrpc_request *request;
+ struct spdk_nbd_disk *nbd;
+};
+
+static void *
+nbd_disconnect_thread(void *arg)
+{
+ struct nbd_disconnect_arg *thd_arg = arg;
+ struct spdk_json_write_ctx *w;
+
+ spdk_unaffinitize_thread();
+
+ nbd_disconnect(thd_arg->nbd);
+
+ w = spdk_jsonrpc_begin_result(thd_arg->request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(thd_arg->request, w);
+
+ free(thd_arg);
+ pthread_exit(NULL);
+}
+
+static void
+rpc_nbd_stop_disk(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_nbd_stop_disk req = {};
+ struct spdk_nbd_disk *nbd;
+ pthread_t tid;
+ struct nbd_disconnect_arg *thd_arg = NULL;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_nbd_stop_disk_decoders,
+ SPDK_COUNTOF(rpc_nbd_stop_disk_decoders),
+ &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto out;
+ }
+
+ if (req.nbd_device == NULL) {
+ spdk_jsonrpc_send_error_response(request, -ENODEV, "invalid nbd device");
+ goto out;
+ }
+
+ /* make sure nbd_device is registered */
+ nbd = nbd_disk_find_by_nbd_path(req.nbd_device);
+ if (!nbd) {
+ spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV));
+ goto out;
+ }
+
+ /*
+ * thd_arg should be freed by created thread
+ * if thread is created successfully.
+ */
+ thd_arg = malloc(sizeof(*thd_arg));
+ if (!thd_arg) {
+ SPDK_ERRLOG("could not allocate nbd disconnect thread arg\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory");
+ goto out;
+ }
+
+ thd_arg->request = request;
+ thd_arg->nbd = nbd;
+
+ /*
+ * NBD ioctl of disconnect will block until data are flushed.
+ * Create separate thread to execute it.
+ */
+ rc = pthread_create(&tid, NULL, nbd_disconnect_thread, (void *)thd_arg);
+ if (rc != 0) {
+ SPDK_ERRLOG("could not create nbd disconnect thread: %s\n", spdk_strerror(rc));
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, spdk_strerror(rc));
+ free(thd_arg);
+ goto out;
+ }
+
+ rc = pthread_detach(tid);
+ if (rc != 0) {
+ SPDK_ERRLOG("could not detach nbd disconnect thread: %s\n", spdk_strerror(rc));
+ goto out;
+ }
+
+out:
+ free_rpc_nbd_stop_disk(&req);
+}
+
+SPDK_RPC_REGISTER("nbd_stop_disk", rpc_nbd_stop_disk, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(nbd_stop_disk, stop_nbd_disk)
+
+static void
+rpc_dump_nbd_info(struct spdk_json_write_ctx *w,
+ struct spdk_nbd_disk *nbd)
+{
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_string(w, "nbd_device", nbd_disk_get_nbd_path(nbd));
+
+ spdk_json_write_named_string(w, "bdev_name", nbd_disk_get_bdev_name(nbd));
+
+ spdk_json_write_object_end(w);
+}
+
+struct rpc_nbd_get_disks {
+ char *nbd_device;
+};
+
+static void
+free_rpc_nbd_get_disks(struct rpc_nbd_get_disks *r)
+{
+ free(r->nbd_device);
+}
+
+static const struct spdk_json_object_decoder rpc_nbd_get_disks_decoders[] = {
+ {"nbd_device", offsetof(struct rpc_nbd_get_disks, nbd_device), spdk_json_decode_string, true},
+};
+
+static void
+rpc_nbd_get_disks(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_nbd_get_disks req = {};
+ struct spdk_json_write_ctx *w;
+ struct spdk_nbd_disk *nbd = NULL;
+
+ if (params != NULL) {
+ if (spdk_json_decode_object(params, rpc_nbd_get_disks_decoders,
+ SPDK_COUNTOF(rpc_nbd_get_disks_decoders),
+ &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto invalid;
+ }
+
+ if (req.nbd_device) {
+ nbd = nbd_disk_find_by_nbd_path(req.nbd_device);
+ if (nbd == NULL) {
+ SPDK_ERRLOG("nbd device '%s' does not exist\n", req.nbd_device);
+ spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV));
+ goto invalid;
+ }
+
+ free_rpc_nbd_get_disks(&req);
+ }
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_array_begin(w);
+
+ if (nbd != NULL) {
+ rpc_dump_nbd_info(w, nbd);
+ } else {
+ for (nbd = nbd_disk_first(); nbd != NULL; nbd = nbd_disk_next(nbd)) {
+ rpc_dump_nbd_info(w, nbd);
+ }
+ }
+
+ spdk_json_write_array_end(w);
+
+ spdk_jsonrpc_end_result(request, w);
+
+ return;
+
+invalid:
+ free_rpc_nbd_get_disks(&req);
+}
+SPDK_RPC_REGISTER("nbd_get_disks", rpc_nbd_get_disks, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(nbd_get_disks, get_nbd_disks)
diff --git a/src/spdk/lib/nbd/spdk_nbd.map b/src/spdk/lib/nbd/spdk_nbd.map
new file mode 100644
index 000000000..0b7d8de81
--- /dev/null
+++ b/src/spdk/lib/nbd/spdk_nbd.map
@@ -0,0 +1,13 @@
+{
+ global:
+
+ # public functions
+ spdk_nbd_init;
+ spdk_nbd_fini;
+ spdk_nbd_start;
+ spdk_nbd_stop;
+ spdk_nbd_get_path;
+ spdk_nbd_write_config_json;
+
+ local: *;
+};
diff --git a/src/spdk/lib/net/Makefile b/src/spdk/lib/net/Makefile
new file mode 100644
index 000000000..918df6cfb
--- /dev/null
+++ b/src/spdk/lib/net/Makefile
@@ -0,0 +1,46 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = interface.c net_rpc.c
+
+LIBNAME = net
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_net.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/net/interface.c b/src/spdk/lib/net/interface.c
new file mode 100644
index 000000000..358cbc308
--- /dev/null
+++ b/src/spdk/lib/net/interface.c
@@ -0,0 +1,551 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "net_internal.h"
+
+#include "spdk/stdinc.h"
+#include "spdk/string.h"
+
+#include "spdk/log.h"
+#include "spdk/net.h"
+
+#ifdef __linux__ /* Interface management is Linux-specific */
+
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+
+static TAILQ_HEAD(, spdk_interface) g_interface_head;
+
+static pthread_mutex_t interface_lock = PTHREAD_MUTEX_INITIALIZER;
+
+static int get_ifc_ipv4(void)
+{
+ int ret;
+ int rtattrlen;
+ int netlink_fd;
+ uint32_t ipv4_addr;
+
+ struct {
+ struct nlmsghdr n;
+ struct ifaddrmsg r;
+ struct rtattr rta;
+ } req;
+ char buf[16384];
+ struct nlmsghdr *nlmp;
+ struct ifaddrmsg *rtmp;
+ struct rtattr *rtatp;
+ struct spdk_interface *ifc;
+
+ netlink_fd = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE);
+ if (netlink_fd < 0) {
+ SPDK_ERRLOG("socket failed!\n");
+ return 1;
+ }
+
+ /*
+ * Prepare a message structure
+ */
+ memset(&req, 0, sizeof(req));
+ req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifaddrmsg));
+ req.n.nlmsg_flags = NLM_F_REQUEST | NLM_F_ROOT;
+ req.n.nlmsg_type = RTM_GETADDR;
+
+ /* IPv4 only */
+ req.r.ifa_family = AF_INET;
+
+ /*
+ * Fill up all the attributes for the rtnetlink header.
+ */
+ assert(&req.rta == (struct rtattr *)(((char *)&req) + NLMSG_ALIGN(req.n.nlmsg_len)));
+ req.rta.rta_len = RTA_LENGTH(16);
+
+ /* Send and recv the message from kernel */
+ ret = send(netlink_fd, &req, req.n.nlmsg_len, 0);
+ if (ret < 0) {
+ SPDK_ERRLOG("netlink send failed: %s\n", spdk_strerror(errno));
+ ret = 1;
+ goto exit;
+ }
+
+ ret = recv(netlink_fd, buf, sizeof(buf), 0);
+ if (ret <= 0) {
+ SPDK_ERRLOG("netlink recv failed: %s\n", spdk_strerror(errno));
+ ret = 1;
+ goto exit;
+ }
+
+ for (nlmp = (struct nlmsghdr *)buf; ret > (int)sizeof(*nlmp);) {
+ int len = nlmp->nlmsg_len;
+ int req_len = len - sizeof(*nlmp);
+
+ if (req_len < 0 || len > ret) {
+ SPDK_ERRLOG("error\n");
+ ret = 1;
+ goto exit;
+ }
+
+ if (!NLMSG_OK(nlmp, (uint32_t)ret)) {
+ SPDK_ERRLOG("NLMSG not OK\n");
+ ret = 1;
+ goto exit;
+ }
+
+ rtmp = (struct ifaddrmsg *)NLMSG_DATA(nlmp);
+ rtatp = (struct rtattr *)IFA_RTA(rtmp);
+
+ rtattrlen = IFA_PAYLOAD(nlmp);
+
+ for (; RTA_OK(rtatp, rtattrlen); rtatp = RTA_NEXT(rtatp, rtattrlen)) {
+ if (rtatp->rta_type == IFA_LOCAL) {
+ memcpy(&ipv4_addr, (struct in_addr *)RTA_DATA(rtatp),
+ sizeof(struct in_addr));
+ TAILQ_FOREACH(ifc, &g_interface_head, tailq) {
+ if (ifc->index == rtmp->ifa_index) {
+ /* add a new IP address to interface */
+ if (ifc->num_ip_addresses >= SPDK_MAX_IP_PER_IFC) {
+ SPDK_ERRLOG("SPDK: number of IP addresses supported for %s excceded. limit=%d\n",
+ ifc->name,
+ SPDK_MAX_IP_PER_IFC);
+ break;
+ }
+ ifc->ip_address[ifc->num_ip_addresses] = ipv4_addr;
+ ifc->num_ip_addresses++;
+ break;
+ }
+ }
+ }
+ }
+ ret -= NLMSG_ALIGN(len);
+ nlmp = (struct nlmsghdr *)((char *)nlmp + NLMSG_ALIGN(len));
+ }
+ ret = 0;
+
+exit:
+ close(netlink_fd);
+ return ret;
+}
+
+
+static int process_new_interface_msg(struct nlmsghdr *h)
+{
+ int len;
+ struct spdk_interface *ifc;
+ struct ifinfomsg *iface;
+ struct rtattr *attribute;
+
+ iface = (struct ifinfomsg *)NLMSG_DATA(h);
+
+ ifc = (struct spdk_interface *) malloc(sizeof(*ifc));
+ if (ifc == NULL) {
+ SPDK_ERRLOG("Malloc failed\n");
+ return 1;
+ }
+
+ memset(ifc, 0, sizeof(*ifc));
+
+ /* Set interface index */
+ ifc->index = iface->ifi_index;
+
+ len = h->nlmsg_len - NLMSG_LENGTH(sizeof(*iface));
+
+ /* Loop over all attributes for the NEWLINK message */
+ for (attribute = IFLA_RTA(iface); RTA_OK(attribute, len); attribute = RTA_NEXT(attribute, len)) {
+ switch (attribute->rta_type) {
+ case IFLA_IFNAME:
+ if (if_indextoname(iface->ifi_index, ifc->name) == NULL) {
+ SPDK_ERRLOG("Indextoname failed!\n");
+ free(ifc);
+ return 2;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+ TAILQ_INSERT_TAIL(&g_interface_head, ifc, tailq);
+ return 0;
+}
+
+static int prepare_ifc_list(void)
+{
+ int ret = 0;
+ struct nl_req_s {
+ struct nlmsghdr hdr;
+ struct rtgenmsg gen;
+ struct ifinfomsg ifi;
+ };
+ int netlink_fd;
+ struct sockaddr_nl local; /* Our local (user space) side of the communication */
+ struct sockaddr_nl kernel; /* The remote (kernel space) side of the communication */
+
+ struct msghdr rtnl_msg; /* Generic msghdr struct for use with sendmsg */
+ struct iovec io; /* IO vector for sendmsg */
+
+ struct nl_req_s req; /* Structure that describes the rtnetlink packet itself */
+ char reply[16384]; /* a large buffer to receive lots of link information */
+
+ pid_t pid = getpid(); /* Our process ID to build the correct netlink address */
+ int end = 0; /* some flag to end loop parsing */
+
+ /*
+ * Prepare netlink socket for kernel/user space communication
+ */
+ netlink_fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+ if (netlink_fd < 0) {
+ SPDK_ERRLOG("socket failed!\n");
+ return 1;
+ }
+
+ memset(&local, 0, sizeof(local)); /* Fill-in local address information */
+ local.nl_family = AF_NETLINK;
+ local.nl_pid = pid;
+ local.nl_groups = 0;
+
+ /* RTNL socket is ready to use, prepare and send L2 request. */
+ memset(&rtnl_msg, 0, sizeof(rtnl_msg));
+ memset(&kernel, 0, sizeof(kernel));
+ memset(&req, 0, sizeof(req));
+
+ kernel.nl_family = AF_NETLINK; /* Fill-in kernel address (destination of our message) */
+
+ req.hdr.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtgenmsg));
+ req.hdr.nlmsg_type = RTM_GETLINK;
+ req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+ req.hdr.nlmsg_seq = 1;
+ req.hdr.nlmsg_pid = pid;
+
+ req.ifi.ifi_family = AF_UNSPEC;
+ req.ifi.ifi_type = 1;
+
+ io.iov_base = &req;
+ io.iov_len = req.hdr.nlmsg_len;
+ rtnl_msg.msg_iov = &io;
+ rtnl_msg.msg_iovlen = 1;
+ rtnl_msg.msg_name = &kernel;
+ rtnl_msg.msg_namelen = sizeof(kernel);
+
+ if (sendmsg(netlink_fd, &rtnl_msg, 0) == -1) {
+ SPDK_ERRLOG("Sendmsg failed!\n");
+ ret = 1;
+ goto exit;
+ }
+
+ /* Parse reply */
+ while (!end) {
+ int len;
+ struct nlmsghdr *msg_ptr; /* Pointer to current message part */
+
+ struct msghdr rtnl_reply; /* Generic msghdr structure for use with recvmsg */
+ struct iovec io_reply;
+
+ memset(&io_reply, 0, sizeof(io_reply));
+ memset(&rtnl_reply, 0, sizeof(rtnl_reply));
+
+ io.iov_base = reply;
+ io.iov_len = 8192;
+ rtnl_reply.msg_iov = &io;
+ rtnl_reply.msg_iovlen = 1;
+ rtnl_reply.msg_name = &kernel;
+ rtnl_reply.msg_namelen = sizeof(kernel);
+
+ /* Read as much data as fits in the receive buffer */
+ len = recvmsg(netlink_fd, &rtnl_reply, 0);
+ if (len) {
+ for (msg_ptr = (struct nlmsghdr *) reply; NLMSG_OK(msg_ptr, (uint32_t)len);
+ msg_ptr = NLMSG_NEXT(msg_ptr, len)) {
+ switch (msg_ptr->nlmsg_type) {
+ case NLMSG_DONE: /* This is the special meaning NLMSG_DONE message we asked for by using NLM_F_DUMP flag */
+ end++;
+ break;
+ case RTM_NEWLINK: /* This is a RTM_NEWLINK message, which contains lots of information about a link */
+ ret = process_new_interface_msg(msg_ptr);
+ if (ret != 0) {
+ goto exit;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ }
+exit:
+ close(netlink_fd);
+ return ret;
+}
+
+static struct spdk_interface *
+interface_find_by_index(uint32_t ifc_index)
+{
+ struct spdk_interface *ifc_entry;
+
+ /* Mutex must has benn held by the caller */
+ TAILQ_FOREACH(ifc_entry, &g_interface_head, tailq) {
+ if (ifc_entry->index == ifc_index) {
+ return ifc_entry;
+ }
+ }
+
+ return NULL;
+}
+
+static int netlink_addr_msg(uint32_t ifc_idx, uint32_t ip_address, uint32_t create)
+{
+ int fd;
+ struct sockaddr_nl la;
+ struct sockaddr_nl pa;
+ struct msghdr msg;
+ struct iovec iov;
+ int ifal;
+ struct {
+ struct nlmsghdr n;
+ struct ifaddrmsg r;
+ char buf[16384];
+ } req;
+ struct rtattr *rta;
+
+ fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+ if (fd < 0) {
+ SPDK_ERRLOG("socket failed!\n");
+ return errno;
+ }
+
+ /* setup local address & bind using this address. */
+ bzero(&la, sizeof(la));
+ la.nl_family = AF_NETLINK;
+ la.nl_pid = getpid();
+ bind(fd, (struct sockaddr *) &la, sizeof(la));
+
+ /* initialize RTNETLINK request buffer. */
+ bzero(&req, sizeof(req));
+
+ /* compute the initial length of the service request. */
+ ifal = sizeof(struct ifaddrmsg);
+
+ /* add first attrib: set IP addr and RTNETLINK buffer size. */
+ rta = (struct rtattr *) req.buf;
+ rta->rta_type = IFA_ADDRESS;
+ rta->rta_len = sizeof(struct rtattr) + 4;
+ memcpy(((char *)rta) + sizeof(struct rtattr), &ip_address, sizeof(ip_address));
+ ifal += rta->rta_len;
+
+ /* add second attrib. */
+ rta = (struct rtattr *)(((char *)rta) + rta->rta_len);
+ rta->rta_type = IFA_LOCAL;
+ rta->rta_len = sizeof(struct rtattr) + 4;
+ memcpy(((char *)rta) + sizeof(struct rtattr), &ip_address, sizeof(ip_address));
+ ifal += rta->rta_len;
+
+ /* setup the NETLINK header. */
+ req.n.nlmsg_len = NLMSG_LENGTH(ifal);
+ if (create) {
+ req.n.nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_APPEND;
+ req.n.nlmsg_type = RTM_NEWADDR;
+ } else {
+ req.n.nlmsg_flags = NLM_F_REQUEST;
+ req.n.nlmsg_type = RTM_DELADDR;
+ }
+
+ /* setup the service header (struct rtmsg). */
+ req.r.ifa_family = AF_INET;
+ req.r.ifa_prefixlen = 32; /* hardcoded */
+ req.r.ifa_flags = IFA_F_PERMANENT | IFA_F_SECONDARY;
+ req.r.ifa_index = ifc_idx;
+ req.r.ifa_scope = 0;
+
+ /* create the remote address to communicate. */
+ bzero(&pa, sizeof(pa));
+ pa.nl_family = AF_NETLINK;
+
+ /* initialize & create the struct msghdr supplied to the sendmsg() function. */
+ bzero(&msg, sizeof(msg));
+ msg.msg_name = (void *) &pa;
+ msg.msg_namelen = sizeof(pa);
+
+ /* place the pointer & size of the RTNETLINK message in the struct msghdr. */
+ iov.iov_base = (void *) &req.n;
+ iov.iov_len = req.n.nlmsg_len;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ /* send the RTNETLINK message to kernel. */
+ sendmsg(fd, &msg, 0);
+ close(fd);
+ return 0;
+}
+
+static void interface_ip_update(void)
+{
+ struct spdk_interface *ifc_entry;
+
+ pthread_mutex_lock(&interface_lock);
+ TAILQ_FOREACH(ifc_entry, &g_interface_head, tailq) {
+ ifc_entry->num_ip_addresses = 0;
+ memset(ifc_entry->ip_address, 0, sizeof(ifc_entry->ip_address));
+ }
+ get_ifc_ipv4();
+ pthread_mutex_unlock(&interface_lock);
+}
+
+static int
+interface_is_ip_address_in_use(int ifc_index, uint32_t addr, bool add)
+{
+ struct spdk_interface *ifc_entry;
+ bool in_use = false;
+ uint32_t idx = 0;
+
+ interface_ip_update();
+
+ pthread_mutex_lock(&interface_lock);
+ ifc_entry = interface_find_by_index(ifc_index);
+ if (ifc_entry == NULL) {
+ pthread_mutex_unlock(&interface_lock);
+ return -ENODEV;
+ }
+
+ for (idx = 0; idx < ifc_entry->num_ip_addresses; idx++) {
+ if (ifc_entry->ip_address[idx] == addr) {
+ in_use = true;
+ break;
+ }
+ }
+ pthread_mutex_unlock(&interface_lock);
+
+ /* The IP address to add is alerady in use */
+ if (add == true && in_use == true) {
+ return -EADDRINUSE;
+ }
+
+ /* The IP address to delete is not in use */
+ if (add == false && in_use == false) {
+ return -ENXIO;
+ }
+
+ return 0;
+}
+
+int
+spdk_interface_init(void)
+{
+ int rc = 0;
+
+ TAILQ_INIT(&g_interface_head);
+ rc = prepare_ifc_list();
+ if (!rc) {
+ rc = get_ifc_ipv4();
+ }
+
+ return rc;
+}
+
+void
+spdk_interface_destroy(void)
+{
+ struct spdk_interface *ifc_entry;
+
+ while (!TAILQ_EMPTY(&g_interface_head)) {
+ ifc_entry = TAILQ_FIRST(&g_interface_head);
+ TAILQ_REMOVE(&g_interface_head, ifc_entry, tailq);
+ free(ifc_entry);
+ }
+}
+
+int
+interface_net_interface_add_ip_address(int ifc_index, char *ip_addr)
+{
+ uint32_t addr;
+ int ret;
+
+ addr = inet_addr(ip_addr);
+
+ ret = interface_is_ip_address_in_use(ifc_index, addr, true);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return netlink_addr_msg(ifc_index, addr, 1);
+}
+
+int
+interface_net_interface_delete_ip_address(int ifc_index, char *ip_addr)
+{
+ uint32_t addr;
+ int ret;
+
+ addr = inet_addr(ip_addr);
+
+ ret = interface_is_ip_address_in_use(ifc_index, addr, false);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return netlink_addr_msg(ifc_index, addr, 0);
+}
+
+void *interface_get_list(void)
+{
+ interface_ip_update();
+ return &g_interface_head;
+}
+
+#else /* Not Linux */
+
+int
+spdk_interface_init(void)
+{
+ return 0;
+}
+
+void
+spdk_interface_destroy(void)
+{
+}
+
+int
+interface_net_interface_add_ip_address(int ifc_index, char *ip_addr)
+{
+ return -1;
+}
+
+int
+interface_net_interface_delete_ip_address(int ifc_index, char *ip_addr)
+{
+ return -1;
+}
+
+void *
+interface_get_list(void)
+{
+ return NULL;
+}
+
+#endif
diff --git a/src/spdk/lib/net/net_internal.h b/src/spdk/lib/net/net_internal.h
new file mode 100644
index 000000000..4a1422939
--- /dev/null
+++ b/src/spdk/lib/net/net_internal.h
@@ -0,0 +1,79 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_NET_INTERNAL_H
+#define SPDK_NET_INTERNAL_H
+
+#include "spdk/stdinc.h"
+
+#include "spdk/queue.h"
+
+#define SPDK_IFNAMSIZE 32
+#define SPDK_MAX_IP_PER_IFC 32
+
+struct spdk_interface {
+ char name[SPDK_IFNAMSIZE];
+ uint32_t index;
+ uint32_t num_ip_addresses; /* number of IP addresses defined */
+ uint32_t ip_address[SPDK_MAX_IP_PER_IFC];
+ TAILQ_ENTRY(spdk_interface) tailq;
+};
+
+/**
+ * Add an ip address to the network interface.
+ *
+ * \param ifc_index Index of the network interface.
+ * \param ip_addr Ip address to add.
+ *
+ * \return 0 on success, -1 on failure.
+ */
+int interface_net_interface_add_ip_address(int ifc_index, char *ip_addr);
+
+/**
+ * Delete an ip address from the network interface.
+ *
+ * \param ifc_index Index of the network interface.
+ * \param ip_addr Ip address to delete.
+ *
+ * \return 0 on success, -1 on failure.
+ */
+int interface_net_interface_delete_ip_address(int ifc_index, char *ip_addr);
+
+/**
+ * Get the list of all the network interfaces.
+ *
+ * \return a pointer to the head of the linked list of all the network interfaces.
+ */
+void *interface_get_list(void);
+
+#endif /* SPDK_NET_INTERNAL_H */
diff --git a/src/spdk/lib/net/net_rpc.c b/src/spdk/lib/net/net_rpc.c
new file mode 100644
index 000000000..47a302a6b
--- /dev/null
+++ b/src/spdk/lib/net/net_rpc.c
@@ -0,0 +1,198 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "net_internal.h"
+
+#include "spdk/stdinc.h"
+
+#include "spdk/rpc.h"
+#include "spdk/net.h"
+#include "spdk/util.h"
+
+#include "spdk_internal/log.h"
+
+struct rpc_ip_address {
+ int32_t ifc_index;
+ char *ip_address;
+};
+
+static void
+free_rpc_ip_address(struct rpc_ip_address *req)
+{
+ free(req->ip_address);
+}
+
+static const struct spdk_json_object_decoder rpc_ip_address_decoders[] = {
+ {"ifc_index", offsetof(struct rpc_ip_address, ifc_index), spdk_json_decode_int32},
+ {"ip_address", offsetof(struct rpc_ip_address, ip_address), spdk_json_decode_string},
+};
+
+static void
+rpc_net_interface_add_ip_address(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_ip_address req = {};
+ struct spdk_json_write_ctx *w;
+ int ret_val = 0;
+
+ if (spdk_json_decode_object(params, rpc_ip_address_decoders,
+ SPDK_COUNTOF(rpc_ip_address_decoders),
+ &req)) {
+ SPDK_DEBUGLOG(SPDK_LOG_NET, "spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto invalid;
+ }
+
+ ret_val = interface_net_interface_add_ip_address(req.ifc_index, req.ip_address);
+ if (ret_val) {
+ if (ret_val == -ENODEV) {
+ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_STATE,
+ "Interface %d not available", req.ifc_index);
+ } else if (ret_val == -EADDRINUSE) {
+ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "IP address %s is already added to interface %d",
+ req.ip_address, req.ifc_index);
+ } else {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ strerror(ret_val));
+ }
+ goto invalid;
+ }
+
+ free_rpc_ip_address(&req);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ return;
+
+invalid:
+ free_rpc_ip_address(&req);
+}
+SPDK_RPC_REGISTER("net_interface_add_ip_address", rpc_net_interface_add_ip_address,
+ SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(net_interface_add_ip_address, add_ip_address)
+
+static void
+rpc_net_interface_delete_ip_address(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_ip_address req = {};
+ struct spdk_json_write_ctx *w;
+ int ret_val = 0;
+
+ if (spdk_json_decode_object(params, rpc_ip_address_decoders,
+ SPDK_COUNTOF(rpc_ip_address_decoders),
+ &req)) {
+ SPDK_DEBUGLOG(SPDK_LOG_NET, "spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "spdk_json_decode_object failed");
+ goto invalid;
+ }
+
+ ret_val = interface_net_interface_delete_ip_address(req.ifc_index, req.ip_address);
+ if (ret_val) {
+ if (ret_val == -ENODEV) {
+ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_STATE,
+ "Interface %d not available", req.ifc_index);
+ } else if (ret_val == -ENXIO) {
+ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "IP address %s is not found in interface %d",
+ req.ip_address, req.ifc_index);
+ } else {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ strerror(ret_val));
+ }
+ goto invalid;
+ }
+
+ free_rpc_ip_address(&req);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ return;
+
+invalid:
+ free_rpc_ip_address(&req);
+}
+SPDK_RPC_REGISTER("net_interface_delete_ip_address", rpc_net_interface_delete_ip_address,
+ SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(net_interface_delete_ip_address, delete_ip_address)
+
+static void
+rpc_net_get_interfaces(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct spdk_json_write_ctx *w;
+ TAILQ_HEAD(, spdk_interface) *interface_head = interface_get_list();
+ struct spdk_interface *ifc;
+ char *ip_address;
+ struct in_addr inaddr;
+ uint32_t i;
+
+ if (params != NULL) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "net_get_interfaces requires no parameters");
+ return;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_array_begin(w);
+
+ TAILQ_FOREACH(ifc, interface_head, tailq) {
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_string(w, "name", ifc->name);
+
+ spdk_json_write_named_int32(w, "ifc_index", ifc->index);
+
+ spdk_json_write_named_array_begin(w, "ip_addr");
+ for (i = 0; i < ifc->num_ip_addresses; i++) {
+ memcpy(&inaddr, &ifc->ip_address[i], sizeof(uint32_t));
+ ip_address = inet_ntoa(inaddr);
+ spdk_json_write_string(w, ip_address);
+ }
+ spdk_json_write_array_end(w);
+
+ spdk_json_write_object_end(w);
+ }
+ spdk_json_write_array_end(w);
+
+ spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("net_get_interfaces", rpc_net_get_interfaces, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(net_get_interfaces, get_interfaces)
+
+SPDK_LOG_REGISTER_COMPONENT("net", SPDK_LOG_NET)
diff --git a/src/spdk/lib/net/spdk_net.map b/src/spdk/lib/net/spdk_net.map
new file mode 100644
index 000000000..944bc4c6e
--- /dev/null
+++ b/src/spdk/lib/net/spdk_net.map
@@ -0,0 +1,9 @@
+{
+ global:
+
+ # public functions
+ spdk_interface_init;
+ spdk_interface_destroy;
+
+ local: *;
+};
diff --git a/src/spdk/lib/notify/Makefile b/src/spdk/lib/notify/Makefile
new file mode 100644
index 000000000..82249a5b2
--- /dev/null
+++ b/src/spdk/lib/notify/Makefile
@@ -0,0 +1,45 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = notify.c notify_rpc.c
+LIBNAME = notify
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_notify.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/notify/notify.c b/src/spdk/lib/notify/notify.c
new file mode 100644
index 000000000..88c5d633b
--- /dev/null
+++ b/src/spdk/lib/notify/notify.c
@@ -0,0 +1,150 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/queue.h>
+
+#include "spdk/stdinc.h"
+#include "spdk/util.h"
+#include "spdk/queue.h"
+#include "spdk/string.h"
+#include "spdk/log.h"
+
+#include "spdk/notify.h"
+
+#define SPDK_NOTIFY_MAX_EVENTS 1024
+
+struct spdk_notify_type {
+ char name[SPDK_NOTIFY_MAX_NAME_SIZE];
+ TAILQ_ENTRY(spdk_notify_type) tailq;
+};
+
+pthread_mutex_t g_events_lock = PTHREAD_MUTEX_INITIALIZER;
+static struct spdk_notify_event g_events[SPDK_NOTIFY_MAX_EVENTS];
+static uint64_t g_events_head;
+
+static TAILQ_HEAD(, spdk_notify_type) g_notify_types = TAILQ_HEAD_INITIALIZER(g_notify_types);
+
+struct spdk_notify_type *
+spdk_notify_type_register(const char *type)
+{
+ struct spdk_notify_type *it = NULL;
+
+ if (!type) {
+ SPDK_ERRLOG("Invalid notification type %p\n", type);
+ return NULL;
+ } else if (!type[0] || strlen(type) >= SPDK_NOTIFY_MAX_NAME_SIZE) {
+ SPDK_ERRLOG("Notification type '%s' too short or too long\n", type);
+ return NULL;
+ }
+
+ pthread_mutex_lock(&g_events_lock);
+ TAILQ_FOREACH(it, &g_notify_types, tailq) {
+ if (strcmp(type, it->name) == 0) {
+ SPDK_NOTICELOG("Notification type '%s' already registered.\n", type);
+ goto out;
+ }
+ }
+
+ it = calloc(1, sizeof(*it));
+ if (it == NULL) {
+ goto out;
+ }
+
+ snprintf(it->name, sizeof(it->name), "%s", type);
+ TAILQ_INSERT_TAIL(&g_notify_types, it, tailq);
+
+out:
+ pthread_mutex_unlock(&g_events_lock);
+ return it;
+}
+
+const char *
+spdk_notify_type_get_name(const struct spdk_notify_type *type)
+{
+ return type->name;
+}
+
+
+void
+spdk_notify_foreach_type(spdk_notify_foreach_type_cb cb, void *ctx)
+{
+ struct spdk_notify_type *it;
+
+ pthread_mutex_lock(&g_events_lock);
+ TAILQ_FOREACH(it, &g_notify_types, tailq) {
+ if (cb(it, ctx)) {
+ break;
+ }
+ }
+ pthread_mutex_unlock(&g_events_lock);
+}
+
+uint64_t
+spdk_notify_send(const char *type, const char *ctx)
+{
+ uint64_t head;
+ struct spdk_notify_event *ev;
+
+ pthread_mutex_lock(&g_events_lock);
+ head = g_events_head;
+ g_events_head++;
+
+ ev = &g_events[head % SPDK_NOTIFY_MAX_EVENTS];
+ spdk_strcpy_pad(ev->type, type, sizeof(ev->type), '\0');
+ spdk_strcpy_pad(ev->ctx, ctx, sizeof(ev->ctx), '\0');
+ pthread_mutex_unlock(&g_events_lock);
+
+ return head;
+}
+
+uint64_t
+spdk_notify_foreach_event(uint64_t start_idx, uint64_t max,
+ spdk_notify_foreach_event_cb cb_fn, void *ctx)
+{
+ uint64_t i;
+
+ pthread_mutex_lock(&g_events_lock);
+
+ if (g_events_head > SPDK_NOTIFY_MAX_EVENTS && start_idx < g_events_head - SPDK_NOTIFY_MAX_EVENTS) {
+ start_idx = g_events_head - SPDK_NOTIFY_MAX_EVENTS;
+ }
+
+ for (i = 0; start_idx < g_events_head && i < max; start_idx++, i++) {
+ if (cb_fn(start_idx, &g_events[start_idx % SPDK_NOTIFY_MAX_EVENTS], ctx)) {
+ break;
+ }
+ }
+ pthread_mutex_unlock(&g_events_lock);
+
+ return i;
+}
diff --git a/src/spdk/lib/notify/notify_rpc.c b/src/spdk/lib/notify/notify_rpc.c
new file mode 100644
index 000000000..fc40502c2
--- /dev/null
+++ b/src/spdk/lib/notify/notify_rpc.c
@@ -0,0 +1,126 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include "spdk/rpc.h"
+#include "spdk/string.h"
+#include "spdk/notify.h"
+#include "spdk/env.h"
+#include "spdk/util.h"
+
+#include "spdk_internal/log.h"
+
+static int
+notify_get_types_cb(const struct spdk_notify_type *type, void *ctx)
+{
+ spdk_json_write_string((struct spdk_json_write_ctx *)ctx, spdk_notify_type_get_name(type));
+ return 0;
+}
+
+static void
+rpc_notify_get_types(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct spdk_json_write_ctx *w;
+
+ if (params != NULL) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "No parameters required");
+ return;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_array_begin(w);
+ spdk_notify_foreach_type(notify_get_types_cb, w);
+ spdk_json_write_array_end(w);
+
+ spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("notify_get_types", rpc_notify_get_types, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(notify_get_types, get_notification_types)
+
+struct rpc_notify_get_notifications {
+ uint64_t id;
+ uint64_t max;
+
+ struct spdk_json_write_ctx *w;
+};
+
+static const struct spdk_json_object_decoder rpc_notify_get_notifications_decoders[] = {
+ {"id", offsetof(struct rpc_notify_get_notifications, id), spdk_json_decode_uint64, true},
+ {"max", offsetof(struct rpc_notify_get_notifications, max), spdk_json_decode_uint64, true},
+};
+
+
+static int
+notify_get_notifications_cb(uint64_t id, const struct spdk_notify_event *ev, void *ctx)
+{
+ struct rpc_notify_get_notifications *req = ctx;
+
+ spdk_json_write_object_begin(req->w);
+ spdk_json_write_named_string(req->w, "type", ev->type);
+ spdk_json_write_named_string(req->w, "ctx", ev->ctx);
+ spdk_json_write_named_uint64(req->w, "id", id);
+ spdk_json_write_object_end(req->w);
+ return 0;
+}
+
+static void
+rpc_notify_get_notifications(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_notify_get_notifications req = {0, UINT64_MAX};
+
+ if (params &&
+ spdk_json_decode_object(params, rpc_notify_get_notifications_decoders,
+ SPDK_COUNTOF(rpc_notify_get_notifications_decoders), &req)) {
+ SPDK_DEBUGLOG(SPDK_NOTIFY_RPC, "spdk_json_decode_object failed\n");
+
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ spdk_strerror(EINVAL));
+ return;
+ }
+
+
+ req.w = spdk_jsonrpc_begin_result(request);
+
+ spdk_json_write_array_begin(req.w);
+ spdk_notify_foreach_event(req.id, req.max, notify_get_notifications_cb, &req);
+ spdk_json_write_array_end(req.w);
+
+ spdk_jsonrpc_end_result(request, req.w);
+}
+SPDK_RPC_REGISTER("notify_get_notifications", rpc_notify_get_notifications, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(notify_get_notifications, get_notifications)
+
+SPDK_LOG_REGISTER_COMPONENT("notify_rpc", SPDK_NOTIFY_RPC)
diff --git a/src/spdk/lib/notify/spdk_notify.map b/src/spdk/lib/notify/spdk_notify.map
new file mode 100644
index 000000000..4023a8e66
--- /dev/null
+++ b/src/spdk/lib/notify/spdk_notify.map
@@ -0,0 +1,10 @@
+{
+ global:
+ spdk_notify_type_register;
+ spdk_notify_type_get_name;
+ spdk_notify_foreach_type;
+ spdk_notify_send;
+ spdk_notify_foreach_event;
+
+ local: *;
+};
diff --git a/src/spdk/lib/nvme/Makefile b/src/spdk/lib/nvme/Makefile
new file mode 100644
index 000000000..1c02965f5
--- /dev/null
+++ b/src/spdk/lib/nvme/Makefile
@@ -0,0 +1,73 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 4
+SO_MINOR := 0
+
+C_SRCS = nvme_ctrlr_cmd.c nvme_ctrlr.c nvme_fabric.c nvme_ns_cmd.c nvme_ns.c nvme_pcie.c nvme_qpair.c nvme.c nvme_quirks.c nvme_transport.c nvme_uevent.c nvme_ctrlr_ocssd_cmd.c \
+ nvme_ns_ocssd_cmd.c nvme_tcp.c nvme_opal.c nvme_io_msg.c nvme_poll_group.c
+C_SRCS-$(CONFIG_RDMA) += nvme_rdma.c
+C_SRCS-$(CONFIG_NVME_CUSE) += nvme_cuse.c
+
+LIBNAME = nvme
+LOCAL_SYS_LIBS = -luuid
+ifeq ($(CONFIG_RDMA),y)
+LOCAL_SYS_LIBS += -libverbs -lrdmacm
+#Attach only if FreeBSD and RDMA is specified with configure
+ifeq ($(OS),FreeBSD)
+# Mellanox - MLX4 HBA Userspace Library
+ifneq ("$(wildcard /usr/lib/libmlx4.*)","")
+LOCAL_SYS_LIBS += -lmlx4
+endif
+# Mellanox - MLX5 HBA Userspace Library
+ifneq ("$(wildcard /usr/lib/libmlx5.*)","")
+LOCAL_SYS_LIBS += -lmlx5
+endif
+# Chelsio HBA Userspace Library
+ifneq ("$(wildcard /usr/lib/libcxgb4.*)","")
+LOCAL_SYS_LIBS += -lcxgb4
+endif
+endif
+endif
+
+ifeq ($(CONFIG_NVME_CUSE),y)
+# fuse requires to set _FILE_OFFSET_BITS to 64 bits even for 64 bit machines
+CFLAGS += -D_FILE_OFFSET_BITS=64
+endif
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_nvme.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/nvme/nvme.c b/src/spdk/lib/nvme/nvme.c
new file mode 100644
index 000000000..9393810a6
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme.c
@@ -0,0 +1,1423 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation. All rights reserved.
+ * Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/nvmf_spec.h"
+#include "spdk/string.h"
+#include "nvme_internal.h"
+#include "nvme_io_msg.h"
+#include "nvme_uevent.h"
+
+#define SPDK_NVME_DRIVER_NAME "spdk_nvme_driver"
+
+struct nvme_driver *g_spdk_nvme_driver;
+pid_t g_spdk_nvme_pid;
+
+/* gross timeout of 180 seconds in milliseconds */
+static int g_nvme_driver_timeout_ms = 3 * 60 * 1000;
+
+/* Per-process attached controller list */
+static TAILQ_HEAD(, spdk_nvme_ctrlr) g_nvme_attached_ctrlrs =
+ TAILQ_HEAD_INITIALIZER(g_nvme_attached_ctrlrs);
+
+/* Returns true if ctrlr should be stored on the multi-process shared_attached_ctrlrs list */
+static bool
+nvme_ctrlr_shared(const struct spdk_nvme_ctrlr *ctrlr)
+{
+ return ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE;
+}
+
+void
+nvme_ctrlr_connected(struct spdk_nvme_probe_ctx *probe_ctx,
+ struct spdk_nvme_ctrlr *ctrlr)
+{
+ TAILQ_INSERT_TAIL(&probe_ctx->init_ctrlrs, ctrlr, tailq);
+}
+
+int
+spdk_nvme_detach(struct spdk_nvme_ctrlr *ctrlr)
+{
+ nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock);
+
+ nvme_ctrlr_proc_put_ref(ctrlr);
+
+ if (nvme_ctrlr_get_ref_count(ctrlr) == 0) {
+ nvme_io_msg_ctrlr_detach(ctrlr);
+ if (nvme_ctrlr_shared(ctrlr)) {
+ TAILQ_REMOVE(&g_spdk_nvme_driver->shared_attached_ctrlrs, ctrlr, tailq);
+ } else {
+ TAILQ_REMOVE(&g_nvme_attached_ctrlrs, ctrlr, tailq);
+ }
+ nvme_ctrlr_destruct(ctrlr);
+ }
+
+ nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock);
+ return 0;
+}
+
+void
+nvme_completion_poll_cb(void *arg, const struct spdk_nvme_cpl *cpl)
+{
+ struct nvme_completion_poll_status *status = arg;
+
+ if (status->timed_out) {
+ /* There is no routine waiting for the completion of this request, free allocated memory */
+ free(status);
+ return;
+ }
+
+ /*
+ * Copy status into the argument passed by the caller, so that
+ * the caller can check the status to determine if the
+ * the request passed or failed.
+ */
+ memcpy(&status->cpl, cpl, sizeof(*cpl));
+ status->done = true;
+}
+
+/**
+ * Poll qpair for completions until a command completes.
+ *
+ * \param qpair queue to poll
+ * \param status completion status. The user must fill this structure with zeroes before calling
+ * this function
+ * \param robust_mutex optional robust mutex to lock while polling qpair
+ *
+ * \return 0 if command completed without error,
+ * -EIO if command completed with error,
+ * -ECANCELED if command is not completed due to transport/device error
+ *
+ * The command to wait upon must be submitted with nvme_completion_poll_cb as the callback
+ * and status as the callback argument.
+ */
+int
+nvme_wait_for_completion_robust_lock(
+ struct spdk_nvme_qpair *qpair,
+ struct nvme_completion_poll_status *status,
+ pthread_mutex_t *robust_mutex)
+{
+ int rc;
+
+ while (status->done == false) {
+ if (robust_mutex) {
+ nvme_robust_mutex_lock(robust_mutex);
+ }
+
+ rc = spdk_nvme_qpair_process_completions(qpair, 0);
+
+ if (robust_mutex) {
+ nvme_robust_mutex_unlock(robust_mutex);
+ }
+
+ if (rc < 0) {
+ status->cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+ status->cpl.status.sc = SPDK_NVME_SC_ABORTED_SQ_DELETION;
+ if (status->done == false) {
+ status->timed_out = true;
+ }
+ return -ECANCELED;
+ }
+ }
+
+ return spdk_nvme_cpl_is_error(&status->cpl) ? -EIO : 0;
+}
+
+int
+nvme_wait_for_completion(struct spdk_nvme_qpair *qpair,
+ struct nvme_completion_poll_status *status)
+{
+ return nvme_wait_for_completion_robust_lock(qpair, status, NULL);
+}
+
+/**
+ * Poll qpair for completions until a command completes.
+ *
+ * \param qpair queue to poll
+ * \param status completion status. The user must fill this structure with zeroes before calling
+ * this function
+ * \param timeout_in_secs optional timeout
+ *
+ * \return 0 if command completed without error,
+ * -EIO if command completed with error,
+ * -ECANCELED if command is not completed due to transport/device error or time expired
+ *
+ * The command to wait upon must be submitted with nvme_completion_poll_cb as the callback
+ * and status as the callback argument.
+ */
+int
+nvme_wait_for_completion_timeout(struct spdk_nvme_qpair *qpair,
+ struct nvme_completion_poll_status *status,
+ uint64_t timeout_in_secs)
+{
+ uint64_t timeout_tsc = 0;
+ int rc = 0;
+
+ if (timeout_in_secs) {
+ timeout_tsc = spdk_get_ticks() + timeout_in_secs * spdk_get_ticks_hz();
+ }
+
+ while (status->done == false) {
+ rc = spdk_nvme_qpair_process_completions(qpair, 0);
+
+ if (rc < 0) {
+ status->cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+ status->cpl.status.sc = SPDK_NVME_SC_ABORTED_SQ_DELETION;
+ break;
+ }
+ if (timeout_tsc && spdk_get_ticks() > timeout_tsc) {
+ break;
+ }
+ }
+
+ if (status->done == false || rc < 0) {
+ if (status->done == false) {
+ status->timed_out = true;
+ }
+ return -ECANCELED;
+ }
+
+ return spdk_nvme_cpl_is_error(&status->cpl) ? -EIO : 0;
+}
+
+static void
+nvme_user_copy_cmd_complete(void *arg, const struct spdk_nvme_cpl *cpl)
+{
+ struct nvme_request *req = arg;
+ enum spdk_nvme_data_transfer xfer;
+
+ if (req->user_buffer && req->payload_size) {
+ /* Copy back to the user buffer and free the contig buffer */
+ assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG);
+ xfer = spdk_nvme_opc_get_data_transfer(req->cmd.opc);
+ if (xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST ||
+ xfer == SPDK_NVME_DATA_BIDIRECTIONAL) {
+ assert(req->pid == getpid());
+ memcpy(req->user_buffer, req->payload.contig_or_cb_arg, req->payload_size);
+ }
+
+ spdk_free(req->payload.contig_or_cb_arg);
+ }
+
+ /* Call the user's original callback now that the buffer has been copied */
+ req->user_cb_fn(req->user_cb_arg, cpl);
+}
+
+/**
+ * Allocate a request as well as a DMA-capable buffer to copy to/from the user's buffer.
+ *
+ * This is intended for use in non-fast-path functions (admin commands, reservations, etc.)
+ * where the overhead of a copy is not a problem.
+ */
+struct nvme_request *
+nvme_allocate_request_user_copy(struct spdk_nvme_qpair *qpair,
+ void *buffer, uint32_t payload_size, spdk_nvme_cmd_cb cb_fn,
+ void *cb_arg, bool host_to_controller)
+{
+ struct nvme_request *req;
+ void *dma_buffer = NULL;
+
+ if (buffer && payload_size) {
+ dma_buffer = spdk_zmalloc(payload_size, 4096, NULL,
+ SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
+ if (!dma_buffer) {
+ return NULL;
+ }
+
+ if (host_to_controller) {
+ memcpy(dma_buffer, buffer, payload_size);
+ }
+ }
+
+ req = nvme_allocate_request_contig(qpair, dma_buffer, payload_size, nvme_user_copy_cmd_complete,
+ NULL);
+ if (!req) {
+ spdk_free(dma_buffer);
+ return NULL;
+ }
+
+ req->user_cb_fn = cb_fn;
+ req->user_cb_arg = cb_arg;
+ req->user_buffer = buffer;
+ req->cb_arg = req;
+
+ return req;
+}
+
+/**
+ * Check if a request has exceeded the controller timeout.
+ *
+ * \param req request to check for timeout.
+ * \param cid command ID for command submitted by req (will be passed to timeout_cb_fn)
+ * \param active_proc per-process data for the controller associated with req
+ * \param now_tick current time from spdk_get_ticks()
+ * \return 0 if requests submitted more recently than req should still be checked for timeouts, or
+ * 1 if requests newer than req need not be checked.
+ *
+ * The request's timeout callback will be called if needed; the caller is only responsible for
+ * calling this function on each outstanding request.
+ */
+int
+nvme_request_check_timeout(struct nvme_request *req, uint16_t cid,
+ struct spdk_nvme_ctrlr_process *active_proc,
+ uint64_t now_tick)
+{
+ struct spdk_nvme_qpair *qpair = req->qpair;
+ struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
+
+ assert(active_proc->timeout_cb_fn != NULL);
+
+ if (req->timed_out || req->submit_tick == 0) {
+ return 0;
+ }
+
+ if (req->pid != g_spdk_nvme_pid) {
+ return 0;
+ }
+
+ if (nvme_qpair_is_admin_queue(qpair) &&
+ req->cmd.opc == SPDK_NVME_OPC_ASYNC_EVENT_REQUEST) {
+ return 0;
+ }
+
+ if (req->submit_tick + active_proc->timeout_ticks > now_tick) {
+ return 1;
+ }
+
+ req->timed_out = true;
+
+ /*
+ * We don't want to expose the admin queue to the user,
+ * so when we're timing out admin commands set the
+ * qpair to NULL.
+ */
+ active_proc->timeout_cb_fn(active_proc->timeout_cb_arg, ctrlr,
+ nvme_qpair_is_admin_queue(qpair) ? NULL : qpair,
+ cid);
+ return 0;
+}
+
+int
+nvme_robust_mutex_init_shared(pthread_mutex_t *mtx)
+{
+ int rc = 0;
+
+#ifdef __FreeBSD__
+ pthread_mutex_init(mtx, NULL);
+#else
+ pthread_mutexattr_t attr;
+
+ if (pthread_mutexattr_init(&attr)) {
+ return -1;
+ }
+ if (pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED) ||
+ pthread_mutexattr_setrobust(&attr, PTHREAD_MUTEX_ROBUST) ||
+ pthread_mutex_init(mtx, &attr)) {
+ rc = -1;
+ }
+ pthread_mutexattr_destroy(&attr);
+#endif
+
+ return rc;
+}
+
+int
+nvme_driver_init(void)
+{
+ static pthread_mutex_t g_init_mutex = PTHREAD_MUTEX_INITIALIZER;
+ int ret = 0;
+ /* Any socket ID */
+ int socket_id = -1;
+
+ /* Use a special process-private mutex to ensure the global
+ * nvme driver object (g_spdk_nvme_driver) gets initialized by
+ * only one thread. Once that object is established and its
+ * mutex is initialized, we can unlock this mutex and use that
+ * one instead.
+ */
+ pthread_mutex_lock(&g_init_mutex);
+
+ /* Each process needs its own pid. */
+ g_spdk_nvme_pid = getpid();
+
+ /*
+ * Only one thread from one process will do this driver init work.
+ * The primary process will reserve the shared memory and do the
+ * initialization.
+ * The secondary process will lookup the existing reserved memory.
+ */
+ if (spdk_process_is_primary()) {
+ /* The unique named memzone already reserved. */
+ if (g_spdk_nvme_driver != NULL) {
+ pthread_mutex_unlock(&g_init_mutex);
+ return 0;
+ } else {
+ g_spdk_nvme_driver = spdk_memzone_reserve(SPDK_NVME_DRIVER_NAME,
+ sizeof(struct nvme_driver), socket_id,
+ SPDK_MEMZONE_NO_IOVA_CONTIG);
+ }
+
+ if (g_spdk_nvme_driver == NULL) {
+ SPDK_ERRLOG("primary process failed to reserve memory\n");
+ pthread_mutex_unlock(&g_init_mutex);
+ return -1;
+ }
+ } else {
+ g_spdk_nvme_driver = spdk_memzone_lookup(SPDK_NVME_DRIVER_NAME);
+
+ /* The unique named memzone already reserved by the primary process. */
+ if (g_spdk_nvme_driver != NULL) {
+ int ms_waited = 0;
+
+ /* Wait the nvme driver to get initialized. */
+ while ((g_spdk_nvme_driver->initialized == false) &&
+ (ms_waited < g_nvme_driver_timeout_ms)) {
+ ms_waited++;
+ nvme_delay(1000); /* delay 1ms */
+ }
+ if (g_spdk_nvme_driver->initialized == false) {
+ SPDK_ERRLOG("timeout waiting for primary process to init\n");
+ pthread_mutex_unlock(&g_init_mutex);
+ return -1;
+ }
+ } else {
+ SPDK_ERRLOG("primary process is not started yet\n");
+ pthread_mutex_unlock(&g_init_mutex);
+ return -1;
+ }
+
+ pthread_mutex_unlock(&g_init_mutex);
+ return 0;
+ }
+
+ /*
+ * At this moment, only one thread from the primary process will do
+ * the g_spdk_nvme_driver initialization
+ */
+ assert(spdk_process_is_primary());
+
+ ret = nvme_robust_mutex_init_shared(&g_spdk_nvme_driver->lock);
+ if (ret != 0) {
+ SPDK_ERRLOG("failed to initialize mutex\n");
+ spdk_memzone_free(SPDK_NVME_DRIVER_NAME);
+ pthread_mutex_unlock(&g_init_mutex);
+ return ret;
+ }
+
+ /* The lock in the shared g_spdk_nvme_driver object is now ready to
+ * be used - so we can unlock the g_init_mutex here.
+ */
+ pthread_mutex_unlock(&g_init_mutex);
+ nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock);
+
+ g_spdk_nvme_driver->initialized = false;
+ g_spdk_nvme_driver->hotplug_fd = nvme_uevent_connect();
+ if (g_spdk_nvme_driver->hotplug_fd < 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Failed to open uevent netlink socket\n");
+ }
+
+ TAILQ_INIT(&g_spdk_nvme_driver->shared_attached_ctrlrs);
+
+ spdk_uuid_generate(&g_spdk_nvme_driver->default_extended_host_id);
+
+ nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock);
+
+ return ret;
+}
+
+/* This function must only be called while holding g_spdk_nvme_driver->lock */
+int
+nvme_ctrlr_probe(const struct spdk_nvme_transport_id *trid,
+ struct spdk_nvme_probe_ctx *probe_ctx, void *devhandle)
+{
+ struct spdk_nvme_ctrlr *ctrlr;
+ struct spdk_nvme_ctrlr_opts opts;
+
+ assert(trid != NULL);
+
+ spdk_nvme_ctrlr_get_default_ctrlr_opts(&opts, sizeof(opts));
+
+ if (!probe_ctx->probe_cb || probe_ctx->probe_cb(probe_ctx->cb_ctx, trid, &opts)) {
+ ctrlr = nvme_get_ctrlr_by_trid_unsafe(trid);
+ if (ctrlr) {
+ /* This ctrlr already exists.
+ * Increase the ref count before calling attach_cb() as the user may
+ * call nvme_detach() immediately. */
+ nvme_ctrlr_proc_get_ref(ctrlr);
+
+ if (probe_ctx->attach_cb) {
+ nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock);
+ probe_ctx->attach_cb(probe_ctx->cb_ctx, &ctrlr->trid, ctrlr, &ctrlr->opts);
+ nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock);
+ }
+ return 0;
+ }
+
+ ctrlr = nvme_transport_ctrlr_construct(trid, &opts, devhandle);
+ if (ctrlr == NULL) {
+ SPDK_ERRLOG("Failed to construct NVMe controller for SSD: %s\n", trid->traddr);
+ return -1;
+ }
+ ctrlr->remove_cb = probe_ctx->remove_cb;
+ ctrlr->cb_ctx = probe_ctx->cb_ctx;
+
+ if (ctrlr->quirks & NVME_QUIRK_MINIMUM_IO_QUEUE_SIZE &&
+ ctrlr->opts.io_queue_size == DEFAULT_IO_QUEUE_SIZE) {
+ /* If the user specifically set an IO queue size different than the
+ * default, use that value. Otherwise overwrite with the quirked value.
+ * This allows this quirk to be overridden when necessary.
+ * However, cap.mqes still needs to be respected.
+ */
+ ctrlr->opts.io_queue_size = spdk_min(DEFAULT_IO_QUEUE_SIZE_FOR_QUIRK, ctrlr->cap.bits.mqes + 1u);
+ }
+
+ nvme_qpair_set_state(ctrlr->adminq, NVME_QPAIR_ENABLED);
+ TAILQ_INSERT_TAIL(&probe_ctx->init_ctrlrs, ctrlr, tailq);
+ return 0;
+ }
+
+ return 1;
+}
+
+static int
+nvme_ctrlr_poll_internal(struct spdk_nvme_ctrlr *ctrlr,
+ struct spdk_nvme_probe_ctx *probe_ctx)
+{
+ int rc = 0;
+
+ rc = nvme_ctrlr_process_init(ctrlr);
+
+ if (rc) {
+ /* Controller failed to initialize. */
+ TAILQ_REMOVE(&probe_ctx->init_ctrlrs, ctrlr, tailq);
+ SPDK_ERRLOG("Failed to initialize SSD: %s\n", ctrlr->trid.traddr);
+ nvme_ctrlr_fail(ctrlr, false);
+ nvme_ctrlr_destruct(ctrlr);
+ return rc;
+ }
+
+ if (ctrlr->state != NVME_CTRLR_STATE_READY) {
+ return 0;
+ }
+
+ STAILQ_INIT(&ctrlr->io_producers);
+
+ /*
+ * Controller has been initialized.
+ * Move it to the attached_ctrlrs list.
+ */
+ TAILQ_REMOVE(&probe_ctx->init_ctrlrs, ctrlr, tailq);
+
+ nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock);
+ if (nvme_ctrlr_shared(ctrlr)) {
+ TAILQ_INSERT_TAIL(&g_spdk_nvme_driver->shared_attached_ctrlrs, ctrlr, tailq);
+ } else {
+ TAILQ_INSERT_TAIL(&g_nvme_attached_ctrlrs, ctrlr, tailq);
+ }
+
+ /*
+ * Increase the ref count before calling attach_cb() as the user may
+ * call nvme_detach() immediately.
+ */
+ nvme_ctrlr_proc_get_ref(ctrlr);
+ nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock);
+
+ if (probe_ctx->attach_cb) {
+ probe_ctx->attach_cb(probe_ctx->cb_ctx, &ctrlr->trid, ctrlr, &ctrlr->opts);
+ return 0;
+ }
+
+ return 0;
+}
+
+static int
+nvme_init_controllers(struct spdk_nvme_probe_ctx *probe_ctx)
+{
+ int rc = 0;
+
+ while (true) {
+ rc = spdk_nvme_probe_poll_async(probe_ctx);
+ if (rc != -EAGAIN) {
+ return rc;
+ }
+ }
+
+ return rc;
+}
+
+/* This function must not be called while holding g_spdk_nvme_driver->lock */
+static struct spdk_nvme_ctrlr *
+nvme_get_ctrlr_by_trid(const struct spdk_nvme_transport_id *trid)
+{
+ struct spdk_nvme_ctrlr *ctrlr;
+
+ nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock);
+ ctrlr = nvme_get_ctrlr_by_trid_unsafe(trid);
+ nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock);
+
+ return ctrlr;
+}
+
+/* This function must be called while holding g_spdk_nvme_driver->lock */
+struct spdk_nvme_ctrlr *
+nvme_get_ctrlr_by_trid_unsafe(const struct spdk_nvme_transport_id *trid)
+{
+ struct spdk_nvme_ctrlr *ctrlr;
+
+ /* Search per-process list */
+ TAILQ_FOREACH(ctrlr, &g_nvme_attached_ctrlrs, tailq) {
+ if (spdk_nvme_transport_id_compare(&ctrlr->trid, trid) == 0) {
+ return ctrlr;
+ }
+ }
+
+ /* Search multi-process shared list */
+ TAILQ_FOREACH(ctrlr, &g_spdk_nvme_driver->shared_attached_ctrlrs, tailq) {
+ if (spdk_nvme_transport_id_compare(&ctrlr->trid, trid) == 0) {
+ return ctrlr;
+ }
+ }
+
+ return NULL;
+}
+
+/* This function must only be called while holding g_spdk_nvme_driver->lock */
+static int
+nvme_probe_internal(struct spdk_nvme_probe_ctx *probe_ctx,
+ bool direct_connect)
+{
+ int rc;
+ struct spdk_nvme_ctrlr *ctrlr, *ctrlr_tmp;
+
+ spdk_nvme_trid_populate_transport(&probe_ctx->trid, probe_ctx->trid.trtype);
+ if (!spdk_nvme_transport_available_by_name(probe_ctx->trid.trstring)) {
+ SPDK_ERRLOG("NVMe trtype %u not available\n", probe_ctx->trid.trtype);
+ return -1;
+ }
+
+ nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock);
+
+ rc = nvme_transport_ctrlr_scan(probe_ctx, direct_connect);
+ if (rc != 0) {
+ SPDK_ERRLOG("NVMe ctrlr scan failed\n");
+ TAILQ_FOREACH_SAFE(ctrlr, &probe_ctx->init_ctrlrs, tailq, ctrlr_tmp) {
+ TAILQ_REMOVE(&probe_ctx->init_ctrlrs, ctrlr, tailq);
+ nvme_transport_ctrlr_destruct(ctrlr);
+ }
+ nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock);
+ return -1;
+ }
+
+ /*
+ * Probe controllers on the shared_attached_ctrlrs list
+ */
+ if (!spdk_process_is_primary() && (probe_ctx->trid.trtype == SPDK_NVME_TRANSPORT_PCIE)) {
+ TAILQ_FOREACH(ctrlr, &g_spdk_nvme_driver->shared_attached_ctrlrs, tailq) {
+ /* Do not attach other ctrlrs if user specify a valid trid */
+ if ((strlen(probe_ctx->trid.traddr) != 0) &&
+ (spdk_nvme_transport_id_compare(&probe_ctx->trid, &ctrlr->trid))) {
+ continue;
+ }
+
+ /* Do not attach if we failed to initialize it in this process */
+ if (nvme_ctrlr_get_current_process(ctrlr) == NULL) {
+ continue;
+ }
+
+ nvme_ctrlr_proc_get_ref(ctrlr);
+
+ /*
+ * Unlock while calling attach_cb() so the user can call other functions
+ * that may take the driver lock, like nvme_detach().
+ */
+ if (probe_ctx->attach_cb) {
+ nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock);
+ probe_ctx->attach_cb(probe_ctx->cb_ctx, &ctrlr->trid, ctrlr, &ctrlr->opts);
+ nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock);
+ }
+ }
+ }
+
+ nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock);
+
+ return 0;
+}
+
+static void
+nvme_probe_ctx_init(struct spdk_nvme_probe_ctx *probe_ctx,
+ const struct spdk_nvme_transport_id *trid,
+ void *cb_ctx,
+ spdk_nvme_probe_cb probe_cb,
+ spdk_nvme_attach_cb attach_cb,
+ spdk_nvme_remove_cb remove_cb)
+{
+ probe_ctx->trid = *trid;
+ probe_ctx->cb_ctx = cb_ctx;
+ probe_ctx->probe_cb = probe_cb;
+ probe_ctx->attach_cb = attach_cb;
+ probe_ctx->remove_cb = remove_cb;
+ TAILQ_INIT(&probe_ctx->init_ctrlrs);
+}
+
+int
+spdk_nvme_probe(const struct spdk_nvme_transport_id *trid, void *cb_ctx,
+ spdk_nvme_probe_cb probe_cb, spdk_nvme_attach_cb attach_cb,
+ spdk_nvme_remove_cb remove_cb)
+{
+ struct spdk_nvme_transport_id trid_pcie;
+ struct spdk_nvme_probe_ctx *probe_ctx;
+
+ if (trid == NULL) {
+ memset(&trid_pcie, 0, sizeof(trid_pcie));
+ spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE);
+ trid = &trid_pcie;
+ }
+
+ probe_ctx = spdk_nvme_probe_async(trid, cb_ctx, probe_cb,
+ attach_cb, remove_cb);
+ if (!probe_ctx) {
+ SPDK_ERRLOG("Create probe context failed\n");
+ return -1;
+ }
+
+ /*
+ * Keep going even if one or more nvme_attach() calls failed,
+ * but maintain the value of rc to signal errors when we return.
+ */
+ return nvme_init_controllers(probe_ctx);
+}
+
+static bool
+nvme_connect_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
+ struct spdk_nvme_ctrlr_opts *opts)
+{
+ struct spdk_nvme_ctrlr_opts *requested_opts = cb_ctx;
+
+ assert(requested_opts);
+ memcpy(opts, requested_opts, sizeof(*opts));
+
+ return true;
+}
+
+static void
+nvme_ctrlr_opts_init(struct spdk_nvme_ctrlr_opts *opts,
+ const struct spdk_nvme_ctrlr_opts *opts_user,
+ size_t opts_size_user)
+{
+ assert(opts);
+ assert(opts_user);
+
+ spdk_nvme_ctrlr_get_default_ctrlr_opts(opts, opts_size_user);
+
+#define FIELD_OK(field) \
+ offsetof(struct spdk_nvme_ctrlr_opts, field) + sizeof(opts->field) <= (opts->opts_size)
+
+ if (FIELD_OK(num_io_queues)) {
+ opts->num_io_queues = opts_user->num_io_queues;
+ }
+
+ if (FIELD_OK(use_cmb_sqs)) {
+ opts->use_cmb_sqs = opts_user->use_cmb_sqs;
+ }
+
+ if (FIELD_OK(no_shn_notification)) {
+ opts->no_shn_notification = opts_user->no_shn_notification;
+ }
+
+ if (FIELD_OK(arb_mechanism)) {
+ opts->arb_mechanism = opts_user->arb_mechanism;
+ }
+
+ if (FIELD_OK(arbitration_burst)) {
+ opts->arbitration_burst = opts_user->arbitration_burst;
+ }
+
+ if (FIELD_OK(low_priority_weight)) {
+ opts->low_priority_weight = opts_user->low_priority_weight;
+ }
+
+ if (FIELD_OK(medium_priority_weight)) {
+ opts->medium_priority_weight = opts_user->medium_priority_weight;
+ }
+
+ if (FIELD_OK(high_priority_weight)) {
+ opts->high_priority_weight = opts_user->high_priority_weight;
+ }
+
+ if (FIELD_OK(keep_alive_timeout_ms)) {
+ opts->keep_alive_timeout_ms = opts_user->keep_alive_timeout_ms;
+ }
+
+ if (FIELD_OK(transport_retry_count)) {
+ opts->transport_retry_count = opts_user->transport_retry_count;
+ }
+
+ if (FIELD_OK(io_queue_size)) {
+ opts->io_queue_size = opts_user->io_queue_size;
+ }
+
+ if (FIELD_OK(hostnqn)) {
+ memcpy(opts->hostnqn, opts_user->hostnqn, sizeof(opts_user->hostnqn));
+ }
+
+ if (FIELD_OK(io_queue_requests)) {
+ opts->io_queue_requests = opts_user->io_queue_requests;
+ }
+
+ if (FIELD_OK(src_addr)) {
+ memcpy(opts->src_addr, opts_user->src_addr, sizeof(opts_user->src_addr));
+ }
+
+ if (FIELD_OK(src_svcid)) {
+ memcpy(opts->src_svcid, opts_user->src_svcid, sizeof(opts_user->src_svcid));
+ }
+
+ if (FIELD_OK(host_id)) {
+ memcpy(opts->host_id, opts_user->host_id, sizeof(opts_user->host_id));
+ }
+ if (FIELD_OK(extended_host_id)) {
+ memcpy(opts->extended_host_id, opts_user->extended_host_id,
+ sizeof(opts_user->extended_host_id));
+ }
+
+ if (FIELD_OK(command_set)) {
+ opts->command_set = opts_user->command_set;
+ }
+
+ if (FIELD_OK(admin_timeout_ms)) {
+ opts->admin_timeout_ms = opts_user->admin_timeout_ms;
+ }
+
+ if (FIELD_OK(header_digest)) {
+ opts->header_digest = opts_user->header_digest;
+ }
+
+ if (FIELD_OK(data_digest)) {
+ opts->data_digest = opts_user->data_digest;
+ }
+
+ if (FIELD_OK(disable_error_logging)) {
+ opts->disable_error_logging = opts_user->disable_error_logging;
+ }
+
+ if (FIELD_OK(transport_ack_timeout)) {
+ opts->transport_ack_timeout = opts_user->transport_ack_timeout;
+ }
+
+ if (FIELD_OK(admin_queue_size)) {
+ opts->admin_queue_size = opts_user->admin_queue_size;
+ }
+#undef FIELD_OK
+}
+
+struct spdk_nvme_ctrlr *
+spdk_nvme_connect(const struct spdk_nvme_transport_id *trid,
+ const struct spdk_nvme_ctrlr_opts *opts, size_t opts_size)
+{
+ int rc;
+ struct spdk_nvme_ctrlr *ctrlr = NULL;
+ struct spdk_nvme_probe_ctx *probe_ctx;
+ struct spdk_nvme_ctrlr_opts *opts_local_p = NULL;
+ struct spdk_nvme_ctrlr_opts opts_local;
+
+ if (trid == NULL) {
+ SPDK_ERRLOG("No transport ID specified\n");
+ return NULL;
+ }
+
+ if (opts) {
+ opts_local_p = &opts_local;
+ nvme_ctrlr_opts_init(opts_local_p, opts, opts_size);
+ }
+
+ probe_ctx = spdk_nvme_connect_async(trid, opts_local_p, NULL);
+ if (!probe_ctx) {
+ SPDK_ERRLOG("Create probe context failed\n");
+ return NULL;
+ }
+
+ rc = nvme_init_controllers(probe_ctx);
+ if (rc != 0) {
+ return NULL;
+ }
+
+ ctrlr = nvme_get_ctrlr_by_trid(trid);
+
+ return ctrlr;
+}
+
+void
+spdk_nvme_trid_populate_transport(struct spdk_nvme_transport_id *trid,
+ enum spdk_nvme_transport_type trtype)
+{
+ const char *trstring = "";
+
+ trid->trtype = trtype;
+ switch (trtype) {
+ case SPDK_NVME_TRANSPORT_FC:
+ trstring = SPDK_NVME_TRANSPORT_NAME_FC;
+ break;
+ case SPDK_NVME_TRANSPORT_PCIE:
+ trstring = SPDK_NVME_TRANSPORT_NAME_PCIE;
+ break;
+ case SPDK_NVME_TRANSPORT_RDMA:
+ trstring = SPDK_NVME_TRANSPORT_NAME_RDMA;
+ break;
+ case SPDK_NVME_TRANSPORT_TCP:
+ trstring = SPDK_NVME_TRANSPORT_NAME_TCP;
+ break;
+ case SPDK_NVME_TRANSPORT_CUSTOM:
+ default:
+ SPDK_ERRLOG("don't use this for custom transports\n");
+ assert(0);
+ return;
+ }
+ snprintf(trid->trstring, SPDK_NVMF_TRSTRING_MAX_LEN, "%s", trstring);
+}
+
+int
+spdk_nvme_transport_id_populate_trstring(struct spdk_nvme_transport_id *trid, const char *trstring)
+{
+ int len, i, rc;
+
+ if (trstring == NULL) {
+ return -EINVAL;
+ }
+
+ len = strnlen(trstring, SPDK_NVMF_TRSTRING_MAX_LEN);
+ if (len == SPDK_NVMF_TRSTRING_MAX_LEN) {
+ return -EINVAL;
+ }
+
+ rc = snprintf(trid->trstring, SPDK_NVMF_TRSTRING_MAX_LEN, "%s", trstring);
+ if (rc < 0) {
+ return rc;
+ }
+
+ /* cast official trstring to uppercase version of input. */
+ for (i = 0; i < len; i++) {
+ trid->trstring[i] = toupper(trid->trstring[i]);
+ }
+ return 0;
+}
+
+int
+spdk_nvme_transport_id_parse_trtype(enum spdk_nvme_transport_type *trtype, const char *str)
+{
+ if (trtype == NULL || str == NULL) {
+ return -EINVAL;
+ }
+
+ if (strcasecmp(str, "PCIe") == 0) {
+ *trtype = SPDK_NVME_TRANSPORT_PCIE;
+ } else if (strcasecmp(str, "RDMA") == 0) {
+ *trtype = SPDK_NVME_TRANSPORT_RDMA;
+ } else if (strcasecmp(str, "FC") == 0) {
+ *trtype = SPDK_NVME_TRANSPORT_FC;
+ } else if (strcasecmp(str, "TCP") == 0) {
+ *trtype = SPDK_NVME_TRANSPORT_TCP;
+ } else {
+ *trtype = SPDK_NVME_TRANSPORT_CUSTOM;
+ }
+ return 0;
+}
+
+const char *
+spdk_nvme_transport_id_trtype_str(enum spdk_nvme_transport_type trtype)
+{
+ switch (trtype) {
+ case SPDK_NVME_TRANSPORT_PCIE:
+ return "PCIe";
+ case SPDK_NVME_TRANSPORT_RDMA:
+ return "RDMA";
+ case SPDK_NVME_TRANSPORT_FC:
+ return "FC";
+ case SPDK_NVME_TRANSPORT_TCP:
+ return "TCP";
+ case SPDK_NVME_TRANSPORT_CUSTOM:
+ return "CUSTOM";
+ default:
+ return NULL;
+ }
+}
+
+int
+spdk_nvme_transport_id_parse_adrfam(enum spdk_nvmf_adrfam *adrfam, const char *str)
+{
+ if (adrfam == NULL || str == NULL) {
+ return -EINVAL;
+ }
+
+ if (strcasecmp(str, "IPv4") == 0) {
+ *adrfam = SPDK_NVMF_ADRFAM_IPV4;
+ } else if (strcasecmp(str, "IPv6") == 0) {
+ *adrfam = SPDK_NVMF_ADRFAM_IPV6;
+ } else if (strcasecmp(str, "IB") == 0) {
+ *adrfam = SPDK_NVMF_ADRFAM_IB;
+ } else if (strcasecmp(str, "FC") == 0) {
+ *adrfam = SPDK_NVMF_ADRFAM_FC;
+ } else {
+ return -ENOENT;
+ }
+ return 0;
+}
+
+const char *
+spdk_nvme_transport_id_adrfam_str(enum spdk_nvmf_adrfam adrfam)
+{
+ switch (adrfam) {
+ case SPDK_NVMF_ADRFAM_IPV4:
+ return "IPv4";
+ case SPDK_NVMF_ADRFAM_IPV6:
+ return "IPv6";
+ case SPDK_NVMF_ADRFAM_IB:
+ return "IB";
+ case SPDK_NVMF_ADRFAM_FC:
+ return "FC";
+ default:
+ return NULL;
+ }
+}
+
+static size_t
+parse_next_key(const char **str, char *key, char *val, size_t key_buf_size, size_t val_buf_size)
+{
+
+ const char *sep, *sep1;
+ const char *whitespace = " \t\n";
+ size_t key_len, val_len;
+
+ *str += strspn(*str, whitespace);
+
+ sep = strchr(*str, ':');
+ if (!sep) {
+ sep = strchr(*str, '=');
+ if (!sep) {
+ SPDK_ERRLOG("Key without ':' or '=' separator\n");
+ return 0;
+ }
+ } else {
+ sep1 = strchr(*str, '=');
+ if ((sep1 != NULL) && (sep1 < sep)) {
+ sep = sep1;
+ }
+ }
+
+ key_len = sep - *str;
+ if (key_len >= key_buf_size) {
+ SPDK_ERRLOG("Key length %zu greater than maximum allowed %zu\n",
+ key_len, key_buf_size - 1);
+ return 0;
+ }
+
+ memcpy(key, *str, key_len);
+ key[key_len] = '\0';
+
+ *str += key_len + 1; /* Skip key: */
+ val_len = strcspn(*str, whitespace);
+ if (val_len == 0) {
+ SPDK_ERRLOG("Key without value\n");
+ return 0;
+ }
+
+ if (val_len >= val_buf_size) {
+ SPDK_ERRLOG("Value length %zu greater than maximum allowed %zu\n",
+ val_len, val_buf_size - 1);
+ return 0;
+ }
+
+ memcpy(val, *str, val_len);
+ val[val_len] = '\0';
+
+ *str += val_len;
+
+ return val_len;
+}
+
+int
+spdk_nvme_transport_id_parse(struct spdk_nvme_transport_id *trid, const char *str)
+{
+ size_t val_len;
+ char key[32];
+ char val[1024];
+
+ if (trid == NULL || str == NULL) {
+ return -EINVAL;
+ }
+
+ while (*str != '\0') {
+
+ val_len = parse_next_key(&str, key, val, sizeof(key), sizeof(val));
+
+ if (val_len == 0) {
+ SPDK_ERRLOG("Failed to parse transport ID\n");
+ return -EINVAL;
+ }
+
+ if (strcasecmp(key, "trtype") == 0) {
+ if (spdk_nvme_transport_id_populate_trstring(trid, val) != 0) {
+ SPDK_ERRLOG("invalid transport '%s'\n", val);
+ return -EINVAL;
+ }
+ if (spdk_nvme_transport_id_parse_trtype(&trid->trtype, val) != 0) {
+ SPDK_ERRLOG("Unknown trtype '%s'\n", val);
+ return -EINVAL;
+ }
+ } else if (strcasecmp(key, "adrfam") == 0) {
+ if (spdk_nvme_transport_id_parse_adrfam(&trid->adrfam, val) != 0) {
+ SPDK_ERRLOG("Unknown adrfam '%s'\n", val);
+ return -EINVAL;
+ }
+ } else if (strcasecmp(key, "traddr") == 0) {
+ if (val_len > SPDK_NVMF_TRADDR_MAX_LEN) {
+ SPDK_ERRLOG("traddr length %zu greater than maximum allowed %u\n",
+ val_len, SPDK_NVMF_TRADDR_MAX_LEN);
+ return -EINVAL;
+ }
+ memcpy(trid->traddr, val, val_len + 1);
+ } else if (strcasecmp(key, "trsvcid") == 0) {
+ if (val_len > SPDK_NVMF_TRSVCID_MAX_LEN) {
+ SPDK_ERRLOG("trsvcid length %zu greater than maximum allowed %u\n",
+ val_len, SPDK_NVMF_TRSVCID_MAX_LEN);
+ return -EINVAL;
+ }
+ memcpy(trid->trsvcid, val, val_len + 1);
+ } else if (strcasecmp(key, "priority") == 0) {
+ if (val_len > SPDK_NVMF_PRIORITY_MAX_LEN) {
+ SPDK_ERRLOG("priority length %zu greater than maximum allowed %u\n",
+ val_len, SPDK_NVMF_PRIORITY_MAX_LEN);
+ return -EINVAL;
+ }
+ trid->priority = spdk_strtol(val, 10);
+ } else if (strcasecmp(key, "subnqn") == 0) {
+ if (val_len > SPDK_NVMF_NQN_MAX_LEN) {
+ SPDK_ERRLOG("subnqn length %zu greater than maximum allowed %u\n",
+ val_len, SPDK_NVMF_NQN_MAX_LEN);
+ return -EINVAL;
+ }
+ memcpy(trid->subnqn, val, val_len + 1);
+ } else if (strcasecmp(key, "hostaddr") == 0) {
+ continue;
+ } else if (strcasecmp(key, "hostsvcid") == 0) {
+ continue;
+ } else if (strcasecmp(key, "ns") == 0) {
+ /*
+ * Special case. The namespace id parameter may
+ * optionally be passed in the transport id string
+ * for an SPDK application (e.g. nvme/perf)
+ * and additionally parsed therein to limit
+ * targeting a specific namespace. For this
+ * scenario, just silently ignore this key
+ * rather than letting it default to logging
+ * it as an invalid key.
+ */
+ continue;
+ } else if (strcasecmp(key, "alt_traddr") == 0) {
+ /*
+ * Used by applications for enabling transport ID failover.
+ * Please see the case above for more information on custom parameters.
+ */
+ continue;
+ } else {
+ SPDK_ERRLOG("Unknown transport ID key '%s'\n", key);
+ }
+ }
+
+ return 0;
+}
+
+int
+spdk_nvme_host_id_parse(struct spdk_nvme_host_id *hostid, const char *str)
+{
+
+ size_t key_size = 32;
+ size_t val_size = 1024;
+ size_t val_len;
+ char key[key_size];
+ char val[val_size];
+
+ if (hostid == NULL || str == NULL) {
+ return -EINVAL;
+ }
+
+ while (*str != '\0') {
+
+ val_len = parse_next_key(&str, key, val, key_size, val_size);
+
+ if (val_len == 0) {
+ SPDK_ERRLOG("Failed to parse host ID\n");
+ return val_len;
+ }
+
+ /* Ignore the rest of the options from the transport ID. */
+ if (strcasecmp(key, "trtype") == 0) {
+ continue;
+ } else if (strcasecmp(key, "adrfam") == 0) {
+ continue;
+ } else if (strcasecmp(key, "traddr") == 0) {
+ continue;
+ } else if (strcasecmp(key, "trsvcid") == 0) {
+ continue;
+ } else if (strcasecmp(key, "subnqn") == 0) {
+ continue;
+ } else if (strcasecmp(key, "priority") == 0) {
+ continue;
+ } else if (strcasecmp(key, "ns") == 0) {
+ continue;
+ } else if (strcasecmp(key, "hostaddr") == 0) {
+ if (val_len > SPDK_NVMF_TRADDR_MAX_LEN) {
+ SPDK_ERRLOG("hostaddr length %zu greater than maximum allowed %u\n",
+ val_len, SPDK_NVMF_TRADDR_MAX_LEN);
+ return -EINVAL;
+ }
+ memcpy(hostid->hostaddr, val, val_len + 1);
+
+ } else if (strcasecmp(key, "hostsvcid") == 0) {
+ if (val_len > SPDK_NVMF_TRSVCID_MAX_LEN) {
+ SPDK_ERRLOG("trsvcid length %zu greater than maximum allowed %u\n",
+ val_len, SPDK_NVMF_TRSVCID_MAX_LEN);
+ return -EINVAL;
+ }
+ memcpy(hostid->hostsvcid, val, val_len + 1);
+ } else {
+ SPDK_ERRLOG("Unknown transport ID key '%s'\n", key);
+ }
+ }
+
+ return 0;
+}
+
+static int
+cmp_int(int a, int b)
+{
+ return a - b;
+}
+
+int
+spdk_nvme_transport_id_compare(const struct spdk_nvme_transport_id *trid1,
+ const struct spdk_nvme_transport_id *trid2)
+{
+ int cmp;
+
+ if (trid1->trtype == SPDK_NVME_TRANSPORT_CUSTOM) {
+ cmp = strcasecmp(trid1->trstring, trid2->trstring);
+ } else {
+ cmp = cmp_int(trid1->trtype, trid2->trtype);
+ }
+
+ if (cmp) {
+ return cmp;
+ }
+
+ if (trid1->trtype == SPDK_NVME_TRANSPORT_PCIE) {
+ struct spdk_pci_addr pci_addr1 = {};
+ struct spdk_pci_addr pci_addr2 = {};
+
+ /* Normalize PCI addresses before comparing */
+ if (spdk_pci_addr_parse(&pci_addr1, trid1->traddr) < 0 ||
+ spdk_pci_addr_parse(&pci_addr2, trid2->traddr) < 0) {
+ return -1;
+ }
+
+ /* PCIe transport ID only uses trtype and traddr */
+ return spdk_pci_addr_compare(&pci_addr1, &pci_addr2);
+ }
+
+ cmp = strcasecmp(trid1->traddr, trid2->traddr);
+ if (cmp) {
+ return cmp;
+ }
+
+ cmp = cmp_int(trid1->adrfam, trid2->adrfam);
+ if (cmp) {
+ return cmp;
+ }
+
+ cmp = strcasecmp(trid1->trsvcid, trid2->trsvcid);
+ if (cmp) {
+ return cmp;
+ }
+
+ cmp = strcmp(trid1->subnqn, trid2->subnqn);
+ if (cmp) {
+ return cmp;
+ }
+
+ return 0;
+}
+
+int
+spdk_nvme_prchk_flags_parse(uint32_t *prchk_flags, const char *str)
+{
+ size_t val_len;
+ char key[32];
+ char val[1024];
+
+ if (prchk_flags == NULL || str == NULL) {
+ return -EINVAL;
+ }
+
+ while (*str != '\0') {
+ val_len = parse_next_key(&str, key, val, sizeof(key), sizeof(val));
+
+ if (val_len == 0) {
+ SPDK_ERRLOG("Failed to parse prchk\n");
+ return -EINVAL;
+ }
+
+ if (strcasecmp(key, "prchk") == 0) {
+ if (strcasestr(val, "reftag") != NULL) {
+ *prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_REFTAG;
+ }
+ if (strcasestr(val, "guard") != NULL) {
+ *prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_GUARD;
+ }
+ } else {
+ SPDK_ERRLOG("Unknown key '%s'\n", key);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+const char *
+spdk_nvme_prchk_flags_str(uint32_t prchk_flags)
+{
+ if (prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) {
+ if (prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) {
+ return "prchk:reftag|guard";
+ } else {
+ return "prchk:reftag";
+ }
+ } else {
+ if (prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) {
+ return "prchk:guard";
+ } else {
+ return NULL;
+ }
+ }
+}
+
+struct spdk_nvme_probe_ctx *
+spdk_nvme_probe_async(const struct spdk_nvme_transport_id *trid,
+ void *cb_ctx,
+ spdk_nvme_probe_cb probe_cb,
+ spdk_nvme_attach_cb attach_cb,
+ spdk_nvme_remove_cb remove_cb)
+{
+ int rc;
+ struct spdk_nvme_probe_ctx *probe_ctx;
+
+ rc = nvme_driver_init();
+ if (rc != 0) {
+ return NULL;
+ }
+
+ probe_ctx = calloc(1, sizeof(*probe_ctx));
+ if (!probe_ctx) {
+ return NULL;
+ }
+
+ nvme_probe_ctx_init(probe_ctx, trid, cb_ctx, probe_cb, attach_cb, remove_cb);
+ rc = nvme_probe_internal(probe_ctx, false);
+ if (rc != 0) {
+ free(probe_ctx);
+ return NULL;
+ }
+
+ return probe_ctx;
+}
+
+int
+spdk_nvme_probe_poll_async(struct spdk_nvme_probe_ctx *probe_ctx)
+{
+ int rc = 0;
+ struct spdk_nvme_ctrlr *ctrlr, *ctrlr_tmp;
+
+ if (!spdk_process_is_primary() && probe_ctx->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) {
+ free(probe_ctx);
+ return 0;
+ }
+
+ TAILQ_FOREACH_SAFE(ctrlr, &probe_ctx->init_ctrlrs, tailq, ctrlr_tmp) {
+ rc = nvme_ctrlr_poll_internal(ctrlr, probe_ctx);
+ if (rc != 0) {
+ rc = -EIO;
+ break;
+ }
+ }
+
+ if (rc != 0 || TAILQ_EMPTY(&probe_ctx->init_ctrlrs)) {
+ nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock);
+ g_spdk_nvme_driver->initialized = true;
+ nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock);
+ free(probe_ctx);
+ return rc;
+ }
+
+ return -EAGAIN;
+}
+
+struct spdk_nvme_probe_ctx *
+spdk_nvme_connect_async(const struct spdk_nvme_transport_id *trid,
+ const struct spdk_nvme_ctrlr_opts *opts,
+ spdk_nvme_attach_cb attach_cb)
+{
+ int rc;
+ spdk_nvme_probe_cb probe_cb = NULL;
+ struct spdk_nvme_probe_ctx *probe_ctx;
+
+ rc = nvme_driver_init();
+ if (rc != 0) {
+ return NULL;
+ }
+
+ probe_ctx = calloc(1, sizeof(*probe_ctx));
+ if (!probe_ctx) {
+ return NULL;
+ }
+
+ if (opts) {
+ probe_cb = nvme_connect_probe_cb;
+ }
+
+ nvme_probe_ctx_init(probe_ctx, trid, (void *)opts, probe_cb, attach_cb, NULL);
+ rc = nvme_probe_internal(probe_ctx, true);
+ if (rc != 0) {
+ free(probe_ctx);
+ return NULL;
+ }
+
+ return probe_ctx;
+}
+
+SPDK_LOG_REGISTER_COMPONENT("nvme", SPDK_LOG_NVME)
diff --git a/src/spdk/lib/nvme/nvme_ctrlr.c b/src/spdk/lib/nvme/nvme_ctrlr.c
new file mode 100644
index 000000000..ced02e9bb
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_ctrlr.c
@@ -0,0 +1,3639 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation. All rights reserved.
+ * Copyright (c) 2019, 2020 Mellanox Technologies LTD. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "nvme_internal.h"
+#include "nvme_io_msg.h"
+
+#include "spdk/env.h"
+#include "spdk/string.h"
+
+struct nvme_active_ns_ctx;
+
+static void nvme_ctrlr_destruct_namespaces(struct spdk_nvme_ctrlr *ctrlr);
+static int nvme_ctrlr_construct_and_submit_aer(struct spdk_nvme_ctrlr *ctrlr,
+ struct nvme_async_event_request *aer);
+static void nvme_ctrlr_identify_active_ns_async(struct nvme_active_ns_ctx *ctx);
+static int nvme_ctrlr_identify_ns_async(struct spdk_nvme_ns *ns);
+static int nvme_ctrlr_identify_id_desc_async(struct spdk_nvme_ns *ns);
+
+static int
+nvme_ctrlr_get_cc(struct spdk_nvme_ctrlr *ctrlr, union spdk_nvme_cc_register *cc)
+{
+ return nvme_transport_ctrlr_get_reg_4(ctrlr, offsetof(struct spdk_nvme_registers, cc.raw),
+ &cc->raw);
+}
+
+static int
+nvme_ctrlr_get_csts(struct spdk_nvme_ctrlr *ctrlr, union spdk_nvme_csts_register *csts)
+{
+ return nvme_transport_ctrlr_get_reg_4(ctrlr, offsetof(struct spdk_nvme_registers, csts.raw),
+ &csts->raw);
+}
+
+int
+nvme_ctrlr_get_cap(struct spdk_nvme_ctrlr *ctrlr, union spdk_nvme_cap_register *cap)
+{
+ return nvme_transport_ctrlr_get_reg_8(ctrlr, offsetof(struct spdk_nvme_registers, cap.raw),
+ &cap->raw);
+}
+
+int
+nvme_ctrlr_get_vs(struct spdk_nvme_ctrlr *ctrlr, union spdk_nvme_vs_register *vs)
+{
+ return nvme_transport_ctrlr_get_reg_4(ctrlr, offsetof(struct spdk_nvme_registers, vs.raw),
+ &vs->raw);
+}
+
+static int
+nvme_ctrlr_set_cc(struct spdk_nvme_ctrlr *ctrlr, const union spdk_nvme_cc_register *cc)
+{
+ return nvme_transport_ctrlr_set_reg_4(ctrlr, offsetof(struct spdk_nvme_registers, cc.raw),
+ cc->raw);
+}
+
+int
+nvme_ctrlr_get_cmbsz(struct spdk_nvme_ctrlr *ctrlr, union spdk_nvme_cmbsz_register *cmbsz)
+{
+ return nvme_transport_ctrlr_get_reg_4(ctrlr, offsetof(struct spdk_nvme_registers, cmbsz.raw),
+ &cmbsz->raw);
+}
+
+/* When the field in spdk_nvme_ctrlr_opts are changed and you change this function, please
+ * also update the nvme_ctrl_opts_init function in nvme_ctrlr.c
+ */
+void
+spdk_nvme_ctrlr_get_default_ctrlr_opts(struct spdk_nvme_ctrlr_opts *opts, size_t opts_size)
+{
+ char host_id_str[SPDK_UUID_STRING_LEN];
+
+ assert(opts);
+
+ opts->opts_size = opts_size;
+
+#define FIELD_OK(field) \
+ offsetof(struct spdk_nvme_ctrlr_opts, field) + sizeof(opts->field) <= opts_size
+
+ if (FIELD_OK(num_io_queues)) {
+ opts->num_io_queues = DEFAULT_MAX_IO_QUEUES;
+ }
+
+ if (FIELD_OK(use_cmb_sqs)) {
+ opts->use_cmb_sqs = true;
+ }
+
+ if (FIELD_OK(no_shn_notification)) {
+ opts->no_shn_notification = false;
+ }
+
+ if (FIELD_OK(arb_mechanism)) {
+ opts->arb_mechanism = SPDK_NVME_CC_AMS_RR;
+ }
+
+ if (FIELD_OK(arbitration_burst)) {
+ opts->arbitration_burst = 0;
+ }
+
+ if (FIELD_OK(low_priority_weight)) {
+ opts->low_priority_weight = 0;
+ }
+
+ if (FIELD_OK(medium_priority_weight)) {
+ opts->medium_priority_weight = 0;
+ }
+
+ if (FIELD_OK(high_priority_weight)) {
+ opts->high_priority_weight = 0;
+ }
+
+ if (FIELD_OK(keep_alive_timeout_ms)) {
+ opts->keep_alive_timeout_ms = MIN_KEEP_ALIVE_TIMEOUT_IN_MS;
+ }
+
+ if (FIELD_OK(transport_retry_count)) {
+ opts->transport_retry_count = SPDK_NVME_DEFAULT_RETRY_COUNT;
+ }
+
+ if (FIELD_OK(io_queue_size)) {
+ opts->io_queue_size = DEFAULT_IO_QUEUE_SIZE;
+ }
+
+ if (nvme_driver_init() == 0) {
+ if (FIELD_OK(hostnqn)) {
+ spdk_uuid_fmt_lower(host_id_str, sizeof(host_id_str),
+ &g_spdk_nvme_driver->default_extended_host_id);
+ snprintf(opts->hostnqn, sizeof(opts->hostnqn), "2014-08.org.nvmexpress:uuid:%s", host_id_str);
+ }
+
+ if (FIELD_OK(extended_host_id)) {
+ memcpy(opts->extended_host_id, &g_spdk_nvme_driver->default_extended_host_id,
+ sizeof(opts->extended_host_id));
+ }
+
+ }
+
+ if (FIELD_OK(io_queue_requests)) {
+ opts->io_queue_requests = DEFAULT_IO_QUEUE_REQUESTS;
+ }
+
+ if (FIELD_OK(src_addr)) {
+ memset(opts->src_addr, 0, sizeof(opts->src_addr));
+ }
+
+ if (FIELD_OK(src_svcid)) {
+ memset(opts->src_svcid, 0, sizeof(opts->src_svcid));
+ }
+
+ if (FIELD_OK(host_id)) {
+ memset(opts->host_id, 0, sizeof(opts->host_id));
+ }
+
+ if (FIELD_OK(command_set)) {
+ opts->command_set = SPDK_NVME_CC_CSS_NVM;
+ }
+
+ if (FIELD_OK(admin_timeout_ms)) {
+ opts->admin_timeout_ms = NVME_MAX_ADMIN_TIMEOUT_IN_SECS * 1000;
+ }
+
+ if (FIELD_OK(header_digest)) {
+ opts->header_digest = false;
+ }
+
+ if (FIELD_OK(data_digest)) {
+ opts->data_digest = false;
+ }
+
+ if (FIELD_OK(disable_error_logging)) {
+ opts->disable_error_logging = false;
+ }
+
+ if (FIELD_OK(transport_ack_timeout)) {
+ opts->transport_ack_timeout = SPDK_NVME_DEFAULT_TRANSPORT_ACK_TIMEOUT;
+ }
+
+ if (FIELD_OK(admin_queue_size)) {
+ opts->admin_queue_size = DEFAULT_ADMIN_QUEUE_SIZE;
+ }
+#undef FIELD_OK
+}
+
+/**
+ * This function will be called when the process allocates the IO qpair.
+ * Note: the ctrlr_lock must be held when calling this function.
+ */
+static void
+nvme_ctrlr_proc_add_io_qpair(struct spdk_nvme_qpair *qpair)
+{
+ struct spdk_nvme_ctrlr_process *active_proc;
+ struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
+
+ active_proc = nvme_ctrlr_get_current_process(ctrlr);
+ if (active_proc) {
+ TAILQ_INSERT_TAIL(&active_proc->allocated_io_qpairs, qpair, per_process_tailq);
+ qpair->active_proc = active_proc;
+ }
+}
+
+/**
+ * This function will be called when the process frees the IO qpair.
+ * Note: the ctrlr_lock must be held when calling this function.
+ */
+static void
+nvme_ctrlr_proc_remove_io_qpair(struct spdk_nvme_qpair *qpair)
+{
+ struct spdk_nvme_ctrlr_process *active_proc;
+ struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
+ struct spdk_nvme_qpair *active_qpair, *tmp_qpair;
+
+ active_proc = nvme_ctrlr_get_current_process(ctrlr);
+ if (!active_proc) {
+ return;
+ }
+
+ TAILQ_FOREACH_SAFE(active_qpair, &active_proc->allocated_io_qpairs,
+ per_process_tailq, tmp_qpair) {
+ if (active_qpair == qpair) {
+ TAILQ_REMOVE(&active_proc->allocated_io_qpairs,
+ active_qpair, per_process_tailq);
+
+ break;
+ }
+ }
+}
+
+void
+spdk_nvme_ctrlr_get_default_io_qpair_opts(struct spdk_nvme_ctrlr *ctrlr,
+ struct spdk_nvme_io_qpair_opts *opts,
+ size_t opts_size)
+{
+ assert(ctrlr);
+
+ assert(opts);
+
+ memset(opts, 0, opts_size);
+
+#define FIELD_OK(field) \
+ offsetof(struct spdk_nvme_io_qpair_opts, field) + sizeof(opts->field) <= opts_size
+
+ if (FIELD_OK(qprio)) {
+ opts->qprio = SPDK_NVME_QPRIO_URGENT;
+ }
+
+ if (FIELD_OK(io_queue_size)) {
+ opts->io_queue_size = ctrlr->opts.io_queue_size;
+ }
+
+ if (FIELD_OK(io_queue_requests)) {
+ opts->io_queue_requests = ctrlr->opts.io_queue_requests;
+ }
+
+ if (FIELD_OK(delay_cmd_submit)) {
+ opts->delay_cmd_submit = false;
+ }
+
+ if (FIELD_OK(sq.vaddr)) {
+ opts->sq.vaddr = NULL;
+ }
+
+ if (FIELD_OK(sq.paddr)) {
+ opts->sq.paddr = 0;
+ }
+
+ if (FIELD_OK(sq.buffer_size)) {
+ opts->sq.buffer_size = 0;
+ }
+
+ if (FIELD_OK(cq.vaddr)) {
+ opts->cq.vaddr = NULL;
+ }
+
+ if (FIELD_OK(cq.paddr)) {
+ opts->cq.paddr = 0;
+ }
+
+ if (FIELD_OK(cq.buffer_size)) {
+ opts->cq.buffer_size = 0;
+ }
+
+ if (FIELD_OK(create_only)) {
+ opts->create_only = false;
+ }
+
+#undef FIELD_OK
+}
+
+static struct spdk_nvme_qpair *
+nvme_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr,
+ const struct spdk_nvme_io_qpair_opts *opts)
+{
+ uint32_t qid;
+ struct spdk_nvme_qpair *qpair;
+ union spdk_nvme_cc_register cc;
+
+ if (!ctrlr) {
+ return NULL;
+ }
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ if (nvme_ctrlr_get_cc(ctrlr, &cc)) {
+ SPDK_ERRLOG("get_cc failed\n");
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return NULL;
+ }
+
+ if (opts->qprio & ~SPDK_NVME_CREATE_IO_SQ_QPRIO_MASK) {
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return NULL;
+ }
+
+ /*
+ * Only value SPDK_NVME_QPRIO_URGENT(0) is valid for the
+ * default round robin arbitration method.
+ */
+ if ((cc.bits.ams == SPDK_NVME_CC_AMS_RR) && (opts->qprio != SPDK_NVME_QPRIO_URGENT)) {
+ SPDK_ERRLOG("invalid queue priority for default round robin arbitration method\n");
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return NULL;
+ }
+
+ /*
+ * Get the first available I/O queue ID.
+ */
+ qid = spdk_bit_array_find_first_set(ctrlr->free_io_qids, 1);
+ if (qid > ctrlr->opts.num_io_queues) {
+ SPDK_ERRLOG("No free I/O queue IDs\n");
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return NULL;
+ }
+
+ qpair = nvme_transport_ctrlr_create_io_qpair(ctrlr, qid, opts);
+ if (qpair == NULL) {
+ SPDK_ERRLOG("nvme_transport_ctrlr_create_io_qpair() failed\n");
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return NULL;
+ }
+
+ spdk_bit_array_clear(ctrlr->free_io_qids, qid);
+ TAILQ_INSERT_TAIL(&ctrlr->active_io_qpairs, qpair, tailq);
+
+ nvme_ctrlr_proc_add_io_qpair(qpair);
+
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+ return qpair;
+}
+
+int
+spdk_nvme_ctrlr_connect_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
+{
+ int rc;
+
+ if (nvme_qpair_get_state(qpair) != NVME_QPAIR_DISCONNECTED) {
+ return -EISCONN;
+ }
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ rc = nvme_transport_ctrlr_connect_qpair(ctrlr, qpair);
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+ if (ctrlr->quirks & NVME_QUIRK_DELAY_AFTER_QUEUE_ALLOC) {
+ spdk_delay_us(100);
+ }
+
+ return rc;
+}
+
+void
+spdk_nvme_ctrlr_disconnect_io_qpair(struct spdk_nvme_qpair *qpair)
+{
+ struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ nvme_transport_ctrlr_disconnect_qpair(ctrlr, qpair);
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+}
+
+struct spdk_nvme_qpair *
+spdk_nvme_ctrlr_alloc_io_qpair(struct spdk_nvme_ctrlr *ctrlr,
+ const struct spdk_nvme_io_qpair_opts *user_opts,
+ size_t opts_size)
+{
+
+ struct spdk_nvme_qpair *qpair;
+ struct spdk_nvme_io_qpair_opts opts;
+ int rc;
+
+ /*
+ * Get the default options, then overwrite them with the user-provided options
+ * up to opts_size.
+ *
+ * This allows for extensions of the opts structure without breaking
+ * ABI compatibility.
+ */
+ spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts));
+ if (user_opts) {
+ memcpy(&opts, user_opts, spdk_min(sizeof(opts), opts_size));
+
+ /* If user passes buffers, make sure they're big enough for the requested queue size */
+ if (opts.sq.vaddr) {
+ if (opts.sq.buffer_size < (opts.io_queue_size * sizeof(struct spdk_nvme_cmd))) {
+ SPDK_ERRLOG("sq buffer size %lx is too small for sq size %lx\n",
+ opts.sq.buffer_size, (opts.io_queue_size * sizeof(struct spdk_nvme_cmd)));
+ return NULL;
+ }
+ }
+ if (opts.cq.vaddr) {
+ if (opts.cq.buffer_size < (opts.io_queue_size * sizeof(struct spdk_nvme_cpl))) {
+ SPDK_ERRLOG("cq buffer size %lx is too small for cq size %lx\n",
+ opts.cq.buffer_size, (opts.io_queue_size * sizeof(struct spdk_nvme_cpl)));
+ return NULL;
+ }
+ }
+ }
+
+ qpair = nvme_ctrlr_create_io_qpair(ctrlr, &opts);
+
+ if (qpair == NULL || opts.create_only == true) {
+ return qpair;
+ }
+
+ rc = spdk_nvme_ctrlr_connect_io_qpair(ctrlr, qpair);
+ if (rc != 0) {
+ SPDK_ERRLOG("nvme_transport_ctrlr_connect_io_qpair() failed\n");
+ nvme_transport_ctrlr_delete_io_qpair(ctrlr, qpair);
+ return NULL;
+ }
+
+ return qpair;
+}
+
+int
+spdk_nvme_ctrlr_reconnect_io_qpair(struct spdk_nvme_qpair *qpair)
+{
+ struct spdk_nvme_ctrlr *ctrlr;
+ enum nvme_qpair_state qpair_state;
+ int rc;
+
+ assert(qpair != NULL);
+ assert(nvme_qpair_is_admin_queue(qpair) == false);
+ assert(qpair->ctrlr != NULL);
+
+ ctrlr = qpair->ctrlr;
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ qpair_state = nvme_qpair_get_state(qpair);
+
+ if (ctrlr->is_removed) {
+ rc = -ENODEV;
+ goto out;
+ }
+
+ if (ctrlr->is_resetting || qpair_state == NVME_QPAIR_DISCONNECTING) {
+ rc = -EAGAIN;
+ goto out;
+ }
+
+ if (ctrlr->is_failed || qpair_state == NVME_QPAIR_DESTROYING) {
+ rc = -ENXIO;
+ goto out;
+ }
+
+ if (qpair_state != NVME_QPAIR_DISCONNECTED) {
+ rc = 0;
+ goto out;
+ }
+
+ rc = nvme_transport_ctrlr_connect_qpair(ctrlr, qpair);
+ if (rc) {
+ rc = -EAGAIN;
+ goto out;
+ }
+
+out:
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return rc;
+}
+
+spdk_nvme_qp_failure_reason
+spdk_nvme_ctrlr_get_admin_qp_failure_reason(struct spdk_nvme_ctrlr *ctrlr)
+{
+ return ctrlr->adminq->transport_failure_reason;
+}
+
+/*
+ * This internal function will attempt to take the controller
+ * lock before calling disconnect on a controller qpair.
+ * Functions already holding the controller lock should
+ * call nvme_transport_ctrlr_disconnect_qpair directly.
+ */
+void
+nvme_ctrlr_disconnect_qpair(struct spdk_nvme_qpair *qpair)
+{
+ struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
+
+ assert(ctrlr != NULL);
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ nvme_transport_ctrlr_disconnect_qpair(ctrlr, qpair);
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+}
+
+int
+spdk_nvme_ctrlr_free_io_qpair(struct spdk_nvme_qpair *qpair)
+{
+ struct spdk_nvme_ctrlr *ctrlr;
+
+ if (qpair == NULL) {
+ return 0;
+ }
+
+ ctrlr = qpair->ctrlr;
+
+ if (qpair->in_completion_context) {
+ /*
+ * There are many cases where it is convenient to delete an io qpair in the context
+ * of that qpair's completion routine. To handle this properly, set a flag here
+ * so that the completion routine will perform an actual delete after the context
+ * unwinds.
+ */
+ qpair->delete_after_completion_context = 1;
+ return 0;
+ }
+
+ if (qpair->poll_group && qpair->poll_group->in_completion_context) {
+ /* Same as above, but in a poll group. */
+ qpair->poll_group->num_qpairs_to_delete++;
+ qpair->delete_after_completion_context = 1;
+ return 0;
+ }
+
+ if (qpair->poll_group) {
+ spdk_nvme_poll_group_remove(qpair->poll_group->group, qpair);
+ }
+
+ /* Do not retry. */
+ nvme_qpair_set_state(qpair, NVME_QPAIR_DESTROYING);
+
+ /* In the multi-process case, a process may call this function on a foreign
+ * I/O qpair (i.e. one that this process did not create) when that qpairs process
+ * exits unexpectedly. In that case, we must not try to abort any reqs associated
+ * with that qpair, since the callbacks will also be foreign to this process.
+ */
+ if (qpair->active_proc == nvme_ctrlr_get_current_process(ctrlr)) {
+ nvme_qpair_abort_reqs(qpair, 1);
+ }
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+
+ nvme_ctrlr_proc_remove_io_qpair(qpair);
+
+ TAILQ_REMOVE(&ctrlr->active_io_qpairs, qpair, tailq);
+ spdk_bit_array_set(ctrlr->free_io_qids, qpair->id);
+
+ if (nvme_transport_ctrlr_delete_io_qpair(ctrlr, qpair)) {
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return -1;
+ }
+
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return 0;
+}
+
+static void
+nvme_ctrlr_construct_intel_support_log_page_list(struct spdk_nvme_ctrlr *ctrlr,
+ struct spdk_nvme_intel_log_page_directory *log_page_directory)
+{
+ if (log_page_directory == NULL) {
+ return;
+ }
+
+ if (ctrlr->cdata.vid != SPDK_PCI_VID_INTEL) {
+ return;
+ }
+
+ ctrlr->log_page_supported[SPDK_NVME_INTEL_LOG_PAGE_DIRECTORY] = true;
+
+ if (log_page_directory->read_latency_log_len ||
+ (ctrlr->quirks & NVME_INTEL_QUIRK_READ_LATENCY)) {
+ ctrlr->log_page_supported[SPDK_NVME_INTEL_LOG_READ_CMD_LATENCY] = true;
+ }
+ if (log_page_directory->write_latency_log_len ||
+ (ctrlr->quirks & NVME_INTEL_QUIRK_WRITE_LATENCY)) {
+ ctrlr->log_page_supported[SPDK_NVME_INTEL_LOG_WRITE_CMD_LATENCY] = true;
+ }
+ if (log_page_directory->temperature_statistics_log_len) {
+ ctrlr->log_page_supported[SPDK_NVME_INTEL_LOG_TEMPERATURE] = true;
+ }
+ if (log_page_directory->smart_log_len) {
+ ctrlr->log_page_supported[SPDK_NVME_INTEL_LOG_SMART] = true;
+ }
+ if (log_page_directory->marketing_description_log_len) {
+ ctrlr->log_page_supported[SPDK_NVME_INTEL_MARKETING_DESCRIPTION] = true;
+ }
+}
+
+static int nvme_ctrlr_set_intel_support_log_pages(struct spdk_nvme_ctrlr *ctrlr)
+{
+ int rc = 0;
+ struct nvme_completion_poll_status *status;
+ struct spdk_nvme_intel_log_page_directory *log_page_directory;
+
+ log_page_directory = spdk_zmalloc(sizeof(struct spdk_nvme_intel_log_page_directory),
+ 64, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
+ if (log_page_directory == NULL) {
+ SPDK_ERRLOG("could not allocate log_page_directory\n");
+ return -ENXIO;
+ }
+
+ status = calloc(1, sizeof(*status));
+ if (!status) {
+ SPDK_ERRLOG("Failed to allocate status tracker\n");
+ spdk_free(log_page_directory);
+ return -ENOMEM;
+ }
+
+ rc = spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, SPDK_NVME_INTEL_LOG_PAGE_DIRECTORY,
+ SPDK_NVME_GLOBAL_NS_TAG, log_page_directory,
+ sizeof(struct spdk_nvme_intel_log_page_directory),
+ 0, nvme_completion_poll_cb, status);
+ if (rc != 0) {
+ spdk_free(log_page_directory);
+ free(status);
+ return rc;
+ }
+
+ if (nvme_wait_for_completion_timeout(ctrlr->adminq, status,
+ ctrlr->opts.admin_timeout_ms / 1000)) {
+ spdk_free(log_page_directory);
+ SPDK_WARNLOG("Intel log pages not supported on Intel drive!\n");
+ if (!status->timed_out) {
+ free(status);
+ }
+ return 0;
+ }
+
+ nvme_ctrlr_construct_intel_support_log_page_list(ctrlr, log_page_directory);
+ spdk_free(log_page_directory);
+ free(status);
+ return 0;
+}
+
+static int
+nvme_ctrlr_set_supported_log_pages(struct spdk_nvme_ctrlr *ctrlr)
+{
+ int rc = 0;
+
+ memset(ctrlr->log_page_supported, 0, sizeof(ctrlr->log_page_supported));
+ /* Mandatory pages */
+ ctrlr->log_page_supported[SPDK_NVME_LOG_ERROR] = true;
+ ctrlr->log_page_supported[SPDK_NVME_LOG_HEALTH_INFORMATION] = true;
+ ctrlr->log_page_supported[SPDK_NVME_LOG_FIRMWARE_SLOT] = true;
+ if (ctrlr->cdata.lpa.celp) {
+ ctrlr->log_page_supported[SPDK_NVME_LOG_COMMAND_EFFECTS_LOG] = true;
+ }
+ if (ctrlr->cdata.vid == SPDK_PCI_VID_INTEL && !(ctrlr->quirks & NVME_INTEL_QUIRK_NO_LOG_PAGES)) {
+ rc = nvme_ctrlr_set_intel_support_log_pages(ctrlr);
+ }
+
+ return rc;
+}
+
+static void
+nvme_ctrlr_set_intel_supported_features(struct spdk_nvme_ctrlr *ctrlr)
+{
+ ctrlr->feature_supported[SPDK_NVME_INTEL_FEAT_MAX_LBA] = true;
+ ctrlr->feature_supported[SPDK_NVME_INTEL_FEAT_NATIVE_MAX_LBA] = true;
+ ctrlr->feature_supported[SPDK_NVME_INTEL_FEAT_POWER_GOVERNOR_SETTING] = true;
+ ctrlr->feature_supported[SPDK_NVME_INTEL_FEAT_SMBUS_ADDRESS] = true;
+ ctrlr->feature_supported[SPDK_NVME_INTEL_FEAT_LED_PATTERN] = true;
+ ctrlr->feature_supported[SPDK_NVME_INTEL_FEAT_RESET_TIMED_WORKLOAD_COUNTERS] = true;
+ ctrlr->feature_supported[SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING] = true;
+}
+
+static void
+nvme_ctrlr_set_arbitration_feature(struct spdk_nvme_ctrlr *ctrlr)
+{
+ uint32_t cdw11;
+ struct nvme_completion_poll_status *status;
+
+ if (ctrlr->opts.arbitration_burst == 0) {
+ return;
+ }
+
+ if (ctrlr->opts.arbitration_burst > 7) {
+ SPDK_WARNLOG("Valid arbitration burst values is from 0-7\n");
+ return;
+ }
+
+ status = calloc(1, sizeof(*status));
+ if (!status) {
+ SPDK_ERRLOG("Failed to allocate status tracker\n");
+ return;
+ }
+
+ cdw11 = ctrlr->opts.arbitration_burst;
+
+ if (spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_WRR_SUPPORTED) {
+ cdw11 |= (uint32_t)ctrlr->opts.low_priority_weight << 8;
+ cdw11 |= (uint32_t)ctrlr->opts.medium_priority_weight << 16;
+ cdw11 |= (uint32_t)ctrlr->opts.high_priority_weight << 24;
+ }
+
+ if (spdk_nvme_ctrlr_cmd_set_feature(ctrlr, SPDK_NVME_FEAT_ARBITRATION,
+ cdw11, 0, NULL, 0,
+ nvme_completion_poll_cb, status) < 0) {
+ SPDK_ERRLOG("Set arbitration feature failed\n");
+ free(status);
+ return;
+ }
+
+ if (nvme_wait_for_completion_timeout(ctrlr->adminq, status,
+ ctrlr->opts.admin_timeout_ms / 1000)) {
+ SPDK_ERRLOG("Timeout to set arbitration feature\n");
+ }
+
+ if (!status->timed_out) {
+ free(status);
+ }
+}
+
+static void
+nvme_ctrlr_set_supported_features(struct spdk_nvme_ctrlr *ctrlr)
+{
+ memset(ctrlr->feature_supported, 0, sizeof(ctrlr->feature_supported));
+ /* Mandatory features */
+ ctrlr->feature_supported[SPDK_NVME_FEAT_ARBITRATION] = true;
+ ctrlr->feature_supported[SPDK_NVME_FEAT_POWER_MANAGEMENT] = true;
+ ctrlr->feature_supported[SPDK_NVME_FEAT_TEMPERATURE_THRESHOLD] = true;
+ ctrlr->feature_supported[SPDK_NVME_FEAT_ERROR_RECOVERY] = true;
+ ctrlr->feature_supported[SPDK_NVME_FEAT_NUMBER_OF_QUEUES] = true;
+ ctrlr->feature_supported[SPDK_NVME_FEAT_INTERRUPT_COALESCING] = true;
+ ctrlr->feature_supported[SPDK_NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION] = true;
+ ctrlr->feature_supported[SPDK_NVME_FEAT_WRITE_ATOMICITY] = true;
+ ctrlr->feature_supported[SPDK_NVME_FEAT_ASYNC_EVENT_CONFIGURATION] = true;
+ /* Optional features */
+ if (ctrlr->cdata.vwc.present) {
+ ctrlr->feature_supported[SPDK_NVME_FEAT_VOLATILE_WRITE_CACHE] = true;
+ }
+ if (ctrlr->cdata.apsta.supported) {
+ ctrlr->feature_supported[SPDK_NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION] = true;
+ }
+ if (ctrlr->cdata.hmpre) {
+ ctrlr->feature_supported[SPDK_NVME_FEAT_HOST_MEM_BUFFER] = true;
+ }
+ if (ctrlr->cdata.vid == SPDK_PCI_VID_INTEL) {
+ nvme_ctrlr_set_intel_supported_features(ctrlr);
+ }
+
+ nvme_ctrlr_set_arbitration_feature(ctrlr);
+}
+
+bool
+spdk_nvme_ctrlr_is_failed(struct spdk_nvme_ctrlr *ctrlr)
+{
+ return ctrlr->is_failed;
+}
+
+void
+nvme_ctrlr_fail(struct spdk_nvme_ctrlr *ctrlr, bool hot_remove)
+{
+ /*
+ * Set the flag here and leave the work failure of qpairs to
+ * spdk_nvme_qpair_process_completions().
+ */
+ if (hot_remove) {
+ ctrlr->is_removed = true;
+ }
+ ctrlr->is_failed = true;
+ nvme_transport_ctrlr_disconnect_qpair(ctrlr, ctrlr->adminq);
+ SPDK_ERRLOG("ctrlr %s in failed state.\n", ctrlr->trid.traddr);
+}
+
+/**
+ * This public API function will try to take the controller lock.
+ * Any private functions being called from a thread already holding
+ * the ctrlr lock should call nvme_ctrlr_fail directly.
+ */
+void
+spdk_nvme_ctrlr_fail(struct spdk_nvme_ctrlr *ctrlr)
+{
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ nvme_ctrlr_fail(ctrlr, false);
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+}
+
+static void
+nvme_ctrlr_shutdown(struct spdk_nvme_ctrlr *ctrlr)
+{
+ union spdk_nvme_cc_register cc;
+ union spdk_nvme_csts_register csts;
+ uint32_t ms_waited = 0;
+ uint32_t shutdown_timeout_ms;
+
+ if (ctrlr->is_removed) {
+ return;
+ }
+
+ if (nvme_ctrlr_get_cc(ctrlr, &cc)) {
+ SPDK_ERRLOG("ctrlr %s get_cc() failed\n", ctrlr->trid.traddr);
+ return;
+ }
+
+ cc.bits.shn = SPDK_NVME_SHN_NORMAL;
+
+ if (nvme_ctrlr_set_cc(ctrlr, &cc)) {
+ SPDK_ERRLOG("ctrlr %s set_cc() failed\n", ctrlr->trid.traddr);
+ return;
+ }
+
+ /*
+ * The NVMe specification defines RTD3E to be the time between
+ * setting SHN = 1 until the controller will set SHST = 10b.
+ * If the device doesn't report RTD3 entry latency, or if it
+ * reports RTD3 entry latency less than 10 seconds, pick
+ * 10 seconds as a reasonable amount of time to
+ * wait before proceeding.
+ */
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "RTD3E = %" PRIu32 " us\n", ctrlr->cdata.rtd3e);
+ shutdown_timeout_ms = (ctrlr->cdata.rtd3e + 999) / 1000;
+ shutdown_timeout_ms = spdk_max(shutdown_timeout_ms, 10000);
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "shutdown timeout = %" PRIu32 " ms\n", shutdown_timeout_ms);
+
+ do {
+ if (nvme_ctrlr_get_csts(ctrlr, &csts)) {
+ SPDK_ERRLOG("ctrlr %s get_csts() failed\n", ctrlr->trid.traddr);
+ return;
+ }
+
+ if (csts.bits.shst == SPDK_NVME_SHST_COMPLETE) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "ctrlr %s shutdown complete in %u milliseconds\n",
+ ctrlr->trid.traddr, ms_waited);
+ return;
+ }
+
+ nvme_delay(1000);
+ ms_waited++;
+ } while (ms_waited < shutdown_timeout_ms);
+
+ SPDK_ERRLOG("ctrlr %s did not shutdown within %u milliseconds\n",
+ ctrlr->trid.traddr, shutdown_timeout_ms);
+ if (ctrlr->quirks & NVME_QUIRK_SHST_COMPLETE) {
+ SPDK_ERRLOG("likely due to shutdown handling in the VMWare emulated NVMe SSD\n");
+ }
+}
+
+static int
+nvme_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr)
+{
+ union spdk_nvme_cc_register cc;
+ int rc;
+
+ rc = nvme_transport_ctrlr_enable(ctrlr);
+ if (rc != 0) {
+ SPDK_ERRLOG("transport ctrlr_enable failed\n");
+ return rc;
+ }
+
+ if (nvme_ctrlr_get_cc(ctrlr, &cc)) {
+ SPDK_ERRLOG("get_cc() failed\n");
+ return -EIO;
+ }
+
+ if (cc.bits.en != 0) {
+ SPDK_ERRLOG("called with CC.EN = 1\n");
+ return -EINVAL;
+ }
+
+ cc.bits.en = 1;
+ cc.bits.css = 0;
+ cc.bits.shn = 0;
+ cc.bits.iosqes = 6; /* SQ entry size == 64 == 2^6 */
+ cc.bits.iocqes = 4; /* CQ entry size == 16 == 2^4 */
+
+ /* Page size is 2 ^ (12 + mps). */
+ cc.bits.mps = spdk_u32log2(ctrlr->page_size) - 12;
+
+ if (ctrlr->cap.bits.css == 0) {
+ SPDK_INFOLOG(SPDK_LOG_NVME,
+ "Drive reports no command sets supported. Assuming NVM is supported.\n");
+ ctrlr->cap.bits.css = SPDK_NVME_CAP_CSS_NVM;
+ }
+
+ if (!(ctrlr->cap.bits.css & (1u << ctrlr->opts.command_set))) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Requested I/O command set %u but supported mask is 0x%x\n",
+ ctrlr->opts.command_set, ctrlr->cap.bits.css);
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Falling back to NVM. Assuming NVM is supported.\n");
+ ctrlr->opts.command_set = SPDK_NVME_CC_CSS_NVM;
+ }
+
+ cc.bits.css = ctrlr->opts.command_set;
+
+ switch (ctrlr->opts.arb_mechanism) {
+ case SPDK_NVME_CC_AMS_RR:
+ break;
+ case SPDK_NVME_CC_AMS_WRR:
+ if (SPDK_NVME_CAP_AMS_WRR & ctrlr->cap.bits.ams) {
+ break;
+ }
+ return -EINVAL;
+ case SPDK_NVME_CC_AMS_VS:
+ if (SPDK_NVME_CAP_AMS_VS & ctrlr->cap.bits.ams) {
+ break;
+ }
+ return -EINVAL;
+ default:
+ return -EINVAL;
+ }
+
+ cc.bits.ams = ctrlr->opts.arb_mechanism;
+
+ if (nvme_ctrlr_set_cc(ctrlr, &cc)) {
+ SPDK_ERRLOG("set_cc() failed\n");
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static int
+nvme_ctrlr_disable(struct spdk_nvme_ctrlr *ctrlr)
+{
+ union spdk_nvme_cc_register cc;
+
+ if (nvme_ctrlr_get_cc(ctrlr, &cc)) {
+ SPDK_ERRLOG("get_cc() failed\n");
+ return -EIO;
+ }
+
+ if (cc.bits.en == 0) {
+ return 0;
+ }
+
+ cc.bits.en = 0;
+
+ if (nvme_ctrlr_set_cc(ctrlr, &cc)) {
+ SPDK_ERRLOG("set_cc() failed\n");
+ return -EIO;
+ }
+
+ return 0;
+}
+
+#ifdef DEBUG
+static const char *
+nvme_ctrlr_state_string(enum nvme_ctrlr_state state)
+{
+ switch (state) {
+ case NVME_CTRLR_STATE_INIT_DELAY:
+ return "delay init";
+ case NVME_CTRLR_STATE_INIT:
+ return "init";
+ case NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_1:
+ return "disable and wait for CSTS.RDY = 1";
+ case NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_0:
+ return "disable and wait for CSTS.RDY = 0";
+ case NVME_CTRLR_STATE_ENABLE:
+ return "enable controller by writing CC.EN = 1";
+ case NVME_CTRLR_STATE_ENABLE_WAIT_FOR_READY_1:
+ return "wait for CSTS.RDY = 1";
+ case NVME_CTRLR_STATE_RESET_ADMIN_QUEUE:
+ return "reset admin queue";
+ case NVME_CTRLR_STATE_IDENTIFY:
+ return "identify controller";
+ case NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY:
+ return "wait for identify controller";
+ case NVME_CTRLR_STATE_SET_NUM_QUEUES:
+ return "set number of queues";
+ case NVME_CTRLR_STATE_WAIT_FOR_SET_NUM_QUEUES:
+ return "wait for set number of queues";
+ case NVME_CTRLR_STATE_CONSTRUCT_NS:
+ return "construct namespaces";
+ case NVME_CTRLR_STATE_IDENTIFY_ACTIVE_NS:
+ return "identify active ns";
+ case NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_ACTIVE_NS:
+ return "wait for identify active ns";
+ case NVME_CTRLR_STATE_IDENTIFY_NS:
+ return "identify ns";
+ case NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_NS:
+ return "wait for identify ns";
+ case NVME_CTRLR_STATE_IDENTIFY_ID_DESCS:
+ return "identify namespace id descriptors";
+ case NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_ID_DESCS:
+ return "wait for identify namespace id descriptors";
+ case NVME_CTRLR_STATE_CONFIGURE_AER:
+ return "configure AER";
+ case NVME_CTRLR_STATE_WAIT_FOR_CONFIGURE_AER:
+ return "wait for configure aer";
+ case NVME_CTRLR_STATE_SET_SUPPORTED_LOG_PAGES:
+ return "set supported log pages";
+ case NVME_CTRLR_STATE_SET_SUPPORTED_FEATURES:
+ return "set supported features";
+ case NVME_CTRLR_STATE_SET_DB_BUF_CFG:
+ return "set doorbell buffer config";
+ case NVME_CTRLR_STATE_WAIT_FOR_DB_BUF_CFG:
+ return "wait for doorbell buffer config";
+ case NVME_CTRLR_STATE_SET_KEEP_ALIVE_TIMEOUT:
+ return "set keep alive timeout";
+ case NVME_CTRLR_STATE_WAIT_FOR_KEEP_ALIVE_TIMEOUT:
+ return "wait for set keep alive timeout";
+ case NVME_CTRLR_STATE_SET_HOST_ID:
+ return "set host ID";
+ case NVME_CTRLR_STATE_WAIT_FOR_HOST_ID:
+ return "wait for set host ID";
+ case NVME_CTRLR_STATE_READY:
+ return "ready";
+ case NVME_CTRLR_STATE_ERROR:
+ return "error";
+ }
+ return "unknown";
+};
+#endif /* DEBUG */
+
+static void
+nvme_ctrlr_set_state(struct spdk_nvme_ctrlr *ctrlr, enum nvme_ctrlr_state state,
+ uint64_t timeout_in_ms)
+{
+ uint64_t ticks_per_ms, timeout_in_ticks, now_ticks;
+
+ ctrlr->state = state;
+ if (timeout_in_ms == NVME_TIMEOUT_INFINITE) {
+ goto inf;
+ }
+
+ ticks_per_ms = spdk_get_ticks_hz() / 1000;
+ if (timeout_in_ms > UINT64_MAX / ticks_per_ms) {
+ SPDK_ERRLOG("Specified timeout would cause integer overflow. Defaulting to no timeout.\n");
+ goto inf;
+ }
+
+ now_ticks = spdk_get_ticks();
+ timeout_in_ticks = timeout_in_ms * ticks_per_ms;
+ if (timeout_in_ticks > UINT64_MAX - now_ticks) {
+ SPDK_ERRLOG("Specified timeout would cause integer overflow. Defaulting to no timeout.\n");
+ goto inf;
+ }
+
+ ctrlr->state_timeout_tsc = timeout_in_ticks + now_ticks;
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "setting state to %s (timeout %" PRIu64 " ms)\n",
+ nvme_ctrlr_state_string(ctrlr->state), timeout_in_ms);
+ return;
+inf:
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "setting state to %s (no timeout)\n",
+ nvme_ctrlr_state_string(ctrlr->state));
+ ctrlr->state_timeout_tsc = NVME_TIMEOUT_INFINITE;
+}
+
+static void
+nvme_ctrlr_free_doorbell_buffer(struct spdk_nvme_ctrlr *ctrlr)
+{
+ if (ctrlr->shadow_doorbell) {
+ spdk_free(ctrlr->shadow_doorbell);
+ ctrlr->shadow_doorbell = NULL;
+ }
+
+ if (ctrlr->eventidx) {
+ spdk_free(ctrlr->eventidx);
+ ctrlr->eventidx = NULL;
+ }
+}
+
+static void
+nvme_ctrlr_set_doorbell_buffer_config_done(void *arg, const struct spdk_nvme_cpl *cpl)
+{
+ struct spdk_nvme_ctrlr *ctrlr = (struct spdk_nvme_ctrlr *)arg;
+
+ if (spdk_nvme_cpl_is_error(cpl)) {
+ SPDK_WARNLOG("Doorbell buffer config failed\n");
+ } else {
+ SPDK_INFOLOG(SPDK_LOG_NVME, "NVMe controller: %s doorbell buffer config enabled\n",
+ ctrlr->trid.traddr);
+ }
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_KEEP_ALIVE_TIMEOUT,
+ ctrlr->opts.admin_timeout_ms);
+}
+
+static int
+nvme_ctrlr_set_doorbell_buffer_config(struct spdk_nvme_ctrlr *ctrlr)
+{
+ int rc = 0;
+ uint64_t prp1, prp2, len;
+
+ if (!ctrlr->cdata.oacs.doorbell_buffer_config) {
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_KEEP_ALIVE_TIMEOUT,
+ ctrlr->opts.admin_timeout_ms);
+ return 0;
+ }
+
+ if (ctrlr->trid.trtype != SPDK_NVME_TRANSPORT_PCIE) {
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_KEEP_ALIVE_TIMEOUT,
+ ctrlr->opts.admin_timeout_ms);
+ return 0;
+ }
+
+ /* only 1 page size for doorbell buffer */
+ ctrlr->shadow_doorbell = spdk_zmalloc(ctrlr->page_size, ctrlr->page_size,
+ NULL, SPDK_ENV_LCORE_ID_ANY,
+ SPDK_MALLOC_DMA | SPDK_MALLOC_SHARE);
+ if (ctrlr->shadow_doorbell == NULL) {
+ rc = -ENOMEM;
+ goto error;
+ }
+
+ len = ctrlr->page_size;
+ prp1 = spdk_vtophys(ctrlr->shadow_doorbell, &len);
+ if (prp1 == SPDK_VTOPHYS_ERROR || len != ctrlr->page_size) {
+ rc = -EFAULT;
+ goto error;
+ }
+
+ ctrlr->eventidx = spdk_zmalloc(ctrlr->page_size, ctrlr->page_size,
+ NULL, SPDK_ENV_LCORE_ID_ANY,
+ SPDK_MALLOC_DMA | SPDK_MALLOC_SHARE);
+ if (ctrlr->eventidx == NULL) {
+ rc = -ENOMEM;
+ goto error;
+ }
+
+ len = ctrlr->page_size;
+ prp2 = spdk_vtophys(ctrlr->eventidx, &len);
+ if (prp2 == SPDK_VTOPHYS_ERROR || len != ctrlr->page_size) {
+ rc = -EFAULT;
+ goto error;
+ }
+
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_DB_BUF_CFG,
+ ctrlr->opts.admin_timeout_ms);
+
+ rc = nvme_ctrlr_cmd_doorbell_buffer_config(ctrlr, prp1, prp2,
+ nvme_ctrlr_set_doorbell_buffer_config_done, ctrlr);
+ if (rc != 0) {
+ goto error;
+ }
+
+ return 0;
+
+error:
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+ nvme_ctrlr_free_doorbell_buffer(ctrlr);
+ return rc;
+}
+
+static void
+nvme_ctrlr_abort_queued_aborts(struct spdk_nvme_ctrlr *ctrlr)
+{
+ struct nvme_request *req, *tmp;
+ struct spdk_nvme_cpl cpl = {};
+
+ cpl.status.sc = SPDK_NVME_SC_ABORTED_SQ_DELETION;
+ cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+
+ STAILQ_FOREACH_SAFE(req, &ctrlr->queued_aborts, stailq, tmp) {
+ STAILQ_REMOVE_HEAD(&ctrlr->queued_aborts, stailq);
+
+ nvme_complete_request(req->cb_fn, req->cb_arg, req->qpair, req, &cpl);
+ nvme_free_request(req);
+ }
+}
+
+int
+spdk_nvme_ctrlr_reset(struct spdk_nvme_ctrlr *ctrlr)
+{
+ int rc = 0;
+ struct spdk_nvme_qpair *qpair;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+
+ if (ctrlr->is_resetting || ctrlr->is_removed) {
+ /*
+ * Controller is already resetting or has been removed. Return
+ * immediately since there is no need to kick off another
+ * reset in these cases.
+ */
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return ctrlr->is_resetting ? 0 : -ENXIO;
+ }
+
+ ctrlr->is_resetting = true;
+ ctrlr->is_failed = false;
+
+ SPDK_NOTICELOG("resetting controller\n");
+
+ /* Abort all of the queued abort requests */
+ nvme_ctrlr_abort_queued_aborts(ctrlr);
+
+ nvme_transport_admin_qpair_abort_aers(ctrlr->adminq);
+
+ /* Disable all queues before disabling the controller hardware. */
+ TAILQ_FOREACH(qpair, &ctrlr->active_io_qpairs, tailq) {
+ qpair->transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_LOCAL;
+ }
+
+ ctrlr->adminq->transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_LOCAL;
+ nvme_transport_ctrlr_disconnect_qpair(ctrlr, ctrlr->adminq);
+ if (nvme_transport_ctrlr_connect_qpair(ctrlr, ctrlr->adminq) != 0) {
+ SPDK_ERRLOG("Controller reinitialization failed.\n");
+ rc = -1;
+ goto out;
+ }
+
+ /* Doorbell buffer config is invalid during reset */
+ nvme_ctrlr_free_doorbell_buffer(ctrlr);
+
+ /* Set the state back to INIT to cause a full hardware reset. */
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_INIT, NVME_TIMEOUT_INFINITE);
+
+ nvme_qpair_set_state(ctrlr->adminq, NVME_QPAIR_ENABLED);
+ while (ctrlr->state != NVME_CTRLR_STATE_READY) {
+ if (nvme_ctrlr_process_init(ctrlr) != 0) {
+ SPDK_ERRLOG("controller reinitialization failed\n");
+ rc = -1;
+ break;
+ }
+ }
+
+ /*
+ * For PCIe controllers, the memory locations of the tranpsort qpair
+ * don't change when the controller is reset. They simply need to be
+ * re-enabled with admin commands to the controller. For fabric
+ * controllers we need to disconnect and reconnect the qpair on its
+ * own thread outside of the context of the reset.
+ */
+ if (rc == 0 && ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) {
+ /* Reinitialize qpairs */
+ TAILQ_FOREACH(qpair, &ctrlr->active_io_qpairs, tailq) {
+ if (nvme_transport_ctrlr_connect_qpair(ctrlr, qpair) != 0) {
+ qpair->transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_LOCAL;
+ rc = -1;
+ continue;
+ }
+ }
+ }
+
+out:
+ if (rc) {
+ nvme_ctrlr_fail(ctrlr, false);
+ }
+ ctrlr->is_resetting = false;
+
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+ if (!ctrlr->cdata.oaes.ns_attribute_notices) {
+ /*
+ * If controller doesn't support ns_attribute_notices and
+ * namespace attributes change (e.g. number of namespaces)
+ * we need to update system handling device reset.
+ */
+ nvme_io_msg_ctrlr_update(ctrlr);
+ }
+
+ return rc;
+}
+
+int
+spdk_nvme_ctrlr_set_trid(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_transport_id *trid)
+{
+ int rc = 0;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+
+ if (ctrlr->is_failed == false) {
+ rc = -EPERM;
+ goto out;
+ }
+
+ if (trid->trtype != ctrlr->trid.trtype) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ if (strncmp(trid->subnqn, ctrlr->trid.subnqn, SPDK_NVMF_NQN_MAX_LEN)) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ ctrlr->trid = *trid;
+
+out:
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return rc;
+}
+
+static void
+nvme_ctrlr_identify_done(void *arg, const struct spdk_nvme_cpl *cpl)
+{
+ struct spdk_nvme_ctrlr *ctrlr = (struct spdk_nvme_ctrlr *)arg;
+
+ if (spdk_nvme_cpl_is_error(cpl)) {
+ SPDK_ERRLOG("nvme_identify_controller failed!\n");
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+ return;
+ }
+
+ /*
+ * Use MDTS to ensure our default max_xfer_size doesn't exceed what the
+ * controller supports.
+ */
+ ctrlr->max_xfer_size = nvme_transport_ctrlr_get_max_xfer_size(ctrlr);
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "transport max_xfer_size %u\n", ctrlr->max_xfer_size);
+ if (ctrlr->cdata.mdts > 0) {
+ ctrlr->max_xfer_size = spdk_min(ctrlr->max_xfer_size,
+ ctrlr->min_page_size * (1 << (ctrlr->cdata.mdts)));
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "MDTS max_xfer_size %u\n", ctrlr->max_xfer_size);
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "CNTLID 0x%04" PRIx16 "\n", ctrlr->cdata.cntlid);
+ if (ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) {
+ ctrlr->cntlid = ctrlr->cdata.cntlid;
+ } else {
+ /*
+ * Fabrics controllers should already have CNTLID from the Connect command.
+ *
+ * If CNTLID from Connect doesn't match CNTLID in the Identify Controller data,
+ * trust the one from Connect.
+ */
+ if (ctrlr->cntlid != ctrlr->cdata.cntlid) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME,
+ "Identify CNTLID 0x%04" PRIx16 " != Connect CNTLID 0x%04" PRIx16 "\n",
+ ctrlr->cdata.cntlid, ctrlr->cntlid);
+ }
+ }
+
+ if (ctrlr->cdata.sgls.supported) {
+ assert(ctrlr->cdata.sgls.supported != 0x3);
+ ctrlr->flags |= SPDK_NVME_CTRLR_SGL_SUPPORTED;
+ if (ctrlr->cdata.sgls.supported == 0x2) {
+ ctrlr->flags |= SPDK_NVME_CTRLR_SGL_REQUIRES_DWORD_ALIGNMENT;
+ }
+ /*
+ * Use MSDBD to ensure our max_sges doesn't exceed what the
+ * controller supports.
+ */
+ ctrlr->max_sges = nvme_transport_ctrlr_get_max_sges(ctrlr);
+ if (ctrlr->cdata.nvmf_specific.msdbd != 0) {
+ ctrlr->max_sges = spdk_min(ctrlr->cdata.nvmf_specific.msdbd, ctrlr->max_sges);
+ } else {
+ /* A value 0 indicates no limit. */
+ }
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "transport max_sges %u\n", ctrlr->max_sges);
+ }
+
+ if (ctrlr->cdata.oacs.security && !(ctrlr->quirks & NVME_QUIRK_OACS_SECURITY)) {
+ ctrlr->flags |= SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "fuses compare and write: %d\n", ctrlr->cdata.fuses.compare_and_write);
+ if (ctrlr->cdata.fuses.compare_and_write) {
+ ctrlr->flags |= SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED;
+ }
+
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_NUM_QUEUES,
+ ctrlr->opts.admin_timeout_ms);
+}
+
+static int
+nvme_ctrlr_identify(struct spdk_nvme_ctrlr *ctrlr)
+{
+ int rc;
+
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY,
+ ctrlr->opts.admin_timeout_ms);
+
+ rc = nvme_ctrlr_cmd_identify(ctrlr, SPDK_NVME_IDENTIFY_CTRLR, 0, 0,
+ &ctrlr->cdata, sizeof(ctrlr->cdata),
+ nvme_ctrlr_identify_done, ctrlr);
+ if (rc != 0) {
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+ return rc;
+ }
+
+ return 0;
+}
+
+enum nvme_active_ns_state {
+ NVME_ACTIVE_NS_STATE_IDLE,
+ NVME_ACTIVE_NS_STATE_PROCESSING,
+ NVME_ACTIVE_NS_STATE_DONE,
+ NVME_ACTIVE_NS_STATE_ERROR
+};
+
+typedef void (*nvme_active_ns_ctx_deleter)(struct nvme_active_ns_ctx *);
+
+struct nvme_active_ns_ctx {
+ struct spdk_nvme_ctrlr *ctrlr;
+ uint32_t page;
+ uint32_t num_pages;
+ uint32_t next_nsid;
+ uint32_t *new_ns_list;
+ nvme_active_ns_ctx_deleter deleter;
+
+ enum nvme_active_ns_state state;
+};
+
+static struct nvme_active_ns_ctx *
+nvme_active_ns_ctx_create(struct spdk_nvme_ctrlr *ctrlr, nvme_active_ns_ctx_deleter deleter)
+{
+ struct nvme_active_ns_ctx *ctx;
+ uint32_t num_pages = 0;
+ uint32_t *new_ns_list = NULL;
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx) {
+ SPDK_ERRLOG("Failed to allocate nvme_active_ns_ctx!\n");
+ return NULL;
+ }
+
+ if (ctrlr->num_ns) {
+ /* The allocated size must be a multiple of sizeof(struct spdk_nvme_ns_list) */
+ num_pages = (ctrlr->num_ns * sizeof(new_ns_list[0]) - 1) / sizeof(struct spdk_nvme_ns_list) + 1;
+ new_ns_list = spdk_zmalloc(num_pages * sizeof(struct spdk_nvme_ns_list), ctrlr->page_size,
+ NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA | SPDK_MALLOC_SHARE);
+ if (!new_ns_list) {
+ SPDK_ERRLOG("Failed to allocate active_ns_list!\n");
+ free(ctx);
+ return NULL;
+ }
+ }
+
+ ctx->num_pages = num_pages;
+ ctx->new_ns_list = new_ns_list;
+ ctx->ctrlr = ctrlr;
+ ctx->deleter = deleter;
+
+ return ctx;
+}
+
+static void
+nvme_active_ns_ctx_destroy(struct nvme_active_ns_ctx *ctx)
+{
+ spdk_free(ctx->new_ns_list);
+ free(ctx);
+}
+
+static void
+nvme_ctrlr_identify_active_ns_swap(struct spdk_nvme_ctrlr *ctrlr, uint32_t **new_ns_list)
+{
+ spdk_free(ctrlr->active_ns_list);
+ ctrlr->active_ns_list = *new_ns_list;
+ *new_ns_list = NULL;
+}
+
+static void
+nvme_ctrlr_identify_active_ns_async_done(void *arg, const struct spdk_nvme_cpl *cpl)
+{
+ struct nvme_active_ns_ctx *ctx = arg;
+
+ if (spdk_nvme_cpl_is_error(cpl)) {
+ ctx->state = NVME_ACTIVE_NS_STATE_ERROR;
+ goto out;
+ }
+
+ ctx->next_nsid = ctx->new_ns_list[1024 * ctx->page + 1023];
+ if (ctx->next_nsid == 0 || ++ctx->page == ctx->num_pages) {
+ ctx->state = NVME_ACTIVE_NS_STATE_DONE;
+ goto out;
+ }
+
+ nvme_ctrlr_identify_active_ns_async(ctx);
+ return;
+
+out:
+ if (ctx->deleter) {
+ ctx->deleter(ctx);
+ }
+}
+
+static void
+nvme_ctrlr_identify_active_ns_async(struct nvme_active_ns_ctx *ctx)
+{
+ struct spdk_nvme_ctrlr *ctrlr = ctx->ctrlr;
+ uint32_t i;
+ int rc;
+
+ if (ctrlr->num_ns == 0) {
+ ctx->state = NVME_ACTIVE_NS_STATE_DONE;
+ goto out;
+ }
+
+ /*
+ * If controller doesn't support active ns list CNS 0x02 dummy up
+ * an active ns list, i.e. all namespaces report as active
+ */
+ if (ctrlr->vs.raw < SPDK_NVME_VERSION(1, 1, 0) || ctrlr->quirks & NVME_QUIRK_IDENTIFY_CNS) {
+ for (i = 0; i < ctrlr->num_ns; i++) {
+ ctx->new_ns_list[i] = i + 1;
+ }
+
+ ctx->state = NVME_ACTIVE_NS_STATE_DONE;
+ goto out;
+ }
+
+ ctx->state = NVME_ACTIVE_NS_STATE_PROCESSING;
+ rc = nvme_ctrlr_cmd_identify(ctrlr, SPDK_NVME_IDENTIFY_ACTIVE_NS_LIST, 0, ctx->next_nsid,
+ &ctx->new_ns_list[1024 * ctx->page], sizeof(struct spdk_nvme_ns_list),
+ nvme_ctrlr_identify_active_ns_async_done, ctx);
+ if (rc != 0) {
+ ctx->state = NVME_ACTIVE_NS_STATE_ERROR;
+ goto out;
+ }
+
+ return;
+
+out:
+ if (ctx->deleter) {
+ ctx->deleter(ctx);
+ }
+}
+
+static void
+_nvme_active_ns_ctx_deleter(struct nvme_active_ns_ctx *ctx)
+{
+ struct spdk_nvme_ctrlr *ctrlr = ctx->ctrlr;
+
+ if (ctx->state == NVME_ACTIVE_NS_STATE_ERROR) {
+ nvme_ctrlr_destruct_namespaces(ctrlr);
+ nvme_active_ns_ctx_destroy(ctx);
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+ return;
+ }
+
+ assert(ctx->state == NVME_ACTIVE_NS_STATE_DONE);
+ nvme_ctrlr_identify_active_ns_swap(ctrlr, &ctx->new_ns_list);
+ nvme_active_ns_ctx_destroy(ctx);
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_IDENTIFY_NS, ctrlr->opts.admin_timeout_ms);
+}
+
+static void
+_nvme_ctrlr_identify_active_ns(struct spdk_nvme_ctrlr *ctrlr)
+{
+ struct nvme_active_ns_ctx *ctx;
+
+ ctx = nvme_active_ns_ctx_create(ctrlr, _nvme_active_ns_ctx_deleter);
+ if (!ctx) {
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+ return;
+ }
+
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_ACTIVE_NS,
+ ctrlr->opts.admin_timeout_ms);
+ nvme_ctrlr_identify_active_ns_async(ctx);
+}
+
+int
+nvme_ctrlr_identify_active_ns(struct spdk_nvme_ctrlr *ctrlr)
+{
+ struct nvme_active_ns_ctx *ctx;
+ int rc;
+
+ ctx = nvme_active_ns_ctx_create(ctrlr, NULL);
+ if (!ctx) {
+ return -ENOMEM;
+ }
+
+ nvme_ctrlr_identify_active_ns_async(ctx);
+ while (ctx->state == NVME_ACTIVE_NS_STATE_PROCESSING) {
+ rc = spdk_nvme_qpair_process_completions(ctrlr->adminq, 0);
+ if (rc < 0) {
+ ctx->state = NVME_ACTIVE_NS_STATE_ERROR;
+ break;
+ }
+ }
+
+ if (ctx->state == NVME_ACTIVE_NS_STATE_ERROR) {
+ nvme_active_ns_ctx_destroy(ctx);
+ return -ENXIO;
+ }
+
+ assert(ctx->state == NVME_ACTIVE_NS_STATE_DONE);
+ nvme_ctrlr_identify_active_ns_swap(ctrlr, &ctx->new_ns_list);
+ nvme_active_ns_ctx_destroy(ctx);
+
+ return 0;
+}
+
+static void
+nvme_ctrlr_identify_ns_async_done(void *arg, const struct spdk_nvme_cpl *cpl)
+{
+ struct spdk_nvme_ns *ns = (struct spdk_nvme_ns *)arg;
+ struct spdk_nvme_ctrlr *ctrlr = ns->ctrlr;
+ uint32_t nsid;
+ int rc;
+
+ if (spdk_nvme_cpl_is_error(cpl)) {
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+ return;
+ } else {
+ nvme_ns_set_identify_data(ns);
+ }
+
+ /* move on to the next active NS */
+ nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, ns->id);
+ ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
+ if (ns == NULL) {
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_IDENTIFY_ID_DESCS,
+ ctrlr->opts.admin_timeout_ms);
+ return;
+ }
+ ns->ctrlr = ctrlr;
+ ns->id = nsid;
+
+ rc = nvme_ctrlr_identify_ns_async(ns);
+ if (rc) {
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+ }
+}
+
+static int
+nvme_ctrlr_identify_ns_async(struct spdk_nvme_ns *ns)
+{
+ struct spdk_nvme_ctrlr *ctrlr = ns->ctrlr;
+ struct spdk_nvme_ns_data *nsdata;
+
+ nsdata = &ctrlr->nsdata[ns->id - 1];
+
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_NS,
+ ctrlr->opts.admin_timeout_ms);
+ return nvme_ctrlr_cmd_identify(ns->ctrlr, SPDK_NVME_IDENTIFY_NS, 0, ns->id,
+ nsdata, sizeof(*nsdata),
+ nvme_ctrlr_identify_ns_async_done, ns);
+}
+
+static int
+nvme_ctrlr_identify_namespaces(struct spdk_nvme_ctrlr *ctrlr)
+{
+ uint32_t nsid;
+ struct spdk_nvme_ns *ns;
+ int rc;
+
+ nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr);
+ ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
+ if (ns == NULL) {
+ /* No active NS, move on to the next state */
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_CONFIGURE_AER,
+ ctrlr->opts.admin_timeout_ms);
+ return 0;
+ }
+
+ ns->ctrlr = ctrlr;
+ ns->id = nsid;
+
+ rc = nvme_ctrlr_identify_ns_async(ns);
+ if (rc) {
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+ }
+
+ return rc;
+}
+
+static void
+nvme_ctrlr_identify_id_desc_async_done(void *arg, const struct spdk_nvme_cpl *cpl)
+{
+ struct spdk_nvme_ns *ns = (struct spdk_nvme_ns *)arg;
+ struct spdk_nvme_ctrlr *ctrlr = ns->ctrlr;
+ uint32_t nsid;
+ int rc;
+
+ if (spdk_nvme_cpl_is_error(cpl)) {
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_CONFIGURE_AER,
+ ctrlr->opts.admin_timeout_ms);
+ return;
+ }
+
+ /* move on to the next active NS */
+ nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, ns->id);
+ ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
+ if (ns == NULL) {
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_CONFIGURE_AER,
+ ctrlr->opts.admin_timeout_ms);
+ return;
+ }
+
+ rc = nvme_ctrlr_identify_id_desc_async(ns);
+ if (rc) {
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+ }
+}
+
+static int
+nvme_ctrlr_identify_id_desc_async(struct spdk_nvme_ns *ns)
+{
+ struct spdk_nvme_ctrlr *ctrlr = ns->ctrlr;
+
+ memset(ns->id_desc_list, 0, sizeof(ns->id_desc_list));
+
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_ID_DESCS,
+ ctrlr->opts.admin_timeout_ms);
+ return nvme_ctrlr_cmd_identify(ns->ctrlr, SPDK_NVME_IDENTIFY_NS_ID_DESCRIPTOR_LIST,
+ 0, ns->id, ns->id_desc_list, sizeof(ns->id_desc_list),
+ nvme_ctrlr_identify_id_desc_async_done, ns);
+}
+
+static int
+nvme_ctrlr_identify_id_desc_namespaces(struct spdk_nvme_ctrlr *ctrlr)
+{
+ uint32_t nsid;
+ struct spdk_nvme_ns *ns;
+ int rc;
+
+ if (ctrlr->vs.raw < SPDK_NVME_VERSION(1, 3, 0) ||
+ (ctrlr->quirks & NVME_QUIRK_IDENTIFY_CNS)) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Version < 1.3; not attempting to retrieve NS ID Descriptor List\n");
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_CONFIGURE_AER,
+ ctrlr->opts.admin_timeout_ms);
+ return 0;
+ }
+
+ nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr);
+ ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
+ if (ns == NULL) {
+ /* No active NS, move on to the next state */
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_CONFIGURE_AER,
+ ctrlr->opts.admin_timeout_ms);
+ return 0;
+ }
+
+ rc = nvme_ctrlr_identify_id_desc_async(ns);
+ if (rc) {
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+ }
+
+ return rc;
+}
+
+static void
+nvme_ctrlr_update_nvmf_ioccsz(struct spdk_nvme_ctrlr *ctrlr)
+{
+ if (ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_RDMA ||
+ ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_TCP ||
+ ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_FC) {
+ if (ctrlr->cdata.nvmf_specific.ioccsz < 4) {
+ SPDK_ERRLOG("Incorrect IOCCSZ %u, the minimum value should be 4\n",
+ ctrlr->cdata.nvmf_specific.ioccsz);
+ ctrlr->cdata.nvmf_specific.ioccsz = 4;
+ assert(0);
+ }
+ ctrlr->ioccsz_bytes = ctrlr->cdata.nvmf_specific.ioccsz * 16 - sizeof(struct spdk_nvme_cmd);
+ ctrlr->icdoff = ctrlr->cdata.nvmf_specific.icdoff;
+ }
+}
+
+static void
+nvme_ctrlr_set_num_queues_done(void *arg, const struct spdk_nvme_cpl *cpl)
+{
+ uint32_t cq_allocated, sq_allocated, min_allocated, i;
+ struct spdk_nvme_ctrlr *ctrlr = (struct spdk_nvme_ctrlr *)arg;
+
+ if (spdk_nvme_cpl_is_error(cpl)) {
+ SPDK_ERRLOG("Set Features - Number of Queues failed!\n");
+ ctrlr->opts.num_io_queues = 0;
+ } else {
+ /*
+ * Data in cdw0 is 0-based.
+ * Lower 16-bits indicate number of submission queues allocated.
+ * Upper 16-bits indicate number of completion queues allocated.
+ */
+ sq_allocated = (cpl->cdw0 & 0xFFFF) + 1;
+ cq_allocated = (cpl->cdw0 >> 16) + 1;
+
+ /*
+ * For 1:1 queue mapping, set number of allocated queues to be minimum of
+ * submission and completion queues.
+ */
+ min_allocated = spdk_min(sq_allocated, cq_allocated);
+
+ /* Set number of queues to be minimum of requested and actually allocated. */
+ ctrlr->opts.num_io_queues = spdk_min(min_allocated, ctrlr->opts.num_io_queues);
+ }
+
+ ctrlr->free_io_qids = spdk_bit_array_create(ctrlr->opts.num_io_queues + 1);
+ if (ctrlr->free_io_qids == NULL) {
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+ return;
+ }
+
+ /* Initialize list of free I/O queue IDs. QID 0 is the admin queue. */
+ spdk_bit_array_clear(ctrlr->free_io_qids, 0);
+ for (i = 1; i <= ctrlr->opts.num_io_queues; i++) {
+ spdk_bit_array_set(ctrlr->free_io_qids, i);
+ }
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_CONSTRUCT_NS,
+ ctrlr->opts.admin_timeout_ms);
+}
+
+static int
+nvme_ctrlr_set_num_queues(struct spdk_nvme_ctrlr *ctrlr)
+{
+ int rc;
+
+ if (ctrlr->opts.num_io_queues > SPDK_NVME_MAX_IO_QUEUES) {
+ SPDK_NOTICELOG("Limiting requested num_io_queues %u to max %d\n",
+ ctrlr->opts.num_io_queues, SPDK_NVME_MAX_IO_QUEUES);
+ ctrlr->opts.num_io_queues = SPDK_NVME_MAX_IO_QUEUES;
+ } else if (ctrlr->opts.num_io_queues < 1) {
+ SPDK_NOTICELOG("Requested num_io_queues 0, increasing to 1\n");
+ ctrlr->opts.num_io_queues = 1;
+ }
+
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_SET_NUM_QUEUES,
+ ctrlr->opts.admin_timeout_ms);
+
+ rc = nvme_ctrlr_cmd_set_num_queues(ctrlr, ctrlr->opts.num_io_queues,
+ nvme_ctrlr_set_num_queues_done, ctrlr);
+ if (rc != 0) {
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+ return rc;
+ }
+
+ return 0;
+}
+
+static void
+nvme_ctrlr_set_keep_alive_timeout_done(void *arg, const struct spdk_nvme_cpl *cpl)
+{
+ uint32_t keep_alive_interval_ms;
+ struct spdk_nvme_ctrlr *ctrlr = (struct spdk_nvme_ctrlr *)arg;
+
+ if (spdk_nvme_cpl_is_error(cpl)) {
+ if ((cpl->status.sct == SPDK_NVME_SCT_GENERIC) &&
+ (cpl->status.sc == SPDK_NVME_SC_INVALID_FIELD)) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Keep alive timeout Get Feature is not supported\n");
+ } else {
+ SPDK_ERRLOG("Keep alive timeout Get Feature failed: SC %x SCT %x\n",
+ cpl->status.sc, cpl->status.sct);
+ ctrlr->opts.keep_alive_timeout_ms = 0;
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+ return;
+ }
+ } else {
+ if (ctrlr->opts.keep_alive_timeout_ms != cpl->cdw0) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Controller adjusted keep alive timeout to %u ms\n",
+ cpl->cdw0);
+ }
+
+ ctrlr->opts.keep_alive_timeout_ms = cpl->cdw0;
+ }
+
+ keep_alive_interval_ms = ctrlr->opts.keep_alive_timeout_ms / 2;
+ if (keep_alive_interval_ms == 0) {
+ keep_alive_interval_ms = 1;
+ }
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Sending keep alive every %u ms\n", keep_alive_interval_ms);
+
+ ctrlr->keep_alive_interval_ticks = (keep_alive_interval_ms * spdk_get_ticks_hz()) / UINT64_C(1000);
+
+ /* Schedule the first Keep Alive to be sent as soon as possible. */
+ ctrlr->next_keep_alive_tick = spdk_get_ticks();
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_HOST_ID,
+ ctrlr->opts.admin_timeout_ms);
+}
+
+static int
+nvme_ctrlr_set_keep_alive_timeout(struct spdk_nvme_ctrlr *ctrlr)
+{
+ int rc;
+
+ if (ctrlr->opts.keep_alive_timeout_ms == 0) {
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_HOST_ID,
+ ctrlr->opts.admin_timeout_ms);
+ return 0;
+ }
+
+ if (ctrlr->cdata.kas == 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Controller KAS is 0 - not enabling Keep Alive\n");
+ ctrlr->opts.keep_alive_timeout_ms = 0;
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_HOST_ID,
+ ctrlr->opts.admin_timeout_ms);
+ return 0;
+ }
+
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_KEEP_ALIVE_TIMEOUT,
+ ctrlr->opts.admin_timeout_ms);
+
+ /* Retrieve actual keep alive timeout, since the controller may have adjusted it. */
+ rc = spdk_nvme_ctrlr_cmd_get_feature(ctrlr, SPDK_NVME_FEAT_KEEP_ALIVE_TIMER, 0, NULL, 0,
+ nvme_ctrlr_set_keep_alive_timeout_done, ctrlr);
+ if (rc != 0) {
+ SPDK_ERRLOG("Keep alive timeout Get Feature failed: %d\n", rc);
+ ctrlr->opts.keep_alive_timeout_ms = 0;
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+ return rc;
+ }
+
+ return 0;
+}
+
+static void
+nvme_ctrlr_set_host_id_done(void *arg, const struct spdk_nvme_cpl *cpl)
+{
+ struct spdk_nvme_ctrlr *ctrlr = (struct spdk_nvme_ctrlr *)arg;
+
+ if (spdk_nvme_cpl_is_error(cpl)) {
+ /*
+ * Treat Set Features - Host ID failure as non-fatal, since the Host ID feature
+ * is optional.
+ */
+ SPDK_WARNLOG("Set Features - Host ID failed: SC 0x%x SCT 0x%x\n",
+ cpl->status.sc, cpl->status.sct);
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Set Features - Host ID was successful\n");
+ }
+
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_READY, NVME_TIMEOUT_INFINITE);
+}
+
+static int
+nvme_ctrlr_set_host_id(struct spdk_nvme_ctrlr *ctrlr)
+{
+ uint8_t *host_id;
+ uint32_t host_id_size;
+ int rc;
+
+ if (ctrlr->trid.trtype != SPDK_NVME_TRANSPORT_PCIE) {
+ /*
+ * NVMe-oF sends the host ID during Connect and doesn't allow
+ * Set Features - Host Identifier after Connect, so we don't need to do anything here.
+ */
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "NVMe-oF transport - not sending Set Features - Host ID\n");
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_READY, NVME_TIMEOUT_INFINITE);
+ return 0;
+ }
+
+ if (ctrlr->cdata.ctratt.host_id_exhid_supported) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Using 128-bit extended host identifier\n");
+ host_id = ctrlr->opts.extended_host_id;
+ host_id_size = sizeof(ctrlr->opts.extended_host_id);
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Using 64-bit host identifier\n");
+ host_id = ctrlr->opts.host_id;
+ host_id_size = sizeof(ctrlr->opts.host_id);
+ }
+
+ /* If the user specified an all-zeroes host identifier, don't send the command. */
+ if (spdk_mem_all_zero(host_id, host_id_size)) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME,
+ "User did not specify host ID - not sending Set Features - Host ID\n");
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_READY, NVME_TIMEOUT_INFINITE);
+ return 0;
+ }
+
+ SPDK_LOGDUMP(SPDK_LOG_NVME, "host_id", host_id, host_id_size);
+
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_HOST_ID,
+ ctrlr->opts.admin_timeout_ms);
+
+ rc = nvme_ctrlr_cmd_set_host_id(ctrlr, host_id, host_id_size, nvme_ctrlr_set_host_id_done, ctrlr);
+ if (rc != 0) {
+ SPDK_ERRLOG("Set Features - Host ID failed: %d\n", rc);
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+ return rc;
+ }
+
+ return 0;
+}
+
+static void
+nvme_ctrlr_destruct_namespaces(struct spdk_nvme_ctrlr *ctrlr)
+{
+ if (ctrlr->ns) {
+ uint32_t i, num_ns = ctrlr->num_ns;
+
+ for (i = 0; i < num_ns; i++) {
+ nvme_ns_destruct(&ctrlr->ns[i]);
+ }
+
+ spdk_free(ctrlr->ns);
+ ctrlr->ns = NULL;
+ ctrlr->num_ns = 0;
+ }
+
+ if (ctrlr->nsdata) {
+ spdk_free(ctrlr->nsdata);
+ ctrlr->nsdata = NULL;
+ }
+
+ spdk_free(ctrlr->active_ns_list);
+ ctrlr->active_ns_list = NULL;
+}
+
+static void
+nvme_ctrlr_update_namespaces(struct spdk_nvme_ctrlr *ctrlr)
+{
+ uint32_t i, nn = ctrlr->cdata.nn;
+ struct spdk_nvme_ns_data *nsdata;
+ bool ns_is_active;
+
+ for (i = 0; i < nn; i++) {
+ struct spdk_nvme_ns *ns = &ctrlr->ns[i];
+ uint32_t nsid = i + 1;
+
+ nsdata = &ctrlr->nsdata[nsid - 1];
+ ns_is_active = spdk_nvme_ctrlr_is_active_ns(ctrlr, nsid);
+
+ if (nsdata->ncap && ns_is_active) {
+ if (nvme_ns_update(ns) != 0) {
+ SPDK_ERRLOG("Failed to update active NS %u\n", nsid);
+ continue;
+ }
+ }
+
+ if ((nsdata->ncap == 0) && ns_is_active) {
+ if (nvme_ns_construct(ns, nsid, ctrlr) != 0) {
+ continue;
+ }
+ }
+
+ if (nsdata->ncap && !ns_is_active) {
+ nvme_ns_destruct(ns);
+ }
+ }
+}
+
+static int
+nvme_ctrlr_construct_namespaces(struct spdk_nvme_ctrlr *ctrlr)
+{
+ int rc = 0;
+ uint32_t nn = ctrlr->cdata.nn;
+
+ /* ctrlr->num_ns may be 0 (startup) or a different number of namespaces (reset),
+ * so check if we need to reallocate.
+ */
+ if (nn != ctrlr->num_ns) {
+ nvme_ctrlr_destruct_namespaces(ctrlr);
+
+ if (nn == 0) {
+ SPDK_WARNLOG("controller has 0 namespaces\n");
+ return 0;
+ }
+
+ ctrlr->ns = spdk_zmalloc(nn * sizeof(struct spdk_nvme_ns), 64, NULL,
+ SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE);
+ if (ctrlr->ns == NULL) {
+ rc = -ENOMEM;
+ goto fail;
+ }
+
+ ctrlr->nsdata = spdk_zmalloc(nn * sizeof(struct spdk_nvme_ns_data), 64,
+ NULL, SPDK_ENV_SOCKET_ID_ANY,
+ SPDK_MALLOC_SHARE | SPDK_MALLOC_DMA);
+ if (ctrlr->nsdata == NULL) {
+ rc = -ENOMEM;
+ goto fail;
+ }
+
+ ctrlr->num_ns = nn;
+ }
+
+ return 0;
+
+fail:
+ nvme_ctrlr_destruct_namespaces(ctrlr);
+ return rc;
+}
+
+static void
+nvme_ctrlr_async_event_cb(void *arg, const struct spdk_nvme_cpl *cpl)
+{
+ struct nvme_async_event_request *aer = arg;
+ struct spdk_nvme_ctrlr *ctrlr = aer->ctrlr;
+ struct spdk_nvme_ctrlr_process *active_proc;
+ union spdk_nvme_async_event_completion event;
+ int rc;
+
+ if (cpl->status.sct == SPDK_NVME_SCT_GENERIC &&
+ cpl->status.sc == SPDK_NVME_SC_ABORTED_SQ_DELETION) {
+ /*
+ * This is simulated when controller is being shut down, to
+ * effectively abort outstanding asynchronous event requests
+ * and make sure all memory is freed. Do not repost the
+ * request in this case.
+ */
+ return;
+ }
+
+ if (cpl->status.sct == SPDK_NVME_SCT_COMMAND_SPECIFIC &&
+ cpl->status.sc == SPDK_NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED) {
+ /*
+ * SPDK will only send as many AERs as the device says it supports,
+ * so this status code indicates an out-of-spec device. Do not repost
+ * the request in this case.
+ */
+ SPDK_ERRLOG("Controller appears out-of-spec for asynchronous event request\n"
+ "handling. Do not repost this AER.\n");
+ return;
+ }
+
+ event.raw = cpl->cdw0;
+ if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) &&
+ (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) {
+ rc = nvme_ctrlr_identify_active_ns(ctrlr);
+ if (rc) {
+ return;
+ }
+ nvme_ctrlr_update_namespaces(ctrlr);
+ nvme_io_msg_ctrlr_update(ctrlr);
+ }
+
+ active_proc = nvme_ctrlr_get_current_process(ctrlr);
+ if (active_proc && active_proc->aer_cb_fn) {
+ active_proc->aer_cb_fn(active_proc->aer_cb_arg, cpl);
+ }
+
+ /* If the ctrlr was removed or in the destruct state, we should not send aer again */
+ if (ctrlr->is_removed || ctrlr->is_destructed) {
+ return;
+ }
+
+ /*
+ * Repost another asynchronous event request to replace the one
+ * that just completed.
+ */
+ if (nvme_ctrlr_construct_and_submit_aer(ctrlr, aer)) {
+ /*
+ * We can't do anything to recover from a failure here,
+ * so just print a warning message and leave the AER unsubmitted.
+ */
+ SPDK_ERRLOG("resubmitting AER failed!\n");
+ }
+}
+
+static int
+nvme_ctrlr_construct_and_submit_aer(struct spdk_nvme_ctrlr *ctrlr,
+ struct nvme_async_event_request *aer)
+{
+ struct nvme_request *req;
+
+ aer->ctrlr = ctrlr;
+ req = nvme_allocate_request_null(ctrlr->adminq, nvme_ctrlr_async_event_cb, aer);
+ aer->req = req;
+ if (req == NULL) {
+ return -1;
+ }
+
+ req->cmd.opc = SPDK_NVME_OPC_ASYNC_EVENT_REQUEST;
+ return nvme_ctrlr_submit_admin_request(ctrlr, req);
+}
+
+static void
+nvme_ctrlr_configure_aer_done(void *arg, const struct spdk_nvme_cpl *cpl)
+{
+ struct nvme_async_event_request *aer;
+ int rc;
+ uint32_t i;
+ struct spdk_nvme_ctrlr *ctrlr = (struct spdk_nvme_ctrlr *)arg;
+
+ if (spdk_nvme_cpl_is_error(cpl)) {
+ SPDK_NOTICELOG("nvme_ctrlr_configure_aer failed!\n");
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_SUPPORTED_LOG_PAGES,
+ ctrlr->opts.admin_timeout_ms);
+ return;
+ }
+
+ /* aerl is a zero-based value, so we need to add 1 here. */
+ ctrlr->num_aers = spdk_min(NVME_MAX_ASYNC_EVENTS, (ctrlr->cdata.aerl + 1));
+
+ for (i = 0; i < ctrlr->num_aers; i++) {
+ aer = &ctrlr->aer[i];
+ rc = nvme_ctrlr_construct_and_submit_aer(ctrlr, aer);
+ if (rc) {
+ SPDK_ERRLOG("nvme_ctrlr_construct_and_submit_aer failed!\n");
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+ return;
+ }
+ }
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_SUPPORTED_LOG_PAGES,
+ ctrlr->opts.admin_timeout_ms);
+}
+
+static int
+nvme_ctrlr_configure_aer(struct spdk_nvme_ctrlr *ctrlr)
+{
+ union spdk_nvme_feat_async_event_configuration config;
+ int rc;
+
+ config.raw = 0;
+ config.bits.crit_warn.bits.available_spare = 1;
+ config.bits.crit_warn.bits.temperature = 1;
+ config.bits.crit_warn.bits.device_reliability = 1;
+ config.bits.crit_warn.bits.read_only = 1;
+ config.bits.crit_warn.bits.volatile_memory_backup = 1;
+
+ if (ctrlr->vs.raw >= SPDK_NVME_VERSION(1, 2, 0)) {
+ if (ctrlr->cdata.oaes.ns_attribute_notices) {
+ config.bits.ns_attr_notice = 1;
+ }
+ if (ctrlr->cdata.oaes.fw_activation_notices) {
+ config.bits.fw_activation_notice = 1;
+ }
+ }
+ if (ctrlr->vs.raw >= SPDK_NVME_VERSION(1, 3, 0) && ctrlr->cdata.lpa.telemetry) {
+ config.bits.telemetry_log_notice = 1;
+ }
+
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_CONFIGURE_AER,
+ ctrlr->opts.admin_timeout_ms);
+
+ rc = nvme_ctrlr_cmd_set_async_event_config(ctrlr, config,
+ nvme_ctrlr_configure_aer_done,
+ ctrlr);
+ if (rc != 0) {
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+ return rc;
+ }
+
+ return 0;
+}
+
+struct spdk_nvme_ctrlr_process *
+nvme_ctrlr_get_process(struct spdk_nvme_ctrlr *ctrlr, pid_t pid)
+{
+ struct spdk_nvme_ctrlr_process *active_proc;
+
+ TAILQ_FOREACH(active_proc, &ctrlr->active_procs, tailq) {
+ if (active_proc->pid == pid) {
+ return active_proc;
+ }
+ }
+
+ return NULL;
+}
+
+struct spdk_nvme_ctrlr_process *
+nvme_ctrlr_get_current_process(struct spdk_nvme_ctrlr *ctrlr)
+{
+ return nvme_ctrlr_get_process(ctrlr, getpid());
+}
+
+/**
+ * This function will be called when a process is using the controller.
+ * 1. For the primary process, it is called when constructing the controller.
+ * 2. For the secondary process, it is called at probing the controller.
+ * Note: will check whether the process is already added for the same process.
+ */
+int
+nvme_ctrlr_add_process(struct spdk_nvme_ctrlr *ctrlr, void *devhandle)
+{
+ struct spdk_nvme_ctrlr_process *ctrlr_proc;
+ pid_t pid = getpid();
+
+ /* Check whether the process is already added or not */
+ if (nvme_ctrlr_get_process(ctrlr, pid)) {
+ return 0;
+ }
+
+ /* Initialize the per process properties for this ctrlr */
+ ctrlr_proc = spdk_zmalloc(sizeof(struct spdk_nvme_ctrlr_process),
+ 64, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE);
+ if (ctrlr_proc == NULL) {
+ SPDK_ERRLOG("failed to allocate memory to track the process props\n");
+
+ return -1;
+ }
+
+ ctrlr_proc->is_primary = spdk_process_is_primary();
+ ctrlr_proc->pid = pid;
+ STAILQ_INIT(&ctrlr_proc->active_reqs);
+ ctrlr_proc->devhandle = devhandle;
+ ctrlr_proc->ref = 0;
+ TAILQ_INIT(&ctrlr_proc->allocated_io_qpairs);
+
+ TAILQ_INSERT_TAIL(&ctrlr->active_procs, ctrlr_proc, tailq);
+
+ return 0;
+}
+
+/**
+ * This function will be called when the process detaches the controller.
+ * Note: the ctrlr_lock must be held when calling this function.
+ */
+static void
+nvme_ctrlr_remove_process(struct spdk_nvme_ctrlr *ctrlr,
+ struct spdk_nvme_ctrlr_process *proc)
+{
+ struct spdk_nvme_qpair *qpair, *tmp_qpair;
+
+ assert(STAILQ_EMPTY(&proc->active_reqs));
+
+ TAILQ_FOREACH_SAFE(qpair, &proc->allocated_io_qpairs, per_process_tailq, tmp_qpair) {
+ spdk_nvme_ctrlr_free_io_qpair(qpair);
+ }
+
+ TAILQ_REMOVE(&ctrlr->active_procs, proc, tailq);
+
+ if (ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) {
+ spdk_pci_device_detach(proc->devhandle);
+ }
+
+ spdk_free(proc);
+}
+
+/**
+ * This function will be called when the process exited unexpectedly
+ * in order to free any incomplete nvme request, allocated IO qpairs
+ * and allocated memory.
+ * Note: the ctrlr_lock must be held when calling this function.
+ */
+static void
+nvme_ctrlr_cleanup_process(struct spdk_nvme_ctrlr_process *proc)
+{
+ struct nvme_request *req, *tmp_req;
+ struct spdk_nvme_qpair *qpair, *tmp_qpair;
+
+ STAILQ_FOREACH_SAFE(req, &proc->active_reqs, stailq, tmp_req) {
+ STAILQ_REMOVE(&proc->active_reqs, req, nvme_request, stailq);
+
+ assert(req->pid == proc->pid);
+
+ nvme_free_request(req);
+ }
+
+ TAILQ_FOREACH_SAFE(qpair, &proc->allocated_io_qpairs, per_process_tailq, tmp_qpair) {
+ TAILQ_REMOVE(&proc->allocated_io_qpairs, qpair, per_process_tailq);
+
+ /*
+ * The process may have been killed while some qpairs were in their
+ * completion context. Clear that flag here to allow these IO
+ * qpairs to be deleted.
+ */
+ qpair->in_completion_context = 0;
+
+ qpair->no_deletion_notification_needed = 1;
+
+ spdk_nvme_ctrlr_free_io_qpair(qpair);
+ }
+
+ spdk_free(proc);
+}
+
+/**
+ * This function will be called when destructing the controller.
+ * 1. There is no more admin request on this controller.
+ * 2. Clean up any left resource allocation when its associated process is gone.
+ */
+void
+nvme_ctrlr_free_processes(struct spdk_nvme_ctrlr *ctrlr)
+{
+ struct spdk_nvme_ctrlr_process *active_proc, *tmp;
+
+ /* Free all the processes' properties and make sure no pending admin IOs */
+ TAILQ_FOREACH_SAFE(active_proc, &ctrlr->active_procs, tailq, tmp) {
+ TAILQ_REMOVE(&ctrlr->active_procs, active_proc, tailq);
+
+ assert(STAILQ_EMPTY(&active_proc->active_reqs));
+
+ spdk_free(active_proc);
+ }
+}
+
+/**
+ * This function will be called when any other process attaches or
+ * detaches the controller in order to cleanup those unexpectedly
+ * terminated processes.
+ * Note: the ctrlr_lock must be held when calling this function.
+ */
+static int
+nvme_ctrlr_remove_inactive_proc(struct spdk_nvme_ctrlr *ctrlr)
+{
+ struct spdk_nvme_ctrlr_process *active_proc, *tmp;
+ int active_proc_count = 0;
+
+ TAILQ_FOREACH_SAFE(active_proc, &ctrlr->active_procs, tailq, tmp) {
+ if ((kill(active_proc->pid, 0) == -1) && (errno == ESRCH)) {
+ SPDK_ERRLOG("process %d terminated unexpected\n", active_proc->pid);
+
+ TAILQ_REMOVE(&ctrlr->active_procs, active_proc, tailq);
+
+ nvme_ctrlr_cleanup_process(active_proc);
+ } else {
+ active_proc_count++;
+ }
+ }
+
+ return active_proc_count;
+}
+
+void
+nvme_ctrlr_proc_get_ref(struct spdk_nvme_ctrlr *ctrlr)
+{
+ struct spdk_nvme_ctrlr_process *active_proc;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+
+ nvme_ctrlr_remove_inactive_proc(ctrlr);
+
+ active_proc = nvme_ctrlr_get_current_process(ctrlr);
+ if (active_proc) {
+ active_proc->ref++;
+ }
+
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+}
+
+void
+nvme_ctrlr_proc_put_ref(struct spdk_nvme_ctrlr *ctrlr)
+{
+ struct spdk_nvme_ctrlr_process *active_proc;
+ int proc_count;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+
+ proc_count = nvme_ctrlr_remove_inactive_proc(ctrlr);
+
+ active_proc = nvme_ctrlr_get_current_process(ctrlr);
+ if (active_proc) {
+ active_proc->ref--;
+ assert(active_proc->ref >= 0);
+
+ /*
+ * The last active process will be removed at the end of
+ * the destruction of the controller.
+ */
+ if (active_proc->ref == 0 && proc_count != 1) {
+ nvme_ctrlr_remove_process(ctrlr, active_proc);
+ }
+ }
+
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+}
+
+int
+nvme_ctrlr_get_ref_count(struct spdk_nvme_ctrlr *ctrlr)
+{
+ struct spdk_nvme_ctrlr_process *active_proc;
+ int ref = 0;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+
+ nvme_ctrlr_remove_inactive_proc(ctrlr);
+
+ TAILQ_FOREACH(active_proc, &ctrlr->active_procs, tailq) {
+ ref += active_proc->ref;
+ }
+
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+ return ref;
+}
+
+/**
+ * Get the PCI device handle which is only visible to its associated process.
+ */
+struct spdk_pci_device *
+nvme_ctrlr_proc_get_devhandle(struct spdk_nvme_ctrlr *ctrlr)
+{
+ struct spdk_nvme_ctrlr_process *active_proc;
+ struct spdk_pci_device *devhandle = NULL;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+
+ active_proc = nvme_ctrlr_get_current_process(ctrlr);
+ if (active_proc) {
+ devhandle = active_proc->devhandle;
+ }
+
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+ return devhandle;
+}
+
+/**
+ * This function will be called repeatedly during initialization until the controller is ready.
+ */
+int
+nvme_ctrlr_process_init(struct spdk_nvme_ctrlr *ctrlr)
+{
+ union spdk_nvme_cc_register cc;
+ union spdk_nvme_csts_register csts;
+ uint32_t ready_timeout_in_ms;
+ int rc = 0;
+
+ /*
+ * May need to avoid accessing any register on the target controller
+ * for a while. Return early without touching the FSM.
+ * Check sleep_timeout_tsc > 0 for unit test.
+ */
+ if ((ctrlr->sleep_timeout_tsc > 0) &&
+ (spdk_get_ticks() <= ctrlr->sleep_timeout_tsc)) {
+ return 0;
+ }
+ ctrlr->sleep_timeout_tsc = 0;
+
+ if (nvme_ctrlr_get_cc(ctrlr, &cc) ||
+ nvme_ctrlr_get_csts(ctrlr, &csts)) {
+ if (ctrlr->state_timeout_tsc != NVME_TIMEOUT_INFINITE) {
+ /* While a device is resetting, it may be unable to service MMIO reads
+ * temporarily. Allow for this case.
+ */
+ SPDK_ERRLOG("Get registers failed while waiting for CSTS.RDY == 0\n");
+ goto init_timeout;
+ }
+ SPDK_ERRLOG("Failed to read CC and CSTS in state %d\n", ctrlr->state);
+ return -EIO;
+ }
+
+ ready_timeout_in_ms = 500 * ctrlr->cap.bits.to;
+
+ /*
+ * Check if the current initialization step is done or has timed out.
+ */
+ switch (ctrlr->state) {
+ case NVME_CTRLR_STATE_INIT_DELAY:
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_INIT, ready_timeout_in_ms);
+ if (ctrlr->quirks & NVME_QUIRK_DELAY_BEFORE_INIT) {
+ /*
+ * Controller may need some delay before it's enabled.
+ *
+ * This is a workaround for an issue where the PCIe-attached NVMe controller
+ * is not ready after VFIO reset. We delay the initialization rather than the
+ * enabling itself, because this is required only for the very first enabling
+ * - directly after a VFIO reset.
+ */
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Adding 2 second delay before initializing the controller\n");
+ ctrlr->sleep_timeout_tsc = spdk_get_ticks() + (2000 * spdk_get_ticks_hz() / 1000);
+ }
+ break;
+
+ case NVME_CTRLR_STATE_INIT:
+ /* Begin the hardware initialization by making sure the controller is disabled. */
+ if (cc.bits.en) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "CC.EN = 1\n");
+ /*
+ * Controller is currently enabled. We need to disable it to cause a reset.
+ *
+ * If CC.EN = 1 && CSTS.RDY = 0, the controller is in the process of becoming ready.
+ * Wait for the ready bit to be 1 before disabling the controller.
+ */
+ if (csts.bits.rdy == 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "CC.EN = 1 && CSTS.RDY = 0 - waiting for reset to complete\n");
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_1, ready_timeout_in_ms);
+ return 0;
+ }
+
+ /* CC.EN = 1 && CSTS.RDY == 1, so we can immediately disable the controller. */
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Setting CC.EN = 0\n");
+ cc.bits.en = 0;
+ if (nvme_ctrlr_set_cc(ctrlr, &cc)) {
+ SPDK_ERRLOG("set_cc() failed\n");
+ return -EIO;
+ }
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_0, ready_timeout_in_ms);
+
+ /*
+ * Wait 2.5 seconds before accessing PCI registers.
+ * Not using sleep() to avoid blocking other controller's initialization.
+ */
+ if (ctrlr->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Applying quirk: delay 2.5 seconds before reading registers\n");
+ ctrlr->sleep_timeout_tsc = spdk_get_ticks() + (2500 * spdk_get_ticks_hz() / 1000);
+ }
+ return 0;
+ } else {
+ if (csts.bits.rdy == 1) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "CC.EN = 0 && CSTS.RDY = 1 - waiting for shutdown to complete\n");
+ }
+
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_0, ready_timeout_in_ms);
+ return 0;
+ }
+ break;
+
+ case NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_1:
+ if (csts.bits.rdy == 1) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "CC.EN = 1 && CSTS.RDY = 1 - disabling controller\n");
+ /* CC.EN = 1 && CSTS.RDY = 1, so we can set CC.EN = 0 now. */
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Setting CC.EN = 0\n");
+ cc.bits.en = 0;
+ if (nvme_ctrlr_set_cc(ctrlr, &cc)) {
+ SPDK_ERRLOG("set_cc() failed\n");
+ return -EIO;
+ }
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_0, ready_timeout_in_ms);
+ return 0;
+ }
+ break;
+
+ case NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_0:
+ if (csts.bits.rdy == 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "CC.EN = 0 && CSTS.RDY = 0\n");
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ENABLE, ready_timeout_in_ms);
+ /*
+ * Delay 100us before setting CC.EN = 1. Some NVMe SSDs miss CC.EN getting
+ * set to 1 if it is too soon after CSTS.RDY is reported as 0.
+ */
+ spdk_delay_us(100);
+ return 0;
+ }
+ break;
+
+ case NVME_CTRLR_STATE_ENABLE:
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Setting CC.EN = 1\n");
+ rc = nvme_ctrlr_enable(ctrlr);
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ENABLE_WAIT_FOR_READY_1, ready_timeout_in_ms);
+ return rc;
+
+ case NVME_CTRLR_STATE_ENABLE_WAIT_FOR_READY_1:
+ if (csts.bits.rdy == 1) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "CC.EN = 1 && CSTS.RDY = 1 - controller is ready\n");
+ /*
+ * The controller has been enabled.
+ * Perform the rest of initialization serially.
+ */
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_RESET_ADMIN_QUEUE,
+ ctrlr->opts.admin_timeout_ms);
+ return 0;
+ }
+ break;
+
+ case NVME_CTRLR_STATE_RESET_ADMIN_QUEUE:
+ nvme_transport_qpair_reset(ctrlr->adminq);
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_IDENTIFY,
+ ctrlr->opts.admin_timeout_ms);
+ break;
+
+ case NVME_CTRLR_STATE_IDENTIFY:
+ rc = nvme_ctrlr_identify(ctrlr);
+ break;
+
+ case NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY:
+ spdk_nvme_qpair_process_completions(ctrlr->adminq, 0);
+ break;
+
+ case NVME_CTRLR_STATE_SET_NUM_QUEUES:
+ nvme_ctrlr_update_nvmf_ioccsz(ctrlr);
+ rc = nvme_ctrlr_set_num_queues(ctrlr);
+ break;
+
+ case NVME_CTRLR_STATE_WAIT_FOR_SET_NUM_QUEUES:
+ spdk_nvme_qpair_process_completions(ctrlr->adminq, 0);
+ break;
+
+ case NVME_CTRLR_STATE_CONSTRUCT_NS:
+ rc = nvme_ctrlr_construct_namespaces(ctrlr);
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_IDENTIFY_ACTIVE_NS,
+ ctrlr->opts.admin_timeout_ms);
+ break;
+
+ case NVME_CTRLR_STATE_IDENTIFY_ACTIVE_NS:
+ _nvme_ctrlr_identify_active_ns(ctrlr);
+ break;
+
+ case NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_ACTIVE_NS:
+ spdk_nvme_qpair_process_completions(ctrlr->adminq, 0);
+ break;
+
+ case NVME_CTRLR_STATE_IDENTIFY_NS:
+ rc = nvme_ctrlr_identify_namespaces(ctrlr);
+ break;
+
+ case NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_NS:
+ spdk_nvme_qpair_process_completions(ctrlr->adminq, 0);
+ break;
+
+ case NVME_CTRLR_STATE_IDENTIFY_ID_DESCS:
+ rc = nvme_ctrlr_identify_id_desc_namespaces(ctrlr);
+ break;
+
+ case NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_ID_DESCS:
+ spdk_nvme_qpair_process_completions(ctrlr->adminq, 0);
+ break;
+
+ case NVME_CTRLR_STATE_CONFIGURE_AER:
+ rc = nvme_ctrlr_configure_aer(ctrlr);
+ break;
+
+ case NVME_CTRLR_STATE_WAIT_FOR_CONFIGURE_AER:
+ spdk_nvme_qpair_process_completions(ctrlr->adminq, 0);
+ break;
+
+ case NVME_CTRLR_STATE_SET_SUPPORTED_LOG_PAGES:
+ rc = nvme_ctrlr_set_supported_log_pages(ctrlr);
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_SUPPORTED_FEATURES,
+ ctrlr->opts.admin_timeout_ms);
+ break;
+
+ case NVME_CTRLR_STATE_SET_SUPPORTED_FEATURES:
+ nvme_ctrlr_set_supported_features(ctrlr);
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_DB_BUF_CFG,
+ ctrlr->opts.admin_timeout_ms);
+ break;
+
+ case NVME_CTRLR_STATE_SET_DB_BUF_CFG:
+ rc = nvme_ctrlr_set_doorbell_buffer_config(ctrlr);
+ break;
+
+ case NVME_CTRLR_STATE_WAIT_FOR_DB_BUF_CFG:
+ spdk_nvme_qpair_process_completions(ctrlr->adminq, 0);
+ break;
+
+ case NVME_CTRLR_STATE_SET_KEEP_ALIVE_TIMEOUT:
+ rc = nvme_ctrlr_set_keep_alive_timeout(ctrlr);
+ break;
+
+ case NVME_CTRLR_STATE_WAIT_FOR_KEEP_ALIVE_TIMEOUT:
+ spdk_nvme_qpair_process_completions(ctrlr->adminq, 0);
+ break;
+
+ case NVME_CTRLR_STATE_SET_HOST_ID:
+ rc = nvme_ctrlr_set_host_id(ctrlr);
+ break;
+
+ case NVME_CTRLR_STATE_WAIT_FOR_HOST_ID:
+ spdk_nvme_qpair_process_completions(ctrlr->adminq, 0);
+ break;
+
+ case NVME_CTRLR_STATE_READY:
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Ctrlr already in ready state\n");
+ return 0;
+
+ case NVME_CTRLR_STATE_ERROR:
+ SPDK_ERRLOG("Ctrlr %s is in error state\n", ctrlr->trid.traddr);
+ return -1;
+
+ default:
+ assert(0);
+ return -1;
+ }
+
+init_timeout:
+ if (ctrlr->state_timeout_tsc != NVME_TIMEOUT_INFINITE &&
+ spdk_get_ticks() > ctrlr->state_timeout_tsc) {
+ SPDK_ERRLOG("Initialization timed out in state %d\n", ctrlr->state);
+ return -1;
+ }
+
+ return rc;
+}
+
+int
+nvme_robust_mutex_init_recursive_shared(pthread_mutex_t *mtx)
+{
+ pthread_mutexattr_t attr;
+ int rc = 0;
+
+ if (pthread_mutexattr_init(&attr)) {
+ return -1;
+ }
+ if (pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE) ||
+#ifndef __FreeBSD__
+ pthread_mutexattr_setrobust(&attr, PTHREAD_MUTEX_ROBUST) ||
+ pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED) ||
+#endif
+ pthread_mutex_init(mtx, &attr)) {
+ rc = -1;
+ }
+ pthread_mutexattr_destroy(&attr);
+ return rc;
+}
+
+int
+nvme_ctrlr_construct(struct spdk_nvme_ctrlr *ctrlr)
+{
+ int rc;
+
+ if (ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) {
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_INIT_DELAY, NVME_TIMEOUT_INFINITE);
+ } else {
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_INIT, NVME_TIMEOUT_INFINITE);
+ }
+
+ if (ctrlr->opts.admin_queue_size > SPDK_NVME_ADMIN_QUEUE_MAX_ENTRIES) {
+ SPDK_ERRLOG("admin_queue_size %u exceeds max defined by NVMe spec, use max value\n",
+ ctrlr->opts.admin_queue_size);
+ ctrlr->opts.admin_queue_size = SPDK_NVME_ADMIN_QUEUE_MAX_ENTRIES;
+ }
+
+ if (ctrlr->opts.admin_queue_size < SPDK_NVME_ADMIN_QUEUE_MIN_ENTRIES) {
+ SPDK_ERRLOG("admin_queue_size %u is less than minimum defined by NVMe spec, use min value\n",
+ ctrlr->opts.admin_queue_size);
+ ctrlr->opts.admin_queue_size = SPDK_NVME_ADMIN_QUEUE_MIN_ENTRIES;
+ }
+
+ ctrlr->flags = 0;
+ ctrlr->free_io_qids = NULL;
+ ctrlr->is_resetting = false;
+ ctrlr->is_failed = false;
+ ctrlr->is_destructed = false;
+
+ TAILQ_INIT(&ctrlr->active_io_qpairs);
+ STAILQ_INIT(&ctrlr->queued_aborts);
+ ctrlr->outstanding_aborts = 0;
+
+ rc = nvme_robust_mutex_init_recursive_shared(&ctrlr->ctrlr_lock);
+ if (rc != 0) {
+ return rc;
+ }
+
+ TAILQ_INIT(&ctrlr->active_procs);
+
+ return rc;
+}
+
+/* This function should be called once at ctrlr initialization to set up constant properties. */
+void
+nvme_ctrlr_init_cap(struct spdk_nvme_ctrlr *ctrlr, const union spdk_nvme_cap_register *cap,
+ const union spdk_nvme_vs_register *vs)
+{
+ ctrlr->cap = *cap;
+ ctrlr->vs = *vs;
+
+ if (ctrlr->cap.bits.ams & SPDK_NVME_CAP_AMS_WRR) {
+ ctrlr->flags |= SPDK_NVME_CTRLR_WRR_SUPPORTED;
+ }
+
+ ctrlr->min_page_size = 1u << (12 + ctrlr->cap.bits.mpsmin);
+
+ /* For now, always select page_size == min_page_size. */
+ ctrlr->page_size = ctrlr->min_page_size;
+
+ ctrlr->opts.io_queue_size = spdk_max(ctrlr->opts.io_queue_size, SPDK_NVME_IO_QUEUE_MIN_ENTRIES);
+ ctrlr->opts.io_queue_size = spdk_min(ctrlr->opts.io_queue_size, MAX_IO_QUEUE_ENTRIES);
+ ctrlr->opts.io_queue_size = spdk_min(ctrlr->opts.io_queue_size, ctrlr->cap.bits.mqes + 1u);
+
+ ctrlr->opts.io_queue_requests = spdk_max(ctrlr->opts.io_queue_requests, ctrlr->opts.io_queue_size);
+}
+
+void
+nvme_ctrlr_destruct_finish(struct spdk_nvme_ctrlr *ctrlr)
+{
+ pthread_mutex_destroy(&ctrlr->ctrlr_lock);
+}
+
+void
+nvme_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr)
+{
+ struct spdk_nvme_qpair *qpair, *tmp;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Prepare to destruct SSD: %s\n", ctrlr->trid.traddr);
+
+ ctrlr->is_destructed = true;
+
+ spdk_nvme_qpair_process_completions(ctrlr->adminq, 0);
+
+ nvme_ctrlr_abort_queued_aborts(ctrlr);
+ nvme_transport_admin_qpair_abort_aers(ctrlr->adminq);
+
+ TAILQ_FOREACH_SAFE(qpair, &ctrlr->active_io_qpairs, tailq, tmp) {
+ spdk_nvme_ctrlr_free_io_qpair(qpair);
+ }
+
+ nvme_ctrlr_free_doorbell_buffer(ctrlr);
+
+ if (ctrlr->opts.no_shn_notification) {
+ SPDK_INFOLOG(SPDK_LOG_NVME, "Disable SSD: %s without shutdown notification\n",
+ ctrlr->trid.traddr);
+ nvme_ctrlr_disable(ctrlr);
+ } else {
+ nvme_ctrlr_shutdown(ctrlr);
+ }
+
+ nvme_ctrlr_destruct_namespaces(ctrlr);
+
+ spdk_bit_array_free(&ctrlr->free_io_qids);
+
+ nvme_transport_ctrlr_destruct(ctrlr);
+}
+
+int
+nvme_ctrlr_submit_admin_request(struct spdk_nvme_ctrlr *ctrlr,
+ struct nvme_request *req)
+{
+ return nvme_qpair_submit_request(ctrlr->adminq, req);
+}
+
+static void
+nvme_keep_alive_completion(void *cb_ctx, const struct spdk_nvme_cpl *cpl)
+{
+ /* Do nothing */
+}
+
+/*
+ * Check if we need to send a Keep Alive command.
+ * Caller must hold ctrlr->ctrlr_lock.
+ */
+static void
+nvme_ctrlr_keep_alive(struct spdk_nvme_ctrlr *ctrlr)
+{
+ uint64_t now;
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+ int rc;
+
+ now = spdk_get_ticks();
+ if (now < ctrlr->next_keep_alive_tick) {
+ return;
+ }
+
+ req = nvme_allocate_request_null(ctrlr->adminq, nvme_keep_alive_completion, NULL);
+ if (req == NULL) {
+ return;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_KEEP_ALIVE;
+
+ rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+ if (rc != 0) {
+ SPDK_ERRLOG("Submitting Keep Alive failed\n");
+ }
+
+ ctrlr->next_keep_alive_tick = now + ctrlr->keep_alive_interval_ticks;
+}
+
+int32_t
+spdk_nvme_ctrlr_process_admin_completions(struct spdk_nvme_ctrlr *ctrlr)
+{
+ int32_t num_completions;
+ int32_t rc;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+
+ if (ctrlr->keep_alive_interval_ticks) {
+ nvme_ctrlr_keep_alive(ctrlr);
+ }
+
+ rc = nvme_io_msg_process(ctrlr);
+ if (rc < 0) {
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return rc;
+ }
+ num_completions = rc;
+
+ rc = spdk_nvme_qpair_process_completions(ctrlr->adminq, 0);
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+ if (rc < 0) {
+ num_completions = rc;
+ } else {
+ num_completions += rc;
+ }
+
+ return num_completions;
+}
+
+const struct spdk_nvme_ctrlr_data *
+spdk_nvme_ctrlr_get_data(struct spdk_nvme_ctrlr *ctrlr)
+{
+ return &ctrlr->cdata;
+}
+
+union spdk_nvme_csts_register spdk_nvme_ctrlr_get_regs_csts(struct spdk_nvme_ctrlr *ctrlr)
+{
+ union spdk_nvme_csts_register csts;
+
+ if (nvme_ctrlr_get_csts(ctrlr, &csts)) {
+ csts.raw = 0xFFFFFFFFu;
+ }
+ return csts;
+}
+
+union spdk_nvme_cap_register spdk_nvme_ctrlr_get_regs_cap(struct spdk_nvme_ctrlr *ctrlr)
+{
+ return ctrlr->cap;
+}
+
+union spdk_nvme_vs_register spdk_nvme_ctrlr_get_regs_vs(struct spdk_nvme_ctrlr *ctrlr)
+{
+ return ctrlr->vs;
+}
+
+union spdk_nvme_cmbsz_register spdk_nvme_ctrlr_get_regs_cmbsz(struct spdk_nvme_ctrlr *ctrlr)
+{
+ union spdk_nvme_cmbsz_register cmbsz;
+
+ if (nvme_ctrlr_get_cmbsz(ctrlr, &cmbsz)) {
+ cmbsz.raw = 0;
+ }
+
+ return cmbsz;
+}
+
+uint32_t
+spdk_nvme_ctrlr_get_num_ns(struct spdk_nvme_ctrlr *ctrlr)
+{
+ return ctrlr->num_ns;
+}
+
+static int32_t
+nvme_ctrlr_active_ns_idx(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid)
+{
+ int32_t result = -1;
+
+ if (ctrlr->active_ns_list == NULL || nsid == 0 || nsid > ctrlr->num_ns) {
+ return result;
+ }
+
+ int32_t lower = 0;
+ int32_t upper = ctrlr->num_ns - 1;
+ int32_t mid;
+
+ while (lower <= upper) {
+ mid = lower + (upper - lower) / 2;
+ if (ctrlr->active_ns_list[mid] == nsid) {
+ result = mid;
+ break;
+ } else {
+ if (ctrlr->active_ns_list[mid] != 0 && ctrlr->active_ns_list[mid] < nsid) {
+ lower = mid + 1;
+ } else {
+ upper = mid - 1;
+ }
+
+ }
+ }
+
+ return result;
+}
+
+bool
+spdk_nvme_ctrlr_is_active_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid)
+{
+ return nvme_ctrlr_active_ns_idx(ctrlr, nsid) != -1;
+}
+
+uint32_t
+spdk_nvme_ctrlr_get_first_active_ns(struct spdk_nvme_ctrlr *ctrlr)
+{
+ return ctrlr->active_ns_list ? ctrlr->active_ns_list[0] : 0;
+}
+
+uint32_t
+spdk_nvme_ctrlr_get_next_active_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t prev_nsid)
+{
+ int32_t nsid_idx = nvme_ctrlr_active_ns_idx(ctrlr, prev_nsid);
+ if (ctrlr->active_ns_list && nsid_idx >= 0 && (uint32_t)nsid_idx < ctrlr->num_ns - 1) {
+ return ctrlr->active_ns_list[nsid_idx + 1];
+ }
+ return 0;
+}
+
+struct spdk_nvme_ns *
+spdk_nvme_ctrlr_get_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid)
+{
+ if (nsid < 1 || nsid > ctrlr->num_ns) {
+ return NULL;
+ }
+
+ return &ctrlr->ns[nsid - 1];
+}
+
+struct spdk_pci_device *
+spdk_nvme_ctrlr_get_pci_device(struct spdk_nvme_ctrlr *ctrlr)
+{
+ if (ctrlr == NULL) {
+ return NULL;
+ }
+
+ if (ctrlr->trid.trtype != SPDK_NVME_TRANSPORT_PCIE) {
+ return NULL;
+ }
+
+ return nvme_ctrlr_proc_get_devhandle(ctrlr);
+}
+
+uint32_t
+spdk_nvme_ctrlr_get_max_xfer_size(const struct spdk_nvme_ctrlr *ctrlr)
+{
+ return ctrlr->max_xfer_size;
+}
+
+void
+spdk_nvme_ctrlr_register_aer_callback(struct spdk_nvme_ctrlr *ctrlr,
+ spdk_nvme_aer_cb aer_cb_fn,
+ void *aer_cb_arg)
+{
+ struct spdk_nvme_ctrlr_process *active_proc;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+
+ active_proc = nvme_ctrlr_get_current_process(ctrlr);
+ if (active_proc) {
+ active_proc->aer_cb_fn = aer_cb_fn;
+ active_proc->aer_cb_arg = aer_cb_arg;
+ }
+
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+}
+
+void
+spdk_nvme_ctrlr_register_timeout_callback(struct spdk_nvme_ctrlr *ctrlr,
+ uint64_t timeout_us, spdk_nvme_timeout_cb cb_fn, void *cb_arg)
+{
+ struct spdk_nvme_ctrlr_process *active_proc;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+
+ active_proc = nvme_ctrlr_get_current_process(ctrlr);
+ if (active_proc) {
+ active_proc->timeout_ticks = timeout_us * spdk_get_ticks_hz() / 1000000ULL;
+ active_proc->timeout_cb_fn = cb_fn;
+ active_proc->timeout_cb_arg = cb_arg;
+ }
+
+ ctrlr->timeout_enabled = true;
+
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+}
+
+bool
+spdk_nvme_ctrlr_is_log_page_supported(struct spdk_nvme_ctrlr *ctrlr, uint8_t log_page)
+{
+ /* No bounds check necessary, since log_page is uint8_t and log_page_supported has 256 entries */
+ SPDK_STATIC_ASSERT(sizeof(ctrlr->log_page_supported) == 256, "log_page_supported size mismatch");
+ return ctrlr->log_page_supported[log_page];
+}
+
+bool
+spdk_nvme_ctrlr_is_feature_supported(struct spdk_nvme_ctrlr *ctrlr, uint8_t feature_code)
+{
+ /* No bounds check necessary, since feature_code is uint8_t and feature_supported has 256 entries */
+ SPDK_STATIC_ASSERT(sizeof(ctrlr->feature_supported) == 256, "feature_supported size mismatch");
+ return ctrlr->feature_supported[feature_code];
+}
+
+int
+spdk_nvme_ctrlr_attach_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid,
+ struct spdk_nvme_ctrlr_list *payload)
+{
+ struct nvme_completion_poll_status *status;
+ int res;
+ struct spdk_nvme_ns *ns;
+
+ status = calloc(1, sizeof(*status));
+ if (!status) {
+ SPDK_ERRLOG("Failed to allocate status tracker\n");
+ return -ENOMEM;
+ }
+
+ res = nvme_ctrlr_cmd_attach_ns(ctrlr, nsid, payload,
+ nvme_completion_poll_cb, status);
+ if (res) {
+ free(status);
+ return res;
+ }
+ if (nvme_wait_for_completion_robust_lock(ctrlr->adminq, status, &ctrlr->ctrlr_lock)) {
+ SPDK_ERRLOG("spdk_nvme_ctrlr_attach_ns failed!\n");
+ if (!status->timed_out) {
+ free(status);
+ }
+ return -ENXIO;
+ }
+ free(status);
+
+ res = nvme_ctrlr_identify_active_ns(ctrlr);
+ if (res) {
+ return res;
+ }
+
+ ns = &ctrlr->ns[nsid - 1];
+ return nvme_ns_construct(ns, nsid, ctrlr);
+}
+
+int
+spdk_nvme_ctrlr_detach_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid,
+ struct spdk_nvme_ctrlr_list *payload)
+{
+ struct nvme_completion_poll_status *status;
+ int res;
+ struct spdk_nvme_ns *ns;
+
+ status = calloc(1, sizeof(*status));
+ if (!status) {
+ SPDK_ERRLOG("Failed to allocate status tracker\n");
+ return -ENOMEM;
+ }
+
+ res = nvme_ctrlr_cmd_detach_ns(ctrlr, nsid, payload,
+ nvme_completion_poll_cb, status);
+ if (res) {
+ free(status);
+ return res;
+ }
+ if (nvme_wait_for_completion_robust_lock(ctrlr->adminq, status, &ctrlr->ctrlr_lock)) {
+ SPDK_ERRLOG("spdk_nvme_ctrlr_detach_ns failed!\n");
+ if (!status->timed_out) {
+ free(status);
+ }
+ return -ENXIO;
+ }
+ free(status);
+
+ res = nvme_ctrlr_identify_active_ns(ctrlr);
+ if (res) {
+ return res;
+ }
+
+ ns = &ctrlr->ns[nsid - 1];
+ /* Inactive NS */
+ nvme_ns_destruct(ns);
+
+ return 0;
+}
+
+uint32_t
+spdk_nvme_ctrlr_create_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns_data *payload)
+{
+ struct nvme_completion_poll_status *status;
+ int res;
+ uint32_t nsid;
+ struct spdk_nvme_ns *ns;
+
+ status = calloc(1, sizeof(*status));
+ if (!status) {
+ SPDK_ERRLOG("Failed to allocate status tracker\n");
+ return 0;
+ }
+
+ res = nvme_ctrlr_cmd_create_ns(ctrlr, payload, nvme_completion_poll_cb, status);
+ if (res) {
+ free(status);
+ return 0;
+ }
+ if (nvme_wait_for_completion_robust_lock(ctrlr->adminq, status, &ctrlr->ctrlr_lock)) {
+ SPDK_ERRLOG("spdk_nvme_ctrlr_create_ns failed!\n");
+ if (!status->timed_out) {
+ free(status);
+ }
+ return 0;
+ }
+
+ nsid = status->cpl.cdw0;
+ ns = &ctrlr->ns[nsid - 1];
+ free(status);
+ /* Inactive NS */
+ res = nvme_ns_construct(ns, nsid, ctrlr);
+ if (res) {
+ return 0;
+ }
+
+ /* Return the namespace ID that was created */
+ return nsid;
+}
+
+int
+spdk_nvme_ctrlr_delete_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid)
+{
+ struct nvme_completion_poll_status *status;
+ int res;
+ struct spdk_nvme_ns *ns;
+
+ status = calloc(1, sizeof(*status));
+ if (!status) {
+ SPDK_ERRLOG("Failed to allocate status tracker\n");
+ return -ENOMEM;
+ }
+
+ res = nvme_ctrlr_cmd_delete_ns(ctrlr, nsid, nvme_completion_poll_cb, status);
+ if (res) {
+ free(status);
+ return res;
+ }
+ if (nvme_wait_for_completion_robust_lock(ctrlr->adminq, status, &ctrlr->ctrlr_lock)) {
+ SPDK_ERRLOG("spdk_nvme_ctrlr_delete_ns failed!\n");
+ if (!status->timed_out) {
+ free(status);
+ }
+ return -ENXIO;
+ }
+ free(status);
+
+ res = nvme_ctrlr_identify_active_ns(ctrlr);
+ if (res) {
+ return res;
+ }
+
+ ns = &ctrlr->ns[nsid - 1];
+ nvme_ns_destruct(ns);
+
+ return 0;
+}
+
+int
+spdk_nvme_ctrlr_format(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid,
+ struct spdk_nvme_format *format)
+{
+ struct nvme_completion_poll_status *status;
+ int res;
+
+ status = calloc(1, sizeof(*status));
+ if (!status) {
+ SPDK_ERRLOG("Failed to allocate status tracker\n");
+ return -ENOMEM;
+ }
+
+ res = nvme_ctrlr_cmd_format(ctrlr, nsid, format, nvme_completion_poll_cb,
+ status);
+ if (res) {
+ free(status);
+ return res;
+ }
+ if (nvme_wait_for_completion_robust_lock(ctrlr->adminq, status, &ctrlr->ctrlr_lock)) {
+ SPDK_ERRLOG("spdk_nvme_ctrlr_format failed!\n");
+ if (!status->timed_out) {
+ free(status);
+ }
+ return -ENXIO;
+ }
+ free(status);
+
+ return spdk_nvme_ctrlr_reset(ctrlr);
+}
+
+int
+spdk_nvme_ctrlr_update_firmware(struct spdk_nvme_ctrlr *ctrlr, void *payload, uint32_t size,
+ int slot, enum spdk_nvme_fw_commit_action commit_action, struct spdk_nvme_status *completion_status)
+{
+ struct spdk_nvme_fw_commit fw_commit;
+ struct nvme_completion_poll_status *status;
+ int res;
+ unsigned int size_remaining;
+ unsigned int offset;
+ unsigned int transfer;
+ void *p;
+
+ if (!completion_status) {
+ return -EINVAL;
+ }
+ memset(completion_status, 0, sizeof(struct spdk_nvme_status));
+ if (size % 4) {
+ SPDK_ERRLOG("spdk_nvme_ctrlr_update_firmware invalid size!\n");
+ return -1;
+ }
+
+ /* Current support only for SPDK_NVME_FW_COMMIT_REPLACE_IMG
+ * and SPDK_NVME_FW_COMMIT_REPLACE_AND_ENABLE_IMG
+ */
+ if ((commit_action != SPDK_NVME_FW_COMMIT_REPLACE_IMG) &&
+ (commit_action != SPDK_NVME_FW_COMMIT_REPLACE_AND_ENABLE_IMG)) {
+ SPDK_ERRLOG("spdk_nvme_ctrlr_update_firmware invalid command!\n");
+ return -1;
+ }
+
+ status = calloc(1, sizeof(*status));
+ if (!status) {
+ SPDK_ERRLOG("Failed to allocate status tracker\n");
+ return -ENOMEM;
+ }
+
+ /* Firmware download */
+ size_remaining = size;
+ offset = 0;
+ p = payload;
+
+ while (size_remaining > 0) {
+ transfer = spdk_min(size_remaining, ctrlr->min_page_size);
+
+ memset(status, 0, sizeof(*status));
+ res = nvme_ctrlr_cmd_fw_image_download(ctrlr, transfer, offset, p,
+ nvme_completion_poll_cb,
+ status);
+ if (res) {
+ free(status);
+ return res;
+ }
+
+ if (nvme_wait_for_completion_robust_lock(ctrlr->adminq, status, &ctrlr->ctrlr_lock)) {
+ SPDK_ERRLOG("spdk_nvme_ctrlr_fw_image_download failed!\n");
+ if (!status->timed_out) {
+ free(status);
+ }
+ return -ENXIO;
+ }
+ p += transfer;
+ offset += transfer;
+ size_remaining -= transfer;
+ }
+
+ /* Firmware commit */
+ memset(&fw_commit, 0, sizeof(struct spdk_nvme_fw_commit));
+ fw_commit.fs = slot;
+ fw_commit.ca = commit_action;
+
+ memset(status, 0, sizeof(*status));
+ res = nvme_ctrlr_cmd_fw_commit(ctrlr, &fw_commit, nvme_completion_poll_cb,
+ status);
+ if (res) {
+ free(status);
+ return res;
+ }
+
+ res = nvme_wait_for_completion_robust_lock(ctrlr->adminq, status, &ctrlr->ctrlr_lock);
+
+ memcpy(completion_status, &status->cpl.status, sizeof(struct spdk_nvme_status));
+
+ if (!status->timed_out) {
+ free(status);
+ }
+
+ if (res) {
+ if (completion_status->sct != SPDK_NVME_SCT_COMMAND_SPECIFIC ||
+ completion_status->sc != SPDK_NVME_SC_FIRMWARE_REQ_NVM_RESET) {
+ if (completion_status->sct == SPDK_NVME_SCT_COMMAND_SPECIFIC &&
+ completion_status->sc == SPDK_NVME_SC_FIRMWARE_REQ_CONVENTIONAL_RESET) {
+ SPDK_NOTICELOG("firmware activation requires conventional reset to be performed. !\n");
+ } else {
+ SPDK_ERRLOG("nvme_ctrlr_cmd_fw_commit failed!\n");
+ }
+ return -ENXIO;
+ }
+ }
+
+ return spdk_nvme_ctrlr_reset(ctrlr);
+}
+
+int
+spdk_nvme_ctrlr_reserve_cmb(struct spdk_nvme_ctrlr *ctrlr)
+{
+ int rc, size;
+ union spdk_nvme_cmbsz_register cmbsz;
+
+ cmbsz = spdk_nvme_ctrlr_get_regs_cmbsz(ctrlr);
+
+ if (cmbsz.bits.rds == 0 || cmbsz.bits.wds == 0) {
+ return -ENOTSUP;
+ }
+
+ size = cmbsz.bits.sz * (0x1000 << (cmbsz.bits.szu * 4));
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ rc = nvme_transport_ctrlr_reserve_cmb(ctrlr);
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+ if (rc < 0) {
+ return rc;
+ }
+
+ return size;
+}
+
+void *
+spdk_nvme_ctrlr_map_cmb(struct spdk_nvme_ctrlr *ctrlr, size_t *size)
+{
+ void *buf;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ buf = nvme_transport_ctrlr_map_cmb(ctrlr, size);
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+ return buf;
+}
+
+void
+spdk_nvme_ctrlr_unmap_cmb(struct spdk_nvme_ctrlr *ctrlr)
+{
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ nvme_transport_ctrlr_unmap_cmb(ctrlr);
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+}
+
+bool
+spdk_nvme_ctrlr_is_discovery(struct spdk_nvme_ctrlr *ctrlr)
+{
+ assert(ctrlr);
+
+ return !strncmp(ctrlr->trid.subnqn, SPDK_NVMF_DISCOVERY_NQN,
+ strlen(SPDK_NVMF_DISCOVERY_NQN));
+}
+
+int
+spdk_nvme_ctrlr_security_receive(struct spdk_nvme_ctrlr *ctrlr, uint8_t secp,
+ uint16_t spsp, uint8_t nssf, void *payload, size_t size)
+{
+ struct nvme_completion_poll_status *status;
+ int res;
+
+ status = calloc(1, sizeof(*status));
+ if (!status) {
+ SPDK_ERRLOG("Failed to allocate status tracker\n");
+ return -ENOMEM;
+ }
+
+ res = spdk_nvme_ctrlr_cmd_security_receive(ctrlr, secp, spsp, nssf, payload, size,
+ nvme_completion_poll_cb, status);
+ if (res) {
+ free(status);
+ return res;
+ }
+ if (nvme_wait_for_completion_robust_lock(ctrlr->adminq, status, &ctrlr->ctrlr_lock)) {
+ SPDK_ERRLOG("spdk_nvme_ctrlr_cmd_security_receive failed!\n");
+ if (!status->timed_out) {
+ free(status);
+ }
+ return -ENXIO;
+ }
+ free(status);
+
+ return 0;
+}
+
+int
+spdk_nvme_ctrlr_security_send(struct spdk_nvme_ctrlr *ctrlr, uint8_t secp,
+ uint16_t spsp, uint8_t nssf, void *payload, size_t size)
+{
+ struct nvme_completion_poll_status *status;
+ int res;
+
+ status = calloc(1, sizeof(*status));
+ if (!status) {
+ SPDK_ERRLOG("Failed to allocate status tracker\n");
+ return -ENOMEM;
+ }
+
+ res = spdk_nvme_ctrlr_cmd_security_send(ctrlr, secp, spsp, nssf, payload, size,
+ nvme_completion_poll_cb,
+ status);
+ if (res) {
+ free(status);
+ return res;
+ }
+ if (nvme_wait_for_completion_robust_lock(ctrlr->adminq, status, &ctrlr->ctrlr_lock)) {
+ SPDK_ERRLOG("spdk_nvme_ctrlr_cmd_security_send failed!\n");
+ if (!status->timed_out) {
+ free(status);
+ }
+ return -ENXIO;
+ }
+
+ free(status);
+
+ return 0;
+}
+
+uint64_t
+spdk_nvme_ctrlr_get_flags(struct spdk_nvme_ctrlr *ctrlr)
+{
+ return ctrlr->flags;
+}
+
+const struct spdk_nvme_transport_id *
+spdk_nvme_ctrlr_get_transport_id(struct spdk_nvme_ctrlr *ctrlr)
+{
+ return &ctrlr->trid;
+}
+
+/* FIXME need to specify max number of iovs */
+int
+spdk_nvme_map_prps(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs,
+ uint32_t len, size_t mps,
+ void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len))
+{
+ uint64_t prp1, prp2;
+ void *vva;
+ uint32_t i;
+ uint32_t residue_len, nents;
+ uint64_t *prp_list;
+ int iovcnt;
+
+ prp1 = cmd->dptr.prp.prp1;
+ prp2 = cmd->dptr.prp.prp2;
+
+ /* PRP1 may started with unaligned page address */
+ residue_len = mps - (prp1 % mps);
+ residue_len = spdk_min(len, residue_len);
+
+ vva = gpa_to_vva(prv, prp1, residue_len);
+ if (spdk_unlikely(vva == NULL)) {
+ SPDK_ERRLOG("GPA to VVA failed\n");
+ return -1;
+ }
+ iovs[0].iov_base = vva;
+ iovs[0].iov_len = residue_len;
+ len -= residue_len;
+
+ if (len) {
+ if (spdk_unlikely(prp2 == 0)) {
+ SPDK_ERRLOG("no PRP2, %d remaining\n", len);
+ return -1;
+ }
+
+ if (len <= mps) {
+ /* 2 PRP used */
+ iovcnt = 2;
+ vva = gpa_to_vva(prv, prp2, len);
+ if (spdk_unlikely(vva == NULL)) {
+ SPDK_ERRLOG("no VVA for %#lx, len%#x\n",
+ prp2, len);
+ return -1;
+ }
+ iovs[1].iov_base = vva;
+ iovs[1].iov_len = len;
+ } else {
+ /* PRP list used */
+ nents = (len + mps - 1) / mps;
+ vva = gpa_to_vva(prv, prp2, nents * sizeof(*prp_list));
+ if (spdk_unlikely(vva == NULL)) {
+ SPDK_ERRLOG("no VVA for %#lx, nents=%#x\n",
+ prp2, nents);
+ return -1;
+ }
+ prp_list = vva;
+ i = 0;
+ while (len != 0) {
+ residue_len = spdk_min(len, mps);
+ vva = gpa_to_vva(prv, prp_list[i], residue_len);
+ if (spdk_unlikely(vva == NULL)) {
+ SPDK_ERRLOG("no VVA for %#lx, residue_len=%#x\n",
+ prp_list[i], residue_len);
+ return -1;
+ }
+ iovs[i + 1].iov_base = vva;
+ iovs[i + 1].iov_len = residue_len;
+ len -= residue_len;
+ i++;
+ }
+ iovcnt = i + 1;
+ }
+ } else {
+ /* 1 PRP used */
+ iovcnt = 1;
+ }
+
+ return iovcnt;
+}
diff --git a/src/spdk/lib/nvme/nvme_ctrlr_cmd.c b/src/spdk/lib/nvme/nvme_ctrlr_cmd.c
new file mode 100644
index 000000000..9b16c8d6f
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_ctrlr_cmd.c
@@ -0,0 +1,966 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "nvme_internal.h"
+
+int
+spdk_nvme_ctrlr_io_cmd_raw_no_payload_build(struct spdk_nvme_ctrlr *ctrlr,
+ struct spdk_nvme_qpair *qpair,
+ struct spdk_nvme_cmd *cmd,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct nvme_payload payload;
+
+ if (ctrlr->trid.trtype != SPDK_NVME_TRANSPORT_PCIE) {
+ return -EINVAL;
+ }
+
+ memset(&payload, 0, sizeof(payload));
+ req = nvme_allocate_request(qpair, &payload, 0, 0, cb_fn, cb_arg);
+
+ if (req == NULL) {
+ return -ENOMEM;
+ }
+
+ memcpy(&req->cmd, cmd, sizeof(req->cmd));
+
+ return nvme_qpair_submit_request(qpair, req);
+}
+
+int
+spdk_nvme_ctrlr_cmd_io_raw(struct spdk_nvme_ctrlr *ctrlr,
+ struct spdk_nvme_qpair *qpair,
+ struct spdk_nvme_cmd *cmd,
+ void *buf, uint32_t len,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+
+ req = nvme_allocate_request_contig(qpair, buf, len, cb_fn, cb_arg);
+
+ if (req == NULL) {
+ return -ENOMEM;
+ }
+
+ memcpy(&req->cmd, cmd, sizeof(req->cmd));
+
+ return nvme_qpair_submit_request(qpair, req);
+}
+
+int
+spdk_nvme_ctrlr_cmd_io_raw_with_md(struct spdk_nvme_ctrlr *ctrlr,
+ struct spdk_nvme_qpair *qpair,
+ struct spdk_nvme_cmd *cmd,
+ void *buf, uint32_t len, void *md_buf,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct nvme_payload payload;
+ uint32_t md_len = 0;
+
+ payload = NVME_PAYLOAD_CONTIG(buf, md_buf);
+
+ /* Caculate metadata length */
+ if (md_buf) {
+ struct spdk_nvme_ns *ns = &ctrlr->ns[cmd->nsid - 1];
+
+ assert(ns->sector_size != 0);
+ md_len = len / ns->sector_size * ns->md_size;
+ }
+
+ req = nvme_allocate_request(qpair, &payload, len, md_len, cb_fn, cb_arg);
+ if (req == NULL) {
+ return -ENOMEM;
+ }
+
+ memcpy(&req->cmd, cmd, sizeof(req->cmd));
+
+ return nvme_qpair_submit_request(qpair, req);
+}
+
+int
+spdk_nvme_ctrlr_cmd_admin_raw(struct spdk_nvme_ctrlr *ctrlr,
+ struct spdk_nvme_cmd *cmd,
+ void *buf, uint32_t len,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ int rc;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ req = nvme_allocate_request_contig(ctrlr->adminq, buf, len, cb_fn, cb_arg);
+ if (req == NULL) {
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return -ENOMEM;
+ }
+
+ memcpy(&req->cmd, cmd, sizeof(req->cmd));
+
+ rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return rc;
+}
+
+int
+nvme_ctrlr_cmd_identify(struct spdk_nvme_ctrlr *ctrlr, uint8_t cns, uint16_t cntid, uint32_t nsid,
+ void *payload, size_t payload_size,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+
+ req = nvme_allocate_request_user_copy(ctrlr->adminq,
+ payload, payload_size,
+ cb_fn, cb_arg, false);
+ if (req == NULL) {
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_IDENTIFY;
+ cmd->cdw10_bits.identify.cns = cns;
+ cmd->cdw10_bits.identify.cntid = cntid;
+ cmd->nsid = nsid;
+
+ return nvme_ctrlr_submit_admin_request(ctrlr, req);
+}
+
+int
+nvme_ctrlr_cmd_attach_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid,
+ struct spdk_nvme_ctrlr_list *payload, spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+ int rc;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ req = nvme_allocate_request_user_copy(ctrlr->adminq,
+ payload, sizeof(struct spdk_nvme_ctrlr_list),
+ cb_fn, cb_arg, true);
+ if (req == NULL) {
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_NS_ATTACHMENT;
+ cmd->nsid = nsid;
+ cmd->cdw10_bits.ns_attach.sel = SPDK_NVME_NS_CTRLR_ATTACH;
+
+ rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return rc;
+}
+
+int
+nvme_ctrlr_cmd_detach_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid,
+ struct spdk_nvme_ctrlr_list *payload, spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+ int rc;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ req = nvme_allocate_request_user_copy(ctrlr->adminq,
+ payload, sizeof(struct spdk_nvme_ctrlr_list),
+ cb_fn, cb_arg, true);
+ if (req == NULL) {
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_NS_ATTACHMENT;
+ cmd->nsid = nsid;
+ cmd->cdw10_bits.ns_attach.sel = SPDK_NVME_NS_CTRLR_DETACH;
+
+ rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return rc;
+}
+
+int
+nvme_ctrlr_cmd_create_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns_data *payload,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+ int rc;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ req = nvme_allocate_request_user_copy(ctrlr->adminq,
+ payload, sizeof(struct spdk_nvme_ns_data),
+ cb_fn, cb_arg, true);
+ if (req == NULL) {
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_NS_MANAGEMENT;
+ cmd->cdw10_bits.ns_manage.sel = SPDK_NVME_NS_MANAGEMENT_CREATE;
+
+ rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return rc;
+}
+
+int
+nvme_ctrlr_cmd_delete_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, spdk_nvme_cmd_cb cb_fn,
+ void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+ int rc;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
+ if (req == NULL) {
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_NS_MANAGEMENT;
+ cmd->cdw10_bits.ns_manage.sel = SPDK_NVME_NS_MANAGEMENT_DELETE;
+ cmd->nsid = nsid;
+
+ rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return rc;
+}
+
+int
+nvme_ctrlr_cmd_doorbell_buffer_config(struct spdk_nvme_ctrlr *ctrlr, uint64_t prp1, uint64_t prp2,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+ int rc;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
+ if (req == NULL) {
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_DOORBELL_BUFFER_CONFIG;
+ cmd->dptr.prp.prp1 = prp1;
+ cmd->dptr.prp.prp2 = prp2;
+
+ rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return rc;
+}
+
+int
+nvme_ctrlr_cmd_format(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, struct spdk_nvme_format *format,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+ int rc;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
+ if (req == NULL) {
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_FORMAT_NVM;
+ cmd->nsid = nsid;
+ memcpy(&cmd->cdw10, format, sizeof(uint32_t));
+
+ rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+ return rc;
+}
+
+int
+spdk_nvme_ctrlr_cmd_set_feature(struct spdk_nvme_ctrlr *ctrlr, uint8_t feature,
+ uint32_t cdw11, uint32_t cdw12, void *payload, uint32_t payload_size,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+ int rc;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ req = nvme_allocate_request_user_copy(ctrlr->adminq, payload, payload_size, cb_fn, cb_arg,
+ true);
+ if (req == NULL) {
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_SET_FEATURES;
+ cmd->cdw10_bits.set_features.fid = feature;
+ cmd->cdw11 = cdw11;
+ cmd->cdw12 = cdw12;
+
+ rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+ return rc;
+}
+
+int
+spdk_nvme_ctrlr_cmd_get_feature(struct spdk_nvme_ctrlr *ctrlr, uint8_t feature,
+ uint32_t cdw11, void *payload, uint32_t payload_size,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+ int rc;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ req = nvme_allocate_request_user_copy(ctrlr->adminq, payload, payload_size, cb_fn, cb_arg,
+ false);
+ if (req == NULL) {
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_GET_FEATURES;
+ cmd->cdw10_bits.get_features.fid = feature;
+ cmd->cdw11 = cdw11;
+
+ rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+ return rc;
+}
+
+int
+spdk_nvme_ctrlr_cmd_get_feature_ns(struct spdk_nvme_ctrlr *ctrlr, uint8_t feature,
+ uint32_t cdw11, void *payload,
+ uint32_t payload_size, spdk_nvme_cmd_cb cb_fn,
+ void *cb_arg, uint32_t ns_id)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+ int rc;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ req = nvme_allocate_request_user_copy(ctrlr->adminq, payload, payload_size, cb_fn, cb_arg,
+ false);
+ if (req == NULL) {
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_GET_FEATURES;
+ cmd->cdw10_bits.get_features.fid = feature;
+ cmd->cdw11 = cdw11;
+ cmd->nsid = ns_id;
+
+ rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+ return rc;
+}
+
+int spdk_nvme_ctrlr_cmd_set_feature_ns(struct spdk_nvme_ctrlr *ctrlr, uint8_t feature,
+ uint32_t cdw11, uint32_t cdw12, void *payload,
+ uint32_t payload_size, spdk_nvme_cmd_cb cb_fn,
+ void *cb_arg, uint32_t ns_id)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+ int rc;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ req = nvme_allocate_request_user_copy(ctrlr->adminq, payload, payload_size, cb_fn, cb_arg,
+ true);
+ if (req == NULL) {
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_SET_FEATURES;
+ cmd->cdw10_bits.set_features.fid = feature;
+ cmd->cdw11 = cdw11;
+ cmd->cdw12 = cdw12;
+ cmd->nsid = ns_id;
+
+ rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+ return rc;
+}
+
+int
+nvme_ctrlr_cmd_set_num_queues(struct spdk_nvme_ctrlr *ctrlr,
+ uint32_t num_queues, spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ union spdk_nvme_feat_number_of_queues feat_num_queues;
+
+ feat_num_queues.raw = 0;
+ feat_num_queues.bits.nsqr = num_queues - 1;
+ feat_num_queues.bits.ncqr = num_queues - 1;
+
+ return spdk_nvme_ctrlr_cmd_set_feature(ctrlr, SPDK_NVME_FEAT_NUMBER_OF_QUEUES, feat_num_queues.raw,
+ 0,
+ NULL, 0, cb_fn, cb_arg);
+}
+
+int
+nvme_ctrlr_cmd_get_num_queues(struct spdk_nvme_ctrlr *ctrlr,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ return spdk_nvme_ctrlr_cmd_get_feature(ctrlr, SPDK_NVME_FEAT_NUMBER_OF_QUEUES, 0, NULL, 0,
+ cb_fn, cb_arg);
+}
+
+int
+nvme_ctrlr_cmd_set_async_event_config(struct spdk_nvme_ctrlr *ctrlr,
+ union spdk_nvme_feat_async_event_configuration config, spdk_nvme_cmd_cb cb_fn,
+ void *cb_arg)
+{
+ uint32_t cdw11;
+
+ cdw11 = config.raw;
+ return spdk_nvme_ctrlr_cmd_set_feature(ctrlr, SPDK_NVME_FEAT_ASYNC_EVENT_CONFIGURATION, cdw11, 0,
+ NULL, 0,
+ cb_fn, cb_arg);
+}
+
+int
+nvme_ctrlr_cmd_set_host_id(struct spdk_nvme_ctrlr *ctrlr, void *host_id, uint32_t host_id_size,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ union spdk_nvme_feat_host_identifier feat_host_identifier;
+
+ feat_host_identifier.raw = 0;
+ if (host_id_size == 16) {
+ /* 128-bit extended host identifier */
+ feat_host_identifier.bits.exhid = 1;
+ } else if (host_id_size == 8) {
+ /* 64-bit host identifier */
+ feat_host_identifier.bits.exhid = 0;
+ } else {
+ SPDK_ERRLOG("Invalid host ID size %u\n", host_id_size);
+ return -EINVAL;
+ }
+
+ return spdk_nvme_ctrlr_cmd_set_feature(ctrlr, SPDK_NVME_FEAT_HOST_IDENTIFIER,
+ feat_host_identifier.raw, 0,
+ host_id, host_id_size, cb_fn, cb_arg);
+}
+
+int
+spdk_nvme_ctrlr_cmd_get_log_page_ext(struct spdk_nvme_ctrlr *ctrlr, uint8_t log_page,
+ uint32_t nsid, void *payload, uint32_t payload_size,
+ uint64_t offset, uint32_t cdw10,
+ uint32_t cdw11, uint32_t cdw14,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+ uint32_t numd, numdl, numdu;
+ uint32_t lpol, lpou;
+ int rc;
+
+ if (payload_size == 0) {
+ return -EINVAL;
+ }
+
+ if (offset & 3) {
+ return -EINVAL;
+ }
+
+ numd = payload_size / sizeof(uint32_t) - 1u;
+ numdl = numd & 0xFFFFu;
+ numdu = (numd >> 16) & 0xFFFFu;
+
+ lpol = (uint32_t)offset;
+ lpou = (uint32_t)(offset >> 32);
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+
+ if (offset && !ctrlr->cdata.lpa.edlp) {
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return -EINVAL;
+ }
+
+ req = nvme_allocate_request_user_copy(ctrlr->adminq,
+ payload, payload_size, cb_fn, cb_arg, false);
+ if (req == NULL) {
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_GET_LOG_PAGE;
+ cmd->nsid = nsid;
+ cmd->cdw10 = cdw10;
+ cmd->cdw10_bits.get_log_page.numdl = numdl;
+ cmd->cdw10_bits.get_log_page.lid = log_page;
+
+ cmd->cdw11 = cdw11;
+ cmd->cdw11_bits.get_log_page.numdu = numdu;
+ cmd->cdw12 = lpol;
+ cmd->cdw13 = lpou;
+ cmd->cdw14 = cdw14;
+
+ rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+ return rc;
+}
+
+int
+spdk_nvme_ctrlr_cmd_get_log_page(struct spdk_nvme_ctrlr *ctrlr, uint8_t log_page,
+ uint32_t nsid, void *payload, uint32_t payload_size,
+ uint64_t offset, spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ return spdk_nvme_ctrlr_cmd_get_log_page_ext(ctrlr, log_page, nsid, payload,
+ payload_size, offset, 0, 0, 0, cb_fn, cb_arg);
+}
+
+static void
+nvme_ctrlr_retry_queued_abort(struct spdk_nvme_ctrlr *ctrlr)
+{
+ struct nvme_request *next, *tmp;
+ int rc;
+
+ if (ctrlr->is_resetting || ctrlr->is_destructed) {
+ return;
+ }
+
+ STAILQ_FOREACH_SAFE(next, &ctrlr->queued_aborts, stailq, tmp) {
+ STAILQ_REMOVE_HEAD(&ctrlr->queued_aborts, stailq);
+ ctrlr->outstanding_aborts++;
+ rc = nvme_ctrlr_submit_admin_request(ctrlr, next);
+ if (rc < 0) {
+ SPDK_ERRLOG("Failed to submit queued abort.\n");
+ memset(&next->cpl, 0, sizeof(next->cpl));
+ next->cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+ next->cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+ next->cpl.status.dnr = 1;
+ nvme_complete_request(next->cb_fn, next->cb_arg, next->qpair, next, &next->cpl);
+ nvme_free_request(next);
+ } else {
+ /* If the first abort succeeds, stop iterating. */
+ break;
+ }
+ }
+}
+
+static int
+_nvme_ctrlr_submit_abort_request(struct spdk_nvme_ctrlr *ctrlr,
+ struct nvme_request *req)
+{
+ /* ACL is a 0's based value. */
+ if (ctrlr->outstanding_aborts >= ctrlr->cdata.acl + 1U) {
+ STAILQ_INSERT_TAIL(&ctrlr->queued_aborts, req, stailq);
+ return 0;
+ } else {
+ ctrlr->outstanding_aborts++;
+ return nvme_ctrlr_submit_admin_request(ctrlr, req);
+ }
+}
+
+static void
+nvme_ctrlr_cmd_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl)
+{
+ struct nvme_request *req = ctx;
+ struct spdk_nvme_ctrlr *ctrlr;
+
+ ctrlr = req->qpair->ctrlr;
+
+ ctrlr->outstanding_aborts--;
+ nvme_ctrlr_retry_queued_abort(ctrlr);
+
+ req->user_cb_fn(req->user_cb_arg, cpl);
+}
+
+int
+spdk_nvme_ctrlr_cmd_abort(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair,
+ uint16_t cid, spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ int rc;
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+
+ if (qpair == NULL) {
+ qpair = ctrlr->adminq;
+ }
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ req = nvme_allocate_request_null(ctrlr->adminq, nvme_ctrlr_cmd_abort_cpl, NULL);
+ if (req == NULL) {
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return -ENOMEM;
+ }
+ req->cb_arg = req;
+ req->user_cb_fn = cb_fn;
+ req->user_cb_arg = cb_arg;
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_ABORT;
+ cmd->cdw10_bits.abort.sqid = qpair->id;
+ cmd->cdw10_bits.abort.cid = cid;
+
+ rc = _nvme_ctrlr_submit_abort_request(ctrlr, req);
+
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return rc;
+}
+
+static void
+nvme_complete_abort_request(void *ctx, const struct spdk_nvme_cpl *cpl)
+{
+ struct nvme_request *req = ctx;
+ struct nvme_request *parent = req->parent;
+ struct spdk_nvme_ctrlr *ctrlr;
+
+ ctrlr = req->qpair->ctrlr;
+
+ ctrlr->outstanding_aborts--;
+ nvme_ctrlr_retry_queued_abort(ctrlr);
+
+ nvme_request_remove_child(parent, req);
+
+ if (!spdk_nvme_cpl_is_abort_success(cpl)) {
+ parent->parent_status.cdw0 |= 1U;
+ }
+
+ if (parent->num_children == 0) {
+ nvme_complete_request(parent->cb_fn, parent->cb_arg, parent->qpair,
+ parent, &parent->parent_status);
+ nvme_free_request(parent);
+ }
+}
+
+static int
+nvme_request_add_abort(struct nvme_request *req, void *arg)
+{
+ struct nvme_request *parent = arg;
+ struct nvme_request *child;
+ void *cmd_cb_arg;
+
+ cmd_cb_arg = parent->user_cb_arg;
+
+ if (req->cb_arg != cmd_cb_arg &&
+ (req->parent == NULL || req->parent->cb_arg != cmd_cb_arg)) {
+ return 0;
+ }
+
+ child = nvme_allocate_request_null(parent->qpair->ctrlr->adminq,
+ nvme_complete_abort_request, NULL);
+ if (child == NULL) {
+ return -ENOMEM;
+ }
+
+ child->cb_arg = child;
+
+ child->cmd.opc = SPDK_NVME_OPC_ABORT;
+ /* Copy SQID from the parent. */
+ child->cmd.cdw10_bits.abort.sqid = parent->cmd.cdw10_bits.abort.sqid;
+ child->cmd.cdw10_bits.abort.cid = req->cmd.cid;
+
+ child->parent = parent;
+
+ TAILQ_INSERT_TAIL(&parent->children, child, child_tailq);
+ parent->num_children++;
+
+ return 0;
+}
+
+int
+spdk_nvme_ctrlr_cmd_abort_ext(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair,
+ void *cmd_cb_arg,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ int rc = 0;
+ struct nvme_request *parent, *child, *tmp;
+ bool child_failed = false;
+ int aborted = 0;
+
+ if (cmd_cb_arg == NULL) {
+ return -EINVAL;
+ }
+
+ pthread_mutex_lock(&ctrlr->ctrlr_lock);
+
+ if (qpair == NULL) {
+ qpair = ctrlr->adminq;
+ }
+
+ parent = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
+ if (parent == NULL) {
+ pthread_mutex_unlock(&ctrlr->ctrlr_lock);
+
+ return -ENOMEM;
+ }
+
+ TAILQ_INIT(&parent->children);
+ parent->num_children = 0;
+
+ parent->cmd.opc = SPDK_NVME_OPC_ABORT;
+ memset(&parent->parent_status, 0, sizeof(struct spdk_nvme_cpl));
+
+ /* Hold SQID that the requests to abort are associated with.
+ * This will be copied to the children.
+ *
+ * CID is not set here because the parent is not submitted directly
+ * and CID is not determined until request to abort is found.
+ */
+ parent->cmd.cdw10_bits.abort.sqid = qpair->id;
+
+ /* This is used to find request to abort. */
+ parent->user_cb_arg = cmd_cb_arg;
+
+ /* Add an abort request for each outstanding request which has cmd_cb_arg
+ * as its callback context.
+ */
+ rc = nvme_transport_qpair_iterate_requests(qpair, nvme_request_add_abort, parent);
+ if (rc != 0) {
+ /* Free abort requests already added. */
+ child_failed = true;
+ }
+
+ TAILQ_FOREACH_SAFE(child, &parent->children, child_tailq, tmp) {
+ if (spdk_likely(!child_failed)) {
+ rc = _nvme_ctrlr_submit_abort_request(ctrlr, child);
+ if (spdk_unlikely(rc != 0)) {
+ child_failed = true;
+ }
+ } else {
+ /* Free remaining abort requests. */
+ nvme_request_remove_child(parent, child);
+ nvme_free_request(child);
+ }
+ }
+
+ if (spdk_likely(!child_failed)) {
+ /* There is no error so far. Abort requests were submitted successfully
+ * or there was no outstanding request to abort.
+ *
+ * Hence abort queued requests which has cmd_cb_arg as its callback
+ * context next.
+ */
+ aborted = nvme_qpair_abort_queued_reqs(qpair, cmd_cb_arg);
+ if (parent->num_children == 0) {
+ /* There was no outstanding request to abort. */
+ if (aborted > 0) {
+ /* The queued requests were successfully aborted. Hence
+ * complete the parent request with success synchronously.
+ */
+ nvme_complete_request(parent->cb_fn, parent->cb_arg, parent->qpair,
+ parent, &parent->parent_status);
+ nvme_free_request(parent);
+ } else {
+ /* There was no queued request to abort. */
+ rc = -ENOENT;
+ }
+ }
+ } else {
+ /* Failed to add or submit abort request. */
+ if (parent->num_children != 0) {
+ /* Return success since we must wait for those children
+ * to complete but set the parent request to failure.
+ */
+ parent->parent_status.cdw0 |= 1U;
+ rc = 0;
+ }
+ }
+
+ if (rc != 0) {
+ nvme_free_request(parent);
+ }
+
+ pthread_mutex_unlock(&ctrlr->ctrlr_lock);
+ return rc;
+}
+
+int
+nvme_ctrlr_cmd_fw_commit(struct spdk_nvme_ctrlr *ctrlr,
+ const struct spdk_nvme_fw_commit *fw_commit,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+ int rc;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
+ if (req == NULL) {
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_FIRMWARE_COMMIT;
+ memcpy(&cmd->cdw10, fw_commit, sizeof(uint32_t));
+
+ rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+ return rc;
+
+}
+
+int
+nvme_ctrlr_cmd_fw_image_download(struct spdk_nvme_ctrlr *ctrlr,
+ uint32_t size, uint32_t offset, void *payload,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+ int rc;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ req = nvme_allocate_request_user_copy(ctrlr->adminq, payload, size, cb_fn, cb_arg, true);
+ if (req == NULL) {
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD;
+ cmd->cdw10 = (size >> 2) - 1;
+ cmd->cdw11 = offset >> 2;
+
+ rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+ return rc;
+}
+
+int
+spdk_nvme_ctrlr_cmd_security_receive(struct spdk_nvme_ctrlr *ctrlr, uint8_t secp,
+ uint16_t spsp, uint8_t nssf, void *payload,
+ uint32_t payload_size, spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+ int rc;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ req = nvme_allocate_request_user_copy(ctrlr->adminq, payload, payload_size,
+ cb_fn, cb_arg, false);
+ if (req == NULL) {
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_SECURITY_RECEIVE;
+ cmd->cdw10_bits.sec_send_recv.nssf = nssf;
+ cmd->cdw10_bits.sec_send_recv.spsp0 = (uint8_t)spsp;
+ cmd->cdw10_bits.sec_send_recv.spsp1 = (uint8_t)(spsp >> 8);
+ cmd->cdw10_bits.sec_send_recv.secp = secp;
+ cmd->cdw11 = payload_size;
+
+ rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+ return rc;
+}
+
+int
+spdk_nvme_ctrlr_cmd_security_send(struct spdk_nvme_ctrlr *ctrlr, uint8_t secp,
+ uint16_t spsp, uint8_t nssf, void *payload,
+ uint32_t payload_size, spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+ int rc;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ req = nvme_allocate_request_user_copy(ctrlr->adminq, payload, payload_size,
+ cb_fn, cb_arg, true);
+ if (req == NULL) {
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_SECURITY_SEND;
+ cmd->cdw10_bits.sec_send_recv.nssf = nssf;
+ cmd->cdw10_bits.sec_send_recv.spsp0 = (uint8_t)spsp;
+ cmd->cdw10_bits.sec_send_recv.spsp1 = (uint8_t)(spsp >> 8);
+ cmd->cdw10_bits.sec_send_recv.secp = secp;
+ cmd->cdw11 = payload_size;
+
+ rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+ return rc;
+}
+
+int
+nvme_ctrlr_cmd_sanitize(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid,
+ struct spdk_nvme_sanitize *sanitize, uint32_t cdw11,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+ int rc;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
+ if (req == NULL) {
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_SANITIZE;
+ cmd->nsid = nsid;
+ cmd->cdw11 = cdw11;
+ memcpy(&cmd->cdw10, sanitize, sizeof(cmd->cdw10));
+
+ rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+ return rc;
+}
diff --git a/src/spdk/lib/nvme/nvme_ctrlr_ocssd_cmd.c b/src/spdk/lib/nvme/nvme_ctrlr_ocssd_cmd.c
new file mode 100644
index 000000000..2eba219ce
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_ctrlr_ocssd_cmd.c
@@ -0,0 +1,88 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/nvme_ocssd.h"
+#include "nvme_internal.h"
+
+bool
+spdk_nvme_ctrlr_is_ocssd_supported(struct spdk_nvme_ctrlr *ctrlr)
+{
+ if (ctrlr->quirks & NVME_QUIRK_OCSSD) {
+ /* TODO: There isn't a standardized way to identify Open-Channel SSD
+ * different verdors may have different conditions.
+ */
+
+ /*
+ * Current QEMU OpenChannel Device needs to check nsdata->vs[0].
+ * Here check nsdata->vs[0] of the first namespace.
+ */
+ if (ctrlr->cdata.vid == SPDK_PCI_VID_CNEXLABS) {
+ if (ctrlr->num_ns && ctrlr->nsdata[0].vendor_specific[0] == 0x1) {
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+
+int
+spdk_nvme_ocssd_ctrlr_cmd_geometry(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid,
+ void *payload, uint32_t payload_size,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+ int rc;
+
+ if (!payload || (payload_size != sizeof(struct spdk_ocssd_geometry_data))) {
+ return -EINVAL;
+ }
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ req = nvme_allocate_request_user_copy(ctrlr->adminq,
+ payload, payload_size, cb_fn, cb_arg, false);
+ if (req == NULL) {
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_OCSSD_OPC_GEOMETRY;
+ cmd->nsid = nsid;
+
+ rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return rc;
+}
diff --git a/src/spdk/lib/nvme/nvme_cuse.c b/src/spdk/lib/nvme/nvme_cuse.c
new file mode 100644
index 000000000..9a5ee1f0d
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_cuse.c
@@ -0,0 +1,1115 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define FUSE_USE_VERSION 31
+
+#include <fuse3/cuse_lowlevel.h>
+
+#include <linux/nvme_ioctl.h>
+#include <linux/fs.h>
+
+#include "nvme_internal.h"
+#include "nvme_io_msg.h"
+#include "nvme_cuse.h"
+
+struct cuse_device {
+ bool is_started;
+
+ char dev_name[128];
+ uint32_t index;
+ int claim_fd;
+ char lock_name[64];
+
+ struct spdk_nvme_ctrlr *ctrlr; /**< NVMe controller */
+ uint32_t nsid; /**< NVMe name space id, or 0 */
+
+ pthread_t tid;
+ struct fuse_session *session;
+
+ struct cuse_device *ctrlr_device;
+ struct cuse_device *ns_devices; /**< Array of cuse ns devices */
+
+ TAILQ_ENTRY(cuse_device) tailq;
+};
+
+static pthread_mutex_t g_cuse_mtx = PTHREAD_MUTEX_INITIALIZER;
+static TAILQ_HEAD(, cuse_device) g_ctrlr_ctx_head = TAILQ_HEAD_INITIALIZER(g_ctrlr_ctx_head);
+static struct spdk_bit_array *g_ctrlr_started;
+
+struct cuse_io_ctx {
+ struct spdk_nvme_cmd nvme_cmd;
+ enum spdk_nvme_data_transfer data_transfer;
+
+ uint64_t lba;
+ uint32_t lba_count;
+
+ void *data;
+ int data_len;
+
+ fuse_req_t req;
+};
+
+static void
+cuse_io_ctx_free(struct cuse_io_ctx *ctx)
+{
+ spdk_free(ctx->data);
+ free(ctx);
+}
+
+#define FUSE_REPLY_CHECK_BUFFER(req, arg, out_bufsz, val) \
+ if (out_bufsz == 0) { \
+ struct iovec out_iov; \
+ out_iov.iov_base = (void *)arg; \
+ out_iov.iov_len = sizeof(val); \
+ fuse_reply_ioctl_retry(req, NULL, 0, &out_iov, 1); \
+ return; \
+ }
+
+static void
+cuse_nvme_admin_cmd_cb(void *arg, const struct spdk_nvme_cpl *cpl)
+{
+ struct cuse_io_ctx *ctx = arg;
+ struct iovec out_iov[2];
+ struct spdk_nvme_cpl _cpl;
+
+ if (ctx->data_transfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) {
+ fuse_reply_ioctl_iov(ctx->req, cpl->status.sc, NULL, 0);
+ } else {
+ memcpy(&_cpl, cpl, sizeof(struct spdk_nvme_cpl));
+
+ out_iov[0].iov_base = &_cpl.cdw0;
+ out_iov[0].iov_len = sizeof(_cpl.cdw0);
+
+ if (ctx->data_len > 0) {
+ out_iov[1].iov_base = ctx->data;
+ out_iov[1].iov_len = ctx->data_len;
+ fuse_reply_ioctl_iov(ctx->req, cpl->status.sc, out_iov, 2);
+ } else {
+ fuse_reply_ioctl_iov(ctx->req, cpl->status.sc, out_iov, 1);
+ }
+ }
+
+ cuse_io_ctx_free(ctx);
+}
+
+static void
+cuse_nvme_admin_cmd_execute(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, void *arg)
+{
+ int rc;
+ struct cuse_io_ctx *ctx = arg;
+
+ rc = spdk_nvme_ctrlr_cmd_admin_raw(ctrlr, &ctx->nvme_cmd, ctx->data, ctx->data_len,
+ cuse_nvme_admin_cmd_cb, (void *)ctx);
+ if (rc < 0) {
+ fuse_reply_err(ctx->req, EINVAL);
+ cuse_io_ctx_free(ctx);
+ }
+}
+
+static void
+cuse_nvme_admin_cmd_send(fuse_req_t req, struct nvme_admin_cmd *admin_cmd,
+ const void *data)
+{
+ struct cuse_io_ctx *ctx;
+ struct cuse_device *cuse_device = fuse_req_userdata(req);
+ int rv;
+
+ ctx = (struct cuse_io_ctx *)calloc(1, sizeof(struct cuse_io_ctx));
+ if (!ctx) {
+ SPDK_ERRLOG("Cannot allocate memory for cuse_io_ctx\n");
+ fuse_reply_err(req, ENOMEM);
+ return;
+ }
+
+ ctx->req = req;
+ ctx->data_transfer = spdk_nvme_opc_get_data_transfer(admin_cmd->opcode);
+
+ memset(&ctx->nvme_cmd, 0, sizeof(ctx->nvme_cmd));
+ ctx->nvme_cmd.opc = admin_cmd->opcode;
+ ctx->nvme_cmd.nsid = admin_cmd->nsid;
+ ctx->nvme_cmd.cdw10 = admin_cmd->cdw10;
+ ctx->nvme_cmd.cdw11 = admin_cmd->cdw11;
+ ctx->nvme_cmd.cdw12 = admin_cmd->cdw12;
+ ctx->nvme_cmd.cdw13 = admin_cmd->cdw13;
+ ctx->nvme_cmd.cdw14 = admin_cmd->cdw14;
+ ctx->nvme_cmd.cdw15 = admin_cmd->cdw15;
+
+ ctx->data_len = admin_cmd->data_len;
+
+ if (ctx->data_len > 0) {
+ ctx->data = spdk_malloc(ctx->data_len, 0, NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+ if (!ctx->data) {
+ SPDK_ERRLOG("Cannot allocate memory for data\n");
+ fuse_reply_err(req, ENOMEM);
+ free(ctx);
+ return;
+ }
+ if (data != NULL) {
+ memcpy(ctx->data, data, ctx->data_len);
+ }
+ }
+
+ rv = nvme_io_msg_send(cuse_device->ctrlr, 0, cuse_nvme_admin_cmd_execute, ctx);
+ if (rv) {
+ SPDK_ERRLOG("Cannot send io msg to the controller\n");
+ fuse_reply_err(req, -rv);
+ cuse_io_ctx_free(ctx);
+ return;
+ }
+}
+
+static void
+cuse_nvme_admin_cmd(fuse_req_t req, int cmd, void *arg,
+ struct fuse_file_info *fi, unsigned flags,
+ const void *in_buf, size_t in_bufsz, size_t out_bufsz)
+{
+ struct nvme_admin_cmd *admin_cmd;
+ struct iovec in_iov[2], out_iov[2];
+
+ in_iov[0].iov_base = (void *)arg;
+ in_iov[0].iov_len = sizeof(*admin_cmd);
+ if (in_bufsz == 0) {
+ fuse_reply_ioctl_retry(req, in_iov, 1, NULL, 0);
+ return;
+ }
+
+ admin_cmd = (struct nvme_admin_cmd *)in_buf;
+
+ switch (spdk_nvme_opc_get_data_transfer(admin_cmd->opcode)) {
+ case SPDK_NVME_DATA_NONE:
+ SPDK_ERRLOG("SPDK_NVME_DATA_NONE not implemented\n");
+ fuse_reply_err(req, EINVAL);
+ return;
+ case SPDK_NVME_DATA_HOST_TO_CONTROLLER:
+ if (admin_cmd->addr != 0) {
+ in_iov[1].iov_base = (void *)admin_cmd->addr;
+ in_iov[1].iov_len = admin_cmd->data_len;
+ if (in_bufsz == sizeof(*admin_cmd)) {
+ fuse_reply_ioctl_retry(req, in_iov, 2, NULL, 0);
+ return;
+ }
+ cuse_nvme_admin_cmd_send(req, admin_cmd, in_buf + sizeof(*admin_cmd));
+ } else {
+ cuse_nvme_admin_cmd_send(req, admin_cmd, NULL);
+ }
+ return;
+ case SPDK_NVME_DATA_CONTROLLER_TO_HOST:
+ if (out_bufsz == 0) {
+ out_iov[0].iov_base = &((struct nvme_admin_cmd *)arg)->result;
+ out_iov[0].iov_len = sizeof(uint32_t);
+ if (admin_cmd->data_len > 0) {
+ out_iov[1].iov_base = (void *)admin_cmd->addr;
+ out_iov[1].iov_len = admin_cmd->data_len;
+ fuse_reply_ioctl_retry(req, in_iov, 1, out_iov, 2);
+ } else {
+ fuse_reply_ioctl_retry(req, in_iov, 1, out_iov, 1);
+ }
+ return;
+ }
+
+ cuse_nvme_admin_cmd_send(req, admin_cmd, NULL);
+
+ return;
+ case SPDK_NVME_DATA_BIDIRECTIONAL:
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+}
+
+static void
+cuse_nvme_reset_execute(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, void *arg)
+{
+ int rc;
+ fuse_req_t req = arg;
+
+ rc = spdk_nvme_ctrlr_reset(ctrlr);
+ if (rc) {
+ fuse_reply_err(req, rc);
+ return;
+ }
+
+ fuse_reply_ioctl_iov(req, 0, NULL, 0);
+}
+
+static void
+cuse_nvme_reset(fuse_req_t req, int cmd, void *arg,
+ struct fuse_file_info *fi, unsigned flags,
+ const void *in_buf, size_t in_bufsz, size_t out_bufsz)
+{
+ int rv;
+ struct cuse_device *cuse_device = fuse_req_userdata(req);
+
+ if (cuse_device->nsid) {
+ SPDK_ERRLOG("Namespace reset not supported\n");
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ rv = nvme_io_msg_send(cuse_device->ctrlr, cuse_device->nsid, cuse_nvme_reset_execute, (void *)req);
+ if (rv) {
+ SPDK_ERRLOG("Cannot send reset\n");
+ fuse_reply_err(req, EINVAL);
+ }
+}
+
+/*****************************************************************************
+ * Namespace IO requests
+ */
+
+static void
+cuse_nvme_submit_io_write_done(void *ref, const struct spdk_nvme_cpl *cpl)
+{
+ struct cuse_io_ctx *ctx = (struct cuse_io_ctx *)ref;
+
+ fuse_reply_ioctl_iov(ctx->req, cpl->status.sc, NULL, 0);
+
+ cuse_io_ctx_free(ctx);
+}
+
+static void
+cuse_nvme_submit_io_write_cb(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, void *arg)
+{
+ int rc;
+ struct cuse_io_ctx *ctx = arg;
+ struct spdk_nvme_ns *ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
+
+ rc = spdk_nvme_ns_cmd_write(ns, ctrlr->external_io_msgs_qpair, ctx->data,
+ ctx->lba, /* LBA start */
+ ctx->lba_count, /* number of LBAs */
+ cuse_nvme_submit_io_write_done, ctx, 0);
+
+ if (rc != 0) {
+ SPDK_ERRLOG("write failed: rc = %d\n", rc);
+ fuse_reply_err(ctx->req, rc);
+ cuse_io_ctx_free(ctx);
+ return;
+ }
+}
+
+static void
+cuse_nvme_submit_io_write(fuse_req_t req, int cmd, void *arg,
+ struct fuse_file_info *fi, unsigned flags,
+ const void *in_buf, size_t in_bufsz, size_t out_bufsz)
+{
+ const struct nvme_user_io *user_io = in_buf;
+ struct cuse_io_ctx *ctx;
+ struct spdk_nvme_ns *ns;
+ uint32_t block_size;
+ int rc;
+ struct cuse_device *cuse_device = fuse_req_userdata(req);
+
+ ctx = (struct cuse_io_ctx *)calloc(1, sizeof(struct cuse_io_ctx));
+ if (!ctx) {
+ SPDK_ERRLOG("Cannot allocate memory for context\n");
+ fuse_reply_err(req, ENOMEM);
+ return;
+ }
+
+ ctx->req = req;
+
+ ns = spdk_nvme_ctrlr_get_ns(cuse_device->ctrlr, cuse_device->nsid);
+ block_size = spdk_nvme_ns_get_sector_size(ns);
+
+ ctx->lba = user_io->slba;
+ ctx->lba_count = user_io->nblocks + 1;
+ ctx->data_len = ctx->lba_count * block_size;
+
+ ctx->data = spdk_zmalloc(ctx->data_len, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY,
+ SPDK_MALLOC_DMA);
+ if (ctx->data == NULL) {
+ SPDK_ERRLOG("Write buffer allocation failed\n");
+ fuse_reply_err(ctx->req, ENOMEM);
+ free(ctx);
+ return;
+ }
+
+ memcpy(ctx->data, in_buf + sizeof(*user_io), ctx->data_len);
+
+ rc = nvme_io_msg_send(cuse_device->ctrlr, cuse_device->nsid, cuse_nvme_submit_io_write_cb,
+ ctx);
+ if (rc < 0) {
+ SPDK_ERRLOG("Cannot send write io\n");
+ fuse_reply_err(ctx->req, rc);
+ cuse_io_ctx_free(ctx);
+ }
+}
+
+static void
+cuse_nvme_submit_io_read_done(void *ref, const struct spdk_nvme_cpl *cpl)
+{
+ struct cuse_io_ctx *ctx = (struct cuse_io_ctx *)ref;
+ struct iovec iov;
+
+ iov.iov_base = ctx->data;
+ iov.iov_len = ctx->data_len;
+
+ fuse_reply_ioctl_iov(ctx->req, cpl->status.sc, &iov, 1);
+
+ cuse_io_ctx_free(ctx);
+}
+
+static void
+cuse_nvme_submit_io_read_cb(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, void *arg)
+{
+ int rc;
+ struct cuse_io_ctx *ctx = arg;
+ struct spdk_nvme_ns *ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
+
+ rc = spdk_nvme_ns_cmd_read(ns, ctrlr->external_io_msgs_qpair, ctx->data,
+ ctx->lba, /* LBA start */
+ ctx->lba_count, /* number of LBAs */
+ cuse_nvme_submit_io_read_done, ctx, 0);
+
+ if (rc != 0) {
+ SPDK_ERRLOG("read failed: rc = %d\n", rc);
+ fuse_reply_err(ctx->req, rc);
+ cuse_io_ctx_free(ctx);
+ return;
+ }
+}
+
+static void
+cuse_nvme_submit_io_read(fuse_req_t req, int cmd, void *arg,
+ struct fuse_file_info *fi, unsigned flags,
+ const void *in_buf, size_t in_bufsz, size_t out_bufsz)
+{
+ int rc;
+ struct cuse_io_ctx *ctx;
+ const struct nvme_user_io *user_io = in_buf;
+ struct cuse_device *cuse_device = fuse_req_userdata(req);
+ struct spdk_nvme_ns *ns;
+ uint32_t block_size;
+
+ ctx = (struct cuse_io_ctx *)calloc(1, sizeof(struct cuse_io_ctx));
+ if (!ctx) {
+ SPDK_ERRLOG("Cannot allocate memory for context\n");
+ fuse_reply_err(req, ENOMEM);
+ return;
+ }
+
+ ctx->req = req;
+ ctx->lba = user_io->slba;
+ ctx->lba_count = user_io->nblocks;
+
+ ns = spdk_nvme_ctrlr_get_ns(cuse_device->ctrlr, cuse_device->nsid);
+ block_size = spdk_nvme_ns_get_sector_size(ns);
+
+ ctx->data_len = ctx->lba_count * block_size;
+ ctx->data = spdk_zmalloc(ctx->data_len, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY,
+ SPDK_MALLOC_DMA);
+ if (ctx->data == NULL) {
+ SPDK_ERRLOG("Read buffer allocation failed\n");
+ fuse_reply_err(ctx->req, ENOMEM);
+ free(ctx);
+ return;
+ }
+
+ rc = nvme_io_msg_send(cuse_device->ctrlr, cuse_device->nsid, cuse_nvme_submit_io_read_cb, ctx);
+ if (rc < 0) {
+ SPDK_ERRLOG("Cannot send read io\n");
+ fuse_reply_err(ctx->req, rc);
+ cuse_io_ctx_free(ctx);
+ }
+}
+
+
+static void
+cuse_nvme_submit_io(fuse_req_t req, int cmd, void *arg,
+ struct fuse_file_info *fi, unsigned flags,
+ const void *in_buf, size_t in_bufsz, size_t out_bufsz)
+{
+ const struct nvme_user_io *user_io;
+ struct iovec in_iov[2], out_iov;
+
+ in_iov[0].iov_base = (void *)arg;
+ in_iov[0].iov_len = sizeof(*user_io);
+ if (in_bufsz == 0) {
+ fuse_reply_ioctl_retry(req, in_iov, 1, NULL, 0);
+ return;
+ }
+
+ user_io = in_buf;
+
+ switch (user_io->opcode) {
+ case SPDK_NVME_OPC_READ:
+ out_iov.iov_base = (void *)user_io->addr;
+ out_iov.iov_len = (user_io->nblocks + 1) * 512;
+ if (out_bufsz == 0) {
+ fuse_reply_ioctl_retry(req, in_iov, 1, &out_iov, 1);
+ return;
+ }
+
+ cuse_nvme_submit_io_read(req, cmd, arg, fi, flags, in_buf,
+ in_bufsz, out_bufsz);
+ break;
+ case SPDK_NVME_OPC_WRITE:
+ in_iov[1].iov_base = (void *)user_io->addr;
+ in_iov[1].iov_len = (user_io->nblocks + 1) * 512;
+ if (in_bufsz == sizeof(*user_io)) {
+ fuse_reply_ioctl_retry(req, in_iov, 2, NULL, 0);
+ return;
+ }
+
+ cuse_nvme_submit_io_write(req, cmd, arg, fi, flags, in_buf,
+ in_bufsz, out_bufsz);
+
+ break;
+ default:
+ SPDK_ERRLOG("SUBMIT_IO: opc:%d not valid\n", user_io->opcode);
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+}
+
+/*****************************************************************************
+ * Other namespace IOCTLs
+ */
+static void
+cuse_blkgetsize64(fuse_req_t req, int cmd, void *arg,
+ struct fuse_file_info *fi, unsigned flags,
+ const void *in_buf, size_t in_bufsz, size_t out_bufsz)
+{
+ uint64_t size;
+ struct spdk_nvme_ns *ns;
+ struct cuse_device *cuse_device = fuse_req_userdata(req);
+
+ FUSE_REPLY_CHECK_BUFFER(req, arg, out_bufsz, size);
+
+ ns = spdk_nvme_ctrlr_get_ns(cuse_device->ctrlr, cuse_device->nsid);
+ size = spdk_nvme_ns_get_num_sectors(ns);
+ fuse_reply_ioctl(req, 0, &size, sizeof(size));
+}
+
+static void
+cuse_blkpbszget(fuse_req_t req, int cmd, void *arg,
+ struct fuse_file_info *fi, unsigned flags,
+ const void *in_buf, size_t in_bufsz, size_t out_bufsz)
+{
+ int pbsz;
+ struct spdk_nvme_ns *ns;
+ struct cuse_device *cuse_device = fuse_req_userdata(req);
+
+ FUSE_REPLY_CHECK_BUFFER(req, arg, out_bufsz, pbsz);
+
+ ns = spdk_nvme_ctrlr_get_ns(cuse_device->ctrlr, cuse_device->nsid);
+ pbsz = spdk_nvme_ns_get_sector_size(ns);
+ fuse_reply_ioctl(req, 0, &pbsz, sizeof(pbsz));
+}
+
+static void
+cuse_blkgetsize(fuse_req_t req, int cmd, void *arg,
+ struct fuse_file_info *fi, unsigned flags,
+ const void *in_buf, size_t in_bufsz, size_t out_bufsz)
+{
+ long size;
+ struct spdk_nvme_ns *ns;
+ struct cuse_device *cuse_device = fuse_req_userdata(req);
+
+ FUSE_REPLY_CHECK_BUFFER(req, arg, out_bufsz, size);
+
+ ns = spdk_nvme_ctrlr_get_ns(cuse_device->ctrlr, cuse_device->nsid);
+
+ /* return size in 512 bytes blocks */
+ size = spdk_nvme_ns_get_num_sectors(ns) * 512 / spdk_nvme_ns_get_sector_size(ns);
+ fuse_reply_ioctl(req, 0, &size, sizeof(size));
+}
+
+static void
+cuse_getid(fuse_req_t req, int cmd, void *arg,
+ struct fuse_file_info *fi, unsigned flags,
+ const void *in_buf, size_t in_bufsz, size_t out_bufsz)
+{
+ struct cuse_device *cuse_device = fuse_req_userdata(req);
+
+ fuse_reply_ioctl(req, cuse_device->nsid, NULL, 0);
+}
+
+static void
+cuse_ctrlr_ioctl(fuse_req_t req, int cmd, void *arg,
+ struct fuse_file_info *fi, unsigned flags,
+ const void *in_buf, size_t in_bufsz, size_t out_bufsz)
+{
+ if (flags & FUSE_IOCTL_COMPAT) {
+ fuse_reply_err(req, ENOSYS);
+ return;
+ }
+
+ switch (cmd) {
+ case NVME_IOCTL_ADMIN_CMD:
+ cuse_nvme_admin_cmd(req, cmd, arg, fi, flags, in_buf, in_bufsz, out_bufsz);
+ break;
+
+ case NVME_IOCTL_RESET:
+ cuse_nvme_reset(req, cmd, arg, fi, flags, in_buf, in_bufsz, out_bufsz);
+ break;
+
+ default:
+ SPDK_ERRLOG("Unsupported IOCTL 0x%X.\n", cmd);
+ fuse_reply_err(req, EINVAL);
+ }
+}
+
+static void
+cuse_ns_ioctl(fuse_req_t req, int cmd, void *arg,
+ struct fuse_file_info *fi, unsigned flags,
+ const void *in_buf, size_t in_bufsz, size_t out_bufsz)
+{
+ if (flags & FUSE_IOCTL_COMPAT) {
+ fuse_reply_err(req, ENOSYS);
+ return;
+ }
+
+ switch (cmd) {
+ case NVME_IOCTL_ADMIN_CMD:
+ cuse_nvme_admin_cmd(req, cmd, arg, fi, flags, in_buf, in_bufsz, out_bufsz);
+ break;
+
+ case NVME_IOCTL_SUBMIT_IO:
+ cuse_nvme_submit_io(req, cmd, arg, fi, flags, in_buf, in_bufsz, out_bufsz);
+ break;
+
+ case NVME_IOCTL_ID:
+ cuse_getid(req, cmd, arg, fi, flags, in_buf, in_bufsz, out_bufsz);
+ break;
+
+ case BLKPBSZGET:
+ cuse_blkpbszget(req, cmd, arg, fi, flags, in_buf, in_bufsz, out_bufsz);
+ break;
+
+ case BLKGETSIZE:
+ /* Returns the device size as a number of 512-byte blocks (returns pointer to long) */
+ cuse_blkgetsize(req, cmd, arg, fi, flags, in_buf, in_bufsz, out_bufsz);
+ break;
+
+ case BLKGETSIZE64:
+ /* Returns the device size in sectors (returns pointer to uint64_t) */
+ cuse_blkgetsize64(req, cmd, arg, fi, flags, in_buf, in_bufsz, out_bufsz);
+ break;
+
+ default:
+ SPDK_ERRLOG("Unsupported IOCTL 0x%X.\n", cmd);
+ fuse_reply_err(req, EINVAL);
+ }
+}
+
+/*****************************************************************************
+ * CUSE threads initialization.
+ */
+
+static void cuse_open(fuse_req_t req, struct fuse_file_info *fi)
+{
+ fuse_reply_open(req, fi);
+}
+
+static const struct cuse_lowlevel_ops cuse_ctrlr_clop = {
+ .open = cuse_open,
+ .ioctl = cuse_ctrlr_ioctl,
+};
+
+static const struct cuse_lowlevel_ops cuse_ns_clop = {
+ .open = cuse_open,
+ .ioctl = cuse_ns_ioctl,
+};
+
+static void *
+cuse_thread(void *arg)
+{
+ struct cuse_device *cuse_device = arg;
+ char *cuse_argv[] = { "cuse", "-f" };
+ int cuse_argc = SPDK_COUNTOF(cuse_argv);
+ char devname_arg[128 + 8];
+ const char *dev_info_argv[] = { devname_arg };
+ struct cuse_info ci;
+ int multithreaded;
+ int rc;
+ struct fuse_buf buf = { .mem = NULL };
+ struct pollfd fds;
+ int timeout_msecs = 500;
+
+ spdk_unaffinitize_thread();
+
+ snprintf(devname_arg, sizeof(devname_arg), "DEVNAME=%s", cuse_device->dev_name);
+
+ memset(&ci, 0, sizeof(ci));
+ ci.dev_info_argc = 1;
+ ci.dev_info_argv = dev_info_argv;
+ ci.flags = CUSE_UNRESTRICTED_IOCTL;
+
+ if (cuse_device->nsid) {
+ cuse_device->session = cuse_lowlevel_setup(cuse_argc, cuse_argv, &ci, &cuse_ns_clop,
+ &multithreaded, cuse_device);
+ } else {
+ cuse_device->session = cuse_lowlevel_setup(cuse_argc, cuse_argv, &ci, &cuse_ctrlr_clop,
+ &multithreaded, cuse_device);
+ }
+ if (!cuse_device->session) {
+ SPDK_ERRLOG("Cannot create cuse session\n");
+ goto err;
+ }
+
+ SPDK_NOTICELOG("fuse session for device %s created\n", cuse_device->dev_name);
+
+ /* Receive and process fuse requests */
+ fds.fd = fuse_session_fd(cuse_device->session);
+ fds.events = POLLIN;
+ while (!fuse_session_exited(cuse_device->session)) {
+ rc = poll(&fds, 1, timeout_msecs);
+ if (rc <= 0) {
+ continue;
+ }
+ rc = fuse_session_receive_buf(cuse_device->session, &buf);
+ if (rc > 0) {
+ fuse_session_process_buf(cuse_device->session, &buf);
+ }
+ }
+ free(buf.mem);
+ fuse_session_reset(cuse_device->session);
+ cuse_lowlevel_teardown(cuse_device->session);
+err:
+ pthread_exit(NULL);
+}
+
+/*****************************************************************************
+ * CUSE devices management
+ */
+
+static int
+cuse_nvme_ns_start(struct cuse_device *ctrlr_device, uint32_t nsid)
+{
+ struct cuse_device *ns_device;
+ int rv;
+
+ ns_device = &ctrlr_device->ns_devices[nsid - 1];
+ if (ns_device->is_started) {
+ return 0;
+ }
+
+ ns_device->ctrlr = ctrlr_device->ctrlr;
+ ns_device->ctrlr_device = ctrlr_device;
+ ns_device->nsid = nsid;
+ rv = snprintf(ns_device->dev_name, sizeof(ns_device->dev_name), "%sn%d",
+ ctrlr_device->dev_name, ns_device->nsid);
+ if (rv < 0) {
+ SPDK_ERRLOG("Device name too long.\n");
+ free(ns_device);
+ return -ENAMETOOLONG;
+ }
+
+ rv = pthread_create(&ns_device->tid, NULL, cuse_thread, ns_device);
+ if (rv != 0) {
+ SPDK_ERRLOG("pthread_create failed\n");
+ return -rv;
+ }
+
+ ns_device->is_started = true;
+
+ return 0;
+}
+
+static void
+cuse_nvme_ns_stop(struct cuse_device *ctrlr_device, uint32_t nsid)
+{
+ struct cuse_device *ns_device;
+
+ ns_device = &ctrlr_device->ns_devices[nsid - 1];
+ if (!ns_device->is_started) {
+ return;
+ }
+
+ fuse_session_exit(ns_device->session);
+ pthread_join(ns_device->tid, NULL);
+ ns_device->is_started = false;
+}
+
+static int
+nvme_cuse_claim(struct cuse_device *ctrlr_device, uint32_t index)
+{
+ int dev_fd;
+ int pid;
+ void *dev_map;
+ struct flock cusedev_lock = {
+ .l_type = F_WRLCK,
+ .l_whence = SEEK_SET,
+ .l_start = 0,
+ .l_len = 0,
+ };
+
+ snprintf(ctrlr_device->lock_name, sizeof(ctrlr_device->lock_name),
+ "/tmp/spdk_nvme_cuse_lock_%" PRIu32, index);
+
+ dev_fd = open(ctrlr_device->lock_name, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
+ if (dev_fd == -1) {
+ SPDK_ERRLOG("could not open %s\n", ctrlr_device->lock_name);
+ return -errno;
+ }
+
+ if (ftruncate(dev_fd, sizeof(int)) != 0) {
+ SPDK_ERRLOG("could not truncate %s\n", ctrlr_device->lock_name);
+ close(dev_fd);
+ return -errno;
+ }
+
+ dev_map = mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE,
+ MAP_SHARED, dev_fd, 0);
+ if (dev_map == MAP_FAILED) {
+ SPDK_ERRLOG("could not mmap dev %s (%d)\n", ctrlr_device->lock_name, errno);
+ close(dev_fd);
+ return -errno;
+ }
+
+ if (fcntl(dev_fd, F_SETLK, &cusedev_lock) != 0) {
+ pid = *(int *)dev_map;
+ SPDK_ERRLOG("Cannot create lock on device %s, probably"
+ " process %d has claimed it\n", ctrlr_device->lock_name, pid);
+ munmap(dev_map, sizeof(int));
+ close(dev_fd);
+ /* F_SETLK returns unspecified errnos, normalize them */
+ return -EACCES;
+ }
+
+ *(int *)dev_map = (int)getpid();
+ munmap(dev_map, sizeof(int));
+ ctrlr_device->claim_fd = dev_fd;
+ ctrlr_device->index = index;
+ /* Keep dev_fd open to maintain the lock. */
+ return 0;
+}
+
+static void
+nvme_cuse_unclaim(struct cuse_device *ctrlr_device)
+{
+ close(ctrlr_device->claim_fd);
+ ctrlr_device->claim_fd = -1;
+ unlink(ctrlr_device->lock_name);
+}
+
+static void
+cuse_nvme_ctrlr_stop(struct cuse_device *ctrlr_device)
+{
+ uint32_t i;
+ uint32_t num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr_device->ctrlr);
+
+ for (i = 1; i <= num_ns; i++) {
+ cuse_nvme_ns_stop(ctrlr_device, i);
+ }
+
+ fuse_session_exit(ctrlr_device->session);
+ pthread_join(ctrlr_device->tid, NULL);
+ TAILQ_REMOVE(&g_ctrlr_ctx_head, ctrlr_device, tailq);
+ spdk_bit_array_clear(g_ctrlr_started, ctrlr_device->index);
+ if (spdk_bit_array_count_set(g_ctrlr_started) == 0) {
+ spdk_bit_array_free(&g_ctrlr_started);
+ }
+ nvme_cuse_unclaim(ctrlr_device);
+ free(ctrlr_device->ns_devices);
+ free(ctrlr_device);
+}
+
+static int
+cuse_nvme_ctrlr_update_namespaces(struct cuse_device *ctrlr_device)
+{
+ uint32_t nsid;
+ uint32_t num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr_device->ctrlr);
+
+ for (nsid = 1; nsid <= num_ns; nsid++) {
+ if (!spdk_nvme_ctrlr_is_active_ns(ctrlr_device->ctrlr, nsid)) {
+ cuse_nvme_ns_stop(ctrlr_device, nsid);
+ continue;
+ }
+
+ if (cuse_nvme_ns_start(ctrlr_device, nsid) < 0) {
+ SPDK_ERRLOG("Cannot start CUSE namespace device.");
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+static int
+nvme_cuse_start(struct spdk_nvme_ctrlr *ctrlr)
+{
+ int rv = 0;
+ struct cuse_device *ctrlr_device;
+ uint32_t num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr);
+
+ SPDK_NOTICELOG("Creating cuse device for controller\n");
+
+ if (g_ctrlr_started == NULL) {
+ g_ctrlr_started = spdk_bit_array_create(128);
+ if (g_ctrlr_started == NULL) {
+ SPDK_ERRLOG("Cannot create bit array\n");
+ return -ENOMEM;
+ }
+ }
+
+ ctrlr_device = (struct cuse_device *)calloc(1, sizeof(struct cuse_device));
+ if (!ctrlr_device) {
+ SPDK_ERRLOG("Cannot allocate memory for ctrlr_device.");
+ rv = -ENOMEM;
+ goto err2;
+ }
+
+ ctrlr_device->ctrlr = ctrlr;
+
+ /* Check if device already exists, if not increment index until success */
+ ctrlr_device->index = 0;
+ while (1) {
+ ctrlr_device->index = spdk_bit_array_find_first_clear(g_ctrlr_started, ctrlr_device->index);
+ if (ctrlr_device->index == UINT32_MAX) {
+ SPDK_ERRLOG("Too many registered controllers\n");
+ goto err2;
+ }
+
+ if (nvme_cuse_claim(ctrlr_device, ctrlr_device->index) == 0) {
+ break;
+ }
+ ctrlr_device->index++;
+ }
+ spdk_bit_array_set(g_ctrlr_started, ctrlr_device->index);
+ snprintf(ctrlr_device->dev_name, sizeof(ctrlr_device->dev_name), "spdk/nvme%d",
+ ctrlr_device->index);
+
+ rv = pthread_create(&ctrlr_device->tid, NULL, cuse_thread, ctrlr_device);
+ if (rv != 0) {
+ SPDK_ERRLOG("pthread_create failed\n");
+ rv = -rv;
+ goto err3;
+ }
+ TAILQ_INSERT_TAIL(&g_ctrlr_ctx_head, ctrlr_device, tailq);
+
+ ctrlr_device->ns_devices = (struct cuse_device *)calloc(num_ns, sizeof(struct cuse_device));
+ /* Start all active namespaces */
+ if (cuse_nvme_ctrlr_update_namespaces(ctrlr_device) < 0) {
+ SPDK_ERRLOG("Cannot start CUSE namespace devices.");
+ cuse_nvme_ctrlr_stop(ctrlr_device);
+ rv = -1;
+ goto err3;
+ }
+
+ return 0;
+
+err3:
+ spdk_bit_array_clear(g_ctrlr_started, ctrlr_device->index);
+err2:
+ free(ctrlr_device);
+ if (spdk_bit_array_count_set(g_ctrlr_started) == 0) {
+ spdk_bit_array_free(&g_ctrlr_started);
+ }
+ return rv;
+}
+
+static struct cuse_device *
+nvme_cuse_get_cuse_ctrlr_device(struct spdk_nvme_ctrlr *ctrlr)
+{
+ struct cuse_device *ctrlr_device = NULL;
+
+ TAILQ_FOREACH(ctrlr_device, &g_ctrlr_ctx_head, tailq) {
+ if (ctrlr_device->ctrlr == ctrlr) {
+ break;
+ }
+ }
+
+ return ctrlr_device;
+}
+
+static struct cuse_device *
+nvme_cuse_get_cuse_ns_device(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid)
+{
+ struct cuse_device *ctrlr_device = NULL;
+ uint32_t num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr);
+
+ if (nsid < 1 || nsid > num_ns) {
+ return NULL;
+ }
+
+ ctrlr_device = nvme_cuse_get_cuse_ctrlr_device(ctrlr);
+ if (!ctrlr_device) {
+ return NULL;
+ }
+
+ if (!ctrlr_device->ns_devices[nsid - 1].is_started) {
+ return NULL;
+ }
+
+ return &ctrlr_device->ns_devices[nsid - 1];
+}
+
+static void
+nvme_cuse_stop(struct spdk_nvme_ctrlr *ctrlr)
+{
+ struct cuse_device *ctrlr_device;
+
+ pthread_mutex_lock(&g_cuse_mtx);
+
+ ctrlr_device = nvme_cuse_get_cuse_ctrlr_device(ctrlr);
+ if (!ctrlr_device) {
+ SPDK_ERRLOG("Cannot find associated CUSE device\n");
+ pthread_mutex_unlock(&g_cuse_mtx);
+ return;
+ }
+
+ cuse_nvme_ctrlr_stop(ctrlr_device);
+
+ pthread_mutex_unlock(&g_cuse_mtx);
+}
+
+static void
+nvme_cuse_update(struct spdk_nvme_ctrlr *ctrlr)
+{
+ struct cuse_device *ctrlr_device;
+
+ pthread_mutex_lock(&g_cuse_mtx);
+
+ ctrlr_device = nvme_cuse_get_cuse_ctrlr_device(ctrlr);
+ if (!ctrlr_device) {
+ pthread_mutex_unlock(&g_cuse_mtx);
+ return;
+ }
+
+ cuse_nvme_ctrlr_update_namespaces(ctrlr_device);
+
+ pthread_mutex_unlock(&g_cuse_mtx);
+}
+
+static struct nvme_io_msg_producer cuse_nvme_io_msg_producer = {
+ .name = "cuse",
+ .stop = nvme_cuse_stop,
+ .update = nvme_cuse_update,
+};
+
+int
+spdk_nvme_cuse_register(struct spdk_nvme_ctrlr *ctrlr)
+{
+ int rc;
+
+ rc = nvme_io_msg_ctrlr_register(ctrlr, &cuse_nvme_io_msg_producer);
+ if (rc) {
+ return rc;
+ }
+
+ pthread_mutex_lock(&g_cuse_mtx);
+
+ rc = nvme_cuse_start(ctrlr);
+ if (rc) {
+ nvme_io_msg_ctrlr_unregister(ctrlr, &cuse_nvme_io_msg_producer);
+ }
+
+ pthread_mutex_unlock(&g_cuse_mtx);
+
+ return rc;
+}
+
+int
+spdk_nvme_cuse_unregister(struct spdk_nvme_ctrlr *ctrlr)
+{
+ struct cuse_device *ctrlr_device;
+
+ pthread_mutex_lock(&g_cuse_mtx);
+
+ ctrlr_device = nvme_cuse_get_cuse_ctrlr_device(ctrlr);
+ if (!ctrlr_device) {
+ SPDK_ERRLOG("Cannot find associated CUSE device\n");
+ pthread_mutex_unlock(&g_cuse_mtx);
+ return -ENODEV;
+ }
+
+ cuse_nvme_ctrlr_stop(ctrlr_device);
+
+ pthread_mutex_unlock(&g_cuse_mtx);
+
+ nvme_io_msg_ctrlr_unregister(ctrlr, &cuse_nvme_io_msg_producer);
+
+ return 0;
+}
+
+void
+spdk_nvme_cuse_update_namespaces(struct spdk_nvme_ctrlr *ctrlr)
+{
+ nvme_cuse_update(ctrlr);
+}
+
+int
+spdk_nvme_cuse_get_ctrlr_name(struct spdk_nvme_ctrlr *ctrlr, char *name, size_t *size)
+{
+ struct cuse_device *ctrlr_device;
+ size_t req_len;
+
+ pthread_mutex_lock(&g_cuse_mtx);
+
+ ctrlr_device = nvme_cuse_get_cuse_ctrlr_device(ctrlr);
+ if (!ctrlr_device) {
+ pthread_mutex_unlock(&g_cuse_mtx);
+ return -ENODEV;
+ }
+
+ req_len = strnlen(ctrlr_device->dev_name, sizeof(ctrlr_device->dev_name));
+ if (*size < req_len) {
+ *size = req_len;
+ pthread_mutex_unlock(&g_cuse_mtx);
+ return -ENOSPC;
+ }
+ snprintf(name, req_len + 1, "%s", ctrlr_device->dev_name);
+
+ pthread_mutex_unlock(&g_cuse_mtx);
+
+ return 0;
+}
+
+int
+spdk_nvme_cuse_get_ns_name(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, char *name, size_t *size)
+{
+ struct cuse_device *ns_device;
+ size_t req_len;
+
+ pthread_mutex_lock(&g_cuse_mtx);
+
+ ns_device = nvme_cuse_get_cuse_ns_device(ctrlr, nsid);
+ if (!ns_device) {
+ pthread_mutex_unlock(&g_cuse_mtx);
+ return -ENODEV;
+ }
+
+ req_len = strnlen(ns_device->dev_name, sizeof(ns_device->dev_name));
+ if (*size < req_len) {
+ *size = req_len;
+ pthread_mutex_unlock(&g_cuse_mtx);
+ return -ENOSPC;
+ }
+ snprintf(name, req_len + 1, "%s", ns_device->dev_name);
+
+ pthread_mutex_unlock(&g_cuse_mtx);
+
+ return 0;
+}
diff --git a/src/spdk/lib/nvme/nvme_cuse.h b/src/spdk/lib/nvme/nvme_cuse.h
new file mode 100644
index 000000000..92b475190
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_cuse.h
@@ -0,0 +1,42 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __NVME_CUSE_H__
+#define __NVME_CUSE_H__
+
+#include "spdk/nvme.h"
+
+int nvme_cuse_register(struct spdk_nvme_ctrlr *ctrlr, const char *dev_path);
+void nvme_cuse_unregister(struct spdk_nvme_ctrlr *ctrlr);
+
+#endif /* __NVME_CUSE_H__ */
diff --git a/src/spdk/lib/nvme/nvme_fabric.c b/src/spdk/lib/nvme/nvme_fabric.c
new file mode 100644
index 000000000..9fff20873
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_fabric.c
@@ -0,0 +1,475 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation. All rights reserved.
+ * Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * NVMe over Fabrics transport-independent functions
+ */
+
+#include "nvme_internal.h"
+
+#include "spdk/endian.h"
+#include "spdk/string.h"
+
+static int
+nvme_fabric_prop_set_cmd(struct spdk_nvme_ctrlr *ctrlr,
+ uint32_t offset, uint8_t size, uint64_t value)
+{
+ struct spdk_nvmf_fabric_prop_set_cmd cmd = {};
+ struct nvme_completion_poll_status *status;
+ int rc;
+
+ assert(size == SPDK_NVMF_PROP_SIZE_4 || size == SPDK_NVMF_PROP_SIZE_8);
+
+ status = calloc(1, sizeof(*status));
+ if (!status) {
+ SPDK_ERRLOG("Failed to allocate status tracker\n");
+ return -ENOMEM;
+ }
+
+ cmd.opcode = SPDK_NVME_OPC_FABRIC;
+ cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET;
+ cmd.ofst = offset;
+ cmd.attrib.size = size;
+ cmd.value.u64 = value;
+
+ rc = spdk_nvme_ctrlr_cmd_admin_raw(ctrlr, (struct spdk_nvme_cmd *)&cmd,
+ NULL, 0,
+ nvme_completion_poll_cb, status);
+ if (rc < 0) {
+ free(status);
+ return rc;
+ }
+
+ if (nvme_wait_for_completion(ctrlr->adminq, status)) {
+ if (!status->timed_out) {
+ free(status);
+ }
+ SPDK_ERRLOG("Property Set failed\n");
+ return -1;
+ }
+ free(status);
+
+ return 0;
+}
+
+static int
+nvme_fabric_prop_get_cmd(struct spdk_nvme_ctrlr *ctrlr,
+ uint32_t offset, uint8_t size, uint64_t *value)
+{
+ struct spdk_nvmf_fabric_prop_set_cmd cmd = {};
+ struct nvme_completion_poll_status *status;
+ struct spdk_nvmf_fabric_prop_get_rsp *response;
+ int rc;
+
+ assert(size == SPDK_NVMF_PROP_SIZE_4 || size == SPDK_NVMF_PROP_SIZE_8);
+
+ status = calloc(1, sizeof(*status));
+ if (!status) {
+ SPDK_ERRLOG("Failed to allocate status tracker\n");
+ return -ENOMEM;
+ }
+
+ cmd.opcode = SPDK_NVME_OPC_FABRIC;
+ cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET;
+ cmd.ofst = offset;
+ cmd.attrib.size = size;
+
+ rc = spdk_nvme_ctrlr_cmd_admin_raw(ctrlr, (struct spdk_nvme_cmd *)&cmd,
+ NULL, 0, nvme_completion_poll_cb,
+ status);
+ if (rc < 0) {
+ free(status);
+ return rc;
+ }
+
+ if (nvme_wait_for_completion(ctrlr->adminq, status)) {
+ if (!status->timed_out) {
+ free(status);
+ }
+ SPDK_ERRLOG("Property Get failed\n");
+ return -1;
+ }
+
+ response = (struct spdk_nvmf_fabric_prop_get_rsp *)&status->cpl;
+
+ if (size == SPDK_NVMF_PROP_SIZE_4) {
+ *value = response->value.u32.low;
+ } else {
+ *value = response->value.u64;
+ }
+
+ free(status);
+
+ return 0;
+}
+
+int
+nvme_fabric_ctrlr_set_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t value)
+{
+ return nvme_fabric_prop_set_cmd(ctrlr, offset, SPDK_NVMF_PROP_SIZE_4, value);
+}
+
+int
+nvme_fabric_ctrlr_set_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t value)
+{
+ return nvme_fabric_prop_set_cmd(ctrlr, offset, SPDK_NVMF_PROP_SIZE_8, value);
+}
+
+int
+nvme_fabric_ctrlr_get_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t *value)
+{
+ uint64_t tmp_value;
+ int rc;
+ rc = nvme_fabric_prop_get_cmd(ctrlr, offset, SPDK_NVMF_PROP_SIZE_4, &tmp_value);
+
+ if (!rc) {
+ *value = (uint32_t)tmp_value;
+ }
+ return rc;
+}
+
+int
+nvme_fabric_ctrlr_get_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t *value)
+{
+ return nvme_fabric_prop_get_cmd(ctrlr, offset, SPDK_NVMF_PROP_SIZE_8, value);
+}
+
+static void
+nvme_fabric_discover_probe(struct spdk_nvmf_discovery_log_page_entry *entry,
+ struct spdk_nvme_probe_ctx *probe_ctx,
+ int discover_priority)
+{
+ struct spdk_nvme_transport_id trid;
+ uint8_t *end;
+ size_t len;
+
+ memset(&trid, 0, sizeof(trid));
+
+ if (entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) {
+ SPDK_WARNLOG("Skipping unsupported discovery service referral\n");
+ return;
+ } else if (entry->subtype != SPDK_NVMF_SUBTYPE_NVME) {
+ SPDK_WARNLOG("Skipping unknown subtype %u\n", entry->subtype);
+ return;
+ }
+
+ trid.trtype = entry->trtype;
+ spdk_nvme_transport_id_populate_trstring(&trid, spdk_nvme_transport_id_trtype_str(entry->trtype));
+ if (!spdk_nvme_transport_available_by_name(trid.trstring)) {
+ SPDK_WARNLOG("NVMe transport type %u not available; skipping probe\n",
+ trid.trtype);
+ return;
+ }
+
+ snprintf(trid.trstring, sizeof(trid.trstring), "%s", probe_ctx->trid.trstring);
+ trid.adrfam = entry->adrfam;
+
+ /* Ensure that subnqn is null terminated. */
+ end = memchr(entry->subnqn, '\0', SPDK_NVMF_NQN_MAX_LEN + 1);
+ if (!end) {
+ SPDK_ERRLOG("Discovery entry SUBNQN is not null terminated\n");
+ return;
+ }
+ len = end - entry->subnqn;
+ memcpy(trid.subnqn, entry->subnqn, len);
+ trid.subnqn[len] = '\0';
+
+ /* Convert traddr to a null terminated string. */
+ len = spdk_strlen_pad(entry->traddr, sizeof(entry->traddr), ' ');
+ memcpy(trid.traddr, entry->traddr, len);
+ if (spdk_str_chomp(trid.traddr) != 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Trailing newlines removed from discovery TRADDR\n");
+ }
+
+ /* Convert trsvcid to a null terminated string. */
+ len = spdk_strlen_pad(entry->trsvcid, sizeof(entry->trsvcid), ' ');
+ memcpy(trid.trsvcid, entry->trsvcid, len);
+ if (spdk_str_chomp(trid.trsvcid) != 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Trailing newlines removed from discovery TRSVCID\n");
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "subnqn=%s, trtype=%u, traddr=%s, trsvcid=%s\n",
+ trid.subnqn, trid.trtype,
+ trid.traddr, trid.trsvcid);
+
+ /* Copy the priority from the discovery ctrlr */
+ trid.priority = discover_priority;
+
+ nvme_ctrlr_probe(&trid, probe_ctx, NULL);
+}
+
+static int
+nvme_fabric_get_discovery_log_page(struct spdk_nvme_ctrlr *ctrlr,
+ void *log_page, uint32_t size, uint64_t offset)
+{
+ struct nvme_completion_poll_status *status;
+ int rc;
+
+ status = calloc(1, sizeof(*status));
+ if (!status) {
+ SPDK_ERRLOG("Failed to allocate status tracker\n");
+ return -ENOMEM;
+ }
+
+ rc = spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, SPDK_NVME_LOG_DISCOVERY, 0, log_page, size, offset,
+ nvme_completion_poll_cb, status);
+ if (rc < 0) {
+ free(status);
+ return -1;
+ }
+
+ if (nvme_wait_for_completion(ctrlr->adminq, status)) {
+ if (!status->timed_out) {
+ free(status);
+ }
+ return -1;
+ }
+ free(status);
+
+ return 0;
+}
+
+int
+nvme_fabric_ctrlr_scan(struct spdk_nvme_probe_ctx *probe_ctx,
+ bool direct_connect)
+{
+ struct spdk_nvme_ctrlr_opts discovery_opts;
+ struct spdk_nvme_ctrlr *discovery_ctrlr;
+ union spdk_nvme_cc_register cc;
+ int rc;
+ struct nvme_completion_poll_status *status;
+
+ if (strcmp(probe_ctx->trid.subnqn, SPDK_NVMF_DISCOVERY_NQN) != 0) {
+ /* It is not a discovery_ctrlr info and try to directly connect it */
+ rc = nvme_ctrlr_probe(&probe_ctx->trid, probe_ctx, NULL);
+ return rc;
+ }
+
+ spdk_nvme_ctrlr_get_default_ctrlr_opts(&discovery_opts, sizeof(discovery_opts));
+ /* For discovery_ctrlr set the timeout to 0 */
+ discovery_opts.keep_alive_timeout_ms = 0;
+
+ discovery_ctrlr = nvme_transport_ctrlr_construct(&probe_ctx->trid, &discovery_opts, NULL);
+ if (discovery_ctrlr == NULL) {
+ return -1;
+ }
+ nvme_qpair_set_state(discovery_ctrlr->adminq, NVME_QPAIR_ENABLED);
+
+ /* TODO: this should be using the normal NVMe controller initialization process +1 */
+ cc.raw = 0;
+ cc.bits.en = 1;
+ cc.bits.iosqes = 6; /* SQ entry size == 64 == 2^6 */
+ cc.bits.iocqes = 4; /* CQ entry size == 16 == 2^4 */
+ rc = nvme_transport_ctrlr_set_reg_4(discovery_ctrlr, offsetof(struct spdk_nvme_registers, cc.raw),
+ cc.raw);
+ if (rc < 0) {
+ SPDK_ERRLOG("Failed to set cc\n");
+ nvme_ctrlr_destruct(discovery_ctrlr);
+ return -1;
+ }
+
+ status = calloc(1, sizeof(*status));
+ if (!status) {
+ SPDK_ERRLOG("Failed to allocate status tracker\n");
+ nvme_ctrlr_destruct(discovery_ctrlr);
+ return -ENOMEM;
+ }
+
+ /* get the cdata info */
+ rc = nvme_ctrlr_cmd_identify(discovery_ctrlr, SPDK_NVME_IDENTIFY_CTRLR, 0, 0,
+ &discovery_ctrlr->cdata, sizeof(discovery_ctrlr->cdata),
+ nvme_completion_poll_cb, status);
+ if (rc != 0) {
+ SPDK_ERRLOG("Failed to identify cdata\n");
+ nvme_ctrlr_destruct(discovery_ctrlr);
+ free(status);
+ return rc;
+ }
+
+ if (nvme_wait_for_completion(discovery_ctrlr->adminq, status)) {
+ SPDK_ERRLOG("nvme_identify_controller failed!\n");
+ nvme_ctrlr_destruct(discovery_ctrlr);
+ if (!status->timed_out) {
+ free(status);
+ }
+ return -ENXIO;
+ }
+
+ free(status);
+
+ /* Direct attach through spdk_nvme_connect() API */
+ if (direct_connect == true) {
+ /* Set the ready state to skip the normal init process */
+ discovery_ctrlr->state = NVME_CTRLR_STATE_READY;
+ nvme_ctrlr_connected(probe_ctx, discovery_ctrlr);
+ nvme_ctrlr_add_process(discovery_ctrlr, 0);
+ return 0;
+ }
+
+ rc = nvme_fabric_ctrlr_discover(discovery_ctrlr, probe_ctx);
+ nvme_ctrlr_destruct(discovery_ctrlr);
+ return rc;
+}
+
+int
+nvme_fabric_ctrlr_discover(struct spdk_nvme_ctrlr *ctrlr,
+ struct spdk_nvme_probe_ctx *probe_ctx)
+{
+ struct spdk_nvmf_discovery_log_page *log_page;
+ struct spdk_nvmf_discovery_log_page_entry *log_page_entry;
+ char buffer[4096];
+ int rc;
+ uint64_t i, numrec, buffer_max_entries_first, buffer_max_entries, log_page_offset = 0;
+ uint64_t remaining_num_rec = 0;
+ uint16_t recfmt;
+
+ memset(buffer, 0x0, 4096);
+ buffer_max_entries_first = (sizeof(buffer) - offsetof(struct spdk_nvmf_discovery_log_page,
+ entries[0])) /
+ sizeof(struct spdk_nvmf_discovery_log_page_entry);
+ buffer_max_entries = sizeof(buffer) / sizeof(struct spdk_nvmf_discovery_log_page_entry);
+ do {
+ rc = nvme_fabric_get_discovery_log_page(ctrlr, buffer, sizeof(buffer), log_page_offset);
+ if (rc < 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Get Log Page - Discovery error\n");
+ return rc;
+ }
+
+ if (!remaining_num_rec) {
+ log_page = (struct spdk_nvmf_discovery_log_page *)buffer;
+ recfmt = from_le16(&log_page->recfmt);
+ if (recfmt != 0) {
+ SPDK_ERRLOG("Unrecognized discovery log record format %" PRIu16 "\n", recfmt);
+ return -EPROTO;
+ }
+ remaining_num_rec = log_page->numrec;
+ log_page_offset = offsetof(struct spdk_nvmf_discovery_log_page, entries[0]);
+ log_page_entry = &log_page->entries[0];
+ numrec = spdk_min(remaining_num_rec, buffer_max_entries_first);
+ } else {
+ numrec = spdk_min(remaining_num_rec, buffer_max_entries);
+ log_page_entry = (struct spdk_nvmf_discovery_log_page_entry *)buffer;
+ }
+
+ for (i = 0; i < numrec; i++) {
+ nvme_fabric_discover_probe(log_page_entry++, probe_ctx, ctrlr->trid.priority);
+ }
+ remaining_num_rec -= numrec;
+ log_page_offset += numrec * sizeof(struct spdk_nvmf_discovery_log_page_entry);
+ } while (remaining_num_rec != 0);
+
+ return 0;
+}
+
+int
+nvme_fabric_qpair_connect(struct spdk_nvme_qpair *qpair, uint32_t num_entries)
+{
+ struct nvme_completion_poll_status *status;
+ struct spdk_nvmf_fabric_connect_rsp *rsp;
+ struct spdk_nvmf_fabric_connect_cmd cmd;
+ struct spdk_nvmf_fabric_connect_data *nvmf_data;
+ struct spdk_nvme_ctrlr *ctrlr;
+ int rc;
+
+ if (num_entries == 0 || num_entries > SPDK_NVME_IO_QUEUE_MAX_ENTRIES) {
+ return -EINVAL;
+ }
+
+ ctrlr = qpair->ctrlr;
+ if (!ctrlr) {
+ return -EINVAL;
+ }
+
+ nvmf_data = spdk_zmalloc(sizeof(*nvmf_data), 0, NULL,
+ SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+ if (!nvmf_data) {
+ SPDK_ERRLOG("nvmf_data allocation error\n");
+ return -ENOMEM;
+ }
+
+ status = calloc(1, sizeof(*status));
+ if (!status) {
+ SPDK_ERRLOG("Failed to allocate status tracker\n");
+ spdk_free(nvmf_data);
+ return -ENOMEM;
+ }
+
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.opcode = SPDK_NVME_OPC_FABRIC;
+ cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_CONNECT;
+ cmd.qid = qpair->id;
+ cmd.sqsize = num_entries - 1;
+ cmd.kato = ctrlr->opts.keep_alive_timeout_ms;
+
+ if (nvme_qpair_is_admin_queue(qpair)) {
+ nvmf_data->cntlid = 0xFFFF;
+ } else {
+ nvmf_data->cntlid = ctrlr->cntlid;
+ }
+
+ SPDK_STATIC_ASSERT(sizeof(nvmf_data->hostid) == sizeof(ctrlr->opts.extended_host_id),
+ "host ID size mismatch");
+ memcpy(nvmf_data->hostid, ctrlr->opts.extended_host_id, sizeof(nvmf_data->hostid));
+ snprintf(nvmf_data->hostnqn, sizeof(nvmf_data->hostnqn), "%s", ctrlr->opts.hostnqn);
+ snprintf(nvmf_data->subnqn, sizeof(nvmf_data->subnqn), "%s", ctrlr->trid.subnqn);
+
+ rc = spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair,
+ (struct spdk_nvme_cmd *)&cmd,
+ nvmf_data, sizeof(*nvmf_data),
+ nvme_completion_poll_cb, status);
+ if (rc < 0) {
+ SPDK_ERRLOG("Connect command failed\n");
+ spdk_free(nvmf_data);
+ free(status);
+ return rc;
+ }
+
+ if (nvme_wait_for_completion(qpair, status)) {
+ SPDK_ERRLOG("Connect command failed\n");
+ spdk_free(nvmf_data);
+ if (!status->timed_out) {
+ free(status);
+ }
+ return -EIO;
+ }
+
+ if (nvme_qpair_is_admin_queue(qpair)) {
+ rsp = (struct spdk_nvmf_fabric_connect_rsp *)&status->cpl;
+ ctrlr->cntlid = rsp->status_code_specific.success.cntlid;
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "CNTLID 0x%04" PRIx16 "\n", ctrlr->cntlid);
+ }
+
+ spdk_free(nvmf_data);
+ free(status);
+ return 0;
+}
diff --git a/src/spdk/lib/nvme/nvme_internal.h b/src/spdk/lib/nvme/nvme_internal.h
new file mode 100644
index 000000000..98fec279d
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_internal.h
@@ -0,0 +1,1233 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation. All rights reserved.
+ * Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __NVME_INTERNAL_H__
+#define __NVME_INTERNAL_H__
+
+#include "spdk/config.h"
+#include "spdk/likely.h"
+#include "spdk/stdinc.h"
+
+#include "spdk/nvme.h"
+
+#if defined(__i386__) || defined(__x86_64__)
+#include <x86intrin.h>
+#endif
+
+#include "spdk/queue.h"
+#include "spdk/barrier.h"
+#include "spdk/bit_array.h"
+#include "spdk/mmio.h"
+#include "spdk/pci_ids.h"
+#include "spdk/util.h"
+#include "spdk/memory.h"
+#include "spdk/nvme_intel.h"
+#include "spdk/nvmf_spec.h"
+#include "spdk/uuid.h"
+
+#include "spdk_internal/assert.h"
+#include "spdk_internal/log.h"
+
+extern pid_t g_spdk_nvme_pid;
+
+/*
+ * Some Intel devices support vendor-unique read latency log page even
+ * though the log page directory says otherwise.
+ */
+#define NVME_INTEL_QUIRK_READ_LATENCY 0x1
+
+/*
+ * Some Intel devices support vendor-unique write latency log page even
+ * though the log page directory says otherwise.
+ */
+#define NVME_INTEL_QUIRK_WRITE_LATENCY 0x2
+
+/*
+ * The controller needs a delay before starts checking the device
+ * readiness, which is done by reading the NVME_CSTS_RDY bit.
+ */
+#define NVME_QUIRK_DELAY_BEFORE_CHK_RDY 0x4
+
+/*
+ * The controller performs best when I/O is split on particular
+ * LBA boundaries.
+ */
+#define NVME_INTEL_QUIRK_STRIPING 0x8
+
+/*
+ * The controller needs a delay after allocating an I/O queue pair
+ * before it is ready to accept I/O commands.
+ */
+#define NVME_QUIRK_DELAY_AFTER_QUEUE_ALLOC 0x10
+
+/*
+ * Earlier NVMe devices do not indicate whether unmapped blocks
+ * will read all zeroes or not. This define indicates that the
+ * device does in fact read all zeroes after an unmap event
+ */
+#define NVME_QUIRK_READ_ZERO_AFTER_DEALLOCATE 0x20
+
+/*
+ * The controller doesn't handle Identify value others than 0 or 1 correctly.
+ */
+#define NVME_QUIRK_IDENTIFY_CNS 0x40
+
+/*
+ * The controller supports Open Channel command set if matching additional
+ * condition, like the first byte (value 0x1) in the vendor specific
+ * bits of the namespace identify structure is set.
+ */
+#define NVME_QUIRK_OCSSD 0x80
+
+/*
+ * The controller has an Intel vendor ID but does not support Intel vendor-specific
+ * log pages. This is primarily for QEMU emulated SSDs which report an Intel vendor
+ * ID but do not support these log pages.
+ */
+#define NVME_INTEL_QUIRK_NO_LOG_PAGES 0x100
+
+/*
+ * The controller does not set SHST_COMPLETE in a reasonable amount of time. This
+ * is primarily seen in virtual VMWare NVMe SSDs. This quirk merely adds an additional
+ * error message that on VMWare NVMe SSDs, the shutdown timeout may be expected.
+ */
+#define NVME_QUIRK_SHST_COMPLETE 0x200
+
+/*
+ * The controller requires an extra delay before starting the initialization process
+ * during attach.
+ */
+#define NVME_QUIRK_DELAY_BEFORE_INIT 0x400
+
+/*
+ * Some SSDs exhibit poor performance with the default SPDK NVMe IO queue size.
+ * This quirk will increase the default to 1024 which matches other operating
+ * systems, at the cost of some extra memory usage. Users can still override
+ * the increased default by changing the spdk_nvme_io_qpair_opts when allocating
+ * a new queue pair.
+ */
+#define NVME_QUIRK_MINIMUM_IO_QUEUE_SIZE 0x800
+
+/**
+ * The maximum access width to PCI memory space is 8 Bytes, don't use AVX2 or
+ * SSE instructions to optimize the memory access(memcpy or memset) larger than
+ * 8 Bytes.
+ */
+#define NVME_QUIRK_MAXIMUM_PCI_ACCESS_WIDTH 0x1000
+
+/**
+ * The SSD does not support OPAL even through it sets the security bit in OACS.
+ */
+#define NVME_QUIRK_OACS_SECURITY 0x2000
+
+#define NVME_MAX_ASYNC_EVENTS (8)
+
+#define NVME_MAX_ADMIN_TIMEOUT_IN_SECS (30)
+
+/* Maximum log page size to fetch for AERs. */
+#define NVME_MAX_AER_LOG_SIZE (4096)
+
+/*
+ * NVME_MAX_IO_QUEUES in nvme_spec.h defines the 64K spec-limit, but this
+ * define specifies the maximum number of queues this driver will actually
+ * try to configure, if available.
+ */
+#define DEFAULT_MAX_IO_QUEUES (1024)
+#define DEFAULT_ADMIN_QUEUE_SIZE (32)
+#define DEFAULT_IO_QUEUE_SIZE (256)
+#define DEFAULT_IO_QUEUE_SIZE_FOR_QUIRK (1024) /* Matches Linux kernel driver */
+
+#define DEFAULT_IO_QUEUE_REQUESTS (512)
+
+#define SPDK_NVME_DEFAULT_RETRY_COUNT (4)
+
+#define SPDK_NVME_TRANSPORT_ACK_TIMEOUT_DISABLED (0)
+#define SPDK_NVME_DEFAULT_TRANSPORT_ACK_TIMEOUT SPDK_NVME_TRANSPORT_ACK_TIMEOUT_DISABLED
+
+#define MIN_KEEP_ALIVE_TIMEOUT_IN_MS (10000)
+
+/* We want to fit submission and completion rings each in a single 2MB
+ * hugepage to ensure physical address contiguity.
+ */
+#define MAX_IO_QUEUE_ENTRIES (VALUE_2MB / spdk_max( \
+ sizeof(struct spdk_nvme_cmd), \
+ sizeof(struct spdk_nvme_cpl)))
+
+enum nvme_payload_type {
+ NVME_PAYLOAD_TYPE_INVALID = 0,
+
+ /** nvme_request::u.payload.contig_buffer is valid for this request */
+ NVME_PAYLOAD_TYPE_CONTIG,
+
+ /** nvme_request::u.sgl is valid for this request */
+ NVME_PAYLOAD_TYPE_SGL,
+};
+
+/**
+ * Descriptor for a request data payload.
+ */
+struct nvme_payload {
+ /**
+ * Functions for retrieving physical addresses for scattered payloads.
+ */
+ spdk_nvme_req_reset_sgl_cb reset_sgl_fn;
+ spdk_nvme_req_next_sge_cb next_sge_fn;
+
+ /**
+ * If reset_sgl_fn == NULL, this is a contig payload, and contig_or_cb_arg contains the
+ * virtual memory address of a single virtually contiguous buffer.
+ *
+ * If reset_sgl_fn != NULL, this is a SGL payload, and contig_or_cb_arg contains the
+ * cb_arg that will be passed to the SGL callback functions.
+ */
+ void *contig_or_cb_arg;
+
+ /** Virtual memory address of a single virtually contiguous metadata buffer */
+ void *md;
+};
+
+#define NVME_PAYLOAD_CONTIG(contig_, md_) \
+ (struct nvme_payload) { \
+ .reset_sgl_fn = NULL, \
+ .next_sge_fn = NULL, \
+ .contig_or_cb_arg = (contig_), \
+ .md = (md_), \
+ }
+
+#define NVME_PAYLOAD_SGL(reset_sgl_fn_, next_sge_fn_, cb_arg_, md_) \
+ (struct nvme_payload) { \
+ .reset_sgl_fn = (reset_sgl_fn_), \
+ .next_sge_fn = (next_sge_fn_), \
+ .contig_or_cb_arg = (cb_arg_), \
+ .md = (md_), \
+ }
+
+static inline enum nvme_payload_type
+nvme_payload_type(const struct nvme_payload *payload) {
+ return payload->reset_sgl_fn ? NVME_PAYLOAD_TYPE_SGL : NVME_PAYLOAD_TYPE_CONTIG;
+}
+
+struct nvme_error_cmd {
+ bool do_not_submit;
+ uint64_t timeout_tsc;
+ uint32_t err_count;
+ uint8_t opc;
+ struct spdk_nvme_status status;
+ TAILQ_ENTRY(nvme_error_cmd) link;
+};
+
+struct nvme_request {
+ struct spdk_nvme_cmd cmd;
+
+ uint8_t retries;
+
+ uint8_t timed_out : 1;
+
+ /**
+ * True if the request is in the queued_req list.
+ */
+ uint8_t queued : 1;
+ uint8_t reserved : 6;
+
+ /**
+ * Number of children requests still outstanding for this
+ * request which was split into multiple child requests.
+ */
+ uint16_t num_children;
+
+ /**
+ * Offset in bytes from the beginning of payload for this request.
+ * This is used for I/O commands that are split into multiple requests.
+ */
+ uint32_t payload_offset;
+ uint32_t md_offset;
+
+ uint32_t payload_size;
+
+ /**
+ * Timeout ticks for error injection requests, can be extended in future
+ * to support per-request timeout feature.
+ */
+ uint64_t timeout_tsc;
+
+ /**
+ * Data payload for this request's command.
+ */
+ struct nvme_payload payload;
+
+ spdk_nvme_cmd_cb cb_fn;
+ void *cb_arg;
+ STAILQ_ENTRY(nvme_request) stailq;
+
+ struct spdk_nvme_qpair *qpair;
+
+ /*
+ * The value of spdk_get_ticks() when the request was submitted to the hardware.
+ * Only set if ctrlr->timeout_enabled is true.
+ */
+ uint64_t submit_tick;
+
+ /**
+ * The active admin request can be moved to a per process pending
+ * list based on the saved pid to tell which process it belongs
+ * to. The cpl saves the original completion information which
+ * is used in the completion callback.
+ * NOTE: these below two fields are only used for admin request.
+ */
+ pid_t pid;
+ struct spdk_nvme_cpl cpl;
+
+ uint32_t md_size;
+
+ /**
+ * The following members should not be reordered with members
+ * above. These members are only needed when splitting
+ * requests which is done rarely, and the driver is careful
+ * to not touch the following fields until a split operation is
+ * needed, to avoid touching an extra cacheline.
+ */
+
+ /**
+ * Points to the outstanding child requests for a parent request.
+ * Only valid if a request was split into multiple children
+ * requests, and is not initialized for non-split requests.
+ */
+ TAILQ_HEAD(, nvme_request) children;
+
+ /**
+ * Linked-list pointers for a child request in its parent's list.
+ */
+ TAILQ_ENTRY(nvme_request) child_tailq;
+
+ /**
+ * Points to a parent request if part of a split request,
+ * NULL otherwise.
+ */
+ struct nvme_request *parent;
+
+ /**
+ * Completion status for a parent request. Initialized to all 0's
+ * (SUCCESS) before child requests are submitted. If a child
+ * request completes with error, the error status is copied here,
+ * to ensure that the parent request is also completed with error
+ * status once all child requests are completed.
+ */
+ struct spdk_nvme_cpl parent_status;
+
+ /**
+ * The user_cb_fn and user_cb_arg fields are used for holding the original
+ * callback data when using nvme_allocate_request_user_copy.
+ */
+ spdk_nvme_cmd_cb user_cb_fn;
+ void *user_cb_arg;
+ void *user_buffer;
+};
+
+struct nvme_completion_poll_status {
+ struct spdk_nvme_cpl cpl;
+ bool done;
+ /* This flag indicates that the request has been timed out and the memory
+ must be freed in a completion callback */
+ bool timed_out;
+};
+
+struct nvme_async_event_request {
+ struct spdk_nvme_ctrlr *ctrlr;
+ struct nvme_request *req;
+ struct spdk_nvme_cpl cpl;
+};
+
+enum nvme_qpair_state {
+ NVME_QPAIR_DISCONNECTED,
+ NVME_QPAIR_DISCONNECTING,
+ NVME_QPAIR_CONNECTING,
+ NVME_QPAIR_CONNECTED,
+ NVME_QPAIR_ENABLING,
+ NVME_QPAIR_ENABLED,
+ NVME_QPAIR_DESTROYING,
+};
+
+struct spdk_nvme_qpair {
+ struct spdk_nvme_ctrlr *ctrlr;
+
+ uint16_t id;
+
+ uint8_t qprio;
+
+ uint8_t state : 3;
+
+ /*
+ * Members for handling IO qpair deletion inside of a completion context.
+ * These are specifically defined as single bits, so that they do not
+ * push this data structure out to another cacheline.
+ */
+ uint8_t in_completion_context : 1;
+ uint8_t delete_after_completion_context: 1;
+
+ /*
+ * Set when no deletion notification is needed. For example, the process
+ * which allocated this qpair exited unexpectedly.
+ */
+ uint8_t no_deletion_notification_needed: 1;
+
+ uint8_t first_fused_submitted: 1;
+
+ enum spdk_nvme_transport_type trtype;
+
+ STAILQ_HEAD(, nvme_request) free_req;
+ STAILQ_HEAD(, nvme_request) queued_req;
+ STAILQ_HEAD(, nvme_request) aborting_queued_req;
+
+ /* List entry for spdk_nvme_transport_poll_group::qpairs */
+ STAILQ_ENTRY(spdk_nvme_qpair) poll_group_stailq;
+
+ /** Commands opcode in this list will return error */
+ TAILQ_HEAD(, nvme_error_cmd) err_cmd_head;
+ /** Requests in this list will return error */
+ STAILQ_HEAD(, nvme_request) err_req_head;
+
+ /* List entry for spdk_nvme_ctrlr::active_io_qpairs */
+ TAILQ_ENTRY(spdk_nvme_qpair) tailq;
+
+ /* List entry for spdk_nvme_ctrlr_process::allocated_io_qpairs */
+ TAILQ_ENTRY(spdk_nvme_qpair) per_process_tailq;
+
+ struct spdk_nvme_ctrlr_process *active_proc;
+
+ struct spdk_nvme_transport_poll_group *poll_group;
+
+ void *poll_group_tailq_head;
+
+ void *req_buf;
+
+ const struct spdk_nvme_transport *transport;
+
+ uint8_t transport_failure_reason: 2;
+};
+
+struct spdk_nvme_poll_group {
+ void *ctx;
+ STAILQ_HEAD(, spdk_nvme_transport_poll_group) tgroups;
+};
+
+struct spdk_nvme_transport_poll_group {
+ struct spdk_nvme_poll_group *group;
+ const struct spdk_nvme_transport *transport;
+ STAILQ_HEAD(, spdk_nvme_qpair) connected_qpairs;
+ STAILQ_HEAD(, spdk_nvme_qpair) disconnected_qpairs;
+ STAILQ_ENTRY(spdk_nvme_transport_poll_group) link;
+ bool in_completion_context;
+ uint64_t num_qpairs_to_delete;
+};
+
+struct spdk_nvme_ns {
+ struct spdk_nvme_ctrlr *ctrlr;
+ uint32_t sector_size;
+
+ /*
+ * Size of data transferred as part of each block,
+ * including metadata if FLBAS indicates the metadata is transferred
+ * as part of the data buffer at the end of each LBA.
+ */
+ uint32_t extended_lba_size;
+
+ uint32_t md_size;
+ uint32_t pi_type;
+ uint32_t sectors_per_max_io;
+ uint32_t sectors_per_stripe;
+ uint32_t id;
+ uint16_t flags;
+
+ /* Namespace Identification Descriptor List (CNS = 03h) */
+ uint8_t id_desc_list[4096];
+};
+
+/**
+ * State of struct spdk_nvme_ctrlr (in particular, during initialization).
+ */
+enum nvme_ctrlr_state {
+ /**
+ * Wait before initializing the controller.
+ */
+ NVME_CTRLR_STATE_INIT_DELAY,
+
+ /**
+ * Controller has not been initialized yet.
+ */
+ NVME_CTRLR_STATE_INIT,
+
+ /**
+ * Waiting for CSTS.RDY to transition from 0 to 1 so that CC.EN may be set to 0.
+ */
+ NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_1,
+
+ /**
+ * Waiting for CSTS.RDY to transition from 1 to 0 so that CC.EN may be set to 1.
+ */
+ NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_0,
+
+ /**
+ * Enable the controller by writing CC.EN to 1
+ */
+ NVME_CTRLR_STATE_ENABLE,
+
+ /**
+ * Waiting for CSTS.RDY to transition from 0 to 1 after enabling the controller.
+ */
+ NVME_CTRLR_STATE_ENABLE_WAIT_FOR_READY_1,
+
+ /**
+ * Reset the Admin queue of the controller.
+ */
+ NVME_CTRLR_STATE_RESET_ADMIN_QUEUE,
+
+ /**
+ * Identify Controller command will be sent to then controller.
+ */
+ NVME_CTRLR_STATE_IDENTIFY,
+
+ /**
+ * Waiting for Identify Controller command be completed.
+ */
+ NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY,
+
+ /**
+ * Set Number of Queues of the controller.
+ */
+ NVME_CTRLR_STATE_SET_NUM_QUEUES,
+
+ /**
+ * Waiting for Set Num of Queues command to be completed.
+ */
+ NVME_CTRLR_STATE_WAIT_FOR_SET_NUM_QUEUES,
+
+ /**
+ * Construct Namespace data structures of the controller.
+ */
+ NVME_CTRLR_STATE_CONSTRUCT_NS,
+
+ /**
+ * Get active Namespace list of the controller.
+ */
+ NVME_CTRLR_STATE_IDENTIFY_ACTIVE_NS,
+
+ /**
+ * Waiting for the Identify Active Namespace commands to be completed.
+ */
+ NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_ACTIVE_NS,
+
+ /**
+ * Get Identify Namespace Data structure for each NS.
+ */
+ NVME_CTRLR_STATE_IDENTIFY_NS,
+
+ /**
+ * Waiting for the Identify Namespace commands to be completed.
+ */
+ NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_NS,
+
+ /**
+ * Get Identify Namespace Identification Descriptors.
+ */
+ NVME_CTRLR_STATE_IDENTIFY_ID_DESCS,
+
+ /**
+ * Waiting for the Identify Namespace Identification
+ * Descriptors to be completed.
+ */
+ NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_ID_DESCS,
+
+ /**
+ * Configure AER of the controller.
+ */
+ NVME_CTRLR_STATE_CONFIGURE_AER,
+
+ /**
+ * Waiting for the Configure AER to be completed.
+ */
+ NVME_CTRLR_STATE_WAIT_FOR_CONFIGURE_AER,
+
+ /**
+ * Set supported log pages of the controller.
+ */
+ NVME_CTRLR_STATE_SET_SUPPORTED_LOG_PAGES,
+
+ /**
+ * Set supported features of the controller.
+ */
+ NVME_CTRLR_STATE_SET_SUPPORTED_FEATURES,
+
+ /**
+ * Set Doorbell Buffer Config of the controller.
+ */
+ NVME_CTRLR_STATE_SET_DB_BUF_CFG,
+
+ /**
+ * Waiting for Doorbell Buffer Config to be completed.
+ */
+ NVME_CTRLR_STATE_WAIT_FOR_DB_BUF_CFG,
+
+ /**
+ * Set Keep Alive Timeout of the controller.
+ */
+ NVME_CTRLR_STATE_SET_KEEP_ALIVE_TIMEOUT,
+
+ /**
+ * Waiting for Set Keep Alive Timeout to be completed.
+ */
+ NVME_CTRLR_STATE_WAIT_FOR_KEEP_ALIVE_TIMEOUT,
+
+ /**
+ * Set Host ID of the controller.
+ */
+ NVME_CTRLR_STATE_SET_HOST_ID,
+
+ /**
+ * Waiting for Set Host ID to be completed.
+ */
+ NVME_CTRLR_STATE_WAIT_FOR_HOST_ID,
+
+ /**
+ * Controller initialization has completed and the controller is ready.
+ */
+ NVME_CTRLR_STATE_READY,
+
+ /**
+ * Controller inilialization has an error.
+ */
+ NVME_CTRLR_STATE_ERROR
+};
+
+#define NVME_TIMEOUT_INFINITE 0
+
+/*
+ * Used to track properties for all processes accessing the controller.
+ */
+struct spdk_nvme_ctrlr_process {
+ /** Whether it is the primary process */
+ bool is_primary;
+
+ /** Process ID */
+ pid_t pid;
+
+ /** Active admin requests to be completed */
+ STAILQ_HEAD(, nvme_request) active_reqs;
+
+ TAILQ_ENTRY(spdk_nvme_ctrlr_process) tailq;
+
+ /** Per process PCI device handle */
+ struct spdk_pci_device *devhandle;
+
+ /** Reference to track the number of attachment to this controller. */
+ int ref;
+
+ /** Allocated IO qpairs */
+ TAILQ_HEAD(, spdk_nvme_qpair) allocated_io_qpairs;
+
+ spdk_nvme_aer_cb aer_cb_fn;
+ void *aer_cb_arg;
+
+ /**
+ * A function pointer to timeout callback function
+ */
+ spdk_nvme_timeout_cb timeout_cb_fn;
+ void *timeout_cb_arg;
+ uint64_t timeout_ticks;
+};
+
+/*
+ * One of these per allocated PCI device.
+ */
+struct spdk_nvme_ctrlr {
+ /* Hot data (accessed in I/O path) starts here. */
+
+ /** Array of namespaces indexed by nsid - 1 */
+ struct spdk_nvme_ns *ns;
+
+ uint32_t num_ns;
+
+ bool is_removed;
+
+ bool is_resetting;
+
+ bool is_failed;
+
+ bool is_destructed;
+
+ bool timeout_enabled;
+
+ uint16_t max_sges;
+
+ uint16_t cntlid;
+
+ /** Controller support flags */
+ uint64_t flags;
+
+ /** NVMEoF in-capsule data size in bytes */
+ uint32_t ioccsz_bytes;
+
+ /** NVMEoF in-capsule data offset in 16 byte units */
+ uint16_t icdoff;
+
+ /* Cold data (not accessed in normal I/O path) is after this point. */
+
+ struct spdk_nvme_transport_id trid;
+
+ union spdk_nvme_cap_register cap;
+ union spdk_nvme_vs_register vs;
+
+ enum nvme_ctrlr_state state;
+ uint64_t state_timeout_tsc;
+
+ uint64_t next_keep_alive_tick;
+ uint64_t keep_alive_interval_ticks;
+
+ TAILQ_ENTRY(spdk_nvme_ctrlr) tailq;
+
+ /** All the log pages supported */
+ bool log_page_supported[256];
+
+ /** All the features supported */
+ bool feature_supported[256];
+
+ /** maximum i/o size in bytes */
+ uint32_t max_xfer_size;
+
+ /** minimum page size supported by this controller in bytes */
+ uint32_t min_page_size;
+
+ /** selected memory page size for this controller in bytes */
+ uint32_t page_size;
+
+ uint32_t num_aers;
+ struct nvme_async_event_request aer[NVME_MAX_ASYNC_EVENTS];
+
+ /** guards access to the controller itself, including admin queues */
+ pthread_mutex_t ctrlr_lock;
+
+ struct spdk_nvme_qpair *adminq;
+
+ /** shadow doorbell buffer */
+ uint32_t *shadow_doorbell;
+ /** eventidx buffer */
+ uint32_t *eventidx;
+
+ /**
+ * Identify Controller data.
+ */
+ struct spdk_nvme_ctrlr_data cdata;
+
+ /**
+ * Keep track of active namespaces
+ */
+ uint32_t *active_ns_list;
+
+ /**
+ * Array of Identify Namespace data.
+ *
+ * Stored separately from ns since nsdata should not normally be accessed during I/O.
+ */
+ struct spdk_nvme_ns_data *nsdata;
+
+ struct spdk_bit_array *free_io_qids;
+ TAILQ_HEAD(, spdk_nvme_qpair) active_io_qpairs;
+
+ struct spdk_nvme_ctrlr_opts opts;
+
+ uint64_t quirks;
+
+ /* Extra sleep time during controller initialization */
+ uint64_t sleep_timeout_tsc;
+
+ /** Track all the processes manage this controller */
+ TAILQ_HEAD(, spdk_nvme_ctrlr_process) active_procs;
+
+
+ STAILQ_HEAD(, nvme_request) queued_aborts;
+ uint32_t outstanding_aborts;
+
+ /* CB to notify the user when the ctrlr is removed/failed. */
+ spdk_nvme_remove_cb remove_cb;
+ void *cb_ctx;
+
+ struct spdk_nvme_qpair *external_io_msgs_qpair;
+ pthread_mutex_t external_io_msgs_lock;
+ struct spdk_ring *external_io_msgs;
+
+ STAILQ_HEAD(, nvme_io_msg_producer) io_producers;
+};
+
+struct spdk_nvme_probe_ctx {
+ struct spdk_nvme_transport_id trid;
+ void *cb_ctx;
+ spdk_nvme_probe_cb probe_cb;
+ spdk_nvme_attach_cb attach_cb;
+ spdk_nvme_remove_cb remove_cb;
+ TAILQ_HEAD(, spdk_nvme_ctrlr) init_ctrlrs;
+};
+
+struct nvme_driver {
+ pthread_mutex_t lock;
+
+ /** Multi-process shared attached controller list */
+ TAILQ_HEAD(, spdk_nvme_ctrlr) shared_attached_ctrlrs;
+
+ bool initialized;
+ struct spdk_uuid default_extended_host_id;
+
+ /** netlink socket fd for hotplug messages */
+ int hotplug_fd;
+};
+
+extern struct nvme_driver *g_spdk_nvme_driver;
+
+int nvme_driver_init(void);
+
+#define nvme_delay usleep
+
+static inline bool
+nvme_qpair_is_admin_queue(struct spdk_nvme_qpair *qpair)
+{
+ return qpair->id == 0;
+}
+
+static inline bool
+nvme_qpair_is_io_queue(struct spdk_nvme_qpair *qpair)
+{
+ return qpair->id != 0;
+}
+
+static inline int
+nvme_robust_mutex_lock(pthread_mutex_t *mtx)
+{
+ int rc = pthread_mutex_lock(mtx);
+
+#ifndef __FreeBSD__
+ if (rc == EOWNERDEAD) {
+ rc = pthread_mutex_consistent(mtx);
+ }
+#endif
+
+ return rc;
+}
+
+static inline int
+nvme_robust_mutex_unlock(pthread_mutex_t *mtx)
+{
+ return pthread_mutex_unlock(mtx);
+}
+
+/* Poll group management functions. */
+int nvme_poll_group_connect_qpair(struct spdk_nvme_qpair *qpair);
+int nvme_poll_group_disconnect_qpair(struct spdk_nvme_qpair *qpair);
+
+/* Admin functions */
+int nvme_ctrlr_cmd_identify(struct spdk_nvme_ctrlr *ctrlr,
+ uint8_t cns, uint16_t cntid, uint32_t nsid,
+ void *payload, size_t payload_size,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg);
+int nvme_ctrlr_cmd_set_num_queues(struct spdk_nvme_ctrlr *ctrlr,
+ uint32_t num_queues, spdk_nvme_cmd_cb cb_fn,
+ void *cb_arg);
+int nvme_ctrlr_cmd_get_num_queues(struct spdk_nvme_ctrlr *ctrlr,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg);
+int nvme_ctrlr_cmd_set_async_event_config(struct spdk_nvme_ctrlr *ctrlr,
+ union spdk_nvme_feat_async_event_configuration config,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg);
+int nvme_ctrlr_cmd_set_host_id(struct spdk_nvme_ctrlr *ctrlr, void *host_id, uint32_t host_id_size,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg);
+int nvme_ctrlr_cmd_attach_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid,
+ struct spdk_nvme_ctrlr_list *payload, spdk_nvme_cmd_cb cb_fn, void *cb_arg);
+int nvme_ctrlr_cmd_detach_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid,
+ struct spdk_nvme_ctrlr_list *payload, spdk_nvme_cmd_cb cb_fn, void *cb_arg);
+int nvme_ctrlr_cmd_create_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns_data *payload,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg);
+int nvme_ctrlr_cmd_doorbell_buffer_config(struct spdk_nvme_ctrlr *ctrlr,
+ uint64_t prp1, uint64_t prp2,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg);
+int nvme_ctrlr_cmd_delete_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, spdk_nvme_cmd_cb cb_fn,
+ void *cb_arg);
+int nvme_ctrlr_cmd_format(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid,
+ struct spdk_nvme_format *format, spdk_nvme_cmd_cb cb_fn, void *cb_arg);
+int nvme_ctrlr_cmd_fw_commit(struct spdk_nvme_ctrlr *ctrlr,
+ const struct spdk_nvme_fw_commit *fw_commit,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg);
+int nvme_ctrlr_cmd_fw_image_download(struct spdk_nvme_ctrlr *ctrlr,
+ uint32_t size, uint32_t offset, void *payload,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg);
+int nvme_ctrlr_cmd_sanitize(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid,
+ struct spdk_nvme_sanitize *sanitize, uint32_t cdw11,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg);
+void nvme_completion_poll_cb(void *arg, const struct spdk_nvme_cpl *cpl);
+int nvme_wait_for_completion(struct spdk_nvme_qpair *qpair,
+ struct nvme_completion_poll_status *status);
+int nvme_wait_for_completion_robust_lock(struct spdk_nvme_qpair *qpair,
+ struct nvme_completion_poll_status *status,
+ pthread_mutex_t *robust_mutex);
+int nvme_wait_for_completion_timeout(struct spdk_nvme_qpair *qpair,
+ struct nvme_completion_poll_status *status,
+ uint64_t timeout_in_secs);
+
+struct spdk_nvme_ctrlr_process *nvme_ctrlr_get_process(struct spdk_nvme_ctrlr *ctrlr,
+ pid_t pid);
+struct spdk_nvme_ctrlr_process *nvme_ctrlr_get_current_process(struct spdk_nvme_ctrlr *ctrlr);
+int nvme_ctrlr_add_process(struct spdk_nvme_ctrlr *ctrlr, void *devhandle);
+void nvme_ctrlr_free_processes(struct spdk_nvme_ctrlr *ctrlr);
+struct spdk_pci_device *nvme_ctrlr_proc_get_devhandle(struct spdk_nvme_ctrlr *ctrlr);
+
+int nvme_ctrlr_probe(const struct spdk_nvme_transport_id *trid,
+ struct spdk_nvme_probe_ctx *probe_ctx, void *devhandle);
+
+int nvme_ctrlr_construct(struct spdk_nvme_ctrlr *ctrlr);
+void nvme_ctrlr_destruct_finish(struct spdk_nvme_ctrlr *ctrlr);
+void nvme_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr);
+void nvme_ctrlr_fail(struct spdk_nvme_ctrlr *ctrlr, bool hot_remove);
+int nvme_ctrlr_reset(struct spdk_nvme_ctrlr *ctrlr);
+int nvme_ctrlr_process_init(struct spdk_nvme_ctrlr *ctrlr);
+void nvme_ctrlr_connected(struct spdk_nvme_probe_ctx *probe_ctx,
+ struct spdk_nvme_ctrlr *ctrlr);
+
+int nvme_ctrlr_submit_admin_request(struct spdk_nvme_ctrlr *ctrlr,
+ struct nvme_request *req);
+int nvme_ctrlr_get_cap(struct spdk_nvme_ctrlr *ctrlr, union spdk_nvme_cap_register *cap);
+int nvme_ctrlr_get_vs(struct spdk_nvme_ctrlr *ctrlr, union spdk_nvme_vs_register *vs);
+int nvme_ctrlr_get_cmbsz(struct spdk_nvme_ctrlr *ctrlr, union spdk_nvme_cmbsz_register *cmbsz);
+void nvme_ctrlr_init_cap(struct spdk_nvme_ctrlr *ctrlr, const union spdk_nvme_cap_register *cap,
+ const union spdk_nvme_vs_register *vs);
+void nvme_ctrlr_disconnect_qpair(struct spdk_nvme_qpair *qpair);
+int nvme_qpair_init(struct spdk_nvme_qpair *qpair, uint16_t id,
+ struct spdk_nvme_ctrlr *ctrlr,
+ enum spdk_nvme_qprio qprio,
+ uint32_t num_requests);
+void nvme_qpair_deinit(struct spdk_nvme_qpair *qpair);
+void nvme_qpair_complete_error_reqs(struct spdk_nvme_qpair *qpair);
+int nvme_qpair_submit_request(struct spdk_nvme_qpair *qpair,
+ struct nvme_request *req);
+void nvme_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr);
+uint32_t nvme_qpair_abort_queued_reqs(struct spdk_nvme_qpair *qpair, void *cmd_cb_arg);
+void nvme_qpair_resubmit_requests(struct spdk_nvme_qpair *qpair, uint32_t num_requests);
+
+int nvme_ctrlr_identify_active_ns(struct spdk_nvme_ctrlr *ctrlr);
+void nvme_ns_set_identify_data(struct spdk_nvme_ns *ns);
+int nvme_ns_construct(struct spdk_nvme_ns *ns, uint32_t id,
+ struct spdk_nvme_ctrlr *ctrlr);
+void nvme_ns_destruct(struct spdk_nvme_ns *ns);
+int nvme_ns_update(struct spdk_nvme_ns *ns);
+
+int nvme_fabric_ctrlr_set_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t value);
+int nvme_fabric_ctrlr_set_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t value);
+int nvme_fabric_ctrlr_get_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t *value);
+int nvme_fabric_ctrlr_scan(struct spdk_nvme_probe_ctx *probe_ctx, bool direct_connect);
+int nvme_fabric_ctrlr_get_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t *value);
+int nvme_fabric_ctrlr_discover(struct spdk_nvme_ctrlr *ctrlr,
+ struct spdk_nvme_probe_ctx *probe_ctx);
+int nvme_fabric_qpair_connect(struct spdk_nvme_qpair *qpair, uint32_t num_entries);
+
+static inline struct nvme_request *
+nvme_allocate_request(struct spdk_nvme_qpair *qpair,
+ const struct nvme_payload *payload, uint32_t payload_size, uint32_t md_size,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+
+ req = STAILQ_FIRST(&qpair->free_req);
+ if (req == NULL) {
+ return req;
+ }
+
+ STAILQ_REMOVE_HEAD(&qpair->free_req, stailq);
+
+ /*
+ * Only memset/zero fields that need it. All other fields
+ * will be initialized appropriately either later in this
+ * function, or before they are needed later in the
+ * submission patch. For example, the children
+ * TAILQ_ENTRY and following members are
+ * only used as part of I/O splitting so we avoid
+ * memsetting them until it is actually needed.
+ * They will be initialized in nvme_request_add_child()
+ * if the request is split.
+ */
+ memset(req, 0, offsetof(struct nvme_request, payload_size));
+
+ req->cb_fn = cb_fn;
+ req->cb_arg = cb_arg;
+ req->payload = *payload;
+ req->payload_size = payload_size;
+ req->md_size = md_size;
+ req->pid = g_spdk_nvme_pid;
+ req->submit_tick = 0;
+
+ return req;
+}
+
+static inline struct nvme_request *
+nvme_allocate_request_contig(struct spdk_nvme_qpair *qpair,
+ void *buffer, uint32_t payload_size,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_payload payload;
+
+ payload = NVME_PAYLOAD_CONTIG(buffer, NULL);
+
+ return nvme_allocate_request(qpair, &payload, payload_size, 0, cb_fn, cb_arg);
+}
+
+static inline struct nvme_request *
+nvme_allocate_request_null(struct spdk_nvme_qpair *qpair, spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ return nvme_allocate_request_contig(qpair, NULL, 0, cb_fn, cb_arg);
+}
+
+struct nvme_request *nvme_allocate_request_user_copy(struct spdk_nvme_qpair *qpair,
+ void *buffer, uint32_t payload_size,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg, bool host_to_controller);
+
+static inline void
+nvme_complete_request(spdk_nvme_cmd_cb cb_fn, void *cb_arg, struct spdk_nvme_qpair *qpair,
+ struct nvme_request *req, struct spdk_nvme_cpl *cpl)
+{
+ struct spdk_nvme_cpl err_cpl;
+ struct nvme_error_cmd *cmd;
+
+ /* error injection at completion path,
+ * only inject for successful completed commands
+ */
+ if (spdk_unlikely(!TAILQ_EMPTY(&qpair->err_cmd_head) &&
+ !spdk_nvme_cpl_is_error(cpl))) {
+ TAILQ_FOREACH(cmd, &qpair->err_cmd_head, link) {
+
+ if (cmd->do_not_submit) {
+ continue;
+ }
+
+ if ((cmd->opc == req->cmd.opc) && cmd->err_count) {
+
+ err_cpl = *cpl;
+ err_cpl.status.sct = cmd->status.sct;
+ err_cpl.status.sc = cmd->status.sc;
+
+ cpl = &err_cpl;
+ cmd->err_count--;
+ break;
+ }
+ }
+ }
+
+ if (cb_fn) {
+ cb_fn(cb_arg, cpl);
+ }
+}
+
+static inline void
+nvme_free_request(struct nvme_request *req)
+{
+ assert(req != NULL);
+ assert(req->num_children == 0);
+ assert(req->qpair != NULL);
+
+ STAILQ_INSERT_HEAD(&req->qpair->free_req, req, stailq);
+}
+
+static inline void
+nvme_qpair_set_state(struct spdk_nvme_qpair *qpair, enum nvme_qpair_state state)
+{
+ qpair->state = state;
+}
+
+static inline enum nvme_qpair_state
+nvme_qpair_get_state(struct spdk_nvme_qpair *qpair) {
+ return qpair->state;
+}
+
+static inline void
+nvme_qpair_free_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req)
+{
+ assert(req != NULL);
+ assert(req->num_children == 0);
+
+ STAILQ_INSERT_HEAD(&qpair->free_req, req, stailq);
+}
+
+static inline void
+nvme_request_remove_child(struct nvme_request *parent, struct nvme_request *child)
+{
+ assert(parent != NULL);
+ assert(child != NULL);
+ assert(child->parent == parent);
+ assert(parent->num_children != 0);
+
+ parent->num_children--;
+ child->parent = NULL;
+ TAILQ_REMOVE(&parent->children, child, child_tailq);
+}
+
+static inline void
+nvme_cb_complete_child(void *child_arg, const struct spdk_nvme_cpl *cpl)
+{
+ struct nvme_request *child = child_arg;
+ struct nvme_request *parent = child->parent;
+
+ nvme_request_remove_child(parent, child);
+
+ if (spdk_nvme_cpl_is_error(cpl)) {
+ memcpy(&parent->parent_status, cpl, sizeof(*cpl));
+ }
+
+ if (parent->num_children == 0) {
+ nvme_complete_request(parent->cb_fn, parent->cb_arg, parent->qpair,
+ parent, &parent->parent_status);
+ nvme_free_request(parent);
+ }
+}
+
+static inline void
+nvme_request_add_child(struct nvme_request *parent, struct nvme_request *child)
+{
+ assert(parent->num_children != UINT16_MAX);
+
+ if (parent->num_children == 0) {
+ /*
+ * Defer initialization of the children TAILQ since it falls
+ * on a separate cacheline. This ensures we do not touch this
+ * cacheline except on request splitting cases, which are
+ * relatively rare.
+ */
+ TAILQ_INIT(&parent->children);
+ parent->parent = NULL;
+ memset(&parent->parent_status, 0, sizeof(struct spdk_nvme_cpl));
+ }
+
+ parent->num_children++;
+ TAILQ_INSERT_TAIL(&parent->children, child, child_tailq);
+ child->parent = parent;
+ child->cb_fn = nvme_cb_complete_child;
+ child->cb_arg = child;
+}
+
+static inline void
+nvme_request_free_children(struct nvme_request *req)
+{
+ struct nvme_request *child, *tmp;
+
+ if (req->num_children == 0) {
+ return;
+ }
+
+ /* free all child nvme_request */
+ TAILQ_FOREACH_SAFE(child, &req->children, child_tailq, tmp) {
+ nvme_request_remove_child(req, child);
+ nvme_request_free_children(child);
+ nvme_free_request(child);
+ }
+}
+
+int nvme_request_check_timeout(struct nvme_request *req, uint16_t cid,
+ struct spdk_nvme_ctrlr_process *active_proc, uint64_t now_tick);
+uint64_t nvme_get_quirks(const struct spdk_pci_id *id);
+
+int nvme_robust_mutex_init_shared(pthread_mutex_t *mtx);
+int nvme_robust_mutex_init_recursive_shared(pthread_mutex_t *mtx);
+
+bool nvme_completion_is_retry(const struct spdk_nvme_cpl *cpl);
+
+struct spdk_nvme_ctrlr *nvme_get_ctrlr_by_trid_unsafe(
+ const struct spdk_nvme_transport_id *trid);
+
+const struct spdk_nvme_transport *nvme_get_transport(const char *transport_name);
+const struct spdk_nvme_transport *nvme_get_first_transport(void);
+const struct spdk_nvme_transport *nvme_get_next_transport(const struct spdk_nvme_transport
+ *transport);
+
+/* Transport specific functions */
+struct spdk_nvme_ctrlr *nvme_transport_ctrlr_construct(const struct spdk_nvme_transport_id *trid,
+ const struct spdk_nvme_ctrlr_opts *opts,
+ void *devhandle);
+int nvme_transport_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr);
+int nvme_transport_ctrlr_scan(struct spdk_nvme_probe_ctx *probe_ctx, bool direct_connect);
+int nvme_transport_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr);
+int nvme_transport_ctrlr_set_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t value);
+int nvme_transport_ctrlr_set_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t value);
+int nvme_transport_ctrlr_get_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t *value);
+int nvme_transport_ctrlr_get_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t *value);
+uint32_t nvme_transport_ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr *ctrlr);
+uint16_t nvme_transport_ctrlr_get_max_sges(struct spdk_nvme_ctrlr *ctrlr);
+struct spdk_nvme_qpair *nvme_transport_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr,
+ uint16_t qid, const struct spdk_nvme_io_qpair_opts *opts);
+int nvme_transport_ctrlr_reserve_cmb(struct spdk_nvme_ctrlr *ctrlr);
+void *nvme_transport_ctrlr_map_cmb(struct spdk_nvme_ctrlr *ctrlr, size_t *size);
+int nvme_transport_ctrlr_unmap_cmb(struct spdk_nvme_ctrlr *ctrlr);
+int nvme_transport_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr,
+ struct spdk_nvme_qpair *qpair);
+int nvme_transport_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr,
+ struct spdk_nvme_qpair *qpair);
+void nvme_transport_ctrlr_disconnect_qpair(struct spdk_nvme_ctrlr *ctrlr,
+ struct spdk_nvme_qpair *qpair);
+void nvme_transport_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr);
+int nvme_transport_qpair_reset(struct spdk_nvme_qpair *qpair);
+int nvme_transport_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req);
+int32_t nvme_transport_qpair_process_completions(struct spdk_nvme_qpair *qpair,
+ uint32_t max_completions);
+void nvme_transport_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair);
+int nvme_transport_qpair_iterate_requests(struct spdk_nvme_qpair *qpair,
+ int (*iter_fn)(struct nvme_request *req, void *arg),
+ void *arg);
+
+struct spdk_nvme_transport_poll_group *nvme_transport_poll_group_create(
+ const struct spdk_nvme_transport *transport);
+int nvme_transport_poll_group_add(struct spdk_nvme_transport_poll_group *tgroup,
+ struct spdk_nvme_qpair *qpair);
+int nvme_transport_poll_group_remove(struct spdk_nvme_transport_poll_group *tgroup,
+ struct spdk_nvme_qpair *qpair);
+int nvme_transport_poll_group_disconnect_qpair(struct spdk_nvme_qpair *qpair);
+int nvme_transport_poll_group_connect_qpair(struct spdk_nvme_qpair *qpair);
+int64_t nvme_transport_poll_group_process_completions(struct spdk_nvme_transport_poll_group *tgroup,
+ uint32_t completions_per_qpair, spdk_nvme_disconnected_qpair_cb disconnected_qpair_cb);
+int nvme_transport_poll_group_destroy(struct spdk_nvme_transport_poll_group *tgroup);
+/*
+ * Below ref related functions must be called with the global
+ * driver lock held for the multi-process condition.
+ * Within these functions, the per ctrlr ctrlr_lock is also
+ * acquired for the multi-thread condition.
+ */
+void nvme_ctrlr_proc_get_ref(struct spdk_nvme_ctrlr *ctrlr);
+void nvme_ctrlr_proc_put_ref(struct spdk_nvme_ctrlr *ctrlr);
+int nvme_ctrlr_get_ref_count(struct spdk_nvme_ctrlr *ctrlr);
+
+static inline bool
+_is_page_aligned(uint64_t address, uint64_t page_size)
+{
+ return (address & (page_size - 1)) == 0;
+}
+
+#endif /* __NVME_INTERNAL_H__ */
diff --git a/src/spdk/lib/nvme/nvme_io_msg.c b/src/spdk/lib/nvme/nvme_io_msg.c
new file mode 100644
index 000000000..fb5aec3d4
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_io_msg.c
@@ -0,0 +1,216 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "nvme_internal.h"
+#include "nvme_io_msg.h"
+
+#define SPDK_NVME_MSG_IO_PROCESS_SIZE 8
+
+/**
+ * Send message to IO queue.
+ */
+int
+nvme_io_msg_send(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, spdk_nvme_io_msg_fn fn,
+ void *arg)
+{
+ int rc;
+ struct spdk_nvme_io_msg *io;
+
+ /* Protect requests ring against preemptive producers */
+ pthread_mutex_lock(&ctrlr->external_io_msgs_lock);
+
+ io = (struct spdk_nvme_io_msg *)calloc(1, sizeof(struct spdk_nvme_io_msg));
+ if (!io) {
+ SPDK_ERRLOG("IO msg allocation failed.");
+ pthread_mutex_unlock(&ctrlr->external_io_msgs_lock);
+ return -ENOMEM;
+ }
+
+ io->ctrlr = ctrlr;
+ io->nsid = nsid;
+ io->fn = fn;
+ io->arg = arg;
+
+ rc = spdk_ring_enqueue(ctrlr->external_io_msgs, (void **)&io, 1, NULL);
+ if (rc != 1) {
+ assert(false);
+ free(io);
+ pthread_mutex_unlock(&ctrlr->external_io_msgs_lock);
+ return -ENOMEM;
+ }
+
+ pthread_mutex_unlock(&ctrlr->external_io_msgs_lock);
+
+ return 0;
+}
+
+int
+nvme_io_msg_process(struct spdk_nvme_ctrlr *ctrlr)
+{
+ int i;
+ int count;
+ struct spdk_nvme_io_msg *io;
+ void *requests[SPDK_NVME_MSG_IO_PROCESS_SIZE];
+
+ if (!ctrlr->external_io_msgs || !ctrlr->external_io_msgs_qpair) {
+ /* Not ready or pending reset */
+ return 0;
+ }
+
+ spdk_nvme_qpair_process_completions(ctrlr->external_io_msgs_qpair, 0);
+
+ count = spdk_ring_dequeue(ctrlr->external_io_msgs, requests,
+ SPDK_NVME_MSG_IO_PROCESS_SIZE);
+ if (count == 0) {
+ return 0;
+ }
+
+ for (i = 0; i < count; i++) {
+ io = requests[i];
+
+ assert(io != NULL);
+
+ io->fn(io->ctrlr, io->nsid, io->arg);
+ free(io);
+ }
+
+ return count;
+}
+
+static bool
+nvme_io_msg_is_producer_registered(struct spdk_nvme_ctrlr *ctrlr,
+ struct nvme_io_msg_producer *io_msg_producer)
+{
+ struct nvme_io_msg_producer *tmp;
+
+ STAILQ_FOREACH(tmp, &ctrlr->io_producers, link) {
+ if (tmp == io_msg_producer) {
+ return true;
+ }
+ }
+ return false;
+}
+
+int
+nvme_io_msg_ctrlr_register(struct spdk_nvme_ctrlr *ctrlr,
+ struct nvme_io_msg_producer *io_msg_producer)
+{
+ if (io_msg_producer == NULL) {
+ SPDK_ERRLOG("io_msg_producer cannot be NULL\n");
+ return -EINVAL;
+ }
+
+ if (nvme_io_msg_is_producer_registered(ctrlr, io_msg_producer)) {
+ return -EEXIST;
+ }
+
+ if (!STAILQ_EMPTY(&ctrlr->io_producers) || ctrlr->is_resetting) {
+ /* There are registered producers - IO messaging already started */
+ STAILQ_INSERT_TAIL(&ctrlr->io_producers, io_msg_producer, link);
+ return 0;
+ }
+
+ pthread_mutex_init(&ctrlr->external_io_msgs_lock, NULL);
+
+ /**
+ * Initialize ring and qpair for controller
+ */
+ ctrlr->external_io_msgs = spdk_ring_create(SPDK_RING_TYPE_MP_SC, 65536, SPDK_ENV_SOCKET_ID_ANY);
+ if (!ctrlr->external_io_msgs) {
+ SPDK_ERRLOG("Unable to allocate memory for message ring\n");
+ return -ENOMEM;
+ }
+
+ ctrlr->external_io_msgs_qpair = spdk_nvme_ctrlr_alloc_io_qpair(ctrlr, NULL, 0);
+ if (ctrlr->external_io_msgs_qpair == NULL) {
+ SPDK_ERRLOG("spdk_nvme_ctrlr_alloc_io_qpair() failed\n");
+ spdk_ring_free(ctrlr->external_io_msgs);
+ ctrlr->external_io_msgs = NULL;
+ return -ENOMEM;
+ }
+
+ STAILQ_INSERT_TAIL(&ctrlr->io_producers, io_msg_producer, link);
+
+ return 0;
+}
+
+void
+nvme_io_msg_ctrlr_update(struct spdk_nvme_ctrlr *ctrlr)
+{
+ struct nvme_io_msg_producer *io_msg_producer;
+
+ /* Update all producers */
+ STAILQ_FOREACH(io_msg_producer, &ctrlr->io_producers, link) {
+ io_msg_producer->update(ctrlr);
+ }
+}
+
+void
+nvme_io_msg_ctrlr_detach(struct spdk_nvme_ctrlr *ctrlr)
+{
+ struct nvme_io_msg_producer *io_msg_producer, *tmp;
+
+ /* Stop all producers */
+ STAILQ_FOREACH_SAFE(io_msg_producer, &ctrlr->io_producers, link, tmp) {
+ io_msg_producer->stop(ctrlr);
+ STAILQ_REMOVE(&ctrlr->io_producers, io_msg_producer, nvme_io_msg_producer, link);
+ }
+
+ if (ctrlr->external_io_msgs) {
+ spdk_ring_free(ctrlr->external_io_msgs);
+ ctrlr->external_io_msgs = NULL;
+ }
+
+ if (ctrlr->external_io_msgs_qpair) {
+ spdk_nvme_ctrlr_free_io_qpair(ctrlr->external_io_msgs_qpair);
+ ctrlr->external_io_msgs_qpair = NULL;
+ }
+
+ pthread_mutex_destroy(&ctrlr->external_io_msgs_lock);
+}
+
+void
+nvme_io_msg_ctrlr_unregister(struct spdk_nvme_ctrlr *ctrlr,
+ struct nvme_io_msg_producer *io_msg_producer)
+{
+ assert(io_msg_producer != NULL);
+
+ if (!nvme_io_msg_is_producer_registered(ctrlr, io_msg_producer)) {
+ return;
+ }
+
+ STAILQ_REMOVE(&ctrlr->io_producers, io_msg_producer, nvme_io_msg_producer, link);
+ if (STAILQ_EMPTY(&ctrlr->io_producers)) {
+ nvme_io_msg_ctrlr_detach(ctrlr);
+ }
+}
diff --git a/src/spdk/lib/nvme/nvme_io_msg.h b/src/spdk/lib/nvme/nvme_io_msg.h
new file mode 100644
index 000000000..9c18261d5
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_io_msg.h
@@ -0,0 +1,90 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * SPDK cuse
+ */
+
+
+#ifndef SPDK_NVME_IO_MSG_H_
+#define SPDK_NVME_IO_MSG_H_
+
+typedef void (*spdk_nvme_io_msg_fn)(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid,
+ void *arg);
+
+struct spdk_nvme_io_msg {
+ struct spdk_nvme_ctrlr *ctrlr;
+ uint32_t nsid;
+
+ spdk_nvme_io_msg_fn fn;
+ void *arg;
+};
+
+struct nvme_io_msg_producer {
+ const char *name;
+ void (*update)(struct spdk_nvme_ctrlr *ctrlr);
+ void (*stop)(struct spdk_nvme_ctrlr *ctrlr);
+ STAILQ_ENTRY(nvme_io_msg_producer) link;
+};
+
+int nvme_io_msg_send(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, spdk_nvme_io_msg_fn fn,
+ void *arg);
+
+/**
+ * Process IO message sent to controller from external module.
+ *
+ * This call process requests from the ring, send IO to an allocated qpair or
+ * admin commands in its context. This call is non-blocking and intended to be
+ * polled by SPDK thread to provide safe environment for NVMe request
+ * completition sent by external module to controller.
+ *
+ * The caller must ensure that each controller is polled by only one thread at
+ * a time.
+ *
+ * This function may be called at any point while the controller is attached to
+ * the SPDK NVMe driver.
+ *
+ * \param ctrlr Opaque handle to NVMe controller.
+ *
+ * \return number of processed external IO messages.
+ */
+int nvme_io_msg_process(struct spdk_nvme_ctrlr *ctrlr);
+
+int nvme_io_msg_ctrlr_register(struct spdk_nvme_ctrlr *ctrlr,
+ struct nvme_io_msg_producer *io_msg_producer);
+void nvme_io_msg_ctrlr_unregister(struct spdk_nvme_ctrlr *ctrlr,
+ struct nvme_io_msg_producer *io_msg_producer);
+void nvme_io_msg_ctrlr_detach(struct spdk_nvme_ctrlr *ctrlr);
+void nvme_io_msg_ctrlr_update(struct spdk_nvme_ctrlr *ctrlr);
+
+#endif /* SPDK_NVME_IO_MSG_H_ */
diff --git a/src/spdk/lib/nvme/nvme_ns.c b/src/spdk/lib/nvme/nvme_ns.c
new file mode 100644
index 000000000..5d424e5c7
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_ns.c
@@ -0,0 +1,401 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation. All rights reserved.
+ * Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "nvme_internal.h"
+
+static inline struct spdk_nvme_ns_data *
+_nvme_ns_get_data(struct spdk_nvme_ns *ns)
+{
+ return &ns->ctrlr->nsdata[ns->id - 1];
+}
+
+/**
+ * Update Namespace flags based on Identify Controller
+ * and Identify Namespace. This can be also used for
+ * Namespace Attribute Notice events and Namespace
+ * operations such as Attach/Detach.
+ */
+void
+nvme_ns_set_identify_data(struct spdk_nvme_ns *ns)
+{
+ struct spdk_nvme_ns_data *nsdata;
+
+ nsdata = _nvme_ns_get_data(ns);
+
+ ns->flags = 0x0000;
+
+ ns->sector_size = 1 << nsdata->lbaf[nsdata->flbas.format].lbads;
+ ns->extended_lba_size = ns->sector_size;
+
+ ns->md_size = nsdata->lbaf[nsdata->flbas.format].ms;
+ if (nsdata->flbas.extended) {
+ ns->flags |= SPDK_NVME_NS_EXTENDED_LBA_SUPPORTED;
+ ns->extended_lba_size += ns->md_size;
+ }
+
+ ns->sectors_per_max_io = spdk_nvme_ns_get_max_io_xfer_size(ns) / ns->extended_lba_size;
+
+ if (nsdata->noiob) {
+ ns->sectors_per_stripe = nsdata->noiob;
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "ns %u optimal IO boundary %" PRIu32 " blocks\n",
+ ns->id, ns->sectors_per_stripe);
+ } else if (ns->ctrlr->quirks & NVME_INTEL_QUIRK_STRIPING &&
+ ns->ctrlr->cdata.vs[3] != 0) {
+ ns->sectors_per_stripe = (1ULL << ns->ctrlr->cdata.vs[3]) * ns->ctrlr->min_page_size /
+ ns->sector_size;
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "ns %u stripe size quirk %" PRIu32 " blocks\n",
+ ns->id, ns->sectors_per_stripe);
+ } else {
+ ns->sectors_per_stripe = 0;
+ }
+
+ if (ns->ctrlr->cdata.oncs.dsm) {
+ ns->flags |= SPDK_NVME_NS_DEALLOCATE_SUPPORTED;
+ }
+
+ if (ns->ctrlr->cdata.oncs.compare) {
+ ns->flags |= SPDK_NVME_NS_COMPARE_SUPPORTED;
+ }
+
+ if (ns->ctrlr->cdata.vwc.present) {
+ ns->flags |= SPDK_NVME_NS_FLUSH_SUPPORTED;
+ }
+
+ if (ns->ctrlr->cdata.oncs.write_zeroes) {
+ ns->flags |= SPDK_NVME_NS_WRITE_ZEROES_SUPPORTED;
+ }
+
+ if (ns->ctrlr->cdata.oncs.write_unc) {
+ ns->flags |= SPDK_NVME_NS_WRITE_UNCORRECTABLE_SUPPORTED;
+ }
+
+ if (nsdata->nsrescap.raw) {
+ ns->flags |= SPDK_NVME_NS_RESERVATION_SUPPORTED;
+ }
+
+ ns->pi_type = SPDK_NVME_FMT_NVM_PROTECTION_DISABLE;
+ if (nsdata->lbaf[nsdata->flbas.format].ms && nsdata->dps.pit) {
+ ns->flags |= SPDK_NVME_NS_DPS_PI_SUPPORTED;
+ ns->pi_type = nsdata->dps.pit;
+ }
+}
+
+static int
+nvme_ctrlr_identify_ns(struct spdk_nvme_ns *ns)
+{
+ struct nvme_completion_poll_status *status;
+ struct spdk_nvme_ns_data *nsdata;
+ int rc;
+
+ status = calloc(1, sizeof(*status));
+ if (!status) {
+ SPDK_ERRLOG("Failed to allocate status tracker\n");
+ return -ENOMEM;
+ }
+
+ nsdata = _nvme_ns_get_data(ns);
+ rc = nvme_ctrlr_cmd_identify(ns->ctrlr, SPDK_NVME_IDENTIFY_NS, 0, ns->id,
+ nsdata, sizeof(*nsdata),
+ nvme_completion_poll_cb, status);
+ if (rc != 0) {
+ free(status);
+ return rc;
+ }
+
+ if (nvme_wait_for_completion_robust_lock(ns->ctrlr->adminq, status,
+ &ns->ctrlr->ctrlr_lock)) {
+ if (!status->timed_out) {
+ free(status);
+ }
+ /* This can occur if the namespace is not active. Simply zero the
+ * namespace data and continue. */
+ nvme_ns_destruct(ns);
+ return 0;
+ }
+ free(status);
+
+ nvme_ns_set_identify_data(ns);
+
+ return 0;
+}
+
+static int
+nvme_ctrlr_identify_id_desc(struct spdk_nvme_ns *ns)
+{
+ struct nvme_completion_poll_status *status;
+ int rc;
+
+ memset(ns->id_desc_list, 0, sizeof(ns->id_desc_list));
+
+ if (ns->ctrlr->vs.raw < SPDK_NVME_VERSION(1, 3, 0) ||
+ (ns->ctrlr->quirks & NVME_QUIRK_IDENTIFY_CNS)) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Version < 1.3; not attempting to retrieve NS ID Descriptor List\n");
+ return 0;
+ }
+
+ status = calloc(1, sizeof(*status));
+ if (!status) {
+ SPDK_ERRLOG("Failed to allocate status tracker\n");
+ return -ENOMEM;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Attempting to retrieve NS ID Descriptor List\n");
+ rc = nvme_ctrlr_cmd_identify(ns->ctrlr, SPDK_NVME_IDENTIFY_NS_ID_DESCRIPTOR_LIST, 0, ns->id,
+ ns->id_desc_list, sizeof(ns->id_desc_list),
+ nvme_completion_poll_cb, status);
+ if (rc < 0) {
+ free(status);
+ return rc;
+ }
+
+ rc = nvme_wait_for_completion_robust_lock(ns->ctrlr->adminq, status, &ns->ctrlr->ctrlr_lock);
+ if (rc != 0) {
+ SPDK_WARNLOG("Failed to retrieve NS ID Descriptor List\n");
+ memset(ns->id_desc_list, 0, sizeof(ns->id_desc_list));
+ }
+
+ if (!status->timed_out) {
+ free(status);
+ }
+
+ return rc;
+}
+
+uint32_t
+spdk_nvme_ns_get_id(struct spdk_nvme_ns *ns)
+{
+ return ns->id;
+}
+
+bool
+spdk_nvme_ns_is_active(struct spdk_nvme_ns *ns)
+{
+ const struct spdk_nvme_ns_data *nsdata = NULL;
+
+ /*
+ * According to the spec, valid NS has non-zero id.
+ */
+ if (ns->id == 0) {
+ return false;
+ }
+
+ nsdata = _nvme_ns_get_data(ns);
+
+ /*
+ * According to the spec, Identify Namespace will return a zero-filled structure for
+ * inactive namespace IDs.
+ * Check NCAP since it must be nonzero for an active namespace.
+ */
+ return nsdata->ncap != 0;
+}
+
+struct spdk_nvme_ctrlr *
+spdk_nvme_ns_get_ctrlr(struct spdk_nvme_ns *ns)
+{
+ return ns->ctrlr;
+}
+
+uint32_t
+spdk_nvme_ns_get_max_io_xfer_size(struct spdk_nvme_ns *ns)
+{
+ return ns->ctrlr->max_xfer_size;
+}
+
+uint32_t
+spdk_nvme_ns_get_sector_size(struct spdk_nvme_ns *ns)
+{
+ return ns->sector_size;
+}
+
+uint32_t
+spdk_nvme_ns_get_extended_sector_size(struct spdk_nvme_ns *ns)
+{
+ return ns->extended_lba_size;
+}
+
+uint64_t
+spdk_nvme_ns_get_num_sectors(struct spdk_nvme_ns *ns)
+{
+ return _nvme_ns_get_data(ns)->nsze;
+}
+
+uint64_t
+spdk_nvme_ns_get_size(struct spdk_nvme_ns *ns)
+{
+ return spdk_nvme_ns_get_num_sectors(ns) * spdk_nvme_ns_get_sector_size(ns);
+}
+
+uint32_t
+spdk_nvme_ns_get_flags(struct spdk_nvme_ns *ns)
+{
+ return ns->flags;
+}
+
+enum spdk_nvme_pi_type
+spdk_nvme_ns_get_pi_type(struct spdk_nvme_ns *ns) {
+ return ns->pi_type;
+}
+
+bool
+spdk_nvme_ns_supports_extended_lba(struct spdk_nvme_ns *ns)
+{
+ return (ns->flags & SPDK_NVME_NS_EXTENDED_LBA_SUPPORTED) ? true : false;
+}
+
+bool
+spdk_nvme_ns_supports_compare(struct spdk_nvme_ns *ns)
+{
+ return (ns->flags & SPDK_NVME_NS_COMPARE_SUPPORTED) ? true : false;
+}
+
+uint32_t
+spdk_nvme_ns_get_md_size(struct spdk_nvme_ns *ns)
+{
+ return ns->md_size;
+}
+
+const struct spdk_nvme_ns_data *
+spdk_nvme_ns_get_data(struct spdk_nvme_ns *ns)
+{
+ return _nvme_ns_get_data(ns);
+}
+
+enum spdk_nvme_dealloc_logical_block_read_value spdk_nvme_ns_get_dealloc_logical_block_read_value(
+ struct spdk_nvme_ns *ns)
+{
+ struct spdk_nvme_ctrlr *ctrlr = ns->ctrlr;
+ const struct spdk_nvme_ns_data *data = spdk_nvme_ns_get_data(ns);
+
+ if (ctrlr->quirks & NVME_QUIRK_READ_ZERO_AFTER_DEALLOCATE) {
+ return SPDK_NVME_DEALLOC_READ_00;
+ } else {
+ return data->dlfeat.bits.read_value;
+ }
+}
+
+uint32_t
+spdk_nvme_ns_get_optimal_io_boundary(struct spdk_nvme_ns *ns)
+{
+ return ns->sectors_per_stripe;
+}
+
+static const void *
+nvme_ns_find_id_desc(const struct spdk_nvme_ns *ns, enum spdk_nvme_nidt type, size_t *length)
+{
+ const struct spdk_nvme_ns_id_desc *desc;
+ size_t offset;
+
+ offset = 0;
+ while (offset + 4 < sizeof(ns->id_desc_list)) {
+ desc = (const struct spdk_nvme_ns_id_desc *)&ns->id_desc_list[offset];
+
+ if (desc->nidl == 0) {
+ /* End of list */
+ return NULL;
+ }
+
+ /*
+ * Check if this descriptor fits within the list.
+ * 4 is the fixed-size descriptor header (not counted in NIDL).
+ */
+ if (offset + desc->nidl + 4 > sizeof(ns->id_desc_list)) {
+ /* Descriptor longer than remaining space in list (invalid) */
+ return NULL;
+ }
+
+ if (desc->nidt == type) {
+ *length = desc->nidl;
+ return &desc->nid[0];
+ }
+
+ offset += 4 + desc->nidl;
+ }
+
+ return NULL;
+}
+
+const struct spdk_uuid *
+spdk_nvme_ns_get_uuid(const struct spdk_nvme_ns *ns)
+{
+ const struct spdk_uuid *uuid;
+ size_t uuid_size;
+
+ uuid = nvme_ns_find_id_desc(ns, SPDK_NVME_NIDT_UUID, &uuid_size);
+ if (uuid == NULL || uuid_size != sizeof(*uuid)) {
+ return NULL;
+ }
+
+ return uuid;
+}
+
+int nvme_ns_construct(struct spdk_nvme_ns *ns, uint32_t id,
+ struct spdk_nvme_ctrlr *ctrlr)
+{
+ int rc;
+
+ assert(id > 0);
+
+ ns->ctrlr = ctrlr;
+ ns->id = id;
+
+ rc = nvme_ctrlr_identify_ns(ns);
+ if (rc != 0) {
+ return rc;
+ }
+
+ return nvme_ctrlr_identify_id_desc(ns);
+}
+
+void nvme_ns_destruct(struct spdk_nvme_ns *ns)
+{
+ struct spdk_nvme_ns_data *nsdata;
+
+ if (!ns->id) {
+ return;
+ }
+
+ nsdata = _nvme_ns_get_data(ns);
+ memset(nsdata, 0, sizeof(*nsdata));
+ ns->sector_size = 0;
+ ns->extended_lba_size = 0;
+ ns->md_size = 0;
+ ns->pi_type = 0;
+ ns->sectors_per_max_io = 0;
+ ns->sectors_per_stripe = 0;
+ ns->flags = 0;
+}
+
+int nvme_ns_update(struct spdk_nvme_ns *ns)
+{
+ return nvme_ctrlr_identify_ns(ns);
+}
diff --git a/src/spdk/lib/nvme/nvme_ns_cmd.c b/src/spdk/lib/nvme/nvme_ns_cmd.c
new file mode 100644
index 000000000..eaa825fa8
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_ns_cmd.c
@@ -0,0 +1,1074 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "nvme_internal.h"
+
+static inline struct nvme_request *_nvme_ns_cmd_rw(struct spdk_nvme_ns *ns,
+ struct spdk_nvme_qpair *qpair,
+ const struct nvme_payload *payload, uint32_t payload_offset, uint32_t md_offset,
+ uint64_t lba, uint32_t lba_count, spdk_nvme_cmd_cb cb_fn,
+ void *cb_arg, uint32_t opc, uint32_t io_flags,
+ uint16_t apptag_mask, uint16_t apptag, bool check_sgl);
+
+
+static bool
+nvme_ns_check_request_length(uint32_t lba_count, uint32_t sectors_per_max_io,
+ uint32_t sectors_per_stripe, uint32_t qdepth)
+{
+ uint32_t child_per_io = UINT32_MAX;
+
+ /* After a namespace is destroyed(e.g. hotplug), all the fields associated with the
+ * namespace will be cleared to zero, the function will return TRUE for this case,
+ * and -EINVAL will be returned to caller.
+ */
+ if (sectors_per_stripe > 0) {
+ child_per_io = (lba_count + sectors_per_stripe - 1) / sectors_per_stripe;
+ } else if (sectors_per_max_io > 0) {
+ child_per_io = (lba_count + sectors_per_max_io - 1) / sectors_per_max_io;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "checking maximum i/o length %d\n", child_per_io);
+
+ return child_per_io >= qdepth;
+}
+
+static struct nvme_request *
+_nvme_add_child_request(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
+ const struct nvme_payload *payload,
+ uint32_t payload_offset, uint32_t md_offset,
+ uint64_t lba, uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t opc,
+ uint32_t io_flags, uint16_t apptag_mask, uint16_t apptag,
+ struct nvme_request *parent, bool check_sgl)
+{
+ struct nvme_request *child;
+
+ child = _nvme_ns_cmd_rw(ns, qpair, payload, payload_offset, md_offset, lba, lba_count, cb_fn,
+ cb_arg, opc, io_flags, apptag_mask, apptag, check_sgl);
+ if (child == NULL) {
+ nvme_request_free_children(parent);
+ nvme_free_request(parent);
+ return NULL;
+ }
+
+ nvme_request_add_child(parent, child);
+ return child;
+}
+
+static struct nvme_request *
+_nvme_ns_cmd_split_request(struct spdk_nvme_ns *ns,
+ struct spdk_nvme_qpair *qpair,
+ const struct nvme_payload *payload,
+ uint32_t payload_offset, uint32_t md_offset,
+ uint64_t lba, uint32_t lba_count,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t opc,
+ uint32_t io_flags, struct nvme_request *req,
+ uint32_t sectors_per_max_io, uint32_t sector_mask,
+ uint16_t apptag_mask, uint16_t apptag)
+{
+ uint32_t sector_size;
+ uint32_t md_size = ns->md_size;
+ uint32_t remaining_lba_count = lba_count;
+ struct nvme_request *child;
+
+ sector_size = ns->extended_lba_size;
+
+ if ((io_flags & SPDK_NVME_IO_FLAGS_PRACT) &&
+ (ns->flags & SPDK_NVME_NS_EXTENDED_LBA_SUPPORTED) &&
+ (ns->flags & SPDK_NVME_NS_DPS_PI_SUPPORTED) &&
+ (md_size == 8)) {
+ sector_size -= 8;
+ }
+
+ while (remaining_lba_count > 0) {
+ lba_count = sectors_per_max_io - (lba & sector_mask);
+ lba_count = spdk_min(remaining_lba_count, lba_count);
+
+ child = _nvme_add_child_request(ns, qpair, payload, payload_offset, md_offset,
+ lba, lba_count, cb_fn, cb_arg, opc,
+ io_flags, apptag_mask, apptag, req, true);
+ if (child == NULL) {
+ return NULL;
+ }
+
+ remaining_lba_count -= lba_count;
+ lba += lba_count;
+ payload_offset += lba_count * sector_size;
+ md_offset += lba_count * md_size;
+ }
+
+ return req;
+}
+
+static inline bool
+_is_io_flags_valid(uint32_t io_flags)
+{
+ if (io_flags & ~SPDK_NVME_IO_FLAGS_VALID_MASK) {
+ /* Invalid io_flags */
+ SPDK_ERRLOG("Invalid io_flags 0x%x\n", io_flags);
+ return false;
+ }
+
+ return true;
+}
+
+static void
+_nvme_ns_cmd_setup_request(struct spdk_nvme_ns *ns, struct nvme_request *req,
+ uint32_t opc, uint64_t lba, uint32_t lba_count,
+ uint32_t io_flags, uint16_t apptag_mask, uint16_t apptag)
+{
+ struct spdk_nvme_cmd *cmd;
+
+ assert(_is_io_flags_valid(io_flags));
+
+ cmd = &req->cmd;
+ cmd->opc = opc;
+ cmd->nsid = ns->id;
+
+ *(uint64_t *)&cmd->cdw10 = lba;
+
+ if (ns->flags & SPDK_NVME_NS_DPS_PI_SUPPORTED) {
+ switch (ns->pi_type) {
+ case SPDK_NVME_FMT_NVM_PROTECTION_TYPE1:
+ case SPDK_NVME_FMT_NVM_PROTECTION_TYPE2:
+ cmd->cdw14 = (uint32_t)lba;
+ break;
+ }
+ }
+
+ cmd->fuse = (io_flags & SPDK_NVME_IO_FLAGS_FUSE_MASK);
+
+ cmd->cdw12 = lba_count - 1;
+ cmd->cdw12 |= (io_flags & SPDK_NVME_IO_FLAGS_CDW12_MASK);
+
+ cmd->cdw15 = apptag_mask;
+ cmd->cdw15 = (cmd->cdw15 << 16 | apptag);
+}
+
+static struct nvme_request *
+_nvme_ns_cmd_split_request_prp(struct spdk_nvme_ns *ns,
+ struct spdk_nvme_qpair *qpair,
+ const struct nvme_payload *payload,
+ uint32_t payload_offset, uint32_t md_offset,
+ uint64_t lba, uint32_t lba_count,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t opc,
+ uint32_t io_flags, struct nvme_request *req,
+ uint16_t apptag_mask, uint16_t apptag)
+{
+ spdk_nvme_req_reset_sgl_cb reset_sgl_fn = req->payload.reset_sgl_fn;
+ spdk_nvme_req_next_sge_cb next_sge_fn = req->payload.next_sge_fn;
+ void *sgl_cb_arg = req->payload.contig_or_cb_arg;
+ bool start_valid, end_valid, last_sge, child_equals_parent;
+ uint64_t child_lba = lba;
+ uint32_t req_current_length = 0;
+ uint32_t child_length = 0;
+ uint32_t sge_length;
+ uint32_t page_size = qpair->ctrlr->page_size;
+ uintptr_t address;
+
+ reset_sgl_fn(sgl_cb_arg, payload_offset);
+ next_sge_fn(sgl_cb_arg, (void **)&address, &sge_length);
+ while (req_current_length < req->payload_size) {
+
+ if (sge_length == 0) {
+ continue;
+ } else if (req_current_length + sge_length > req->payload_size) {
+ sge_length = req->payload_size - req_current_length;
+ }
+
+ /*
+ * The start of the SGE is invalid if the start address is not page aligned,
+ * unless it is the first SGE in the child request.
+ */
+ start_valid = child_length == 0 || _is_page_aligned(address, page_size);
+
+ /* Boolean for whether this is the last SGE in the parent request. */
+ last_sge = (req_current_length + sge_length == req->payload_size);
+
+ /*
+ * The end of the SGE is invalid if the end address is not page aligned,
+ * unless it is the last SGE in the parent request.
+ */
+ end_valid = last_sge || _is_page_aligned(address + sge_length, page_size);
+
+ /*
+ * This child request equals the parent request, meaning that no splitting
+ * was required for the parent request (the one passed into this function).
+ * In this case, we do not create a child request at all - we just send
+ * the original request as a single request at the end of this function.
+ */
+ child_equals_parent = (child_length + sge_length == req->payload_size);
+
+ if (start_valid) {
+ /*
+ * The start of the SGE is valid, so advance the length parameters,
+ * to include this SGE with previous SGEs for this child request
+ * (if any). If it is not valid, we do not advance the length
+ * parameters nor get the next SGE, because we must send what has
+ * been collected before this SGE as a child request.
+ */
+ child_length += sge_length;
+ req_current_length += sge_length;
+ if (req_current_length < req->payload_size) {
+ next_sge_fn(sgl_cb_arg, (void **)&address, &sge_length);
+ }
+ /*
+ * If the next SGE is not page aligned, we will need to create a child
+ * request for what we have so far, and then start a new child request for
+ * the next SGE.
+ */
+ start_valid = _is_page_aligned(address, page_size);
+ }
+
+ if (start_valid && end_valid && !last_sge) {
+ continue;
+ }
+
+ /*
+ * We need to create a split here. Send what we have accumulated so far as a child
+ * request. Checking if child_equals_parent allows us to *not* create a child request
+ * when no splitting is required - in that case we will fall-through and just create
+ * a single request with no children for the entire I/O.
+ */
+ if (!child_equals_parent) {
+ struct nvme_request *child;
+ uint32_t child_lba_count;
+
+ if ((child_length % ns->extended_lba_size) != 0) {
+ SPDK_ERRLOG("child_length %u not even multiple of lba_size %u\n",
+ child_length, ns->extended_lba_size);
+ return NULL;
+ }
+ child_lba_count = child_length / ns->extended_lba_size;
+ /*
+ * Note the last parameter is set to "false" - this tells the recursive
+ * call to _nvme_ns_cmd_rw() to not bother with checking for SGL splitting
+ * since we have already verified it here.
+ */
+ child = _nvme_add_child_request(ns, qpair, payload, payload_offset, md_offset,
+ child_lba, child_lba_count,
+ cb_fn, cb_arg, opc, io_flags,
+ apptag_mask, apptag, req, false);
+ if (child == NULL) {
+ return NULL;
+ }
+ payload_offset += child_length;
+ md_offset += child_lba_count * ns->md_size;
+ child_lba += child_lba_count;
+ child_length = 0;
+ }
+ }
+
+ if (child_length == req->payload_size) {
+ /* No splitting was required, so setup the whole payload as one request. */
+ _nvme_ns_cmd_setup_request(ns, req, opc, lba, lba_count, io_flags, apptag_mask, apptag);
+ }
+
+ return req;
+}
+
+static struct nvme_request *
+_nvme_ns_cmd_split_request_sgl(struct spdk_nvme_ns *ns,
+ struct spdk_nvme_qpair *qpair,
+ const struct nvme_payload *payload,
+ uint32_t payload_offset, uint32_t md_offset,
+ uint64_t lba, uint32_t lba_count,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t opc,
+ uint32_t io_flags, struct nvme_request *req,
+ uint16_t apptag_mask, uint16_t apptag)
+{
+ spdk_nvme_req_reset_sgl_cb reset_sgl_fn = req->payload.reset_sgl_fn;
+ spdk_nvme_req_next_sge_cb next_sge_fn = req->payload.next_sge_fn;
+ void *sgl_cb_arg = req->payload.contig_or_cb_arg;
+ uint64_t child_lba = lba;
+ uint32_t req_current_length = 0;
+ uint32_t child_length = 0;
+ uint32_t sge_length;
+ uint16_t max_sges, num_sges;
+ uintptr_t address;
+
+ max_sges = ns->ctrlr->max_sges;
+
+ reset_sgl_fn(sgl_cb_arg, payload_offset);
+ num_sges = 0;
+
+ while (req_current_length < req->payload_size) {
+ next_sge_fn(sgl_cb_arg, (void **)&address, &sge_length);
+
+ if (req_current_length + sge_length > req->payload_size) {
+ sge_length = req->payload_size - req_current_length;
+ }
+
+ child_length += sge_length;
+ req_current_length += sge_length;
+ num_sges++;
+
+ if (num_sges < max_sges && req_current_length < req->payload_size) {
+ continue;
+ }
+
+ /*
+ * We need to create a split here. Send what we have accumulated so far as a child
+ * request. Checking if the child equals the full payload allows us to *not*
+ * create a child request when no splitting is required - in that case we will
+ * fall-through and just create a single request with no children for the entire I/O.
+ */
+ if (child_length != req->payload_size) {
+ struct nvme_request *child;
+ uint32_t child_lba_count;
+
+ if ((child_length % ns->extended_lba_size) != 0) {
+ SPDK_ERRLOG("child_length %u not even multiple of lba_size %u\n",
+ child_length, ns->extended_lba_size);
+ return NULL;
+ }
+ child_lba_count = child_length / ns->extended_lba_size;
+ /*
+ * Note the last parameter is set to "false" - this tells the recursive
+ * call to _nvme_ns_cmd_rw() to not bother with checking for SGL splitting
+ * since we have already verified it here.
+ */
+ child = _nvme_add_child_request(ns, qpair, payload, payload_offset, md_offset,
+ child_lba, child_lba_count,
+ cb_fn, cb_arg, opc, io_flags,
+ apptag_mask, apptag, req, false);
+ if (child == NULL) {
+ return NULL;
+ }
+ payload_offset += child_length;
+ md_offset += child_lba_count * ns->md_size;
+ child_lba += child_lba_count;
+ child_length = 0;
+ num_sges = 0;
+ }
+ }
+
+ if (child_length == req->payload_size) {
+ /* No splitting was required, so setup the whole payload as one request. */
+ _nvme_ns_cmd_setup_request(ns, req, opc, lba, lba_count, io_flags, apptag_mask, apptag);
+ }
+
+ return req;
+}
+
+static inline struct nvme_request *
+_nvme_ns_cmd_rw(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
+ const struct nvme_payload *payload, uint32_t payload_offset, uint32_t md_offset,
+ uint64_t lba, uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t opc,
+ uint32_t io_flags, uint16_t apptag_mask, uint16_t apptag, bool check_sgl)
+{
+ struct nvme_request *req;
+ uint32_t sector_size;
+ uint32_t sectors_per_max_io;
+ uint32_t sectors_per_stripe;
+
+ sector_size = ns->extended_lba_size;
+ sectors_per_max_io = ns->sectors_per_max_io;
+ sectors_per_stripe = ns->sectors_per_stripe;
+
+ if ((io_flags & SPDK_NVME_IO_FLAGS_PRACT) &&
+ (ns->flags & SPDK_NVME_NS_EXTENDED_LBA_SUPPORTED) &&
+ (ns->flags & SPDK_NVME_NS_DPS_PI_SUPPORTED) &&
+ (ns->md_size == 8)) {
+ sector_size -= 8;
+ }
+
+ req = nvme_allocate_request(qpair, payload, lba_count * sector_size, lba_count * ns->md_size,
+ cb_fn, cb_arg);
+ if (req == NULL) {
+ return NULL;
+ }
+
+ req->payload_offset = payload_offset;
+ req->md_offset = md_offset;
+
+ /*
+ * Intel DC P3*00 NVMe controllers benefit from driver-assisted striping.
+ * If this controller defines a stripe boundary and this I/O spans a stripe
+ * boundary, split the request into multiple requests and submit each
+ * separately to hardware.
+ */
+ if (sectors_per_stripe > 0 &&
+ (((lba & (sectors_per_stripe - 1)) + lba_count) > sectors_per_stripe)) {
+
+ return _nvme_ns_cmd_split_request(ns, qpair, payload, payload_offset, md_offset, lba, lba_count,
+ cb_fn,
+ cb_arg, opc,
+ io_flags, req, sectors_per_stripe, sectors_per_stripe - 1, apptag_mask, apptag);
+ } else if (lba_count > sectors_per_max_io) {
+ return _nvme_ns_cmd_split_request(ns, qpair, payload, payload_offset, md_offset, lba, lba_count,
+ cb_fn,
+ cb_arg, opc,
+ io_flags, req, sectors_per_max_io, 0, apptag_mask, apptag);
+ } else if (nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL && check_sgl) {
+ if (ns->ctrlr->flags & SPDK_NVME_CTRLR_SGL_SUPPORTED) {
+ return _nvme_ns_cmd_split_request_sgl(ns, qpair, payload, payload_offset, md_offset,
+ lba, lba_count, cb_fn, cb_arg, opc, io_flags,
+ req, apptag_mask, apptag);
+ } else {
+ return _nvme_ns_cmd_split_request_prp(ns, qpair, payload, payload_offset, md_offset,
+ lba, lba_count, cb_fn, cb_arg, opc, io_flags,
+ req, apptag_mask, apptag);
+ }
+ }
+
+ _nvme_ns_cmd_setup_request(ns, req, opc, lba, lba_count, io_flags, apptag_mask, apptag);
+ return req;
+}
+
+int
+spdk_nvme_ns_cmd_compare(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, void *buffer,
+ uint64_t lba,
+ uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg,
+ uint32_t io_flags)
+{
+ struct nvme_request *req;
+ struct nvme_payload payload;
+
+ if (!_is_io_flags_valid(io_flags)) {
+ return -EINVAL;
+ }
+
+ payload = NVME_PAYLOAD_CONTIG(buffer, NULL);
+
+ req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg,
+ SPDK_NVME_OPC_COMPARE,
+ io_flags, 0,
+ 0, true);
+ if (req != NULL) {
+ return nvme_qpair_submit_request(qpair, req);
+ } else if (nvme_ns_check_request_length(lba_count,
+ ns->sectors_per_max_io,
+ ns->sectors_per_stripe,
+ qpair->ctrlr->opts.io_queue_requests)) {
+ return -EINVAL;
+ } else {
+ return -ENOMEM;
+ }
+}
+
+int
+spdk_nvme_ns_cmd_compare_with_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
+ void *buffer,
+ void *metadata,
+ uint64_t lba,
+ uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg,
+ uint32_t io_flags, uint16_t apptag_mask, uint16_t apptag)
+{
+ struct nvme_request *req;
+ struct nvme_payload payload;
+
+ if (!_is_io_flags_valid(io_flags)) {
+ return -EINVAL;
+ }
+
+ payload = NVME_PAYLOAD_CONTIG(buffer, metadata);
+
+ req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg,
+ SPDK_NVME_OPC_COMPARE,
+ io_flags,
+ apptag_mask, apptag, true);
+ if (req != NULL) {
+ return nvme_qpair_submit_request(qpair, req);
+ } else if (nvme_ns_check_request_length(lba_count,
+ ns->sectors_per_max_io,
+ ns->sectors_per_stripe,
+ qpair->ctrlr->opts.io_queue_requests)) {
+ return -EINVAL;
+ } else {
+ return -ENOMEM;
+ }
+}
+
+int
+spdk_nvme_ns_cmd_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
+ uint64_t lba, uint32_t lba_count,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t io_flags,
+ spdk_nvme_req_reset_sgl_cb reset_sgl_fn,
+ spdk_nvme_req_next_sge_cb next_sge_fn)
+{
+ struct nvme_request *req;
+ struct nvme_payload payload;
+
+ if (!_is_io_flags_valid(io_flags)) {
+ return -EINVAL;
+ }
+
+ if (reset_sgl_fn == NULL || next_sge_fn == NULL) {
+ return -EINVAL;
+ }
+
+ payload = NVME_PAYLOAD_SGL(reset_sgl_fn, next_sge_fn, cb_arg, NULL);
+
+ req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg,
+ SPDK_NVME_OPC_COMPARE,
+ io_flags, 0, 0, true);
+ if (req != NULL) {
+ return nvme_qpair_submit_request(qpair, req);
+ } else if (nvme_ns_check_request_length(lba_count,
+ ns->sectors_per_max_io,
+ ns->sectors_per_stripe,
+ qpair->ctrlr->opts.io_queue_requests)) {
+ return -EINVAL;
+ } else {
+ return -ENOMEM;
+ }
+}
+
+int
+spdk_nvme_ns_cmd_comparev_with_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
+ uint64_t lba, uint32_t lba_count,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t io_flags,
+ spdk_nvme_req_reset_sgl_cb reset_sgl_fn,
+ spdk_nvme_req_next_sge_cb next_sge_fn, void *metadata,
+ uint16_t apptag_mask, uint16_t apptag)
+{
+ struct nvme_request *req;
+ struct nvme_payload payload;
+
+ if (!_is_io_flags_valid(io_flags)) {
+ return -EINVAL;
+ }
+
+ if (reset_sgl_fn == NULL || next_sge_fn == NULL) {
+ return -EINVAL;
+ }
+
+ payload = NVME_PAYLOAD_SGL(reset_sgl_fn, next_sge_fn, cb_arg, metadata);
+
+ req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg,
+ SPDK_NVME_OPC_COMPARE, io_flags, apptag_mask, apptag, true);
+ if (req != NULL) {
+ return nvme_qpair_submit_request(qpair, req);
+ } else if (nvme_ns_check_request_length(lba_count,
+ ns->sectors_per_max_io,
+ ns->sectors_per_stripe,
+ qpair->ctrlr->opts.io_queue_requests)) {
+ return -EINVAL;
+ } else {
+ return -ENOMEM;
+ }
+}
+
+int
+spdk_nvme_ns_cmd_read(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, void *buffer,
+ uint64_t lba,
+ uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg,
+ uint32_t io_flags)
+{
+ struct nvme_request *req;
+ struct nvme_payload payload;
+
+ if (!_is_io_flags_valid(io_flags)) {
+ return -EINVAL;
+ }
+
+ payload = NVME_PAYLOAD_CONTIG(buffer, NULL);
+
+ req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_READ,
+ io_flags, 0,
+ 0, true);
+ if (req != NULL) {
+ return nvme_qpair_submit_request(qpair, req);
+ } else if (nvme_ns_check_request_length(lba_count,
+ ns->sectors_per_max_io,
+ ns->sectors_per_stripe,
+ qpair->ctrlr->opts.io_queue_requests)) {
+ return -EINVAL;
+ } else {
+ return -ENOMEM;
+ }
+}
+
+int
+spdk_nvme_ns_cmd_read_with_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, void *buffer,
+ void *metadata,
+ uint64_t lba,
+ uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg,
+ uint32_t io_flags, uint16_t apptag_mask, uint16_t apptag)
+{
+ struct nvme_request *req;
+ struct nvme_payload payload;
+
+ if (!_is_io_flags_valid(io_flags)) {
+ return -EINVAL;
+ }
+
+ payload = NVME_PAYLOAD_CONTIG(buffer, metadata);
+
+ req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_READ,
+ io_flags,
+ apptag_mask, apptag, true);
+ if (req != NULL) {
+ return nvme_qpair_submit_request(qpair, req);
+ } else if (nvme_ns_check_request_length(lba_count,
+ ns->sectors_per_max_io,
+ ns->sectors_per_stripe,
+ qpair->ctrlr->opts.io_queue_requests)) {
+ return -EINVAL;
+ } else {
+ return -ENOMEM;
+ }
+}
+
+int
+spdk_nvme_ns_cmd_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
+ uint64_t lba, uint32_t lba_count,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t io_flags,
+ spdk_nvme_req_reset_sgl_cb reset_sgl_fn,
+ spdk_nvme_req_next_sge_cb next_sge_fn)
+{
+ struct nvme_request *req;
+ struct nvme_payload payload;
+
+ if (!_is_io_flags_valid(io_flags)) {
+ return -EINVAL;
+ }
+
+ if (reset_sgl_fn == NULL || next_sge_fn == NULL) {
+ return -EINVAL;
+ }
+
+ payload = NVME_PAYLOAD_SGL(reset_sgl_fn, next_sge_fn, cb_arg, NULL);
+
+ req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_READ,
+ io_flags, 0, 0, true);
+ if (req != NULL) {
+ return nvme_qpair_submit_request(qpair, req);
+ } else if (nvme_ns_check_request_length(lba_count,
+ ns->sectors_per_max_io,
+ ns->sectors_per_stripe,
+ qpair->ctrlr->opts.io_queue_requests)) {
+ return -EINVAL;
+ } else {
+ return -ENOMEM;
+ }
+}
+
+int
+spdk_nvme_ns_cmd_readv_with_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
+ uint64_t lba, uint32_t lba_count,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t io_flags,
+ spdk_nvme_req_reset_sgl_cb reset_sgl_fn,
+ spdk_nvme_req_next_sge_cb next_sge_fn, void *metadata,
+ uint16_t apptag_mask, uint16_t apptag)
+{
+ struct nvme_request *req;
+ struct nvme_payload payload;
+
+ if (!_is_io_flags_valid(io_flags)) {
+ return -EINVAL;
+ }
+
+ if (reset_sgl_fn == NULL || next_sge_fn == NULL) {
+ return -EINVAL;
+ }
+
+ payload = NVME_PAYLOAD_SGL(reset_sgl_fn, next_sge_fn, cb_arg, metadata);
+
+ req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_READ,
+ io_flags, apptag_mask, apptag, true);
+ if (req != NULL) {
+ return nvme_qpair_submit_request(qpair, req);
+ } else if (nvme_ns_check_request_length(lba_count,
+ ns->sectors_per_max_io,
+ ns->sectors_per_stripe,
+ qpair->ctrlr->opts.io_queue_requests)) {
+ return -EINVAL;
+ } else {
+ return -ENOMEM;
+ }
+}
+
+int
+spdk_nvme_ns_cmd_write(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
+ void *buffer, uint64_t lba,
+ uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg,
+ uint32_t io_flags)
+{
+ struct nvme_request *req;
+ struct nvme_payload payload;
+
+ if (!_is_io_flags_valid(io_flags)) {
+ return -EINVAL;
+ }
+
+ payload = NVME_PAYLOAD_CONTIG(buffer, NULL);
+
+ req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_WRITE,
+ io_flags, 0, 0, true);
+ if (req != NULL) {
+ return nvme_qpair_submit_request(qpair, req);
+ } else if (nvme_ns_check_request_length(lba_count,
+ ns->sectors_per_max_io,
+ ns->sectors_per_stripe,
+ qpair->ctrlr->opts.io_queue_requests)) {
+ return -EINVAL;
+ } else {
+ return -ENOMEM;
+ }
+}
+
+int
+spdk_nvme_ns_cmd_write_with_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
+ void *buffer, void *metadata, uint64_t lba,
+ uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg,
+ uint32_t io_flags, uint16_t apptag_mask, uint16_t apptag)
+{
+ struct nvme_request *req;
+ struct nvme_payload payload;
+
+ if (!_is_io_flags_valid(io_flags)) {
+ return -EINVAL;
+ }
+
+ payload = NVME_PAYLOAD_CONTIG(buffer, metadata);
+
+ req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_WRITE,
+ io_flags, apptag_mask, apptag, true);
+ if (req != NULL) {
+ return nvme_qpair_submit_request(qpair, req);
+ } else if (nvme_ns_check_request_length(lba_count,
+ ns->sectors_per_max_io,
+ ns->sectors_per_stripe,
+ qpair->ctrlr->opts.io_queue_requests)) {
+ return -EINVAL;
+ } else {
+ return -ENOMEM;
+ }
+}
+
+int
+spdk_nvme_ns_cmd_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
+ uint64_t lba, uint32_t lba_count,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t io_flags,
+ spdk_nvme_req_reset_sgl_cb reset_sgl_fn,
+ spdk_nvme_req_next_sge_cb next_sge_fn)
+{
+ struct nvme_request *req;
+ struct nvme_payload payload;
+
+ if (!_is_io_flags_valid(io_flags)) {
+ return -EINVAL;
+ }
+
+ if (reset_sgl_fn == NULL || next_sge_fn == NULL) {
+ return -EINVAL;
+ }
+
+ payload = NVME_PAYLOAD_SGL(reset_sgl_fn, next_sge_fn, cb_arg, NULL);
+
+ req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_WRITE,
+ io_flags, 0, 0, true);
+ if (req != NULL) {
+ return nvme_qpair_submit_request(qpair, req);
+ } else if (nvme_ns_check_request_length(lba_count,
+ ns->sectors_per_max_io,
+ ns->sectors_per_stripe,
+ qpair->ctrlr->opts.io_queue_requests)) {
+ return -EINVAL;
+ } else {
+ return -ENOMEM;
+ }
+}
+
+int
+spdk_nvme_ns_cmd_writev_with_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
+ uint64_t lba, uint32_t lba_count,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t io_flags,
+ spdk_nvme_req_reset_sgl_cb reset_sgl_fn,
+ spdk_nvme_req_next_sge_cb next_sge_fn, void *metadata,
+ uint16_t apptag_mask, uint16_t apptag)
+{
+ struct nvme_request *req;
+ struct nvme_payload payload;
+
+ if (!_is_io_flags_valid(io_flags)) {
+ return -EINVAL;
+ }
+
+ if (reset_sgl_fn == NULL || next_sge_fn == NULL) {
+ return -EINVAL;
+ }
+
+ payload = NVME_PAYLOAD_SGL(reset_sgl_fn, next_sge_fn, cb_arg, metadata);
+
+ req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_WRITE,
+ io_flags, apptag_mask, apptag, true);
+ if (req != NULL) {
+ return nvme_qpair_submit_request(qpair, req);
+ } else if (nvme_ns_check_request_length(lba_count,
+ ns->sectors_per_max_io,
+ ns->sectors_per_stripe,
+ qpair->ctrlr->opts.io_queue_requests)) {
+ return -EINVAL;
+ } else {
+ return -ENOMEM;
+ }
+}
+
+int
+spdk_nvme_ns_cmd_write_zeroes(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
+ uint64_t lba, uint32_t lba_count,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg,
+ uint32_t io_flags)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+ uint64_t *tmp_lba;
+
+ if (!_is_io_flags_valid(io_flags)) {
+ return -EINVAL;
+ }
+
+ if (lba_count == 0 || lba_count > UINT16_MAX + 1) {
+ return -EINVAL;
+ }
+
+ req = nvme_allocate_request_null(qpair, cb_fn, cb_arg);
+ if (req == NULL) {
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_WRITE_ZEROES;
+ cmd->nsid = ns->id;
+
+ tmp_lba = (uint64_t *)&cmd->cdw10;
+ *tmp_lba = lba;
+ cmd->cdw12 = lba_count - 1;
+ cmd->fuse = (io_flags & SPDK_NVME_IO_FLAGS_FUSE_MASK);
+ cmd->cdw12 |= (io_flags & SPDK_NVME_IO_FLAGS_CDW12_MASK);
+
+ return nvme_qpair_submit_request(qpair, req);
+}
+
+int
+spdk_nvme_ns_cmd_write_uncorrectable(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
+ uint64_t lba, uint32_t lba_count,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+ uint64_t *tmp_lba;
+
+ if (lba_count == 0 || lba_count > UINT16_MAX + 1) {
+ return -EINVAL;
+ }
+
+ req = nvme_allocate_request_null(qpair, cb_fn, cb_arg);
+ if (req == NULL) {
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_WRITE_UNCORRECTABLE;
+ cmd->nsid = ns->id;
+
+ tmp_lba = (uint64_t *)&cmd->cdw10;
+ *tmp_lba = lba;
+ cmd->cdw12 = lba_count - 1;
+
+ return nvme_qpair_submit_request(qpair, req);
+}
+
+int
+spdk_nvme_ns_cmd_dataset_management(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
+ uint32_t type,
+ const struct spdk_nvme_dsm_range *ranges, uint16_t num_ranges,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+
+ if (num_ranges == 0 || num_ranges > SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES) {
+ return -EINVAL;
+ }
+
+ if (ranges == NULL) {
+ return -EINVAL;
+ }
+
+ req = nvme_allocate_request_user_copy(qpair, (void *)ranges,
+ num_ranges * sizeof(struct spdk_nvme_dsm_range),
+ cb_fn, cb_arg, true);
+ if (req == NULL) {
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_DATASET_MANAGEMENT;
+ cmd->nsid = ns->id;
+
+ cmd->cdw10_bits.dsm.nr = num_ranges - 1;
+ cmd->cdw11 = type;
+
+ return nvme_qpair_submit_request(qpair, req);
+}
+
+int
+spdk_nvme_ns_cmd_flush(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+
+ req = nvme_allocate_request_null(qpair, cb_fn, cb_arg);
+ if (req == NULL) {
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_FLUSH;
+ cmd->nsid = ns->id;
+
+ return nvme_qpair_submit_request(qpair, req);
+}
+
+int
+spdk_nvme_ns_cmd_reservation_register(struct spdk_nvme_ns *ns,
+ struct spdk_nvme_qpair *qpair,
+ struct spdk_nvme_reservation_register_data *payload,
+ bool ignore_key,
+ enum spdk_nvme_reservation_register_action action,
+ enum spdk_nvme_reservation_register_cptpl cptpl,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+
+ req = nvme_allocate_request_user_copy(qpair,
+ payload, sizeof(struct spdk_nvme_reservation_register_data),
+ cb_fn, cb_arg, true);
+ if (req == NULL) {
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_RESERVATION_REGISTER;
+ cmd->nsid = ns->id;
+
+ cmd->cdw10_bits.resv_register.rrega = action;
+ cmd->cdw10_bits.resv_register.iekey = ignore_key;
+ cmd->cdw10_bits.resv_register.cptpl = cptpl;
+
+ return nvme_qpair_submit_request(qpair, req);
+}
+
+int
+spdk_nvme_ns_cmd_reservation_release(struct spdk_nvme_ns *ns,
+ struct spdk_nvme_qpair *qpair,
+ struct spdk_nvme_reservation_key_data *payload,
+ bool ignore_key,
+ enum spdk_nvme_reservation_release_action action,
+ enum spdk_nvme_reservation_type type,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+
+ req = nvme_allocate_request_user_copy(qpair,
+ payload, sizeof(struct spdk_nvme_reservation_key_data), cb_fn,
+ cb_arg, true);
+ if (req == NULL) {
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_RESERVATION_RELEASE;
+ cmd->nsid = ns->id;
+
+ cmd->cdw10_bits.resv_release.rrela = action;
+ cmd->cdw10_bits.resv_release.iekey = ignore_key;
+ cmd->cdw10_bits.resv_release.rtype = type;
+
+ return nvme_qpair_submit_request(qpair, req);
+}
+
+int
+spdk_nvme_ns_cmd_reservation_acquire(struct spdk_nvme_ns *ns,
+ struct spdk_nvme_qpair *qpair,
+ struct spdk_nvme_reservation_acquire_data *payload,
+ bool ignore_key,
+ enum spdk_nvme_reservation_acquire_action action,
+ enum spdk_nvme_reservation_type type,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+
+ req = nvme_allocate_request_user_copy(qpair,
+ payload, sizeof(struct spdk_nvme_reservation_acquire_data),
+ cb_fn, cb_arg, true);
+ if (req == NULL) {
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_RESERVATION_ACQUIRE;
+ cmd->nsid = ns->id;
+
+ cmd->cdw10_bits.resv_acquire.racqa = action;
+ cmd->cdw10_bits.resv_acquire.iekey = ignore_key;
+ cmd->cdw10_bits.resv_acquire.rtype = type;
+
+ return nvme_qpair_submit_request(qpair, req);
+}
+
+int
+spdk_nvme_ns_cmd_reservation_report(struct spdk_nvme_ns *ns,
+ struct spdk_nvme_qpair *qpair,
+ void *payload, uint32_t len,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ uint32_t num_dwords;
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+
+ if (len % 4) {
+ return -EINVAL;
+ }
+ num_dwords = len / 4;
+
+ req = nvme_allocate_request_user_copy(qpair, payload, len, cb_fn, cb_arg, false);
+ if (req == NULL) {
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_RESERVATION_REPORT;
+ cmd->nsid = ns->id;
+
+ cmd->cdw10 = num_dwords;
+
+ return nvme_qpair_submit_request(qpair, req);
+}
diff --git a/src/spdk/lib/nvme/nvme_ns_ocssd_cmd.c b/src/spdk/lib/nvme/nvme_ns_ocssd_cmd.c
new file mode 100644
index 000000000..f60aa6789
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_ns_ocssd_cmd.c
@@ -0,0 +1,233 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/nvme_ocssd.h"
+#include "nvme_internal.h"
+
+int
+spdk_nvme_ocssd_ns_cmd_vector_reset(struct spdk_nvme_ns *ns,
+ struct spdk_nvme_qpair *qpair,
+ uint64_t *lba_list, uint32_t num_lbas,
+ struct spdk_ocssd_chunk_information_entry *chunk_info,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+
+ if (!lba_list || (num_lbas == 0) ||
+ (num_lbas > SPDK_NVME_OCSSD_MAX_LBAL_ENTRIES)) {
+ return -EINVAL;
+ }
+
+ req = nvme_allocate_request_null(qpair, cb_fn, cb_arg);
+ if (req == NULL) {
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_OCSSD_OPC_VECTOR_RESET;
+ cmd->nsid = ns->id;
+
+ if (chunk_info != NULL) {
+ cmd->mptr = spdk_vtophys(chunk_info, NULL);
+ }
+
+ /*
+ * Dword 10 and 11 store a pointer to the list of logical block addresses.
+ * If there is a single entry in the LBA list, the logical block
+ * address should be stored instead.
+ */
+ if (num_lbas == 1) {
+ *(uint64_t *)&cmd->cdw10 = *lba_list;
+ } else {
+ *(uint64_t *)&cmd->cdw10 = spdk_vtophys(lba_list, NULL);
+ }
+
+ cmd->cdw12 = num_lbas - 1;
+
+ return nvme_qpair_submit_request(qpair, req);
+}
+
+static int
+_nvme_ocssd_ns_cmd_vector_rw_with_md(struct spdk_nvme_ns *ns,
+ struct spdk_nvme_qpair *qpair,
+ void *buffer, void *metadata,
+ uint64_t *lba_list, uint32_t num_lbas,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg,
+ enum spdk_ocssd_io_opcode opc,
+ uint32_t io_flags)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+ struct nvme_payload payload;
+ uint32_t valid_flags = SPDK_OCSSD_IO_FLAGS_LIMITED_RETRY;
+
+ if (io_flags & ~valid_flags) {
+ return -EINVAL;
+ }
+
+ if (!buffer || !lba_list || (num_lbas == 0) ||
+ (num_lbas > SPDK_NVME_OCSSD_MAX_LBAL_ENTRIES)) {
+ return -EINVAL;
+ }
+
+ payload = NVME_PAYLOAD_CONTIG(buffer, metadata);
+
+ req = nvme_allocate_request(qpair, &payload, num_lbas * ns->sector_size, num_lbas * ns->md_size,
+ cb_fn, cb_arg);
+ if (req == NULL) {
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = opc;
+ cmd->nsid = ns->id;
+
+ /*
+ * Dword 10 and 11 store a pointer to the list of logical block addresses.
+ * If there is a single entry in the LBA list, the logical block
+ * address should be stored instead.
+ */
+ if (num_lbas == 1) {
+ *(uint64_t *)&cmd->cdw10 = *lba_list;
+ } else {
+ *(uint64_t *)&cmd->cdw10 = spdk_vtophys(lba_list, NULL);
+ }
+
+ cmd->cdw12 = num_lbas - 1;
+ cmd->cdw12 |= io_flags;
+
+ return nvme_qpair_submit_request(qpair, req);
+}
+
+int
+spdk_nvme_ocssd_ns_cmd_vector_write_with_md(struct spdk_nvme_ns *ns,
+ struct spdk_nvme_qpair *qpair,
+ void *buffer, void *metadata,
+ uint64_t *lba_list, uint32_t num_lbas,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg,
+ uint32_t io_flags)
+{
+ return _nvme_ocssd_ns_cmd_vector_rw_with_md(ns, qpair, buffer, metadata, lba_list,
+ num_lbas, cb_fn, cb_arg, SPDK_OCSSD_OPC_VECTOR_WRITE, io_flags);
+}
+
+int
+spdk_nvme_ocssd_ns_cmd_vector_write(struct spdk_nvme_ns *ns,
+ struct spdk_nvme_qpair *qpair,
+ void *buffer,
+ uint64_t *lba_list, uint32_t num_lbas,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg,
+ uint32_t io_flags)
+{
+ return _nvme_ocssd_ns_cmd_vector_rw_with_md(ns, qpair, buffer, NULL, lba_list,
+ num_lbas, cb_fn, cb_arg, SPDK_OCSSD_OPC_VECTOR_WRITE, io_flags);
+}
+
+int
+spdk_nvme_ocssd_ns_cmd_vector_read_with_md(struct spdk_nvme_ns *ns,
+ struct spdk_nvme_qpair *qpair,
+ void *buffer, void *metadata,
+ uint64_t *lba_list, uint32_t num_lbas,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg,
+ uint32_t io_flags)
+{
+ return _nvme_ocssd_ns_cmd_vector_rw_with_md(ns, qpair, buffer, metadata, lba_list,
+ num_lbas, cb_fn, cb_arg, SPDK_OCSSD_OPC_VECTOR_READ, io_flags);
+}
+
+int
+spdk_nvme_ocssd_ns_cmd_vector_read(struct spdk_nvme_ns *ns,
+ struct spdk_nvme_qpair *qpair,
+ void *buffer,
+ uint64_t *lba_list, uint32_t num_lbas,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg,
+ uint32_t io_flags)
+{
+ return _nvme_ocssd_ns_cmd_vector_rw_with_md(ns, qpair, buffer, NULL, lba_list,
+ num_lbas, cb_fn, cb_arg, SPDK_OCSSD_OPC_VECTOR_READ, io_flags);
+}
+
+int
+spdk_nvme_ocssd_ns_cmd_vector_copy(struct spdk_nvme_ns *ns,
+ struct spdk_nvme_qpair *qpair,
+ uint64_t *dst_lba_list,
+ uint64_t *src_lba_list,
+ uint32_t num_lbas,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg,
+ uint32_t io_flags)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+
+ uint32_t valid_flags = SPDK_OCSSD_IO_FLAGS_LIMITED_RETRY;
+
+ if (io_flags & ~valid_flags) {
+ return -EINVAL;
+ }
+
+ if (!dst_lba_list || !src_lba_list || (num_lbas == 0) ||
+ (num_lbas > SPDK_NVME_OCSSD_MAX_LBAL_ENTRIES)) {
+ return -EINVAL;
+ }
+
+ req = nvme_allocate_request_null(qpair, cb_fn, cb_arg);
+ if (req == NULL) {
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_OCSSD_OPC_VECTOR_COPY;
+ cmd->nsid = ns->id;
+
+ /*
+ * Dword 10 and 11 store a pointer to the list of source logical
+ * block addresses.
+ * Dword 14 and 15 store a pointer to the list of destination logical
+ * block addresses.
+ * If there is a single entry in the LBA list, the logical block
+ * address should be stored instead.
+ */
+ if (num_lbas == 1) {
+ *(uint64_t *)&cmd->cdw10 = *src_lba_list;
+ *(uint64_t *)&cmd->cdw14 = *dst_lba_list;
+ } else {
+ *(uint64_t *)&cmd->cdw10 = spdk_vtophys(src_lba_list, NULL);
+ *(uint64_t *)&cmd->cdw14 = spdk_vtophys(dst_lba_list, NULL);
+ }
+
+ cmd->cdw12 = num_lbas - 1;
+ cmd->cdw12 |= io_flags;
+
+ return nvme_qpair_submit_request(qpair, req);
+}
diff --git a/src/spdk/lib/nvme/nvme_opal.c b/src/spdk/lib/nvme/nvme_opal.c
new file mode 100644
index 000000000..e0a3aa7fa
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_opal.c
@@ -0,0 +1,2566 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include "spdk/opal.h"
+#include "spdk_internal/log.h"
+#include "spdk/util.h"
+
+#include "nvme_opal_internal.h"
+
+static void
+opal_nvme_security_recv_done(void *arg, const struct spdk_nvme_cpl *cpl)
+{
+ struct opal_session *sess = arg;
+ struct spdk_opal_dev *dev = sess->dev;
+ void *response = sess->resp;
+ struct spdk_opal_compacket *header = response;
+ int ret;
+
+ if (spdk_nvme_cpl_is_error(cpl)) {
+ sess->sess_cb(sess, -EIO, sess->cb_arg);
+ return;
+ }
+
+ if (!header->outstanding_data && !header->min_transfer) {
+ sess->sess_cb(sess, 0, sess->cb_arg);
+ return;
+ }
+
+ memset(response, 0, IO_BUFFER_LENGTH);
+ ret = spdk_nvme_ctrlr_cmd_security_receive(dev->ctrlr, SPDK_SCSI_SECP_TCG,
+ dev->comid, 0, sess->resp, IO_BUFFER_LENGTH,
+ opal_nvme_security_recv_done, sess);
+ if (ret) {
+ sess->sess_cb(sess, ret, sess->cb_arg);
+ }
+}
+
+static void
+opal_nvme_security_send_done(void *arg, const struct spdk_nvme_cpl *cpl)
+{
+ struct opal_session *sess = arg;
+ struct spdk_opal_dev *dev = sess->dev;
+ int ret;
+
+ if (spdk_nvme_cpl_is_error(cpl)) {
+ sess->sess_cb(sess, -EIO, sess->cb_arg);
+ return;
+ }
+
+ ret = spdk_nvme_ctrlr_cmd_security_receive(dev->ctrlr, SPDK_SCSI_SECP_TCG,
+ dev->comid, 0, sess->resp, IO_BUFFER_LENGTH,
+ opal_nvme_security_recv_done, sess);
+ if (ret) {
+ sess->sess_cb(sess, ret, sess->cb_arg);
+ }
+}
+
+static int
+opal_nvme_security_send(struct spdk_opal_dev *dev, struct opal_session *sess,
+ opal_sess_cb sess_cb, void *cb_arg)
+{
+ sess->sess_cb = sess_cb;
+ sess->cb_arg = cb_arg;
+
+ return spdk_nvme_ctrlr_cmd_security_send(dev->ctrlr, SPDK_SCSI_SECP_TCG, dev->comid,
+ 0, sess->cmd, IO_BUFFER_LENGTH,
+ opal_nvme_security_send_done, sess);
+}
+
+static void
+opal_send_recv_done(struct opal_session *sess, int status, void *ctx)
+{
+ sess->status = status;
+ sess->done = true;
+}
+
+static int
+opal_send_recv(struct spdk_opal_dev *dev, struct opal_session *sess)
+{
+ int ret;
+
+ sess->done = false;
+ ret = opal_nvme_security_send(dev, sess, opal_send_recv_done, NULL);
+ if (ret) {
+ return ret;
+ }
+
+ while (!sess->done) {
+ spdk_nvme_ctrlr_process_admin_completions(dev->ctrlr);
+ }
+
+ return sess->status;
+}
+
+static struct opal_session *
+opal_alloc_session(struct spdk_opal_dev *dev)
+{
+ struct opal_session *sess;
+
+ sess = calloc(1, sizeof(*sess));
+ if (!sess) {
+ return NULL;
+ }
+ sess->dev = dev;
+
+ return sess;
+}
+
+static void
+opal_add_token_u8(int *err, struct opal_session *sess, uint8_t token)
+{
+ if (*err) {
+ return;
+ }
+ if (sess->cmd_pos >= IO_BUFFER_LENGTH - 1) {
+ SPDK_ERRLOG("Error adding u8: end of buffer.\n");
+ *err = -ERANGE;
+ return;
+ }
+ sess->cmd[sess->cmd_pos++] = token;
+}
+
+static void
+opal_add_short_atom_header(struct opal_session *sess, bool bytestring,
+ bool has_sign, size_t len)
+{
+ uint8_t atom;
+ int err = 0;
+
+ atom = SPDK_SHORT_ATOM_ID;
+ atom |= bytestring ? SPDK_SHORT_ATOM_BYTESTRING_FLAG : 0;
+ atom |= has_sign ? SPDK_SHORT_ATOM_SIGN_FLAG : 0;
+ atom |= len & SPDK_SHORT_ATOM_LEN_MASK;
+
+ opal_add_token_u8(&err, sess, atom);
+}
+
+static void
+opal_add_medium_atom_header(struct opal_session *sess, bool bytestring,
+ bool has_sign, size_t len)
+{
+ uint8_t header;
+
+ header = SPDK_MEDIUM_ATOM_ID;
+ header |= bytestring ? SPDK_MEDIUM_ATOM_BYTESTRING_FLAG : 0;
+ header |= has_sign ? SPDK_MEDIUM_ATOM_SIGN_FLAG : 0;
+ header |= (len >> 8) & SPDK_MEDIUM_ATOM_LEN_MASK;
+ sess->cmd[sess->cmd_pos++] = header;
+ sess->cmd[sess->cmd_pos++] = len;
+}
+
+static void
+opal_add_token_bytestring(int *err, struct opal_session *sess,
+ const uint8_t *bytestring, size_t len)
+{
+ size_t header_len = 1;
+ bool is_short_atom = true;
+
+ if (*err) {
+ return;
+ }
+
+ if (len & ~SPDK_SHORT_ATOM_LEN_MASK) {
+ header_len = 2;
+ is_short_atom = false;
+ }
+
+ if (len >= IO_BUFFER_LENGTH - sess->cmd_pos - header_len) {
+ SPDK_ERRLOG("Error adding bytestring: end of buffer.\n");
+ *err = -ERANGE;
+ return;
+ }
+
+ if (is_short_atom) {
+ opal_add_short_atom_header(sess, true, false, len);
+ } else {
+ opal_add_medium_atom_header(sess, true, false, len);
+ }
+
+ memcpy(&sess->cmd[sess->cmd_pos], bytestring, len);
+ sess->cmd_pos += len;
+}
+
+static void
+opal_add_token_u64(int *err, struct opal_session *sess, uint64_t number)
+{
+ int startat = 0;
+
+ if (*err) {
+ return;
+ }
+
+ /* add header first */
+ if (number <= SPDK_TINY_ATOM_DATA_MASK) {
+ sess->cmd[sess->cmd_pos++] = (uint8_t) number & SPDK_TINY_ATOM_DATA_MASK;
+ } else {
+ if (number < 0x100) {
+ sess->cmd[sess->cmd_pos++] = 0x81; /* short atom, 1 byte length */
+ startat = 0;
+ } else if (number < 0x10000) {
+ sess->cmd[sess->cmd_pos++] = 0x82; /* short atom, 2 byte length */
+ startat = 1;
+ } else if (number < 0x100000000) {
+ sess->cmd[sess->cmd_pos++] = 0x84; /* short atom, 4 byte length */
+ startat = 3;
+ } else {
+ sess->cmd[sess->cmd_pos++] = 0x88; /* short atom, 8 byte length */
+ startat = 7;
+ }
+
+ /* add number value */
+ for (int i = startat; i > -1; i--) {
+ sess->cmd[sess->cmd_pos++] = (uint8_t)((number >> (i * 8)) & 0xff);
+ }
+ }
+}
+
+static void
+opal_add_tokens(int *err, struct opal_session *sess, int num, ...)
+{
+ int i;
+ va_list args_ptr;
+ enum spdk_opal_token tmp;
+
+ va_start(args_ptr, num);
+
+ for (i = 0; i < num; i++) {
+ tmp = va_arg(args_ptr, enum spdk_opal_token);
+ opal_add_token_u8(err, sess, tmp);
+ if (*err != 0) { break; }
+ }
+
+ va_end(args_ptr);
+}
+
+static int
+opal_cmd_finalize(struct opal_session *sess, uint32_t hsn, uint32_t tsn, bool eod)
+{
+ struct spdk_opal_header *hdr;
+ int err = 0;
+
+ if (eod) {
+ opal_add_tokens(&err, sess, 6, SPDK_OPAL_ENDOFDATA,
+ SPDK_OPAL_STARTLIST,
+ 0, 0, 0,
+ SPDK_OPAL_ENDLIST);
+ }
+
+ if (err) {
+ SPDK_ERRLOG("Error finalizing command.\n");
+ return -EFAULT;
+ }
+
+ hdr = (struct spdk_opal_header *)sess->cmd;
+
+ to_be32(&hdr->packet.session_tsn, tsn);
+ to_be32(&hdr->packet.session_hsn, hsn);
+
+ to_be32(&hdr->sub_packet.length, sess->cmd_pos - sizeof(*hdr));
+ while (sess->cmd_pos % 4) {
+ if (sess->cmd_pos >= IO_BUFFER_LENGTH) {
+ SPDK_ERRLOG("Error: Buffer overrun\n");
+ return -ERANGE;
+ }
+ sess->cmd[sess->cmd_pos++] = 0;
+ }
+ to_be32(&hdr->packet.length, sess->cmd_pos - sizeof(hdr->com_packet) -
+ sizeof(hdr->packet));
+ to_be32(&hdr->com_packet.length, sess->cmd_pos - sizeof(hdr->com_packet));
+
+ return 0;
+}
+
+static size_t
+opal_response_parse_tiny(struct spdk_opal_resp_token *token,
+ const uint8_t *pos)
+{
+ token->pos = pos;
+ token->len = 1;
+ token->width = OPAL_WIDTH_TINY;
+
+ if (pos[0] & SPDK_TINY_ATOM_SIGN_FLAG) {
+ token->type = OPAL_DTA_TOKENID_SINT;
+ } else {
+ token->type = OPAL_DTA_TOKENID_UINT;
+ token->stored.unsigned_num = pos[0] & SPDK_TINY_ATOM_DATA_MASK;
+ }
+
+ return token->len;
+}
+
+static int
+opal_response_parse_short(struct spdk_opal_resp_token *token,
+ const uint8_t *pos)
+{
+ token->pos = pos;
+ token->len = (pos[0] & SPDK_SHORT_ATOM_LEN_MASK) + 1; /* plus 1-byte header */
+ token->width = OPAL_WIDTH_SHORT;
+
+ if (pos[0] & SPDK_SHORT_ATOM_BYTESTRING_FLAG) {
+ token->type = OPAL_DTA_TOKENID_BYTESTRING;
+ } else if (pos[0] & SPDK_SHORT_ATOM_SIGN_FLAG) {
+ token->type = OPAL_DTA_TOKENID_SINT;
+ } else {
+ uint64_t u_integer = 0;
+ size_t i, b = 0;
+
+ token->type = OPAL_DTA_TOKENID_UINT;
+ if (token->len > 9) {
+ SPDK_ERRLOG("uint64 with more than 8 bytes\n");
+ return -EINVAL;
+ }
+ for (i = token->len - 1; i > 0; i--) {
+ u_integer |= ((uint64_t)pos[i] << (8 * b));
+ b++;
+ }
+ token->stored.unsigned_num = u_integer;
+ }
+
+ return token->len;
+}
+
+static size_t
+opal_response_parse_medium(struct spdk_opal_resp_token *token,
+ const uint8_t *pos)
+{
+ token->pos = pos;
+ token->len = (((pos[0] & SPDK_MEDIUM_ATOM_LEN_MASK) << 8) | pos[1]) + 2; /* plus 2-byte header */
+ token->width = OPAL_WIDTH_MEDIUM;
+
+ if (pos[0] & SPDK_MEDIUM_ATOM_BYTESTRING_FLAG) {
+ token->type = OPAL_DTA_TOKENID_BYTESTRING;
+ } else if (pos[0] & SPDK_MEDIUM_ATOM_SIGN_FLAG) {
+ token->type = OPAL_DTA_TOKENID_SINT;
+ } else {
+ token->type = OPAL_DTA_TOKENID_UINT;
+ }
+
+ return token->len;
+}
+
+static size_t
+opal_response_parse_long(struct spdk_opal_resp_token *token,
+ const uint8_t *pos)
+{
+ token->pos = pos;
+ token->len = ((pos[1] << 16) | (pos[2] << 8) | pos[3]) + 4; /* plus 4-byte header */
+ token->width = OPAL_WIDTH_LONG;
+
+ if (pos[0] & SPDK_LONG_ATOM_BYTESTRING_FLAG) {
+ token->type = OPAL_DTA_TOKENID_BYTESTRING;
+ } else if (pos[0] & SPDK_LONG_ATOM_SIGN_FLAG) {
+ token->type = OPAL_DTA_TOKENID_SINT;
+ } else {
+ token->type = OPAL_DTA_TOKENID_UINT;
+ }
+
+ return token->len;
+}
+
+static size_t
+opal_response_parse_token(struct spdk_opal_resp_token *token,
+ const uint8_t *pos)
+{
+ token->pos = pos;
+ token->len = 1;
+ token->type = OPAL_DTA_TOKENID_TOKEN;
+ token->width = OPAL_WIDTH_TOKEN;
+
+ return token->len;
+}
+
+static int
+opal_response_parse(const uint8_t *buf, size_t length,
+ struct spdk_opal_resp_parsed *resp)
+{
+ const struct spdk_opal_header *hdr;
+ struct spdk_opal_resp_token *token_iter;
+ int num_entries = 0;
+ int total;
+ size_t token_length;
+ const uint8_t *pos;
+ uint32_t clen, plen, slen;
+
+ if (!buf || !resp) {
+ return -EINVAL;
+ }
+
+ hdr = (struct spdk_opal_header *)buf;
+ pos = buf + sizeof(*hdr);
+
+ clen = from_be32(&hdr->com_packet.length);
+ plen = from_be32(&hdr->packet.length);
+ slen = from_be32(&hdr->sub_packet.length);
+ SPDK_DEBUGLOG(SPDK_LOG_OPAL, "Response size: cp: %u, pkt: %u, subpkt: %u\n",
+ clen, plen, slen);
+
+ if (clen == 0 || plen == 0 || slen == 0 ||
+ slen > IO_BUFFER_LENGTH - sizeof(*hdr)) {
+ SPDK_ERRLOG("Bad header length. cp: %u, pkt: %u, subpkt: %u\n",
+ clen, plen, slen);
+ return -EINVAL;
+ }
+
+ if (pos > buf + length) {
+ SPDK_ERRLOG("Pointer out of range\n");
+ return -EFAULT;
+ }
+
+ token_iter = resp->resp_tokens;
+ total = slen;
+
+ while (total > 0) {
+ if (pos[0] <= SPDK_TINY_ATOM_TYPE_MAX) { /* tiny atom */
+ token_length = opal_response_parse_tiny(token_iter, pos);
+ } else if (pos[0] <= SPDK_SHORT_ATOM_TYPE_MAX) { /* short atom */
+ token_length = opal_response_parse_short(token_iter, pos);
+ } else if (pos[0] <= SPDK_MEDIUM_ATOM_TYPE_MAX) { /* medium atom */
+ token_length = opal_response_parse_medium(token_iter, pos);
+ } else if (pos[0] <= SPDK_LONG_ATOM_TYPE_MAX) { /* long atom */
+ token_length = opal_response_parse_long(token_iter, pos);
+ } else { /* TOKEN */
+ token_length = opal_response_parse_token(token_iter, pos);
+ }
+
+ if (token_length <= 0) {
+ SPDK_ERRLOG("Parse response failure.\n");
+ return -EINVAL;
+ }
+
+ pos += token_length;
+ total -= token_length;
+ token_iter++;
+ num_entries++;
+
+ if (total < 0) {
+ SPDK_ERRLOG("Length not matching.\n");
+ return -EINVAL;
+ }
+ }
+
+ if (num_entries == 0) {
+ SPDK_ERRLOG("Couldn't parse response.\n");
+ return -EINVAL;
+ }
+ resp->num = num_entries;
+
+ return 0;
+}
+
+static inline bool
+opal_response_token_matches(const struct spdk_opal_resp_token *token,
+ uint8_t match)
+{
+ if (!token ||
+ token->type != OPAL_DTA_TOKENID_TOKEN ||
+ token->pos[0] != match) {
+ return false;
+ }
+ return true;
+}
+
+static const struct spdk_opal_resp_token *
+opal_response_get_token(const struct spdk_opal_resp_parsed *resp, int index)
+{
+ const struct spdk_opal_resp_token *token;
+
+ if (index >= resp->num) {
+ SPDK_ERRLOG("Token number doesn't exist: %d, resp: %d\n",
+ index, resp->num);
+ return NULL;
+ }
+
+ token = &resp->resp_tokens[index];
+ if (token->len == 0) {
+ SPDK_ERRLOG("Token length must be non-zero\n");
+ return NULL;
+ }
+
+ return token;
+}
+
+static uint64_t
+opal_response_get_u64(const struct spdk_opal_resp_parsed *resp, int index)
+{
+ if (!resp) {
+ SPDK_ERRLOG("Response is NULL\n");
+ return 0;
+ }
+
+ if (resp->resp_tokens[index].type != OPAL_DTA_TOKENID_UINT) {
+ SPDK_ERRLOG("Token is not unsigned int: %d\n",
+ resp->resp_tokens[index].type);
+ return 0;
+ }
+
+ if (!(resp->resp_tokens[index].width == OPAL_WIDTH_TINY ||
+ resp->resp_tokens[index].width == OPAL_WIDTH_SHORT)) {
+ SPDK_ERRLOG("Atom is not short or tiny: %d\n",
+ resp->resp_tokens[index].width);
+ return 0;
+ }
+
+ return resp->resp_tokens[index].stored.unsigned_num;
+}
+
+static uint16_t
+opal_response_get_u16(const struct spdk_opal_resp_parsed *resp, int index)
+{
+ uint64_t i = opal_response_get_u64(resp, index);
+ if (i > 0xffffull) {
+ SPDK_ERRLOG("parse reponse u16 failed. Overflow\n");
+ return 0;
+ }
+ return (uint16_t) i;
+}
+
+static uint8_t
+opal_response_get_u8(const struct spdk_opal_resp_parsed *resp, int index)
+{
+ uint64_t i = opal_response_get_u64(resp, index);
+ if (i > 0xffull) {
+ SPDK_ERRLOG("parse reponse u8 failed. Overflow\n");
+ return 0;
+ }
+ return (uint8_t) i;
+}
+
+static size_t
+opal_response_get_string(const struct spdk_opal_resp_parsed *resp, int n,
+ const char **store)
+{
+ uint8_t header_len;
+ struct spdk_opal_resp_token token;
+ *store = NULL;
+ if (!resp) {
+ SPDK_ERRLOG("Response is NULL\n");
+ return 0;
+ }
+
+ if (n > resp->num) {
+ SPDK_ERRLOG("Response has %d tokens. Can't access %d\n",
+ resp->num, n);
+ return 0;
+ }
+
+ token = resp->resp_tokens[n];
+ if (token.type != OPAL_DTA_TOKENID_BYTESTRING) {
+ SPDK_ERRLOG("Token is not a byte string!\n");
+ return 0;
+ }
+
+ switch (token.width) {
+ case OPAL_WIDTH_SHORT:
+ header_len = 1;
+ break;
+ case OPAL_WIDTH_MEDIUM:
+ header_len = 2;
+ break;
+ case OPAL_WIDTH_LONG:
+ header_len = 4;
+ break;
+ default:
+ SPDK_ERRLOG("Can't get string from this Token\n");
+ return 0;
+ }
+
+ *store = token.pos + header_len;
+ return token.len - header_len;
+}
+
+static int
+opal_response_status(const struct spdk_opal_resp_parsed *resp)
+{
+ const struct spdk_opal_resp_token *tok;
+
+ /* if we get an EOS token, just return 0 */
+ tok = opal_response_get_token(resp, 0);
+ if (opal_response_token_matches(tok, SPDK_OPAL_ENDOFSESSION)) {
+ return 0;
+ }
+
+ if (resp->num < 5) {
+ return SPDK_DTAERROR_NO_METHOD_STATUS;
+ }
+
+ tok = opal_response_get_token(resp, resp->num - 5); /* the first token should be STARTLIST */
+ if (!opal_response_token_matches(tok, SPDK_OPAL_STARTLIST)) {
+ return SPDK_DTAERROR_NO_METHOD_STATUS;
+ }
+
+ tok = opal_response_get_token(resp, resp->num - 1); /* the last token should be ENDLIST */
+ if (!opal_response_token_matches(tok, SPDK_OPAL_ENDLIST)) {
+ return SPDK_DTAERROR_NO_METHOD_STATUS;
+ }
+
+ /* The second and third values in the status list are reserved, and are
+ defined in core spec to be 0x00 and 0x00 and SHOULD be ignored by the host. */
+ return (int)opal_response_get_u64(resp,
+ resp->num - 4); /* We only need the first value in the status list. */
+}
+
+static int
+opal_parse_and_check_status(struct opal_session *sess)
+{
+ int error;
+
+ error = opal_response_parse(sess->resp, IO_BUFFER_LENGTH, &sess->parsed_resp);
+ if (error) {
+ SPDK_ERRLOG("Couldn't parse response.\n");
+ return error;
+ }
+ return opal_response_status(&sess->parsed_resp);
+}
+
+static inline void
+opal_clear_cmd(struct opal_session *sess)
+{
+ sess->cmd_pos = sizeof(struct spdk_opal_header);
+ memset(sess->cmd, 0, IO_BUFFER_LENGTH);
+}
+
+static inline void
+opal_set_comid(struct opal_session *sess, uint16_t comid)
+{
+ struct spdk_opal_header *hdr = (struct spdk_opal_header *)sess->cmd;
+
+ hdr->com_packet.comid[0] = comid >> 8;
+ hdr->com_packet.comid[1] = comid;
+ hdr->com_packet.extended_comid[0] = 0;
+ hdr->com_packet.extended_comid[1] = 0;
+}
+
+static inline int
+opal_init_key(struct spdk_opal_key *opal_key, const char *passwd)
+{
+ int len;
+
+ if (passwd == NULL || passwd[0] == '\0') {
+ SPDK_ERRLOG("Password is empty. Create key failed\n");
+ return -EINVAL;
+ }
+
+ len = strlen(passwd);
+
+ if (len >= OPAL_KEY_MAX) {
+ SPDK_ERRLOG("Password too long. Create key failed\n");
+ return -EINVAL;
+ }
+
+ opal_key->key_len = len;
+ memcpy(opal_key->key, passwd, opal_key->key_len);
+
+ return 0;
+}
+
+static void
+opal_build_locking_range(uint8_t *buffer, uint8_t locking_range)
+{
+ memcpy(buffer, spdk_opal_uid[UID_LOCKINGRANGE_GLOBAL], OPAL_UID_LENGTH);
+
+ /* global */
+ if (locking_range == 0) {
+ return;
+ }
+
+ /* non-global */
+ buffer[5] = LOCKING_RANGE_NON_GLOBAL;
+ buffer[7] = locking_range;
+}
+
+static void
+opal_check_tper(struct spdk_opal_dev *dev, const void *data)
+{
+ const struct spdk_opal_d0_tper_feat *tper = data;
+
+ dev->feat_info.tper = *tper;
+}
+
+/*
+ * check single user mode
+ */
+static bool
+opal_check_sum(struct spdk_opal_dev *dev, const void *data)
+{
+ const struct spdk_opal_d0_single_user_mode_feat *sum = data;
+ uint32_t num_locking_objects = from_be32(&sum->num_locking_objects);
+
+ if (num_locking_objects == 0) {
+ SPDK_NOTICELOG("Need at least one locking object.\n");
+ return false;
+ }
+
+ dev->feat_info.single_user = *sum;
+
+ return true;
+}
+
+static void
+opal_check_lock(struct spdk_opal_dev *dev, const void *data)
+{
+ const struct spdk_opal_d0_locking_feat *lock = data;
+
+ dev->feat_info.locking = *lock;
+}
+
+static void
+opal_check_geometry(struct spdk_opal_dev *dev, const void *data)
+{
+ const struct spdk_opal_d0_geo_feat *geo = data;
+
+ dev->feat_info.geo = *geo;
+}
+
+static void
+opal_check_datastore(struct spdk_opal_dev *dev, const void *data)
+{
+ const struct spdk_opal_d0_datastore_feat *datastore = data;
+
+ dev->feat_info.datastore = *datastore;
+}
+
+static uint16_t
+opal_get_comid_v100(struct spdk_opal_dev *dev, const void *data)
+{
+ const struct spdk_opal_d0_v100_feat *v100 = data;
+ uint16_t base_comid = from_be16(&v100->base_comid);
+
+ dev->feat_info.v100 = *v100;
+
+ return base_comid;
+}
+
+static uint16_t
+opal_get_comid_v200(struct spdk_opal_dev *dev, const void *data)
+{
+ const struct spdk_opal_d0_v200_feat *v200 = data;
+ uint16_t base_comid = from_be16(&v200->base_comid);
+
+ dev->feat_info.v200 = *v200;
+
+ return base_comid;
+}
+
+static int
+opal_discovery0_end(struct spdk_opal_dev *dev, void *payload, uint32_t payload_size)
+{
+ bool supported = false, single_user = false;
+ const struct spdk_opal_d0_hdr *hdr = (struct spdk_opal_d0_hdr *)payload;
+ struct spdk_opal_d0_feat_hdr *feat_hdr;
+ const uint8_t *epos = payload, *cpos = payload;
+ uint16_t comid = 0;
+ uint32_t hlen = from_be32(&(hdr->length));
+
+ if (hlen > payload_size - sizeof(*hdr)) {
+ SPDK_ERRLOG("Discovery length overflows buffer (%zu+%u)/%u\n",
+ sizeof(*hdr), hlen, payload_size);
+ return -EFAULT;
+ }
+
+ epos += hlen; /* end of buffer */
+ cpos += sizeof(*hdr); /* current position on buffer */
+
+ while (cpos < epos) {
+ feat_hdr = (struct spdk_opal_d0_feat_hdr *)cpos;
+ uint16_t feat_code = from_be16(&feat_hdr->code);
+
+ switch (feat_code) {
+ case FEATURECODE_TPER:
+ opal_check_tper(dev, cpos);
+ break;
+ case FEATURECODE_SINGLEUSER:
+ single_user = opal_check_sum(dev, cpos);
+ break;
+ case FEATURECODE_GEOMETRY:
+ opal_check_geometry(dev, cpos);
+ break;
+ case FEATURECODE_LOCKING:
+ opal_check_lock(dev, cpos);
+ break;
+ case FEATURECODE_DATASTORE:
+ opal_check_datastore(dev, cpos);
+ break;
+ case FEATURECODE_OPALV100:
+ comid = opal_get_comid_v100(dev, cpos);
+ supported = true;
+ break;
+ case FEATURECODE_OPALV200:
+ comid = opal_get_comid_v200(dev, cpos);
+ supported = true;
+ break;
+ default:
+ SPDK_INFOLOG(SPDK_LOG_OPAL, "Unknow feature code: %d\n", feat_code);
+ }
+ cpos += feat_hdr->length + sizeof(*feat_hdr);
+ }
+
+ if (supported == false) {
+ SPDK_ERRLOG("Opal Not Supported.\n");
+ return -ENOTSUP;
+ }
+
+ if (single_user == false) {
+ SPDK_INFOLOG(SPDK_LOG_OPAL, "Single User Mode Not Supported\n");
+ }
+
+ dev->comid = comid;
+ return 0;
+}
+
+static int
+opal_discovery0(struct spdk_opal_dev *dev, void *payload, uint32_t payload_size)
+{
+ int ret;
+
+ ret = spdk_nvme_ctrlr_security_receive(dev->ctrlr, SPDK_SCSI_SECP_TCG, LV0_DISCOVERY_COMID,
+ 0, payload, payload_size);
+ if (ret) {
+ return ret;
+ }
+
+ return opal_discovery0_end(dev, payload, payload_size);
+}
+
+static int
+opal_end_session(struct spdk_opal_dev *dev, struct opal_session *sess, uint16_t comid)
+{
+ int err = 0;
+ int ret;
+
+ opal_clear_cmd(sess);
+ opal_set_comid(sess, comid);
+ opal_add_token_u8(&err, sess, SPDK_OPAL_ENDOFSESSION);
+
+ if (err < 0) {
+ return err;
+ }
+
+ ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, false);
+ if (ret) {
+ return ret;
+ }
+
+ ret = opal_send_recv(dev, sess);
+ if (ret) {
+ return ret;
+ }
+
+ sess->hsn = 0;
+ sess->tsn = 0;
+
+ return opal_parse_and_check_status(sess);
+}
+
+void
+spdk_opal_dev_destruct(struct spdk_opal_dev *dev)
+{
+ free(dev);
+}
+
+static int
+opal_start_session_done(struct opal_session *sess)
+{
+ uint32_t hsn, tsn;
+ int error = 0;
+
+ error = opal_parse_and_check_status(sess);
+ if (error) {
+ return error;
+ }
+
+ hsn = opal_response_get_u64(&sess->parsed_resp, 4);
+ tsn = opal_response_get_u64(&sess->parsed_resp, 5);
+
+ if (hsn == 0 && tsn == 0) {
+ SPDK_ERRLOG("Couldn't authenticate session\n");
+ return -EPERM;
+ }
+
+ sess->hsn = hsn;
+ sess->tsn = tsn;
+
+ return 0;
+}
+
+static int
+opal_start_generic_session(struct spdk_opal_dev *dev,
+ struct opal_session *sess,
+ enum opal_uid_enum auth,
+ enum opal_uid_enum sp_type,
+ const char *key,
+ uint8_t key_len)
+{
+ uint32_t hsn;
+ int err = 0;
+ int ret;
+
+ if (key == NULL && auth != UID_ANYBODY) {
+ return OPAL_INVAL_PARAM;
+ }
+
+ opal_clear_cmd(sess);
+
+ opal_set_comid(sess, dev->comid);
+ hsn = GENERIC_HOST_SESSION_NUM;
+
+ opal_add_token_u8(&err, sess, SPDK_OPAL_CALL);
+ opal_add_token_bytestring(&err, sess, spdk_opal_uid[UID_SMUID],
+ OPAL_UID_LENGTH);
+ opal_add_token_bytestring(&err, sess, spdk_opal_method[STARTSESSION_METHOD],
+ OPAL_UID_LENGTH);
+ opal_add_token_u8(&err, sess, SPDK_OPAL_STARTLIST);
+ opal_add_token_u64(&err, sess, hsn);
+ opal_add_token_bytestring(&err, sess, spdk_opal_uid[sp_type], OPAL_UID_LENGTH);
+ opal_add_token_u8(&err, sess, SPDK_OPAL_TRUE); /* Write */
+
+ switch (auth) {
+ case UID_ANYBODY:
+ opal_add_token_u8(&err, sess, SPDK_OPAL_ENDLIST);
+ break;
+ case UID_ADMIN1:
+ case UID_SID:
+ opal_add_token_u8(&err, sess, SPDK_OPAL_STARTNAME);
+ opal_add_token_u8(&err, sess, 0); /* HostChallenge */
+ opal_add_token_bytestring(&err, sess, key, key_len);
+ opal_add_tokens(&err, sess, 3, /* number of token */
+ SPDK_OPAL_ENDNAME,
+ SPDK_OPAL_STARTNAME,
+ 3);/* HostSignAuth */
+ opal_add_token_bytestring(&err, sess, spdk_opal_uid[auth],
+ OPAL_UID_LENGTH);
+ opal_add_token_u8(&err, sess, SPDK_OPAL_ENDNAME);
+ opal_add_token_u8(&err, sess, SPDK_OPAL_ENDLIST);
+ break;
+ default:
+ SPDK_ERRLOG("Cannot start Admin SP session with auth %d\n", auth);
+ return -EINVAL;
+ }
+
+ if (err) {
+ SPDK_ERRLOG("Error building start adminsp session command.\n");
+ return err;
+ }
+
+ ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true);
+ if (ret) {
+ return ret;
+ }
+
+ ret = opal_send_recv(dev, sess);
+ if (ret) {
+ return ret;
+ }
+
+ return opal_start_session_done(sess);
+}
+
+static int
+opal_get_msid_cpin_pin_done(struct opal_session *sess,
+ struct spdk_opal_key *opal_key)
+{
+ const char *msid_pin;
+ size_t strlen;
+ int error = 0;
+
+ error = opal_parse_and_check_status(sess);
+ if (error) {
+ return error;
+ }
+
+ strlen = opal_response_get_string(&sess->parsed_resp, 4, &msid_pin);
+ if (!msid_pin) {
+ SPDK_ERRLOG("Couldn't extract PIN from response\n");
+ return -EINVAL;
+ }
+
+ opal_key->key_len = strlen;
+ memcpy(opal_key->key, msid_pin, opal_key->key_len);
+
+ SPDK_DEBUGLOG(SPDK_LOG_OPAL, "MSID = %p\n", opal_key->key);
+ return 0;
+}
+
+static int
+opal_get_msid_cpin_pin(struct spdk_opal_dev *dev, struct opal_session *sess,
+ struct spdk_opal_key *opal_key)
+{
+ int err = 0;
+ int ret;
+
+ opal_clear_cmd(sess);
+ opal_set_comid(sess, dev->comid);
+
+ opal_add_token_u8(&err, sess, SPDK_OPAL_CALL);
+ opal_add_token_bytestring(&err, sess, spdk_opal_uid[UID_C_PIN_MSID],
+ OPAL_UID_LENGTH);
+ opal_add_token_bytestring(&err, sess, spdk_opal_method[GET_METHOD], OPAL_UID_LENGTH);
+
+ opal_add_tokens(&err, sess, 12, SPDK_OPAL_STARTLIST,
+ SPDK_OPAL_STARTLIST,
+ SPDK_OPAL_STARTNAME,
+ SPDK_OPAL_STARTCOLUMN,
+ SPDK_OPAL_PIN,
+ SPDK_OPAL_ENDNAME,
+ SPDK_OPAL_STARTNAME,
+ SPDK_OPAL_ENDCOLUMN,
+ SPDK_OPAL_PIN,
+ SPDK_OPAL_ENDNAME,
+ SPDK_OPAL_ENDLIST,
+ SPDK_OPAL_ENDLIST);
+
+ if (err) {
+ SPDK_ERRLOG("Error building Get MSID CPIN PIN command.\n");
+ return err;
+ }
+
+ ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true);
+ if (ret) {
+ return ret;
+ }
+
+ ret = opal_send_recv(dev, sess);
+ if (ret) {
+ return ret;
+ }
+
+ return opal_get_msid_cpin_pin_done(sess, opal_key);
+}
+
+static int
+opal_build_generic_pw_cmd(struct opal_session *sess, uint8_t *key, size_t key_len,
+ uint8_t *cpin_uid, struct spdk_opal_dev *dev)
+{
+ int err = 0;
+
+ opal_clear_cmd(sess);
+ opal_set_comid(sess, dev->comid);
+
+ opal_add_token_u8(&err, sess, SPDK_OPAL_CALL);
+ opal_add_token_bytestring(&err, sess, cpin_uid, OPAL_UID_LENGTH);
+ opal_add_token_bytestring(&err, sess, spdk_opal_method[SET_METHOD],
+ OPAL_UID_LENGTH);
+
+ opal_add_tokens(&err, sess, 6,
+ SPDK_OPAL_STARTLIST,
+ SPDK_OPAL_STARTNAME,
+ SPDK_OPAL_VALUES,
+ SPDK_OPAL_STARTLIST,
+ SPDK_OPAL_STARTNAME,
+ SPDK_OPAL_PIN);
+ opal_add_token_bytestring(&err, sess, key, key_len);
+ opal_add_tokens(&err, sess, 4,
+ SPDK_OPAL_ENDNAME,
+ SPDK_OPAL_ENDLIST,
+ SPDK_OPAL_ENDNAME,
+ SPDK_OPAL_ENDLIST);
+ if (err) {
+ return err;
+ }
+
+ return opal_cmd_finalize(sess, sess->hsn, sess->tsn, true);
+}
+
+static int
+opal_get_locking_sp_lifecycle_done(struct opal_session *sess)
+{
+ uint8_t lifecycle;
+ int error = 0;
+
+ error = opal_parse_and_check_status(sess);
+ if (error) {
+ return error;
+ }
+
+ lifecycle = opal_response_get_u64(&sess->parsed_resp, 4);
+ if (lifecycle != OPAL_MANUFACTURED_INACTIVE) { /* status before activate */
+ SPDK_ERRLOG("Couldn't determine the status of the Lifecycle state\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int
+opal_get_locking_sp_lifecycle(struct spdk_opal_dev *dev, struct opal_session *sess)
+{
+ int err = 0;
+ int ret;
+
+ opal_clear_cmd(sess);
+ opal_set_comid(sess, dev->comid);
+
+ opal_add_token_u8(&err, sess, SPDK_OPAL_CALL);
+ opal_add_token_bytestring(&err, sess, spdk_opal_uid[UID_LOCKINGSP],
+ OPAL_UID_LENGTH);
+ opal_add_token_bytestring(&err, sess, spdk_opal_method[GET_METHOD], OPAL_UID_LENGTH);
+
+ opal_add_tokens(&err, sess, 12, SPDK_OPAL_STARTLIST,
+ SPDK_OPAL_STARTLIST,
+ SPDK_OPAL_STARTNAME,
+ SPDK_OPAL_STARTCOLUMN,
+ SPDK_OPAL_LIFECYCLE,
+ SPDK_OPAL_ENDNAME,
+ SPDK_OPAL_STARTNAME,
+ SPDK_OPAL_ENDCOLUMN,
+ SPDK_OPAL_LIFECYCLE,
+ SPDK_OPAL_ENDNAME,
+ SPDK_OPAL_ENDLIST,
+ SPDK_OPAL_ENDLIST);
+
+ if (err) {
+ SPDK_ERRLOG("Error Building GET Lifecycle Status command\n");
+ return err;
+ }
+
+ ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true);
+ if (ret) {
+ return ret;
+ }
+
+ ret = opal_send_recv(dev, sess);
+ if (ret) {
+ return ret;
+ }
+
+ return opal_get_locking_sp_lifecycle_done(sess);
+}
+
+static int
+opal_activate(struct spdk_opal_dev *dev, struct opal_session *sess)
+{
+ int err = 0;
+ int ret;
+
+ opal_clear_cmd(sess);
+ opal_set_comid(sess, dev->comid);
+
+ opal_add_token_u8(&err, sess, SPDK_OPAL_CALL);
+ opal_add_token_bytestring(&err, sess, spdk_opal_uid[UID_LOCKINGSP],
+ OPAL_UID_LENGTH);
+ opal_add_token_bytestring(&err, sess, spdk_opal_method[ACTIVATE_METHOD],
+ OPAL_UID_LENGTH);
+
+ opal_add_tokens(&err, sess, 2, SPDK_OPAL_STARTLIST, SPDK_OPAL_ENDLIST);
+
+ if (err) {
+ SPDK_ERRLOG("Error building Activate LockingSP command.\n");
+ return err;
+ }
+
+ /* TODO: Single User Mode for activatation */
+
+ ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true);
+ if (ret) {
+ return ret;
+ }
+
+ ret = opal_send_recv(dev, sess);
+ if (ret) {
+ return ret;
+ }
+
+ return opal_parse_and_check_status(sess);
+}
+
+static int
+opal_start_auth_session(struct spdk_opal_dev *dev,
+ struct opal_session *sess,
+ enum spdk_opal_user user,
+ struct spdk_opal_key *opal_key)
+{
+ uint8_t uid_user[OPAL_UID_LENGTH];
+ int err = 0;
+ int ret;
+ uint32_t hsn = GENERIC_HOST_SESSION_NUM;
+
+ opal_clear_cmd(sess);
+ opal_set_comid(sess, dev->comid);
+
+ if (user != OPAL_ADMIN1) {
+ memcpy(uid_user, spdk_opal_uid[UID_USER1], OPAL_UID_LENGTH);
+ uid_user[7] = user;
+ } else {
+ memcpy(uid_user, spdk_opal_uid[UID_ADMIN1], OPAL_UID_LENGTH);
+ }
+
+ opal_add_token_u8(&err, sess, SPDK_OPAL_CALL);
+ opal_add_token_bytestring(&err, sess, spdk_opal_uid[UID_SMUID],
+ OPAL_UID_LENGTH);
+ opal_add_token_bytestring(&err, sess, spdk_opal_method[STARTSESSION_METHOD],
+ OPAL_UID_LENGTH);
+
+ opal_add_token_u8(&err, sess, SPDK_OPAL_STARTLIST);
+ opal_add_token_u64(&err, sess, hsn);
+ opal_add_token_bytestring(&err, sess, spdk_opal_uid[UID_LOCKINGSP],
+ OPAL_UID_LENGTH);
+ opal_add_tokens(&err, sess, 3, SPDK_OPAL_TRUE, SPDK_OPAL_STARTNAME,
+ 0); /* True for a Read-Write session */
+ opal_add_token_bytestring(&err, sess, opal_key->key, opal_key->key_len);
+ opal_add_tokens(&err, sess, 3, SPDK_OPAL_ENDNAME, SPDK_OPAL_STARTNAME, 3); /* HostSignAuth */
+ opal_add_token_bytestring(&err, sess, uid_user, OPAL_UID_LENGTH);
+ opal_add_tokens(&err, sess, 2, SPDK_OPAL_ENDNAME, SPDK_OPAL_ENDLIST);
+
+ if (err) {
+ SPDK_ERRLOG("Error building STARTSESSION command.\n");
+ return err;
+ }
+
+ ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true);
+ if (ret) {
+ return ret;
+ }
+
+ ret = opal_send_recv(dev, sess);
+ if (ret) {
+ return ret;
+ }
+
+ return opal_start_session_done(sess);
+}
+
+static int
+opal_lock_unlock_range(struct spdk_opal_dev *dev, struct opal_session *sess,
+ enum spdk_opal_locking_range locking_range,
+ enum spdk_opal_lock_state l_state)
+{
+ uint8_t uid_locking_range[OPAL_UID_LENGTH];
+ uint8_t read_locked, write_locked;
+ int err = 0;
+ int ret;
+
+ opal_clear_cmd(sess);
+ opal_set_comid(sess, dev->comid);
+
+ opal_build_locking_range(uid_locking_range, locking_range);
+
+ switch (l_state) {
+ case OPAL_READONLY:
+ read_locked = 0;
+ write_locked = 1;
+ break;
+ case OPAL_READWRITE:
+ read_locked = 0;
+ write_locked = 0;
+ break;
+ case OPAL_RWLOCK:
+ read_locked = 1;
+ write_locked = 1;
+ break;
+ default:
+ SPDK_ERRLOG("Tried to set an invalid locking state.\n");
+ return -EINVAL;
+ }
+
+ opal_add_token_u8(&err, sess, SPDK_OPAL_CALL);
+ opal_add_token_bytestring(&err, sess, uid_locking_range, OPAL_UID_LENGTH);
+ opal_add_token_bytestring(&err, sess, spdk_opal_method[SET_METHOD], OPAL_UID_LENGTH);
+
+ opal_add_tokens(&err, sess, 15, SPDK_OPAL_STARTLIST,
+ SPDK_OPAL_STARTNAME,
+ SPDK_OPAL_VALUES,
+ SPDK_OPAL_STARTLIST,
+ SPDK_OPAL_STARTNAME,
+ SPDK_OPAL_READLOCKED,
+ read_locked,
+ SPDK_OPAL_ENDNAME,
+ SPDK_OPAL_STARTNAME,
+ SPDK_OPAL_WRITELOCKED,
+ write_locked,
+ SPDK_OPAL_ENDNAME,
+ SPDK_OPAL_ENDLIST,
+ SPDK_OPAL_ENDNAME,
+ SPDK_OPAL_ENDLIST);
+
+ if (err) {
+ SPDK_ERRLOG("Error building SET command.\n");
+ return err;
+ }
+ ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true);
+ if (ret) {
+ return ret;
+ }
+
+ ret = opal_send_recv(dev, sess);
+ if (ret) {
+ return ret;
+ }
+
+ return opal_parse_and_check_status(sess);
+}
+
+static int opal_generic_locking_range_enable_disable(struct spdk_opal_dev *dev,
+ struct opal_session *sess,
+ uint8_t *uid, bool read_lock_enabled, bool write_lock_enabled)
+{
+ int err = 0;
+
+ opal_add_token_u8(&err, sess, SPDK_OPAL_CALL);
+ opal_add_token_bytestring(&err, sess, uid, OPAL_UID_LENGTH);
+ opal_add_token_bytestring(&err, sess, spdk_opal_method[SET_METHOD], OPAL_UID_LENGTH);
+
+ opal_add_tokens(&err, sess, 23, SPDK_OPAL_STARTLIST,
+ SPDK_OPAL_STARTNAME,
+ SPDK_OPAL_VALUES,
+ SPDK_OPAL_STARTLIST,
+
+ SPDK_OPAL_STARTNAME,
+ SPDK_OPAL_READLOCKENABLED,
+ read_lock_enabled,
+ SPDK_OPAL_ENDNAME,
+
+ SPDK_OPAL_STARTNAME,
+ SPDK_OPAL_WRITELOCKENABLED,
+ write_lock_enabled,
+ SPDK_OPAL_ENDNAME,
+
+ SPDK_OPAL_STARTNAME,
+ SPDK_OPAL_READLOCKED,
+ 0,
+ SPDK_OPAL_ENDNAME,
+
+ SPDK_OPAL_STARTNAME,
+ SPDK_OPAL_WRITELOCKED,
+ 0,
+ SPDK_OPAL_ENDNAME,
+
+ SPDK_OPAL_ENDLIST,
+ SPDK_OPAL_ENDNAME,
+ SPDK_OPAL_ENDLIST);
+ if (err) {
+ SPDK_ERRLOG("Error building locking range enable/disable command.\n");
+ }
+ return err;
+}
+
+static int
+opal_setup_locking_range(struct spdk_opal_dev *dev, struct opal_session *sess,
+ enum spdk_opal_locking_range locking_range,
+ uint64_t range_start, uint64_t range_length,
+ bool read_lock_enabled, bool write_lock_enabled)
+{
+ uint8_t uid_locking_range[OPAL_UID_LENGTH];
+ int err = 0;
+ int ret;
+
+ opal_clear_cmd(sess);
+ opal_set_comid(sess, dev->comid);
+
+ opal_build_locking_range(uid_locking_range, locking_range);
+
+ if (locking_range == 0) {
+ err = opal_generic_locking_range_enable_disable(dev, sess, uid_locking_range,
+ read_lock_enabled, write_lock_enabled);
+ } else {
+ opal_add_token_u8(&err, sess, SPDK_OPAL_CALL);
+ opal_add_token_bytestring(&err, sess, uid_locking_range, OPAL_UID_LENGTH);
+ opal_add_token_bytestring(&err, sess, spdk_opal_method[SET_METHOD],
+ OPAL_UID_LENGTH);
+
+ opal_add_tokens(&err, sess, 6,
+ SPDK_OPAL_STARTLIST,
+ SPDK_OPAL_STARTNAME,
+ SPDK_OPAL_VALUES,
+ SPDK_OPAL_STARTLIST,
+ SPDK_OPAL_STARTNAME,
+ SPDK_OPAL_RANGESTART);
+ opal_add_token_u64(&err, sess, range_start);
+ opal_add_tokens(&err, sess, 3,
+ SPDK_OPAL_ENDNAME,
+ SPDK_OPAL_STARTNAME,
+ SPDK_OPAL_RANGELENGTH);
+ opal_add_token_u64(&err, sess, range_length);
+ opal_add_tokens(&err, sess, 3,
+ SPDK_OPAL_ENDNAME,
+ SPDK_OPAL_STARTNAME,
+ SPDK_OPAL_READLOCKENABLED);
+ opal_add_token_u64(&err, sess, read_lock_enabled);
+ opal_add_tokens(&err, sess, 3,
+ SPDK_OPAL_ENDNAME,
+ SPDK_OPAL_STARTNAME,
+ SPDK_OPAL_WRITELOCKENABLED);
+ opal_add_token_u64(&err, sess, write_lock_enabled);
+ opal_add_tokens(&err, sess, 4,
+ SPDK_OPAL_ENDNAME,
+ SPDK_OPAL_ENDLIST,
+ SPDK_OPAL_ENDNAME,
+ SPDK_OPAL_ENDLIST);
+ }
+ if (err) {
+ SPDK_ERRLOG("Error building Setup Locking range command.\n");
+ return err;
+
+ }
+
+ ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true);
+ if (ret) {
+ return ret;
+ }
+
+ ret = opal_send_recv(dev, sess);
+ if (ret) {
+ return ret;
+ }
+
+ return opal_parse_and_check_status(sess);
+}
+
+static int
+opal_get_max_ranges_done(struct opal_session *sess)
+{
+ int error = 0;
+
+ error = opal_parse_and_check_status(sess);
+ if (error) {
+ return error;
+ }
+
+ /* "MaxRanges" is token 4 of response */
+ return opal_response_get_u16(&sess->parsed_resp, 4);
+}
+
+static int
+opal_get_max_ranges(struct spdk_opal_dev *dev, struct opal_session *sess)
+{
+ int err = 0;
+ int ret;
+
+ opal_clear_cmd(sess);
+ opal_set_comid(sess, dev->comid);
+
+ opal_add_token_u8(&err, sess, SPDK_OPAL_CALL);
+ opal_add_token_bytestring(&err, sess, spdk_opal_uid[UID_LOCKING_INFO_TABLE],
+ OPAL_UID_LENGTH);
+ opal_add_token_bytestring(&err, sess, spdk_opal_method[GET_METHOD], OPAL_UID_LENGTH);
+
+ opal_add_tokens(&err, sess, 12, SPDK_OPAL_STARTLIST,
+ SPDK_OPAL_STARTLIST,
+ SPDK_OPAL_STARTNAME,
+ SPDK_OPAL_STARTCOLUMN,
+ SPDK_OPAL_MAXRANGES,
+ SPDK_OPAL_ENDNAME,
+ SPDK_OPAL_STARTNAME,
+ SPDK_OPAL_ENDCOLUMN,
+ SPDK_OPAL_MAXRANGES,
+ SPDK_OPAL_ENDNAME,
+ SPDK_OPAL_ENDLIST,
+ SPDK_OPAL_ENDLIST);
+
+ if (err) {
+ SPDK_ERRLOG("Error Building GET Lifecycle Status command\n");
+ return err;
+ }
+
+ ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true);
+ if (ret) {
+ return ret;
+ }
+
+ ret = opal_send_recv(dev, sess);
+ if (ret) {
+ return ret;
+ }
+
+ return opal_get_max_ranges_done(sess);
+}
+
+static int
+opal_get_locking_range_info_done(struct opal_session *sess,
+ struct spdk_opal_locking_range_info *info)
+{
+ int error = 0;
+
+ error = opal_parse_and_check_status(sess);
+ if (error) {
+ return error;
+ }
+
+ info->range_start = opal_response_get_u64(&sess->parsed_resp, 4);
+ info->range_length = opal_response_get_u64(&sess->parsed_resp, 8);
+ info->read_lock_enabled = opal_response_get_u8(&sess->parsed_resp, 12);
+ info->write_lock_enabled = opal_response_get_u8(&sess->parsed_resp, 16);
+ info->read_locked = opal_response_get_u8(&sess->parsed_resp, 20);
+ info->write_locked = opal_response_get_u8(&sess->parsed_resp, 24);
+
+ return 0;
+}
+
+static int
+opal_get_locking_range_info(struct spdk_opal_dev *dev,
+ struct opal_session *sess,
+ enum spdk_opal_locking_range locking_range_id)
+{
+ int err = 0;
+ int ret;
+ uint8_t uid_locking_range[OPAL_UID_LENGTH];
+ struct spdk_opal_locking_range_info *info;
+
+ opal_build_locking_range(uid_locking_range, locking_range_id);
+
+ assert(locking_range_id < SPDK_OPAL_MAX_LOCKING_RANGE);
+ info = &dev->locking_ranges[locking_range_id];
+ memset(info, 0, sizeof(*info));
+ info->locking_range_id = locking_range_id;
+
+ opal_clear_cmd(sess);
+ opal_set_comid(sess, dev->comid);
+
+ opal_add_token_u8(&err, sess, SPDK_OPAL_CALL);
+ opal_add_token_bytestring(&err, sess, uid_locking_range, OPAL_UID_LENGTH);
+ opal_add_token_bytestring(&err, sess, spdk_opal_method[GET_METHOD], OPAL_UID_LENGTH);
+
+
+ opal_add_tokens(&err, sess, 12, SPDK_OPAL_STARTLIST,
+ SPDK_OPAL_STARTLIST,
+ SPDK_OPAL_STARTNAME,
+ SPDK_OPAL_STARTCOLUMN,
+ SPDK_OPAL_RANGESTART,
+ SPDK_OPAL_ENDNAME,
+ SPDK_OPAL_STARTNAME,
+ SPDK_OPAL_ENDCOLUMN,
+ SPDK_OPAL_WRITELOCKED,
+ SPDK_OPAL_ENDNAME,
+ SPDK_OPAL_ENDLIST,
+ SPDK_OPAL_ENDLIST);
+
+ if (err) {
+ SPDK_ERRLOG("Error Building get locking range info command\n");
+ return err;
+ }
+
+ ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true);
+ if (ret) {
+ return ret;
+ }
+
+ ret = opal_send_recv(dev, sess);
+ if (ret) {
+ return ret;
+ }
+
+ return opal_get_locking_range_info_done(sess, info);
+}
+
+static int
+opal_enable_user(struct spdk_opal_dev *dev, struct opal_session *sess,
+ enum spdk_opal_user user)
+{
+ int err = 0;
+ int ret;
+ uint8_t uid_user[OPAL_UID_LENGTH];
+
+ memcpy(uid_user, spdk_opal_uid[UID_USER1], OPAL_UID_LENGTH);
+ uid_user[7] = user;
+
+ opal_clear_cmd(sess);
+ opal_set_comid(sess, dev->comid);
+
+ opal_add_token_u8(&err, sess, SPDK_OPAL_CALL);
+ opal_add_token_bytestring(&err, sess, uid_user, OPAL_UID_LENGTH);
+ opal_add_token_bytestring(&err, sess, spdk_opal_method[SET_METHOD], OPAL_UID_LENGTH);
+
+ opal_add_tokens(&err, sess, 11,
+ SPDK_OPAL_STARTLIST,
+ SPDK_OPAL_STARTNAME,
+ SPDK_OPAL_VALUES,
+ SPDK_OPAL_STARTLIST,
+ SPDK_OPAL_STARTNAME,
+ SPDK_OPAL_AUTH_ENABLE,
+ SPDK_OPAL_TRUE,
+ SPDK_OPAL_ENDNAME,
+ SPDK_OPAL_ENDLIST,
+ SPDK_OPAL_ENDNAME,
+ SPDK_OPAL_ENDLIST);
+
+ if (err) {
+ SPDK_ERRLOG("Error Building enable user command\n");
+ return err;
+ }
+
+ ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true);
+ if (ret) {
+ return ret;
+ }
+
+ ret = opal_send_recv(dev, sess);
+ if (ret) {
+ return ret;
+ }
+
+ return opal_parse_and_check_status(sess);
+}
+
+static int
+opal_add_user_to_locking_range(struct spdk_opal_dev *dev,
+ struct opal_session *sess,
+ enum spdk_opal_user user,
+ enum spdk_opal_locking_range locking_range,
+ enum spdk_opal_lock_state l_state)
+{
+ int err = 0;
+ int ret;
+ uint8_t uid_user[OPAL_UID_LENGTH];
+ uint8_t uid_locking_range[OPAL_UID_LENGTH];
+
+ memcpy(uid_user, spdk_opal_uid[UID_USER1], OPAL_UID_LENGTH);
+ uid_user[7] = user;
+
+ switch (l_state) {
+ case OPAL_READONLY:
+ memcpy(uid_locking_range, spdk_opal_uid[UID_LOCKINGRANGE_ACE_RDLOCKED], OPAL_UID_LENGTH);
+ break;
+ case OPAL_READWRITE:
+ memcpy(uid_locking_range, spdk_opal_uid[UID_LOCKINGRANGE_ACE_WRLOCKED], OPAL_UID_LENGTH);
+ break;
+ default:
+ SPDK_ERRLOG("locking state should only be OPAL_READONLY or OPAL_READWRITE\n");
+ return -EINVAL;
+ }
+
+ uid_locking_range[7] = locking_range;
+
+ opal_clear_cmd(sess);
+ opal_set_comid(sess, dev->comid);
+
+ opal_add_token_u8(&err, sess, SPDK_OPAL_CALL);
+ opal_add_token_bytestring(&err, sess, uid_locking_range, OPAL_UID_LENGTH);
+ opal_add_token_bytestring(&err, sess, spdk_opal_method[SET_METHOD], OPAL_UID_LENGTH);
+
+ opal_add_tokens(&err, sess, 8,
+ SPDK_OPAL_STARTLIST,
+ SPDK_OPAL_STARTNAME,
+ SPDK_OPAL_VALUES,
+ SPDK_OPAL_STARTLIST,
+ SPDK_OPAL_STARTNAME,
+ SPDK_OPAL_BOOLEAN_EXPR,
+ SPDK_OPAL_STARTLIST,
+ SPDK_OPAL_STARTNAME);
+ opal_add_token_bytestring(&err, sess, spdk_opal_uid[UID_HALF_AUTHORITY_OBJ_REF],
+ OPAL_UID_LENGTH / 2);
+ opal_add_token_bytestring(&err, sess, uid_user, OPAL_UID_LENGTH);
+
+ opal_add_tokens(&err, sess, 2, SPDK_OPAL_ENDNAME, SPDK_OPAL_STARTNAME);
+ opal_add_token_bytestring(&err, sess, spdk_opal_uid[UID_HALF_AUTHORITY_OBJ_REF],
+ OPAL_UID_LENGTH / 2);
+ opal_add_token_bytestring(&err, sess, uid_user, OPAL_UID_LENGTH);
+
+ opal_add_tokens(&err, sess, 2, SPDK_OPAL_ENDNAME, SPDK_OPAL_STARTNAME);
+ opal_add_token_bytestring(&err, sess, spdk_opal_uid[UID_HALF_BOOLEAN_ACE], OPAL_UID_LENGTH / 2);
+ opal_add_tokens(&err, sess, 7,
+ SPDK_OPAL_TRUE,
+ SPDK_OPAL_ENDNAME,
+ SPDK_OPAL_ENDLIST,
+ SPDK_OPAL_ENDNAME,
+ SPDK_OPAL_ENDLIST,
+ SPDK_OPAL_ENDNAME,
+ SPDK_OPAL_ENDLIST);
+ if (err) {
+ SPDK_ERRLOG("Error building add user to locking range command\n");
+ return err;
+ }
+
+ ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true);
+ if (ret) {
+ return ret;
+ }
+
+ ret = opal_send_recv(dev, sess);
+ if (ret) {
+ return ret;
+ }
+
+ return opal_parse_and_check_status(sess);
+}
+
+static int
+opal_new_user_passwd(struct spdk_opal_dev *dev, struct opal_session *sess,
+ enum spdk_opal_user user,
+ struct spdk_opal_key *opal_key)
+{
+ uint8_t uid_cpin[OPAL_UID_LENGTH];
+ int ret;
+
+ if (user == OPAL_ADMIN1) {
+ memcpy(uid_cpin, spdk_opal_uid[UID_C_PIN_ADMIN1], OPAL_UID_LENGTH);
+ } else {
+ memcpy(uid_cpin, spdk_opal_uid[UID_C_PIN_USER1], OPAL_UID_LENGTH);
+ uid_cpin[7] = user;
+ }
+
+ ret = opal_build_generic_pw_cmd(sess, opal_key->key, opal_key->key_len, uid_cpin, dev);
+ if (ret != 0) {
+ SPDK_ERRLOG("Error building set password command\n");
+ return ret;
+ }
+
+ ret = opal_send_recv(dev, sess);
+ if (ret) {
+ return ret;
+ }
+
+ return opal_parse_and_check_status(sess);
+}
+
+static int
+opal_set_sid_cpin_pin(struct spdk_opal_dev *dev, struct opal_session *sess, char *new_passwd)
+{
+ uint8_t cpin_uid[OPAL_UID_LENGTH];
+ struct spdk_opal_key opal_key = {};
+ int ret;
+
+ ret = opal_init_key(&opal_key, new_passwd);
+ if (ret != 0) {
+ return ret;
+ }
+
+ memcpy(cpin_uid, spdk_opal_uid[UID_C_PIN_SID], OPAL_UID_LENGTH);
+
+ if (opal_build_generic_pw_cmd(sess, opal_key.key, opal_key.key_len, cpin_uid, dev)) {
+ SPDK_ERRLOG("Error building Set SID cpin\n");
+ return -ERANGE;
+ }
+
+ ret = opal_send_recv(dev, sess);
+ if (ret) {
+ return ret;
+ }
+
+ return opal_parse_and_check_status(sess);
+}
+
+int
+spdk_opal_cmd_take_ownership(struct spdk_opal_dev *dev, char *new_passwd)
+{
+ int ret;
+ struct spdk_opal_key opal_key = {};
+ struct opal_session *sess;
+
+ assert(dev != NULL);
+
+ sess = opal_alloc_session(dev);
+ if (!sess) {
+ return -ENOMEM;
+ }
+
+ ret = opal_start_generic_session(dev, sess, UID_ANYBODY, UID_ADMINSP, NULL, 0);
+ if (ret) {
+ SPDK_ERRLOG("start admin SP session error %d\n", ret);
+ goto end;
+ }
+
+ ret = opal_get_msid_cpin_pin(dev, sess, &opal_key);
+ if (ret) {
+ SPDK_ERRLOG("get msid error %d\n", ret);
+ opal_end_session(dev, sess, dev->comid);
+ goto end;
+ }
+
+ ret = opal_end_session(dev, sess, dev->comid);
+ if (ret) {
+ SPDK_ERRLOG("end session error %d\n", ret);
+ goto end;
+ }
+
+ /* reuse the session structure */
+ memset(sess, 0, sizeof(*sess));
+ sess->dev = dev;
+ ret = opal_start_generic_session(dev, sess, UID_SID, UID_ADMINSP,
+ opal_key.key, opal_key.key_len);
+ if (ret) {
+ SPDK_ERRLOG("start admin SP session error %d\n", ret);
+ goto end;
+ }
+ memset(&opal_key, 0, sizeof(struct spdk_opal_key));
+
+ ret = opal_set_sid_cpin_pin(dev, sess, new_passwd);
+ if (ret) {
+ SPDK_ERRLOG("set cpin error %d\n", ret);
+ opal_end_session(dev, sess, dev->comid);
+ goto end;
+ }
+
+ ret = opal_end_session(dev, sess, dev->comid);
+ if (ret) {
+ SPDK_ERRLOG("end session error %d\n", ret);
+ }
+
+end:
+ free(sess);
+ return ret;
+}
+
+struct spdk_opal_dev *
+ spdk_opal_dev_construct(struct spdk_nvme_ctrlr *ctrlr)
+{
+ struct spdk_opal_dev *dev;
+ void *payload;
+
+ dev = calloc(1, sizeof(*dev));
+ if (!dev) {
+ SPDK_ERRLOG("Memory allocation failed\n");
+ return NULL;
+ }
+
+ dev->ctrlr = ctrlr;
+
+ payload = calloc(1, IO_BUFFER_LENGTH);
+ if (!payload) {
+ free(dev);
+ return NULL;
+ }
+
+ if (opal_discovery0(dev, payload, IO_BUFFER_LENGTH)) {
+ SPDK_INFOLOG(SPDK_LOG_OPAL, "Opal is not supported on this device\n");
+ free(dev);
+ free(payload);
+ return NULL;
+ }
+
+ free(payload);
+ return dev;
+}
+
+static int
+opal_build_revert_tper_cmd(struct spdk_opal_dev *dev, struct opal_session *sess)
+{
+ int err = 0;
+
+ opal_clear_cmd(sess);
+ opal_set_comid(sess, dev->comid);
+
+ opal_add_token_u8(&err, sess, SPDK_OPAL_CALL);
+ opal_add_token_bytestring(&err, sess, spdk_opal_uid[UID_ADMINSP],
+ OPAL_UID_LENGTH);
+ opal_add_token_bytestring(&err, sess, spdk_opal_method[REVERT_METHOD],
+ OPAL_UID_LENGTH);
+ opal_add_token_u8(&err, sess, SPDK_OPAL_STARTLIST);
+ opal_add_token_u8(&err, sess, SPDK_OPAL_ENDLIST);
+ if (err) {
+ SPDK_ERRLOG("Error building REVERT TPER command.\n");
+ return -ERANGE;
+ }
+
+ return opal_cmd_finalize(sess, sess->hsn, sess->tsn, true);
+}
+
+static int
+opal_gen_new_active_key(struct spdk_opal_dev *dev, struct opal_session *sess,
+ struct spdk_opal_key *active_key)
+{
+ uint8_t uid_data[OPAL_UID_LENGTH] = {0};
+ int err = 0;
+ int length;
+ int ret;
+
+ opal_clear_cmd(sess);
+ opal_set_comid(sess, dev->comid);
+
+ if (active_key->key_len == 0) {
+ SPDK_ERRLOG("Error finding previous data to generate new active key\n");
+ return -EINVAL;
+ }
+
+ length = spdk_min(active_key->key_len, OPAL_UID_LENGTH);
+ memcpy(uid_data, active_key->key, length);
+
+ opal_add_token_u8(&err, sess, SPDK_OPAL_CALL);
+ opal_add_token_bytestring(&err, sess, uid_data, OPAL_UID_LENGTH);
+ opal_add_token_bytestring(&err, sess, spdk_opal_method[GENKEY_METHOD],
+ OPAL_UID_LENGTH);
+
+ opal_add_tokens(&err, sess, 2, SPDK_OPAL_STARTLIST, SPDK_OPAL_ENDLIST);
+
+ if (err) {
+ SPDK_ERRLOG("Error building new key generation command.\n");
+ return err;
+ }
+
+ ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true);
+ if (ret) {
+ return ret;
+ }
+
+ ret = opal_send_recv(dev, sess);
+ if (ret) {
+ return ret;
+ }
+
+ return opal_parse_and_check_status(sess);
+}
+
+static int
+opal_get_active_key_done(struct opal_session *sess, struct spdk_opal_key *active_key)
+{
+ const char *key;
+ size_t str_len;
+ int error = 0;
+
+ error = opal_parse_and_check_status(sess);
+ if (error) {
+ return error;
+ }
+
+ str_len = opal_response_get_string(&sess->parsed_resp, 4, &key);
+ if (!key) {
+ SPDK_ERRLOG("Couldn't extract active key from response\n");
+ return -EINVAL;
+ }
+
+ active_key->key_len = str_len;
+ memcpy(active_key->key, key, active_key->key_len);
+
+ SPDK_DEBUGLOG(SPDK_LOG_OPAL, "active key = %p\n", active_key->key);
+ return 0;
+}
+
+static int
+opal_get_active_key(struct spdk_opal_dev *dev, struct opal_session *sess,
+ enum spdk_opal_locking_range locking_range,
+ struct spdk_opal_key *active_key)
+{
+ uint8_t uid_locking_range[OPAL_UID_LENGTH];
+ int err = 0;
+ int ret;
+
+ opal_clear_cmd(sess);
+ opal_set_comid(sess, dev->comid);
+
+ opal_build_locking_range(uid_locking_range, locking_range);
+
+ opal_add_token_u8(&err, sess, SPDK_OPAL_CALL);
+ opal_add_token_bytestring(&err, sess, uid_locking_range, OPAL_UID_LENGTH);
+ opal_add_token_bytestring(&err, sess, spdk_opal_method[GET_METHOD],
+ OPAL_UID_LENGTH);
+ opal_add_tokens(&err, sess, 12,
+ SPDK_OPAL_STARTLIST,
+ SPDK_OPAL_STARTLIST,
+ SPDK_OPAL_STARTNAME,
+ SPDK_OPAL_STARTCOLUMN,
+ SPDK_OPAL_ACTIVEKEY,
+ SPDK_OPAL_ENDNAME,
+ SPDK_OPAL_STARTNAME,
+ SPDK_OPAL_ENDCOLUMN,
+ SPDK_OPAL_ACTIVEKEY,
+ SPDK_OPAL_ENDNAME,
+ SPDK_OPAL_ENDLIST,
+ SPDK_OPAL_ENDLIST);
+
+ if (err) {
+ SPDK_ERRLOG("Error building get active key command.\n");
+ return err;
+ }
+
+ ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true);
+ if (ret) {
+ return ret;
+ }
+
+ ret = opal_send_recv(dev, sess);
+ if (ret) {
+ return ret;
+ }
+
+ return opal_get_active_key_done(sess, active_key);
+}
+
+static int
+opal_erase_locking_range(struct spdk_opal_dev *dev, struct opal_session *sess,
+ enum spdk_opal_locking_range locking_range)
+{
+ uint8_t uid_locking_range[OPAL_UID_LENGTH];
+ int err = 0;
+ int ret;
+
+ opal_clear_cmd(sess);
+ opal_set_comid(sess, dev->comid);
+
+ opal_build_locking_range(uid_locking_range, locking_range);
+
+ opal_add_token_u8(&err, sess, SPDK_OPAL_CALL);
+ opal_add_token_bytestring(&err, sess, uid_locking_range, OPAL_UID_LENGTH);
+ opal_add_token_bytestring(&err, sess, spdk_opal_method[ERASE_METHOD],
+ OPAL_UID_LENGTH);
+ opal_add_tokens(&err, sess, 2, SPDK_OPAL_STARTLIST, SPDK_OPAL_ENDLIST);
+
+ if (err) {
+ SPDK_ERRLOG("Error building erase locking range.\n");
+ return err;
+ }
+
+ ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true);
+ if (ret) {
+ return ret;
+ }
+
+ ret = opal_send_recv(dev, sess);
+ if (ret) {
+ return ret;
+ }
+
+ return opal_parse_and_check_status(sess);
+}
+
+int
+spdk_opal_cmd_revert_tper(struct spdk_opal_dev *dev, const char *passwd)
+{
+ int ret;
+ struct opal_session *sess;
+ struct spdk_opal_key opal_key = {};
+
+ assert(dev != NULL);
+
+ ret = opal_init_key(&opal_key, passwd);
+ if (ret) {
+ SPDK_ERRLOG("Init key failed\n");
+ return ret;
+ }
+
+ sess = opal_alloc_session(dev);
+ if (!sess) {
+ return -ENOMEM;
+ }
+
+ ret = opal_start_generic_session(dev, sess, UID_SID, UID_ADMINSP,
+ opal_key.key, opal_key.key_len);
+ if (ret) {
+ SPDK_ERRLOG("Error on starting admin SP session with error %d\n", ret);
+ free(sess);
+ return ret;
+ }
+
+ ret = opal_build_revert_tper_cmd(dev, sess);
+ if (ret) {
+ opal_end_session(dev, sess, dev->comid);
+ SPDK_ERRLOG("Build revert tper command with error %d\n", ret);
+ goto end;
+ }
+
+ ret = opal_send_recv(dev, sess);
+ if (ret) {
+ opal_end_session(dev, sess, dev->comid);
+ SPDK_ERRLOG("Error on reverting TPer with error %d\n", ret);
+ goto end;
+ }
+
+ ret = opal_parse_and_check_status(sess);
+ if (ret) {
+ opal_end_session(dev, sess, dev->comid);
+ SPDK_ERRLOG("Error on reverting TPer with error %d\n", ret);
+ }
+ /* No opal_end_session() required here for successful case */
+
+end:
+ free(sess);
+ return ret;
+}
+
+int
+spdk_opal_cmd_activate_locking_sp(struct spdk_opal_dev *dev, const char *passwd)
+{
+ struct opal_session *sess;
+ struct spdk_opal_key opal_key = {};
+ int ret;
+
+ ret = opal_init_key(&opal_key, passwd);
+ if (ret != 0) {
+ return ret;
+ }
+
+ sess = opal_alloc_session(dev);
+ if (!sess) {
+ return -ENOMEM;
+ }
+
+ ret = opal_start_generic_session(dev, sess, UID_SID, UID_ADMINSP,
+ opal_key.key, opal_key.key_len);
+ if (ret) {
+ SPDK_ERRLOG("Error on starting admin SP session with error %d\n", ret);
+ free(sess);
+ return ret;
+ }
+
+ ret = opal_get_locking_sp_lifecycle(dev, sess);
+ if (ret) {
+ SPDK_ERRLOG("Error on getting SP lifecycle with error %d\n", ret);
+ goto end;
+ }
+
+ ret = opal_activate(dev, sess);
+ if (ret) {
+ SPDK_ERRLOG("Error on activation with error %d\n", ret);
+ }
+
+end:
+ ret += opal_end_session(dev, sess, dev->comid);
+ if (ret) {
+ SPDK_ERRLOG("Error on ending session with error %d\n", ret);
+ }
+
+ free(sess);
+ return ret;
+}
+
+int
+spdk_opal_cmd_lock_unlock(struct spdk_opal_dev *dev, enum spdk_opal_user user,
+ enum spdk_opal_lock_state flag, enum spdk_opal_locking_range locking_range,
+ const char *passwd)
+{
+ struct opal_session *sess;
+ struct spdk_opal_key opal_key = {};
+ int ret;
+
+ assert(dev != NULL);
+
+ ret = opal_init_key(&opal_key, passwd);
+ if (ret != 0) {
+ return ret;
+ }
+
+ sess = opal_alloc_session(dev);
+ if (!sess) {
+ return -ENOMEM;
+ }
+
+ ret = opal_start_auth_session(dev, sess, user, &opal_key);
+ if (ret) {
+ SPDK_ERRLOG("start authenticate session error %d\n", ret);
+ free(sess);
+ return ret;
+ }
+
+ ret = opal_lock_unlock_range(dev, sess, locking_range, flag);
+ if (ret) {
+ SPDK_ERRLOG("lock unlock range error %d\n", ret);
+ }
+
+ ret += opal_end_session(dev, sess, dev->comid);
+ if (ret) {
+ SPDK_ERRLOG("end session error %d\n", ret);
+ }
+
+ free(sess);
+ return ret;
+}
+
+int
+spdk_opal_cmd_setup_locking_range(struct spdk_opal_dev *dev, enum spdk_opal_user user,
+ enum spdk_opal_locking_range locking_range_id, uint64_t range_start,
+ uint64_t range_length, const char *passwd)
+{
+ struct opal_session *sess;
+ struct spdk_opal_key opal_key = {};
+ int ret;
+
+ assert(dev != NULL);
+
+ ret = opal_init_key(&opal_key, passwd);
+ if (ret != 0) {
+ return ret;
+ }
+
+ sess = opal_alloc_session(dev);
+ if (!sess) {
+ return -ENOMEM;
+ }
+
+ ret = opal_start_auth_session(dev, sess, user, &opal_key);
+ if (ret) {
+ SPDK_ERRLOG("start authenticate session error %d\n", ret);
+ free(sess);
+ return ret;
+ }
+
+ ret = opal_setup_locking_range(dev, sess, locking_range_id, range_start, range_length, true,
+ true);
+ if (ret) {
+ SPDK_ERRLOG("setup locking range error %d\n", ret);
+ }
+
+ ret += opal_end_session(dev, sess, dev->comid);
+ if (ret) {
+ SPDK_ERRLOG("end session error %d\n", ret);
+ }
+
+ free(sess);
+ return ret;
+}
+
+int
+spdk_opal_cmd_get_max_ranges(struct spdk_opal_dev *dev, const char *passwd)
+{
+ struct opal_session *sess;
+ struct spdk_opal_key opal_key = {};
+ int ret;
+
+ assert(dev != NULL);
+
+ if (dev->max_ranges) {
+ return dev->max_ranges;
+ }
+
+ ret = opal_init_key(&opal_key, passwd);
+ if (ret != 0) {
+ return ret;
+ }
+
+ sess = opal_alloc_session(dev);
+ if (!sess) {
+ return -ENOMEM;
+ }
+
+ ret = opal_start_auth_session(dev, sess, OPAL_ADMIN1, &opal_key);
+ if (ret) {
+ SPDK_ERRLOG("start authenticate session error %d\n", ret);
+ free(sess);
+ return ret;
+ }
+
+ ret = opal_get_max_ranges(dev, sess);
+ if (ret > 0) {
+ dev->max_ranges = ret;
+ }
+
+ ret = opal_end_session(dev, sess, dev->comid);
+ if (ret) {
+ SPDK_ERRLOG("end session error %d\n", ret);
+ }
+
+ free(sess);
+
+ return (ret == 0 ? dev->max_ranges : ret);
+}
+
+int
+spdk_opal_cmd_get_locking_range_info(struct spdk_opal_dev *dev, const char *passwd,
+ enum spdk_opal_user user_id,
+ enum spdk_opal_locking_range locking_range_id)
+{
+ struct opal_session *sess;
+ struct spdk_opal_key opal_key = {};
+ int ret;
+
+ assert(dev != NULL);
+
+ ret = opal_init_key(&opal_key, passwd);
+ if (ret != 0) {
+ return ret;
+ }
+
+ sess = opal_alloc_session(dev);
+ if (!sess) {
+ return -ENOMEM;
+ }
+
+ ret = opal_start_auth_session(dev, sess, user_id, &opal_key);
+ if (ret) {
+ SPDK_ERRLOG("start authenticate session error %d\n", ret);
+ free(sess);
+ return ret;
+ }
+
+ ret = opal_get_locking_range_info(dev, sess, locking_range_id);
+ if (ret) {
+ SPDK_ERRLOG("get locking range info error %d\n", ret);
+ }
+
+ ret += opal_end_session(dev, sess, dev->comid);
+ if (ret) {
+ SPDK_ERRLOG("end session error %d\n", ret);
+ }
+
+ free(sess);
+ return ret;
+}
+
+int
+spdk_opal_cmd_enable_user(struct spdk_opal_dev *dev, enum spdk_opal_user user_id,
+ const char *passwd)
+{
+ struct opal_session *sess;
+ struct spdk_opal_key opal_key = {};
+ int ret;
+
+ assert(dev != NULL);
+
+ ret = opal_init_key(&opal_key, passwd);
+ if (ret != 0) {
+ return ret;
+ }
+
+ sess = opal_alloc_session(dev);
+ if (!sess) {
+ return -ENOMEM;
+ }
+
+ ret = opal_start_generic_session(dev, sess, UID_ADMIN1, UID_LOCKINGSP,
+ opal_key.key, opal_key.key_len);
+ if (ret) {
+ SPDK_ERRLOG("start locking SP session error %d\n", ret);
+ free(sess);
+ return ret;
+ }
+
+ ret = opal_enable_user(dev, sess, user_id);
+ if (ret) {
+ SPDK_ERRLOG("enable user error %d\n", ret);
+ }
+
+ ret += opal_end_session(dev, sess, dev->comid);
+ if (ret) {
+ SPDK_ERRLOG("end session error %d\n", ret);
+ }
+
+ free(sess);
+ return ret;
+}
+
+int
+spdk_opal_cmd_add_user_to_locking_range(struct spdk_opal_dev *dev, enum spdk_opal_user user_id,
+ enum spdk_opal_locking_range locking_range_id,
+ enum spdk_opal_lock_state lock_flag, const char *passwd)
+{
+ struct opal_session *sess;
+ struct spdk_opal_key opal_key = {};
+ int ret;
+
+ assert(dev != NULL);
+
+ ret = opal_init_key(&opal_key, passwd);
+ if (ret != 0) {
+ return ret;
+ }
+
+ sess = opal_alloc_session(dev);
+ if (!sess) {
+ return -ENOMEM;
+ }
+
+ ret = opal_start_generic_session(dev, sess, UID_ADMIN1, UID_LOCKINGSP,
+ opal_key.key, opal_key.key_len);
+ if (ret) {
+ SPDK_ERRLOG("start locking SP session error %d\n", ret);
+ free(sess);
+ return ret;
+ }
+
+ ret = opal_add_user_to_locking_range(dev, sess, user_id, locking_range_id, lock_flag);
+ if (ret) {
+ SPDK_ERRLOG("add user to locking range error %d\n", ret);
+ }
+
+ ret += opal_end_session(dev, sess, dev->comid);
+ if (ret) {
+ SPDK_ERRLOG("end session error %d\n", ret);
+ }
+
+ free(sess);
+ return ret;
+}
+
+int
+spdk_opal_cmd_set_new_passwd(struct spdk_opal_dev *dev, enum spdk_opal_user user_id,
+ const char *new_passwd, const char *old_passwd, bool new_user)
+{
+ struct opal_session *sess;
+ struct spdk_opal_key old_key = {};
+ struct spdk_opal_key new_key = {};
+ int ret;
+
+ assert(dev != NULL);
+
+ ret = opal_init_key(&old_key, old_passwd);
+ if (ret != 0) {
+ return ret;
+ }
+
+ ret = opal_init_key(&new_key, new_passwd);
+ if (ret != 0) {
+ return ret;
+ }
+
+ sess = opal_alloc_session(dev);
+ if (!sess) {
+ return -ENOMEM;
+ }
+
+ ret = opal_start_auth_session(dev, sess, new_user ? OPAL_ADMIN1 : user_id,
+ &old_key);
+ if (ret) {
+ SPDK_ERRLOG("start authenticate session error %d\n", ret);
+ free(sess);
+ return ret;
+ }
+
+ ret = opal_new_user_passwd(dev, sess, user_id, &new_key);
+ if (ret) {
+ SPDK_ERRLOG("set new passwd error %d\n", ret);
+ }
+
+ ret += opal_end_session(dev, sess, dev->comid);
+ if (ret) {
+ SPDK_ERRLOG("end session error %d\n", ret);
+ }
+
+ free(sess);
+ return ret;
+}
+
+int
+spdk_opal_cmd_erase_locking_range(struct spdk_opal_dev *dev, enum spdk_opal_user user_id,
+ enum spdk_opal_locking_range locking_range_id, const char *password)
+{
+ struct opal_session *sess;
+ struct spdk_opal_key opal_key = {};
+ int ret;
+
+ assert(dev != NULL);
+
+ ret = opal_init_key(&opal_key, password);
+ if (ret != 0) {
+ return ret;
+ }
+
+ sess = opal_alloc_session(dev);
+ if (!sess) {
+ return -ENOMEM;
+ }
+
+ ret = opal_start_auth_session(dev, sess, user_id, &opal_key);
+ if (ret) {
+ SPDK_ERRLOG("start authenticate session error %d\n", ret);
+ free(sess);
+ return ret;
+ }
+
+ ret = opal_erase_locking_range(dev, sess, locking_range_id);
+ if (ret) {
+ SPDK_ERRLOG("get active key error %d\n", ret);
+ }
+
+ ret += opal_end_session(dev, sess, dev->comid);
+ if (ret) {
+ SPDK_ERRLOG("end session error %d\n", ret);
+ }
+
+ free(sess);
+ return ret;
+}
+
+int
+spdk_opal_cmd_secure_erase_locking_range(struct spdk_opal_dev *dev, enum spdk_opal_user user_id,
+ enum spdk_opal_locking_range locking_range_id, const char *password)
+{
+ struct opal_session *sess;
+ struct spdk_opal_key opal_key = {};
+ struct spdk_opal_key *active_key;
+ int ret;
+
+ assert(dev != NULL);
+
+ ret = opal_init_key(&opal_key, password);
+ if (ret != 0) {
+ return ret;
+ }
+
+ active_key = calloc(1, sizeof(*active_key));
+ if (!active_key) {
+ return -ENOMEM;
+ }
+
+ sess = opal_alloc_session(dev);
+ if (!sess) {
+ free(active_key);
+ return -ENOMEM;
+ }
+
+ ret = opal_start_auth_session(dev, sess, user_id, &opal_key);
+ if (ret) {
+ SPDK_ERRLOG("start authenticate session error %d\n", ret);
+ free(active_key);
+ free(sess);
+ return ret;
+ }
+
+ ret = opal_get_active_key(dev, sess, locking_range_id, active_key);
+ if (ret) {
+ SPDK_ERRLOG("get active key error %d\n", ret);
+ goto end;
+ }
+
+ ret = opal_gen_new_active_key(dev, sess, active_key);
+ if (ret) {
+ SPDK_ERRLOG("generate new active key error %d\n", ret);
+ goto end;
+ }
+ memset(active_key, 0, sizeof(struct spdk_opal_key));
+
+end:
+ ret += opal_end_session(dev, sess, dev->comid);
+ if (ret) {
+ SPDK_ERRLOG("end session error %d\n", ret);
+ }
+ free(active_key);
+ free(sess);
+ return ret;
+}
+
+struct spdk_opal_d0_features_info *
+spdk_opal_get_d0_features_info(struct spdk_opal_dev *dev)
+{
+ return &dev->feat_info;
+}
+
+bool
+spdk_opal_supported(struct spdk_opal_dev *dev)
+{
+ return false;
+}
+
+struct spdk_opal_locking_range_info *
+spdk_opal_get_locking_range_info(struct spdk_opal_dev *dev, enum spdk_opal_locking_range id)
+{
+ assert(id < SPDK_OPAL_MAX_LOCKING_RANGE);
+ return &dev->locking_ranges[id];
+}
+
+void
+spdk_opal_free_locking_range_info(struct spdk_opal_dev *dev, enum spdk_opal_locking_range id)
+{
+ struct spdk_opal_locking_range_info *info;
+
+ assert(id < SPDK_OPAL_MAX_LOCKING_RANGE);
+ info = &dev->locking_ranges[id];
+ memset(info, 0, sizeof(*info));
+}
+
+/* Log component for opal submodule */
+SPDK_LOG_REGISTER_COMPONENT("opal", SPDK_LOG_OPAL)
diff --git a/src/spdk/lib/nvme/nvme_opal_internal.h b/src/spdk/lib/nvme/nvme_opal_internal.h
new file mode 100644
index 000000000..11815d435
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_opal_internal.h
@@ -0,0 +1,272 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_OPAL_INTERNAL_H
+#define SPDK_OPAL_INTERNAL_H
+
+#include "spdk/opal_spec.h"
+#include "spdk/opal.h"
+#include "spdk/scsi_spec.h"
+
+#define IO_BUFFER_LENGTH 2048
+#define MAX_TOKS 64
+#define OPAL_KEY_MAX 256
+#define OPAL_UID_LENGTH 8
+
+#define GENERIC_HOST_SESSION_NUM 0x69
+
+#define OPAL_INVAL_PARAM 12
+
+#define SPDK_DTAERROR_NO_METHOD_STATUS 0x89
+
+enum opal_token_type {
+ OPAL_DTA_TOKENID_BYTESTRING = 0xE0,
+ OPAL_DTA_TOKENID_SINT = 0xE1,
+ OPAL_DTA_TOKENID_UINT = 0xE2,
+ OPAL_DTA_TOKENID_TOKEN = 0xE3, /* actual token is returned */
+ OPAL_DTA_TOKENID_INVALID = 0X0,
+};
+
+enum opal_atom_width {
+ OPAL_WIDTH_TINY, /* 1 byte in length */
+ OPAL_WIDTH_SHORT, /* a 1-byte header and contain up to 15 bytes of data */
+ OPAL_WIDTH_MEDIUM, /* a 2-byte header and contain up to 2047 bytes of data */
+ OPAL_WIDTH_LONG, /* a 4-byte header and which contain up to 16,777,215 bytes of data */
+ OPAL_WIDTH_TOKEN
+};
+
+enum opal_uid_enum {
+ /* users */
+ UID_SMUID,
+ UID_THISSP,
+ UID_ADMINSP,
+ UID_LOCKINGSP,
+ UID_ANYBODY,
+ UID_SID,
+ UID_ADMIN1,
+ UID_USER1,
+ UID_USER2,
+
+ /* tables */
+ UID_LOCKINGRANGE_GLOBAL,
+ UID_LOCKINGRANGE_ACE_RDLOCKED,
+ UID_LOCKINGRANGE_ACE_WRLOCKED,
+ UID_MBRCONTROL,
+ UID_MBR,
+ UID_AUTHORITY_TABLE,
+ UID_C_PIN_TABLE,
+ UID_LOCKING_INFO_TABLE,
+ UID_PSID,
+
+ /* C_PIN_TABLE object ID's */
+ UID_C_PIN_MSID,
+ UID_C_PIN_SID,
+ UID_C_PIN_ADMIN1,
+ UID_C_PIN_USER1,
+
+ /* half UID's (only first 4 bytes used) */
+ UID_HALF_AUTHORITY_OBJ_REF,
+ UID_HALF_BOOLEAN_ACE,
+};
+
+/* enum for indexing the spdk_opal_method array */
+enum opal_method_enum {
+ PROPERTIES_METHOD,
+ STARTSESSION_METHOD,
+ REVERT_METHOD,
+ ACTIVATE_METHOD,
+ NEXT_METHOD,
+ GETACL_METHOD,
+ GENKEY_METHOD,
+ REVERTSP_METHOD,
+ GET_METHOD,
+ SET_METHOD,
+ AUTHENTICATE_METHOD,
+ RANDOM_METHOD,
+ ERASE_METHOD,
+};
+
+struct spdk_opal_key {
+ uint8_t key_len;
+ uint8_t key[OPAL_KEY_MAX];
+};
+
+const uint8_t spdk_opal_uid[][OPAL_UID_LENGTH] = {
+ /* users */
+ [UID_SMUID] = /* Session Manager UID */
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff },
+ [UID_THISSP] =
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01 },
+ [UID_ADMINSP] =
+ { 0x00, 0x00, 0x02, 0x05, 0x00, 0x00, 0x00, 0x01 },
+ [UID_LOCKINGSP] =
+ { 0x00, 0x00, 0x02, 0x05, 0x00, 0x00, 0x00, 0x02 },
+ [UID_ANYBODY] =
+ { 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x01 },
+ [UID_SID] = /* Security Identifier UID */
+ { 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x06 },
+ [UID_ADMIN1] =
+ { 0x00, 0x00, 0x00, 0x09, 0x00, 0x01, 0x00, 0x01 },
+ [UID_USER1] =
+ { 0x00, 0x00, 0x00, 0x09, 0x00, 0x03, 0x00, 0x01 },
+ [UID_USER2] =
+ { 0x00, 0x00, 0x00, 0x09, 0x00, 0x03, 0x00, 0x02 },
+
+ /* tables */
+ [UID_LOCKINGRANGE_GLOBAL] =
+ { 0x00, 0x00, 0x08, 0x02, 0x00, 0x00, 0x00, 0x01 },
+ [UID_LOCKINGRANGE_ACE_RDLOCKED] =
+ { 0x00, 0x00, 0x00, 0x08, 0x00, 0x03, 0xE0, 0x01 },
+ [UID_LOCKINGRANGE_ACE_WRLOCKED] =
+ { 0x00, 0x00, 0x00, 0x08, 0x00, 0x03, 0xE8, 0x01 },
+ [UID_MBRCONTROL] =
+ { 0x00, 0x00, 0x08, 0x03, 0x00, 0x00, 0x00, 0x01 },
+ [UID_MBR] =
+ { 0x00, 0x00, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00 },
+ [UID_AUTHORITY_TABLE] =
+ { 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x00},
+ [UID_C_PIN_TABLE] =
+ { 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x00, 0x00},
+ [UID_LOCKING_INFO_TABLE] =
+ { 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x00, 0x01 },
+ [UID_PSID] =
+ { 0x00, 0x00, 0x00, 0x09, 0x00, 0x01, 0xff, 0x01 },
+
+ /* C_PIN_TABLE object ID's */
+ [UID_C_PIN_MSID] =
+ { 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x84, 0x02},
+ [UID_C_PIN_SID] =
+ { 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x00, 0x01},
+ [UID_C_PIN_ADMIN1] =
+ { 0x00, 0x00, 0x00, 0x0B, 0x00, 0x01, 0x00, 0x01},
+ [UID_C_PIN_USER1] =
+ { 0x00, 0x00, 0x00, 0x0B, 0x00, 0x03, 0x00, 0x01},
+
+ /* half UID's (only first 4 bytes used) */
+ [UID_HALF_AUTHORITY_OBJ_REF] =
+ { 0x00, 0x00, 0x0C, 0x05, 0xff, 0xff, 0xff, 0xff },
+ [UID_HALF_BOOLEAN_ACE] =
+ { 0x00, 0x00, 0x04, 0x0E, 0xff, 0xff, 0xff, 0xff },
+};
+
+/*
+ * TCG Storage SSC Methods.
+ */
+const uint8_t spdk_opal_method[][OPAL_UID_LENGTH] = {
+ [PROPERTIES_METHOD] =
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x01 },
+ [STARTSESSION_METHOD] =
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x02 },
+ [REVERT_METHOD] =
+ { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x02, 0x02 },
+ [ACTIVATE_METHOD] =
+ { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x02, 0x03 },
+ [NEXT_METHOD] =
+ { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x08 },
+ [GETACL_METHOD] =
+ { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x0d },
+ [GENKEY_METHOD] =
+ { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x10 },
+ [REVERTSP_METHOD] =
+ { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x11 },
+ [GET_METHOD] =
+ { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x16 },
+ [SET_METHOD] =
+ { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x17 },
+ [AUTHENTICATE_METHOD] =
+ { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x1c },
+ [RANDOM_METHOD] =
+ { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x06, 0x01 },
+ [ERASE_METHOD] =
+ { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x08, 0x03 },
+};
+
+/*
+ * Response token
+ */
+struct spdk_opal_resp_token {
+ const uint8_t *pos;
+ uint8_t _padding[7];
+ union {
+ uint64_t unsigned_num;
+ int64_t signed_num;
+ } stored;
+ size_t len; /* header + data */
+ enum opal_token_type type;
+ enum opal_atom_width width;
+};
+
+struct spdk_opal_resp_parsed {
+ int num;
+ struct spdk_opal_resp_token resp_tokens[MAX_TOKS];
+};
+
+/* header of a response */
+struct spdk_opal_header {
+ struct spdk_opal_compacket com_packet;
+ struct spdk_opal_packet packet;
+ struct spdk_opal_data_subpacket sub_packet;
+};
+
+struct opal_session;
+struct spdk_opal_dev;
+
+typedef void (*opal_sess_cb)(struct opal_session *sess, int status, void *ctx);
+
+struct opal_session {
+ uint32_t hsn;
+ uint32_t tsn;
+ size_t cmd_pos;
+ uint8_t cmd[IO_BUFFER_LENGTH];
+ uint8_t resp[IO_BUFFER_LENGTH];
+ struct spdk_opal_resp_parsed parsed_resp;
+
+ opal_sess_cb sess_cb;
+ void *cb_arg;
+ bool done;
+ int status;
+ struct spdk_opal_dev *dev;
+};
+
+struct spdk_opal_dev {
+ struct spdk_nvme_ctrlr *ctrlr;
+
+ uint16_t comid;
+
+ struct spdk_opal_d0_features_info feat_info;
+
+ uint8_t max_ranges; /* max locking range number */
+ struct spdk_opal_locking_range_info locking_ranges[SPDK_OPAL_MAX_LOCKING_RANGE];
+};
+
+#endif
diff --git a/src/spdk/lib/nvme/nvme_pcie.c b/src/spdk/lib/nvme/nvme_pcie.c
new file mode 100644
index 000000000..132e34cdc
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_pcie.c
@@ -0,0 +1,2604 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation. All rights reserved.
+ * Copyright (c) 2017, IBM Corporation. All rights reserved.
+ * Copyright (c) 2019, 2020 Mellanox Technologies LTD. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * NVMe over PCIe transport
+ */
+
+#include "spdk/stdinc.h"
+#include "spdk/env.h"
+#include "spdk/likely.h"
+#include "spdk/string.h"
+#include "nvme_internal.h"
+#include "nvme_uevent.h"
+
+/*
+ * Number of completion queue entries to process before ringing the
+ * completion queue doorbell.
+ */
+#define NVME_MIN_COMPLETIONS (1)
+#define NVME_MAX_COMPLETIONS (128)
+
+/*
+ * NVME_MAX_SGL_DESCRIPTORS defines the maximum number of descriptors in one SGL
+ * segment.
+ */
+#define NVME_MAX_SGL_DESCRIPTORS (250)
+
+#define NVME_MAX_PRP_LIST_ENTRIES (503)
+
+struct nvme_pcie_enum_ctx {
+ struct spdk_nvme_probe_ctx *probe_ctx;
+ struct spdk_pci_addr pci_addr;
+ bool has_pci_addr;
+};
+
+/* PCIe transport extensions for spdk_nvme_ctrlr */
+struct nvme_pcie_ctrlr {
+ struct spdk_nvme_ctrlr ctrlr;
+
+ /** NVMe MMIO register space */
+ volatile struct spdk_nvme_registers *regs;
+
+ /** NVMe MMIO register size */
+ uint64_t regs_size;
+
+ struct {
+ /* BAR mapping address which contains controller memory buffer */
+ void *bar_va;
+
+ /* BAR physical address which contains controller memory buffer */
+ uint64_t bar_pa;
+
+ /* Controller memory buffer size in Bytes */
+ uint64_t size;
+
+ /* Current offset of controller memory buffer, relative to start of BAR virt addr */
+ uint64_t current_offset;
+
+ void *mem_register_addr;
+ size_t mem_register_size;
+ } cmb;
+
+ /** stride in uint32_t units between doorbell registers (1 = 4 bytes, 2 = 8 bytes, ...) */
+ uint32_t doorbell_stride_u32;
+
+ /* Opaque handle to associated PCI device. */
+ struct spdk_pci_device *devhandle;
+
+ /* Flag to indicate the MMIO register has been remapped */
+ bool is_remapped;
+};
+
+struct nvme_tracker {
+ TAILQ_ENTRY(nvme_tracker) tq_list;
+
+ struct nvme_request *req;
+ uint16_t cid;
+
+ uint16_t rsvd0;
+ uint32_t rsvd1;
+
+ spdk_nvme_cmd_cb cb_fn;
+ void *cb_arg;
+
+ uint64_t prp_sgl_bus_addr;
+
+ /* Don't move, metadata SGL is always contiguous with Data Block SGL */
+ struct spdk_nvme_sgl_descriptor meta_sgl;
+ union {
+ uint64_t prp[NVME_MAX_PRP_LIST_ENTRIES];
+ struct spdk_nvme_sgl_descriptor sgl[NVME_MAX_SGL_DESCRIPTORS];
+ } u;
+};
+/*
+ * struct nvme_tracker must be exactly 4K so that the prp[] array does not cross a page boundary
+ * and so that there is no padding required to meet alignment requirements.
+ */
+SPDK_STATIC_ASSERT(sizeof(struct nvme_tracker) == 4096, "nvme_tracker is not 4K");
+SPDK_STATIC_ASSERT((offsetof(struct nvme_tracker, u.sgl) & 7) == 0, "SGL must be Qword aligned");
+SPDK_STATIC_ASSERT((offsetof(struct nvme_tracker, meta_sgl) & 7) == 0, "SGL must be Qword aligned");
+
+struct nvme_pcie_poll_group {
+ struct spdk_nvme_transport_poll_group group;
+};
+
+/* PCIe transport extensions for spdk_nvme_qpair */
+struct nvme_pcie_qpair {
+ /* Submission queue tail doorbell */
+ volatile uint32_t *sq_tdbl;
+
+ /* Completion queue head doorbell */
+ volatile uint32_t *cq_hdbl;
+
+ /* Submission queue */
+ struct spdk_nvme_cmd *cmd;
+
+ /* Completion queue */
+ struct spdk_nvme_cpl *cpl;
+
+ TAILQ_HEAD(, nvme_tracker) free_tr;
+ TAILQ_HEAD(nvme_outstanding_tr_head, nvme_tracker) outstanding_tr;
+
+ /* Array of trackers indexed by command ID. */
+ struct nvme_tracker *tr;
+
+ uint16_t num_entries;
+
+ uint8_t retry_count;
+
+ uint16_t max_completions_cap;
+
+ uint16_t last_sq_tail;
+ uint16_t sq_tail;
+ uint16_t cq_head;
+ uint16_t sq_head;
+
+ struct {
+ uint8_t phase : 1;
+ uint8_t delay_cmd_submit : 1;
+ uint8_t has_shadow_doorbell : 1;
+ } flags;
+
+ /*
+ * Base qpair structure.
+ * This is located after the hot data in this structure so that the important parts of
+ * nvme_pcie_qpair are in the same cache line.
+ */
+ struct spdk_nvme_qpair qpair;
+
+ struct {
+ /* Submission queue shadow tail doorbell */
+ volatile uint32_t *sq_tdbl;
+
+ /* Completion queue shadow head doorbell */
+ volatile uint32_t *cq_hdbl;
+
+ /* Submission queue event index */
+ volatile uint32_t *sq_eventidx;
+
+ /* Completion queue event index */
+ volatile uint32_t *cq_eventidx;
+ } shadow_doorbell;
+
+ /*
+ * Fields below this point should not be touched on the normal I/O path.
+ */
+
+ bool sq_in_cmb;
+
+ uint64_t cmd_bus_addr;
+ uint64_t cpl_bus_addr;
+
+ struct spdk_nvme_cmd *sq_vaddr;
+ struct spdk_nvme_cpl *cq_vaddr;
+};
+
+static int nvme_pcie_ctrlr_attach(struct spdk_nvme_probe_ctx *probe_ctx,
+ struct spdk_pci_addr *pci_addr);
+static int nvme_pcie_qpair_construct(struct spdk_nvme_qpair *qpair,
+ const struct spdk_nvme_io_qpair_opts *opts);
+static int nvme_pcie_qpair_destroy(struct spdk_nvme_qpair *qpair);
+
+__thread struct nvme_pcie_ctrlr *g_thread_mmio_ctrlr = NULL;
+static uint16_t g_signal_lock;
+static bool g_sigset = false;
+
+static void
+nvme_sigbus_fault_sighandler(int signum, siginfo_t *info, void *ctx)
+{
+ void *map_address;
+ uint16_t flag = 0;
+
+ if (!__atomic_compare_exchange_n(&g_signal_lock, &flag, 1, false, __ATOMIC_ACQUIRE,
+ __ATOMIC_RELAXED)) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "request g_signal_lock failed\n");
+ return;
+ }
+
+ assert(g_thread_mmio_ctrlr != NULL);
+
+ if (!g_thread_mmio_ctrlr->is_remapped) {
+ map_address = mmap((void *)g_thread_mmio_ctrlr->regs, g_thread_mmio_ctrlr->regs_size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
+ if (map_address == MAP_FAILED) {
+ SPDK_ERRLOG("mmap failed\n");
+ __atomic_store_n(&g_signal_lock, 0, __ATOMIC_RELEASE);
+ return;
+ }
+ memset(map_address, 0xFF, sizeof(struct spdk_nvme_registers));
+ g_thread_mmio_ctrlr->regs = (volatile struct spdk_nvme_registers *)map_address;
+ g_thread_mmio_ctrlr->is_remapped = true;
+ }
+ __atomic_store_n(&g_signal_lock, 0, __ATOMIC_RELEASE);
+}
+
+static void
+nvme_pcie_ctrlr_setup_signal(void)
+{
+ struct sigaction sa;
+
+ sa.sa_sigaction = nvme_sigbus_fault_sighandler;
+ sigemptyset(&sa.sa_mask);
+ sa.sa_flags = SA_SIGINFO;
+ sigaction(SIGBUS, &sa, NULL);
+}
+
+static inline struct nvme_pcie_ctrlr *
+nvme_pcie_ctrlr(struct spdk_nvme_ctrlr *ctrlr)
+{
+ assert(ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE);
+ return SPDK_CONTAINEROF(ctrlr, struct nvme_pcie_ctrlr, ctrlr);
+}
+
+static int
+_nvme_pcie_hotplug_monitor(struct spdk_nvme_probe_ctx *probe_ctx)
+{
+ struct spdk_nvme_ctrlr *ctrlr, *tmp;
+ struct spdk_uevent event;
+ struct spdk_pci_addr pci_addr;
+
+ if (g_spdk_nvme_driver->hotplug_fd < 0) {
+ return 0;
+ }
+
+ while (nvme_get_uevent(g_spdk_nvme_driver->hotplug_fd, &event) > 0) {
+ if (event.subsystem == SPDK_NVME_UEVENT_SUBSYSTEM_UIO ||
+ event.subsystem == SPDK_NVME_UEVENT_SUBSYSTEM_VFIO) {
+ if (event.action == SPDK_NVME_UEVENT_ADD) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "add nvme address: %s\n",
+ event.traddr);
+ if (spdk_process_is_primary()) {
+ if (!spdk_pci_addr_parse(&pci_addr, event.traddr)) {
+ nvme_pcie_ctrlr_attach(probe_ctx, &pci_addr);
+ }
+ }
+ } else if (event.action == SPDK_NVME_UEVENT_REMOVE) {
+ struct spdk_nvme_transport_id trid;
+
+ memset(&trid, 0, sizeof(trid));
+ spdk_nvme_trid_populate_transport(&trid, SPDK_NVME_TRANSPORT_PCIE);
+ snprintf(trid.traddr, sizeof(trid.traddr), "%s", event.traddr);
+
+ ctrlr = nvme_get_ctrlr_by_trid_unsafe(&trid);
+ if (ctrlr == NULL) {
+ return 0;
+ }
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "remove nvme address: %s\n",
+ event.traddr);
+
+ nvme_ctrlr_fail(ctrlr, true);
+
+ /* get the user app to clean up and stop I/O */
+ if (ctrlr->remove_cb) {
+ nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock);
+ ctrlr->remove_cb(probe_ctx->cb_ctx, ctrlr);
+ nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock);
+ }
+ }
+ }
+ }
+
+ /* Initiate removal of physically hotremoved PCI controllers. Even after
+ * they're hotremoved from the system, SPDK might still report them via RPC.
+ */
+ TAILQ_FOREACH_SAFE(ctrlr, &g_spdk_nvme_driver->shared_attached_ctrlrs, tailq, tmp) {
+ bool do_remove = false;
+ struct nvme_pcie_ctrlr *pctrlr;
+
+ if (ctrlr->trid.trtype != SPDK_NVME_TRANSPORT_PCIE) {
+ continue;
+ }
+
+ pctrlr = nvme_pcie_ctrlr(ctrlr);
+ if (spdk_pci_device_is_removed(pctrlr->devhandle)) {
+ do_remove = true;
+ }
+
+ if (do_remove) {
+ nvme_ctrlr_fail(ctrlr, true);
+ if (ctrlr->remove_cb) {
+ nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock);
+ ctrlr->remove_cb(probe_ctx->cb_ctx, ctrlr);
+ nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock);
+ }
+ }
+ }
+ return 0;
+}
+
+static inline struct nvme_pcie_qpair *
+nvme_pcie_qpair(struct spdk_nvme_qpair *qpair)
+{
+ assert(qpair->trtype == SPDK_NVME_TRANSPORT_PCIE);
+ return SPDK_CONTAINEROF(qpair, struct nvme_pcie_qpair, qpair);
+}
+
+static volatile void *
+nvme_pcie_reg_addr(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset)
+{
+ struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
+
+ return (volatile void *)((uintptr_t)pctrlr->regs + offset);
+}
+
+static int
+nvme_pcie_ctrlr_set_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t value)
+{
+ struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
+
+ assert(offset <= sizeof(struct spdk_nvme_registers) - 4);
+ g_thread_mmio_ctrlr = pctrlr;
+ spdk_mmio_write_4(nvme_pcie_reg_addr(ctrlr, offset), value);
+ g_thread_mmio_ctrlr = NULL;
+ return 0;
+}
+
+static int
+nvme_pcie_ctrlr_set_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t value)
+{
+ struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
+
+ assert(offset <= sizeof(struct spdk_nvme_registers) - 8);
+ g_thread_mmio_ctrlr = pctrlr;
+ spdk_mmio_write_8(nvme_pcie_reg_addr(ctrlr, offset), value);
+ g_thread_mmio_ctrlr = NULL;
+ return 0;
+}
+
+static int
+nvme_pcie_ctrlr_get_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t *value)
+{
+ struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
+
+ assert(offset <= sizeof(struct spdk_nvme_registers) - 4);
+ assert(value != NULL);
+ g_thread_mmio_ctrlr = pctrlr;
+ *value = spdk_mmio_read_4(nvme_pcie_reg_addr(ctrlr, offset));
+ g_thread_mmio_ctrlr = NULL;
+ if (~(*value) == 0) {
+ return -1;
+ }
+
+ return 0;
+}
+
+static int
+nvme_pcie_ctrlr_get_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t *value)
+{
+ struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
+
+ assert(offset <= sizeof(struct spdk_nvme_registers) - 8);
+ assert(value != NULL);
+ g_thread_mmio_ctrlr = pctrlr;
+ *value = spdk_mmio_read_8(nvme_pcie_reg_addr(ctrlr, offset));
+ g_thread_mmio_ctrlr = NULL;
+ if (~(*value) == 0) {
+ return -1;
+ }
+
+ return 0;
+}
+
+static int
+nvme_pcie_ctrlr_set_asq(struct nvme_pcie_ctrlr *pctrlr, uint64_t value)
+{
+ return nvme_pcie_ctrlr_set_reg_8(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, asq),
+ value);
+}
+
+static int
+nvme_pcie_ctrlr_set_acq(struct nvme_pcie_ctrlr *pctrlr, uint64_t value)
+{
+ return nvme_pcie_ctrlr_set_reg_8(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, acq),
+ value);
+}
+
+static int
+nvme_pcie_ctrlr_set_aqa(struct nvme_pcie_ctrlr *pctrlr, const union spdk_nvme_aqa_register *aqa)
+{
+ return nvme_pcie_ctrlr_set_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, aqa.raw),
+ aqa->raw);
+}
+
+static int
+nvme_pcie_ctrlr_get_cmbloc(struct nvme_pcie_ctrlr *pctrlr, union spdk_nvme_cmbloc_register *cmbloc)
+{
+ return nvme_pcie_ctrlr_get_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, cmbloc.raw),
+ &cmbloc->raw);
+}
+
+static int
+nvme_pcie_ctrlr_get_cmbsz(struct nvme_pcie_ctrlr *pctrlr, union spdk_nvme_cmbsz_register *cmbsz)
+{
+ return nvme_pcie_ctrlr_get_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, cmbsz.raw),
+ &cmbsz->raw);
+}
+
+static uint32_t
+nvme_pcie_ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr *ctrlr)
+{
+ /*
+ * For commands requiring more than 2 PRP entries, one PRP will be
+ * embedded in the command (prp1), and the rest of the PRP entries
+ * will be in a list pointed to by the command (prp2). This means
+ * that real max number of PRP entries we support is 506+1, which
+ * results in a max xfer size of 506*ctrlr->page_size.
+ */
+ return NVME_MAX_PRP_LIST_ENTRIES * ctrlr->page_size;
+}
+
+static uint16_t
+nvme_pcie_ctrlr_get_max_sges(struct spdk_nvme_ctrlr *ctrlr)
+{
+ return NVME_MAX_SGL_DESCRIPTORS;
+}
+
+static void
+nvme_pcie_ctrlr_map_cmb(struct nvme_pcie_ctrlr *pctrlr)
+{
+ int rc;
+ void *addr = NULL;
+ uint32_t bir;
+ union spdk_nvme_cmbsz_register cmbsz;
+ union spdk_nvme_cmbloc_register cmbloc;
+ uint64_t size, unit_size, offset, bar_size = 0, bar_phys_addr = 0;
+
+ if (nvme_pcie_ctrlr_get_cmbsz(pctrlr, &cmbsz) ||
+ nvme_pcie_ctrlr_get_cmbloc(pctrlr, &cmbloc)) {
+ SPDK_ERRLOG("get registers failed\n");
+ goto exit;
+ }
+
+ if (!cmbsz.bits.sz) {
+ goto exit;
+ }
+
+ bir = cmbloc.bits.bir;
+ /* Values 0 2 3 4 5 are valid for BAR */
+ if (bir > 5 || bir == 1) {
+ goto exit;
+ }
+
+ /* unit size for 4KB/64KB/1MB/16MB/256MB/4GB/64GB */
+ unit_size = (uint64_t)1 << (12 + 4 * cmbsz.bits.szu);
+ /* controller memory buffer size in Bytes */
+ size = unit_size * cmbsz.bits.sz;
+ /* controller memory buffer offset from BAR in Bytes */
+ offset = unit_size * cmbloc.bits.ofst;
+
+ rc = spdk_pci_device_map_bar(pctrlr->devhandle, bir, &addr,
+ &bar_phys_addr, &bar_size);
+ if ((rc != 0) || addr == NULL) {
+ goto exit;
+ }
+
+ if (offset > bar_size) {
+ goto exit;
+ }
+
+ if (size > bar_size - offset) {
+ goto exit;
+ }
+
+ pctrlr->cmb.bar_va = addr;
+ pctrlr->cmb.bar_pa = bar_phys_addr;
+ pctrlr->cmb.size = size;
+ pctrlr->cmb.current_offset = offset;
+
+ if (!cmbsz.bits.sqs) {
+ pctrlr->ctrlr.opts.use_cmb_sqs = false;
+ }
+
+ return;
+exit:
+ pctrlr->ctrlr.opts.use_cmb_sqs = false;
+ return;
+}
+
+static int
+nvme_pcie_ctrlr_unmap_cmb(struct nvme_pcie_ctrlr *pctrlr)
+{
+ int rc = 0;
+ union spdk_nvme_cmbloc_register cmbloc;
+ void *addr = pctrlr->cmb.bar_va;
+
+ if (addr) {
+ if (pctrlr->cmb.mem_register_addr) {
+ spdk_mem_unregister(pctrlr->cmb.mem_register_addr, pctrlr->cmb.mem_register_size);
+ }
+
+ if (nvme_pcie_ctrlr_get_cmbloc(pctrlr, &cmbloc)) {
+ SPDK_ERRLOG("get_cmbloc() failed\n");
+ return -EIO;
+ }
+ rc = spdk_pci_device_unmap_bar(pctrlr->devhandle, cmbloc.bits.bir, addr);
+ }
+ return rc;
+}
+
+static int
+nvme_pcie_ctrlr_reserve_cmb(struct spdk_nvme_ctrlr *ctrlr)
+{
+ struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
+
+ if (pctrlr->cmb.bar_va == NULL) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "CMB not available\n");
+ return -ENOTSUP;
+ }
+
+ if (ctrlr->opts.use_cmb_sqs) {
+ SPDK_ERRLOG("CMB is already in use for submission queues.\n");
+ return -ENOTSUP;
+ }
+
+ return 0;
+}
+
+static void *
+nvme_pcie_ctrlr_map_io_cmb(struct spdk_nvme_ctrlr *ctrlr, size_t *size)
+{
+ struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
+ union spdk_nvme_cmbsz_register cmbsz;
+ union spdk_nvme_cmbloc_register cmbloc;
+ uint64_t mem_register_start, mem_register_end;
+ int rc;
+
+ if (pctrlr->cmb.mem_register_addr != NULL) {
+ *size = pctrlr->cmb.mem_register_size;
+ return pctrlr->cmb.mem_register_addr;
+ }
+
+ *size = 0;
+
+ if (pctrlr->cmb.bar_va == NULL) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "CMB not available\n");
+ return NULL;
+ }
+
+ if (ctrlr->opts.use_cmb_sqs) {
+ SPDK_ERRLOG("CMB is already in use for submission queues.\n");
+ return NULL;
+ }
+
+ if (nvme_pcie_ctrlr_get_cmbsz(pctrlr, &cmbsz) ||
+ nvme_pcie_ctrlr_get_cmbloc(pctrlr, &cmbloc)) {
+ SPDK_ERRLOG("get registers failed\n");
+ return NULL;
+ }
+
+ /* If only SQS is supported */
+ if (!(cmbsz.bits.wds || cmbsz.bits.rds)) {
+ return NULL;
+ }
+
+ /* If CMB is less than 4MiB in size then abort CMB mapping */
+ if (pctrlr->cmb.size < (1ULL << 22)) {
+ return NULL;
+ }
+
+ mem_register_start = _2MB_PAGE((uintptr_t)pctrlr->cmb.bar_va + pctrlr->cmb.current_offset +
+ VALUE_2MB - 1);
+ mem_register_end = _2MB_PAGE((uintptr_t)pctrlr->cmb.bar_va + pctrlr->cmb.current_offset +
+ pctrlr->cmb.size);
+ pctrlr->cmb.mem_register_addr = (void *)mem_register_start;
+ pctrlr->cmb.mem_register_size = mem_register_end - mem_register_start;
+
+ rc = spdk_mem_register((void *)mem_register_start, mem_register_end - mem_register_start);
+ if (rc) {
+ SPDK_ERRLOG("spdk_mem_register() failed\n");
+ return NULL;
+ }
+
+ pctrlr->cmb.mem_register_addr = (void *)mem_register_start;
+ pctrlr->cmb.mem_register_size = mem_register_end - mem_register_start;
+
+ *size = pctrlr->cmb.mem_register_size;
+ return pctrlr->cmb.mem_register_addr;
+}
+
+static int
+nvme_pcie_ctrlr_unmap_io_cmb(struct spdk_nvme_ctrlr *ctrlr)
+{
+ struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
+ int rc;
+
+ if (pctrlr->cmb.mem_register_addr == NULL) {
+ return 0;
+ }
+
+ rc = spdk_mem_unregister(pctrlr->cmb.mem_register_addr, pctrlr->cmb.mem_register_size);
+
+ if (rc == 0) {
+ pctrlr->cmb.mem_register_addr = NULL;
+ pctrlr->cmb.mem_register_size = 0;
+ }
+
+ return rc;
+}
+
+static int
+nvme_pcie_ctrlr_allocate_bars(struct nvme_pcie_ctrlr *pctrlr)
+{
+ int rc;
+ void *addr = NULL;
+ uint64_t phys_addr = 0, size = 0;
+
+ rc = spdk_pci_device_map_bar(pctrlr->devhandle, 0, &addr,
+ &phys_addr, &size);
+
+ if ((addr == NULL) || (rc != 0)) {
+ SPDK_ERRLOG("nvme_pcicfg_map_bar failed with rc %d or bar %p\n",
+ rc, addr);
+ return -1;
+ }
+
+ pctrlr->regs = (volatile struct spdk_nvme_registers *)addr;
+ pctrlr->regs_size = size;
+ nvme_pcie_ctrlr_map_cmb(pctrlr);
+
+ return 0;
+}
+
+static int
+nvme_pcie_ctrlr_free_bars(struct nvme_pcie_ctrlr *pctrlr)
+{
+ int rc = 0;
+ void *addr = (void *)pctrlr->regs;
+
+ if (pctrlr->ctrlr.is_removed) {
+ return rc;
+ }
+
+ rc = nvme_pcie_ctrlr_unmap_cmb(pctrlr);
+ if (rc != 0) {
+ SPDK_ERRLOG("nvme_ctrlr_unmap_cmb failed with error code %d\n", rc);
+ return -1;
+ }
+
+ if (addr) {
+ /* NOTE: addr may have been remapped here. We're relying on DPDK to call
+ * munmap internally.
+ */
+ rc = spdk_pci_device_unmap_bar(pctrlr->devhandle, 0, addr);
+ }
+ return rc;
+}
+
+static int
+nvme_pcie_ctrlr_construct_admin_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t num_entries)
+{
+ struct nvme_pcie_qpair *pqpair;
+ int rc;
+
+ pqpair = spdk_zmalloc(sizeof(*pqpair), 64, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE);
+ if (pqpair == NULL) {
+ return -ENOMEM;
+ }
+
+ pqpair->num_entries = num_entries;
+ pqpair->flags.delay_cmd_submit = 0;
+
+ ctrlr->adminq = &pqpair->qpair;
+
+ rc = nvme_qpair_init(ctrlr->adminq,
+ 0, /* qpair ID */
+ ctrlr,
+ SPDK_NVME_QPRIO_URGENT,
+ num_entries);
+ if (rc != 0) {
+ return rc;
+ }
+
+ return nvme_pcie_qpair_construct(ctrlr->adminq, NULL);
+}
+
+/* This function must only be called while holding g_spdk_nvme_driver->lock */
+static int
+pcie_nvme_enum_cb(void *ctx, struct spdk_pci_device *pci_dev)
+{
+ struct spdk_nvme_transport_id trid = {};
+ struct nvme_pcie_enum_ctx *enum_ctx = ctx;
+ struct spdk_nvme_ctrlr *ctrlr;
+ struct spdk_pci_addr pci_addr;
+
+ pci_addr = spdk_pci_device_get_addr(pci_dev);
+
+ spdk_nvme_trid_populate_transport(&trid, SPDK_NVME_TRANSPORT_PCIE);
+ spdk_pci_addr_fmt(trid.traddr, sizeof(trid.traddr), &pci_addr);
+
+ ctrlr = nvme_get_ctrlr_by_trid_unsafe(&trid);
+ if (!spdk_process_is_primary()) {
+ if (!ctrlr) {
+ SPDK_ERRLOG("Controller must be constructed in the primary process first.\n");
+ return -1;
+ }
+
+ return nvme_ctrlr_add_process(ctrlr, pci_dev);
+ }
+
+ /* check whether user passes the pci_addr */
+ if (enum_ctx->has_pci_addr &&
+ (spdk_pci_addr_compare(&pci_addr, &enum_ctx->pci_addr) != 0)) {
+ return 1;
+ }
+
+ return nvme_ctrlr_probe(&trid, enum_ctx->probe_ctx, pci_dev);
+}
+
+static int
+nvme_pcie_ctrlr_scan(struct spdk_nvme_probe_ctx *probe_ctx,
+ bool direct_connect)
+{
+ struct nvme_pcie_enum_ctx enum_ctx = {};
+
+ enum_ctx.probe_ctx = probe_ctx;
+
+ if (strlen(probe_ctx->trid.traddr) != 0) {
+ if (spdk_pci_addr_parse(&enum_ctx.pci_addr, probe_ctx->trid.traddr)) {
+ return -1;
+ }
+ enum_ctx.has_pci_addr = true;
+ }
+
+ /* Only the primary process can monitor hotplug. */
+ if (spdk_process_is_primary()) {
+ _nvme_pcie_hotplug_monitor(probe_ctx);
+ }
+
+ if (enum_ctx.has_pci_addr == false) {
+ return spdk_pci_enumerate(spdk_pci_nvme_get_driver(),
+ pcie_nvme_enum_cb, &enum_ctx);
+ } else {
+ return spdk_pci_device_attach(spdk_pci_nvme_get_driver(),
+ pcie_nvme_enum_cb, &enum_ctx, &enum_ctx.pci_addr);
+ }
+}
+
+static int
+nvme_pcie_ctrlr_attach(struct spdk_nvme_probe_ctx *probe_ctx, struct spdk_pci_addr *pci_addr)
+{
+ struct nvme_pcie_enum_ctx enum_ctx;
+
+ enum_ctx.probe_ctx = probe_ctx;
+ enum_ctx.has_pci_addr = true;
+ enum_ctx.pci_addr = *pci_addr;
+
+ return spdk_pci_enumerate(spdk_pci_nvme_get_driver(), pcie_nvme_enum_cb, &enum_ctx);
+}
+
+static struct spdk_nvme_ctrlr *nvme_pcie_ctrlr_construct(const struct spdk_nvme_transport_id *trid,
+ const struct spdk_nvme_ctrlr_opts *opts,
+ void *devhandle)
+{
+ struct spdk_pci_device *pci_dev = devhandle;
+ struct nvme_pcie_ctrlr *pctrlr;
+ union spdk_nvme_cap_register cap;
+ union spdk_nvme_vs_register vs;
+ uint16_t cmd_reg;
+ int rc;
+ struct spdk_pci_id pci_id;
+
+ rc = spdk_pci_device_claim(pci_dev);
+ if (rc < 0) {
+ SPDK_ERRLOG("could not claim device %s (%s)\n",
+ trid->traddr, spdk_strerror(-rc));
+ return NULL;
+ }
+
+ pctrlr = spdk_zmalloc(sizeof(struct nvme_pcie_ctrlr), 64, NULL,
+ SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE);
+ if (pctrlr == NULL) {
+ spdk_pci_device_unclaim(pci_dev);
+ SPDK_ERRLOG("could not allocate ctrlr\n");
+ return NULL;
+ }
+
+ pctrlr->is_remapped = false;
+ pctrlr->ctrlr.is_removed = false;
+ pctrlr->devhandle = devhandle;
+ pctrlr->ctrlr.opts = *opts;
+ pctrlr->ctrlr.trid = *trid;
+
+ rc = nvme_ctrlr_construct(&pctrlr->ctrlr);
+ if (rc != 0) {
+ spdk_pci_device_unclaim(pci_dev);
+ spdk_free(pctrlr);
+ return NULL;
+ }
+
+ rc = nvme_pcie_ctrlr_allocate_bars(pctrlr);
+ if (rc != 0) {
+ spdk_pci_device_unclaim(pci_dev);
+ spdk_free(pctrlr);
+ return NULL;
+ }
+
+ /* Enable PCI busmaster and disable INTx */
+ spdk_pci_device_cfg_read16(pci_dev, &cmd_reg, 4);
+ cmd_reg |= 0x404;
+ spdk_pci_device_cfg_write16(pci_dev, cmd_reg, 4);
+
+ if (nvme_ctrlr_get_cap(&pctrlr->ctrlr, &cap)) {
+ SPDK_ERRLOG("get_cap() failed\n");
+ spdk_pci_device_unclaim(pci_dev);
+ spdk_free(pctrlr);
+ return NULL;
+ }
+
+ if (nvme_ctrlr_get_vs(&pctrlr->ctrlr, &vs)) {
+ SPDK_ERRLOG("get_vs() failed\n");
+ spdk_pci_device_unclaim(pci_dev);
+ spdk_free(pctrlr);
+ return NULL;
+ }
+
+ nvme_ctrlr_init_cap(&pctrlr->ctrlr, &cap, &vs);
+
+ /* Doorbell stride is 2 ^ (dstrd + 2),
+ * but we want multiples of 4, so drop the + 2 */
+ pctrlr->doorbell_stride_u32 = 1 << cap.bits.dstrd;
+
+ pci_id = spdk_pci_device_get_id(pci_dev);
+ pctrlr->ctrlr.quirks = nvme_get_quirks(&pci_id);
+
+ rc = nvme_pcie_ctrlr_construct_admin_qpair(&pctrlr->ctrlr, pctrlr->ctrlr.opts.admin_queue_size);
+ if (rc != 0) {
+ nvme_ctrlr_destruct(&pctrlr->ctrlr);
+ return NULL;
+ }
+
+ /* Construct the primary process properties */
+ rc = nvme_ctrlr_add_process(&pctrlr->ctrlr, pci_dev);
+ if (rc != 0) {
+ nvme_ctrlr_destruct(&pctrlr->ctrlr);
+ return NULL;
+ }
+
+ if (g_sigset != true) {
+ nvme_pcie_ctrlr_setup_signal();
+ g_sigset = true;
+ }
+
+ return &pctrlr->ctrlr;
+}
+
+static int
+nvme_pcie_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr)
+{
+ struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
+ struct nvme_pcie_qpair *padminq = nvme_pcie_qpair(ctrlr->adminq);
+ union spdk_nvme_aqa_register aqa;
+
+ if (nvme_pcie_ctrlr_set_asq(pctrlr, padminq->cmd_bus_addr)) {
+ SPDK_ERRLOG("set_asq() failed\n");
+ return -EIO;
+ }
+
+ if (nvme_pcie_ctrlr_set_acq(pctrlr, padminq->cpl_bus_addr)) {
+ SPDK_ERRLOG("set_acq() failed\n");
+ return -EIO;
+ }
+
+ aqa.raw = 0;
+ /* acqs and asqs are 0-based. */
+ aqa.bits.acqs = nvme_pcie_qpair(ctrlr->adminq)->num_entries - 1;
+ aqa.bits.asqs = nvme_pcie_qpair(ctrlr->adminq)->num_entries - 1;
+
+ if (nvme_pcie_ctrlr_set_aqa(pctrlr, &aqa)) {
+ SPDK_ERRLOG("set_aqa() failed\n");
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static int
+nvme_pcie_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr)
+{
+ struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
+ struct spdk_pci_device *devhandle = nvme_ctrlr_proc_get_devhandle(ctrlr);
+
+ if (ctrlr->adminq) {
+ nvme_pcie_qpair_destroy(ctrlr->adminq);
+ }
+
+ nvme_ctrlr_destruct_finish(ctrlr);
+
+ nvme_ctrlr_free_processes(ctrlr);
+
+ nvme_pcie_ctrlr_free_bars(pctrlr);
+
+ if (devhandle) {
+ spdk_pci_device_unclaim(devhandle);
+ spdk_pci_device_detach(devhandle);
+ }
+
+ spdk_free(pctrlr);
+
+ return 0;
+}
+
+static void
+nvme_qpair_construct_tracker(struct nvme_tracker *tr, uint16_t cid, uint64_t phys_addr)
+{
+ tr->prp_sgl_bus_addr = phys_addr + offsetof(struct nvme_tracker, u.prp);
+ tr->cid = cid;
+ tr->req = NULL;
+}
+
+static int
+nvme_pcie_qpair_reset(struct spdk_nvme_qpair *qpair)
+{
+ struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
+ uint32_t i;
+
+ /* all head/tail vals are set to 0 */
+ pqpair->last_sq_tail = pqpair->sq_tail = pqpair->sq_head = pqpair->cq_head = 0;
+
+ /*
+ * First time through the completion queue, HW will set phase
+ * bit on completions to 1. So set this to 1 here, indicating
+ * we're looking for a 1 to know which entries have completed.
+ * we'll toggle the bit each time when the completion queue
+ * rolls over.
+ */
+ pqpair->flags.phase = 1;
+ for (i = 0; i < pqpair->num_entries; i++) {
+ pqpair->cpl[i].status.p = 0;
+ }
+
+ return 0;
+}
+
+static void *
+nvme_pcie_ctrlr_alloc_cmb(struct spdk_nvme_ctrlr *ctrlr, uint64_t size, uint64_t alignment,
+ uint64_t *phys_addr)
+{
+ struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
+ uintptr_t addr;
+
+ if (pctrlr->cmb.mem_register_addr != NULL) {
+ /* BAR is mapped for data */
+ return NULL;
+ }
+
+ addr = (uintptr_t)pctrlr->cmb.bar_va + pctrlr->cmb.current_offset;
+ addr = (addr + (alignment - 1)) & ~(alignment - 1);
+
+ /* CMB may only consume part of the BAR, calculate accordingly */
+ if (addr + size > ((uintptr_t)pctrlr->cmb.bar_va + pctrlr->cmb.size)) {
+ SPDK_ERRLOG("Tried to allocate past valid CMB range!\n");
+ return NULL;
+ }
+ *phys_addr = pctrlr->cmb.bar_pa + addr - (uintptr_t)pctrlr->cmb.bar_va;
+
+ pctrlr->cmb.current_offset = (addr + size) - (uintptr_t)pctrlr->cmb.bar_va;
+
+ return (void *)addr;
+}
+
+static int
+nvme_pcie_qpair_construct(struct spdk_nvme_qpair *qpair,
+ const struct spdk_nvme_io_qpair_opts *opts)
+{
+ struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
+ struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
+ struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
+ struct nvme_tracker *tr;
+ uint16_t i;
+ volatile uint32_t *doorbell_base;
+ uint16_t num_trackers;
+ size_t page_align = sysconf(_SC_PAGESIZE);
+ size_t queue_align, queue_len;
+ uint32_t flags = SPDK_MALLOC_DMA;
+ uint64_t sq_paddr = 0;
+ uint64_t cq_paddr = 0;
+
+ if (opts) {
+ pqpair->sq_vaddr = opts->sq.vaddr;
+ pqpair->cq_vaddr = opts->cq.vaddr;
+ sq_paddr = opts->sq.paddr;
+ cq_paddr = opts->cq.paddr;
+ }
+
+ pqpair->retry_count = ctrlr->opts.transport_retry_count;
+
+ /*
+ * Limit the maximum number of completions to return per call to prevent wraparound,
+ * and calculate how many trackers can be submitted at once without overflowing the
+ * completion queue.
+ */
+ pqpair->max_completions_cap = pqpair->num_entries / 4;
+ pqpair->max_completions_cap = spdk_max(pqpair->max_completions_cap, NVME_MIN_COMPLETIONS);
+ pqpair->max_completions_cap = spdk_min(pqpair->max_completions_cap, NVME_MAX_COMPLETIONS);
+ num_trackers = pqpair->num_entries - pqpair->max_completions_cap;
+
+ SPDK_INFOLOG(SPDK_LOG_NVME, "max_completions_cap = %" PRIu16 " num_trackers = %" PRIu16 "\n",
+ pqpair->max_completions_cap, num_trackers);
+
+ assert(num_trackers != 0);
+
+ pqpair->sq_in_cmb = false;
+
+ if (nvme_qpair_is_admin_queue(&pqpair->qpair)) {
+ flags |= SPDK_MALLOC_SHARE;
+ }
+
+ /* cmd and cpl rings must be aligned on page size boundaries. */
+ if (ctrlr->opts.use_cmb_sqs) {
+ pqpair->cmd = nvme_pcie_ctrlr_alloc_cmb(ctrlr, pqpair->num_entries * sizeof(struct spdk_nvme_cmd),
+ page_align, &pqpair->cmd_bus_addr);
+ if (pqpair->cmd != NULL) {
+ pqpair->sq_in_cmb = true;
+ }
+ }
+
+ if (pqpair->sq_in_cmb == false) {
+ if (pqpair->sq_vaddr) {
+ pqpair->cmd = pqpair->sq_vaddr;
+ } else {
+ /* To ensure physical address contiguity we make each ring occupy
+ * a single hugepage only. See MAX_IO_QUEUE_ENTRIES.
+ */
+ queue_len = pqpair->num_entries * sizeof(struct spdk_nvme_cmd);
+ queue_align = spdk_max(spdk_align32pow2(queue_len), page_align);
+ pqpair->cmd = spdk_zmalloc(queue_len, queue_align, NULL, SPDK_ENV_SOCKET_ID_ANY, flags);
+ if (pqpair->cmd == NULL) {
+ SPDK_ERRLOG("alloc qpair_cmd failed\n");
+ return -ENOMEM;
+ }
+ }
+ if (sq_paddr) {
+ assert(pqpair->sq_vaddr != NULL);
+ pqpair->cmd_bus_addr = sq_paddr;
+ } else {
+ pqpair->cmd_bus_addr = spdk_vtophys(pqpair->cmd, NULL);
+ if (pqpair->cmd_bus_addr == SPDK_VTOPHYS_ERROR) {
+ SPDK_ERRLOG("spdk_vtophys(pqpair->cmd) failed\n");
+ return -EFAULT;
+ }
+ }
+ }
+
+ if (pqpair->cq_vaddr) {
+ pqpair->cpl = pqpair->cq_vaddr;
+ } else {
+ queue_len = pqpair->num_entries * sizeof(struct spdk_nvme_cpl);
+ queue_align = spdk_max(spdk_align32pow2(queue_len), page_align);
+ pqpair->cpl = spdk_zmalloc(queue_len, queue_align, NULL, SPDK_ENV_SOCKET_ID_ANY, flags);
+ if (pqpair->cpl == NULL) {
+ SPDK_ERRLOG("alloc qpair_cpl failed\n");
+ return -ENOMEM;
+ }
+ }
+ if (cq_paddr) {
+ assert(pqpair->cq_vaddr != NULL);
+ pqpair->cpl_bus_addr = cq_paddr;
+ } else {
+ pqpair->cpl_bus_addr = spdk_vtophys(pqpair->cpl, NULL);
+ if (pqpair->cpl_bus_addr == SPDK_VTOPHYS_ERROR) {
+ SPDK_ERRLOG("spdk_vtophys(pqpair->cpl) failed\n");
+ return -EFAULT;
+ }
+ }
+
+ doorbell_base = &pctrlr->regs->doorbell[0].sq_tdbl;
+ pqpair->sq_tdbl = doorbell_base + (2 * qpair->id + 0) * pctrlr->doorbell_stride_u32;
+ pqpair->cq_hdbl = doorbell_base + (2 * qpair->id + 1) * pctrlr->doorbell_stride_u32;
+
+ /*
+ * Reserve space for all of the trackers in a single allocation.
+ * struct nvme_tracker must be padded so that its size is already a power of 2.
+ * This ensures the PRP list embedded in the nvme_tracker object will not span a
+ * 4KB boundary, while allowing access to trackers in tr[] via normal array indexing.
+ */
+ pqpair->tr = spdk_zmalloc(num_trackers * sizeof(*tr), sizeof(*tr), NULL,
+ SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE);
+ if (pqpair->tr == NULL) {
+ SPDK_ERRLOG("nvme_tr failed\n");
+ return -ENOMEM;
+ }
+
+ TAILQ_INIT(&pqpair->free_tr);
+ TAILQ_INIT(&pqpair->outstanding_tr);
+
+ for (i = 0; i < num_trackers; i++) {
+ tr = &pqpair->tr[i];
+ nvme_qpair_construct_tracker(tr, i, spdk_vtophys(tr, NULL));
+ TAILQ_INSERT_HEAD(&pqpair->free_tr, tr, tq_list);
+ }
+
+ nvme_pcie_qpair_reset(qpair);
+
+ return 0;
+}
+
+/* Used when dst points to MMIO (i.e. CMB) in a virtual machine - in these cases we must
+ * not use wide instructions because QEMU will not emulate such instructions to MMIO space.
+ * So this function ensures we only copy 8 bytes at a time.
+ */
+static inline void
+nvme_pcie_copy_command_mmio(struct spdk_nvme_cmd *dst, const struct spdk_nvme_cmd *src)
+{
+ uint64_t *dst64 = (uint64_t *)dst;
+ const uint64_t *src64 = (const uint64_t *)src;
+ uint32_t i;
+
+ for (i = 0; i < sizeof(*dst) / 8; i++) {
+ dst64[i] = src64[i];
+ }
+}
+
+static inline void
+nvme_pcie_copy_command(struct spdk_nvme_cmd *dst, const struct spdk_nvme_cmd *src)
+{
+ /* dst and src are known to be non-overlapping and 64-byte aligned. */
+#if defined(__SSE2__)
+ __m128i *d128 = (__m128i *)dst;
+ const __m128i *s128 = (const __m128i *)src;
+
+ _mm_stream_si128(&d128[0], _mm_load_si128(&s128[0]));
+ _mm_stream_si128(&d128[1], _mm_load_si128(&s128[1]));
+ _mm_stream_si128(&d128[2], _mm_load_si128(&s128[2]));
+ _mm_stream_si128(&d128[3], _mm_load_si128(&s128[3]));
+#else
+ *dst = *src;
+#endif
+}
+
+/**
+ * Note: the ctrlr_lock must be held when calling this function.
+ */
+static void
+nvme_pcie_qpair_insert_pending_admin_request(struct spdk_nvme_qpair *qpair,
+ struct nvme_request *req, struct spdk_nvme_cpl *cpl)
+{
+ struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
+ struct nvme_request *active_req = req;
+ struct spdk_nvme_ctrlr_process *active_proc;
+
+ /*
+ * The admin request is from another process. Move to the per
+ * process list for that process to handle it later.
+ */
+ assert(nvme_qpair_is_admin_queue(qpair));
+ assert(active_req->pid != getpid());
+
+ active_proc = nvme_ctrlr_get_process(ctrlr, active_req->pid);
+ if (active_proc) {
+ /* Save the original completion information */
+ memcpy(&active_req->cpl, cpl, sizeof(*cpl));
+ STAILQ_INSERT_TAIL(&active_proc->active_reqs, active_req, stailq);
+ } else {
+ SPDK_ERRLOG("The owning process (pid %d) is not found. Dropping the request.\n",
+ active_req->pid);
+
+ nvme_free_request(active_req);
+ }
+}
+
+/**
+ * Note: the ctrlr_lock must be held when calling this function.
+ */
+static void
+nvme_pcie_qpair_complete_pending_admin_request(struct spdk_nvme_qpair *qpair)
+{
+ struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
+ struct nvme_request *req, *tmp_req;
+ pid_t pid = getpid();
+ struct spdk_nvme_ctrlr_process *proc;
+
+ /*
+ * Check whether there is any pending admin request from
+ * other active processes.
+ */
+ assert(nvme_qpair_is_admin_queue(qpair));
+
+ proc = nvme_ctrlr_get_current_process(ctrlr);
+ if (!proc) {
+ SPDK_ERRLOG("the active process (pid %d) is not found for this controller.\n", pid);
+ assert(proc);
+ return;
+ }
+
+ STAILQ_FOREACH_SAFE(req, &proc->active_reqs, stailq, tmp_req) {
+ STAILQ_REMOVE(&proc->active_reqs, req, nvme_request, stailq);
+
+ assert(req->pid == pid);
+
+ nvme_complete_request(req->cb_fn, req->cb_arg, qpair, req, &req->cpl);
+ nvme_free_request(req);
+ }
+}
+
+static inline int
+nvme_pcie_qpair_need_event(uint16_t event_idx, uint16_t new_idx, uint16_t old)
+{
+ return (uint16_t)(new_idx - event_idx) <= (uint16_t)(new_idx - old);
+}
+
+static bool
+nvme_pcie_qpair_update_mmio_required(struct spdk_nvme_qpair *qpair, uint16_t value,
+ volatile uint32_t *shadow_db,
+ volatile uint32_t *eventidx)
+{
+ uint16_t old;
+
+ if (!shadow_db) {
+ return true;
+ }
+
+ old = *shadow_db;
+ *shadow_db = value;
+
+ /*
+ * Ensure that the doorbell is updated before reading the EventIdx from
+ * memory
+ */
+ spdk_mb();
+
+ if (!nvme_pcie_qpair_need_event(*eventidx, value, old)) {
+ return false;
+ }
+
+ return true;
+}
+
+static inline void
+nvme_pcie_qpair_ring_sq_doorbell(struct spdk_nvme_qpair *qpair)
+{
+ struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
+ struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(qpair->ctrlr);
+ bool need_mmio = true;
+
+ if (qpair->first_fused_submitted) {
+ /* This is first cmd of two fused commands - don't ring doorbell */
+ qpair->first_fused_submitted = 0;
+ return;
+ }
+
+ if (spdk_unlikely(pqpair->flags.has_shadow_doorbell)) {
+ need_mmio = nvme_pcie_qpair_update_mmio_required(qpair,
+ pqpair->sq_tail,
+ pqpair->shadow_doorbell.sq_tdbl,
+ pqpair->shadow_doorbell.sq_eventidx);
+ }
+
+ if (spdk_likely(need_mmio)) {
+ spdk_wmb();
+ g_thread_mmio_ctrlr = pctrlr;
+ spdk_mmio_write_4(pqpair->sq_tdbl, pqpair->sq_tail);
+ g_thread_mmio_ctrlr = NULL;
+ }
+}
+
+static inline void
+nvme_pcie_qpair_ring_cq_doorbell(struct spdk_nvme_qpair *qpair)
+{
+ struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
+ struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(qpair->ctrlr);
+ bool need_mmio = true;
+
+ if (spdk_unlikely(pqpair->flags.has_shadow_doorbell)) {
+ need_mmio = nvme_pcie_qpair_update_mmio_required(qpair,
+ pqpair->cq_head,
+ pqpair->shadow_doorbell.cq_hdbl,
+ pqpair->shadow_doorbell.cq_eventidx);
+ }
+
+ if (spdk_likely(need_mmio)) {
+ g_thread_mmio_ctrlr = pctrlr;
+ spdk_mmio_write_4(pqpair->cq_hdbl, pqpair->cq_head);
+ g_thread_mmio_ctrlr = NULL;
+ }
+}
+
+static void
+nvme_pcie_qpair_submit_tracker(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr)
+{
+ struct nvme_request *req;
+ struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
+ struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
+
+ req = tr->req;
+ assert(req != NULL);
+
+ if (req->cmd.fuse == SPDK_NVME_IO_FLAGS_FUSE_FIRST) {
+ /* This is first cmd of two fused commands - don't ring doorbell */
+ qpair->first_fused_submitted = 1;
+ }
+
+ /* Don't use wide instructions to copy NVMe command, this is limited by QEMU
+ * virtual NVMe controller, the maximum access width is 8 Bytes for one time.
+ */
+ if (spdk_unlikely((ctrlr->quirks & NVME_QUIRK_MAXIMUM_PCI_ACCESS_WIDTH) && pqpair->sq_in_cmb)) {
+ nvme_pcie_copy_command_mmio(&pqpair->cmd[pqpair->sq_tail], &req->cmd);
+ } else {
+ /* Copy the command from the tracker to the submission queue. */
+ nvme_pcie_copy_command(&pqpair->cmd[pqpair->sq_tail], &req->cmd);
+ }
+
+ if (spdk_unlikely(++pqpair->sq_tail == pqpair->num_entries)) {
+ pqpair->sq_tail = 0;
+ }
+
+ if (spdk_unlikely(pqpair->sq_tail == pqpair->sq_head)) {
+ SPDK_ERRLOG("sq_tail is passing sq_head!\n");
+ }
+
+ if (!pqpair->flags.delay_cmd_submit) {
+ nvme_pcie_qpair_ring_sq_doorbell(qpair);
+ }
+}
+
+static void
+nvme_pcie_qpair_complete_tracker(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr,
+ struct spdk_nvme_cpl *cpl, bool print_on_error)
+{
+ struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
+ struct nvme_request *req;
+ bool retry, error;
+ bool req_from_current_proc = true;
+
+ req = tr->req;
+
+ assert(req != NULL);
+
+ error = spdk_nvme_cpl_is_error(cpl);
+ retry = error && nvme_completion_is_retry(cpl) &&
+ req->retries < pqpair->retry_count;
+
+ if (error && print_on_error && !qpair->ctrlr->opts.disable_error_logging) {
+ spdk_nvme_qpair_print_command(qpair, &req->cmd);
+ spdk_nvme_qpair_print_completion(qpair, cpl);
+ }
+
+ assert(cpl->cid == req->cmd.cid);
+
+ if (retry) {
+ req->retries++;
+ nvme_pcie_qpair_submit_tracker(qpair, tr);
+ } else {
+ TAILQ_REMOVE(&pqpair->outstanding_tr, tr, tq_list);
+
+ /* Only check admin requests from different processes. */
+ if (nvme_qpair_is_admin_queue(qpair) && req->pid != getpid()) {
+ req_from_current_proc = false;
+ nvme_pcie_qpair_insert_pending_admin_request(qpair, req, cpl);
+ } else {
+ nvme_complete_request(tr->cb_fn, tr->cb_arg, qpair, req, cpl);
+ }
+
+ if (req_from_current_proc == true) {
+ nvme_qpair_free_request(qpair, req);
+ }
+
+ tr->req = NULL;
+
+ TAILQ_INSERT_HEAD(&pqpair->free_tr, tr, tq_list);
+ }
+}
+
+static void
+nvme_pcie_qpair_manual_complete_tracker(struct spdk_nvme_qpair *qpair,
+ struct nvme_tracker *tr, uint32_t sct, uint32_t sc, uint32_t dnr,
+ bool print_on_error)
+{
+ struct spdk_nvme_cpl cpl;
+
+ memset(&cpl, 0, sizeof(cpl));
+ cpl.sqid = qpair->id;
+ cpl.cid = tr->cid;
+ cpl.status.sct = sct;
+ cpl.status.sc = sc;
+ cpl.status.dnr = dnr;
+ nvme_pcie_qpair_complete_tracker(qpair, tr, &cpl, print_on_error);
+}
+
+static void
+nvme_pcie_qpair_abort_trackers(struct spdk_nvme_qpair *qpair, uint32_t dnr)
+{
+ struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
+ struct nvme_tracker *tr, *temp, *last;
+
+ last = TAILQ_LAST(&pqpair->outstanding_tr, nvme_outstanding_tr_head);
+
+ /* Abort previously submitted (outstanding) trs */
+ TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, temp) {
+ if (!qpair->ctrlr->opts.disable_error_logging) {
+ SPDK_ERRLOG("aborting outstanding command\n");
+ }
+ nvme_pcie_qpair_manual_complete_tracker(qpair, tr, SPDK_NVME_SCT_GENERIC,
+ SPDK_NVME_SC_ABORTED_BY_REQUEST, dnr, true);
+
+ if (tr == last) {
+ break;
+ }
+ }
+}
+
+static int
+nvme_pcie_qpair_iterate_requests(struct spdk_nvme_qpair *qpair,
+ int (*iter_fn)(struct nvme_request *req, void *arg),
+ void *arg)
+{
+ struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
+ struct nvme_tracker *tr, *tmp;
+ int rc;
+
+ assert(iter_fn != NULL);
+
+ TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, tmp) {
+ assert(tr->req != NULL);
+
+ rc = iter_fn(tr->req, arg);
+ if (rc != 0) {
+ return rc;
+ }
+ }
+
+ return 0;
+}
+
+static void
+nvme_pcie_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair)
+{
+ struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
+ struct nvme_tracker *tr;
+
+ tr = TAILQ_FIRST(&pqpair->outstanding_tr);
+ while (tr != NULL) {
+ assert(tr->req != NULL);
+ if (tr->req->cmd.opc == SPDK_NVME_OPC_ASYNC_EVENT_REQUEST) {
+ nvme_pcie_qpair_manual_complete_tracker(qpair, tr,
+ SPDK_NVME_SCT_GENERIC, SPDK_NVME_SC_ABORTED_SQ_DELETION, 0,
+ false);
+ tr = TAILQ_FIRST(&pqpair->outstanding_tr);
+ } else {
+ tr = TAILQ_NEXT(tr, tq_list);
+ }
+ }
+}
+
+static void
+nvme_pcie_admin_qpair_destroy(struct spdk_nvme_qpair *qpair)
+{
+ nvme_pcie_admin_qpair_abort_aers(qpair);
+}
+
+static int
+nvme_pcie_qpair_destroy(struct spdk_nvme_qpair *qpair)
+{
+ struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
+
+ if (nvme_qpair_is_admin_queue(qpair)) {
+ nvme_pcie_admin_qpair_destroy(qpair);
+ }
+ /*
+ * We check sq_vaddr and cq_vaddr to see if the user specified the memory
+ * buffers when creating the I/O queue.
+ * If the user specified them, we cannot free that memory.
+ * Nor do we free it if it's in the CMB.
+ */
+ if (!pqpair->sq_vaddr && pqpair->cmd && !pqpair->sq_in_cmb) {
+ spdk_free(pqpair->cmd);
+ }
+ if (!pqpair->cq_vaddr && pqpair->cpl) {
+ spdk_free(pqpair->cpl);
+ }
+ if (pqpair->tr) {
+ spdk_free(pqpair->tr);
+ }
+
+ nvme_qpair_deinit(qpair);
+
+ spdk_free(pqpair);
+
+ return 0;
+}
+
+static void
+nvme_pcie_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr)
+{
+ nvme_pcie_qpair_abort_trackers(qpair, dnr);
+}
+
+static int
+nvme_pcie_ctrlr_cmd_create_io_cq(struct spdk_nvme_ctrlr *ctrlr,
+ struct spdk_nvme_qpair *io_que, spdk_nvme_cmd_cb cb_fn,
+ void *cb_arg)
+{
+ struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(io_que);
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+
+ req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
+ if (req == NULL) {
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_CREATE_IO_CQ;
+
+ cmd->cdw10_bits.create_io_q.qid = io_que->id;
+ cmd->cdw10_bits.create_io_q.qsize = pqpair->num_entries - 1;
+
+ cmd->cdw11_bits.create_io_cq.pc = 1;
+ cmd->dptr.prp.prp1 = pqpair->cpl_bus_addr;
+
+ return nvme_ctrlr_submit_admin_request(ctrlr, req);
+}
+
+static int
+nvme_pcie_ctrlr_cmd_create_io_sq(struct spdk_nvme_ctrlr *ctrlr,
+ struct spdk_nvme_qpair *io_que, spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(io_que);
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+
+ req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
+ if (req == NULL) {
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_CREATE_IO_SQ;
+
+ cmd->cdw10_bits.create_io_q.qid = io_que->id;
+ cmd->cdw10_bits.create_io_q.qsize = pqpair->num_entries - 1;
+ cmd->cdw11_bits.create_io_sq.pc = 1;
+ cmd->cdw11_bits.create_io_sq.qprio = io_que->qprio;
+ cmd->cdw11_bits.create_io_sq.cqid = io_que->id;
+ cmd->dptr.prp.prp1 = pqpair->cmd_bus_addr;
+
+ return nvme_ctrlr_submit_admin_request(ctrlr, req);
+}
+
+static int
+nvme_pcie_ctrlr_cmd_delete_io_cq(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+
+ req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
+ if (req == NULL) {
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_DELETE_IO_CQ;
+ cmd->cdw10_bits.delete_io_q.qid = qpair->id;
+
+ return nvme_ctrlr_submit_admin_request(ctrlr, req);
+}
+
+static int
+nvme_pcie_ctrlr_cmd_delete_io_sq(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+
+ req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
+ if (req == NULL) {
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_DELETE_IO_SQ;
+ cmd->cdw10_bits.delete_io_q.qid = qpair->id;
+
+ return nvme_ctrlr_submit_admin_request(ctrlr, req);
+}
+
+static int
+_nvme_pcie_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair,
+ uint16_t qid)
+{
+ struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
+ struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
+ struct nvme_completion_poll_status *status;
+ int rc;
+
+ status = calloc(1, sizeof(*status));
+ if (!status) {
+ SPDK_ERRLOG("Failed to allocate status tracker\n");
+ return -ENOMEM;
+ }
+
+ rc = nvme_pcie_ctrlr_cmd_create_io_cq(ctrlr, qpair, nvme_completion_poll_cb, status);
+ if (rc != 0) {
+ free(status);
+ return rc;
+ }
+
+ if (nvme_wait_for_completion(ctrlr->adminq, status)) {
+ SPDK_ERRLOG("nvme_create_io_cq failed!\n");
+ if (!status->timed_out) {
+ free(status);
+ }
+ return -1;
+ }
+
+ memset(status, 0, sizeof(*status));
+ rc = nvme_pcie_ctrlr_cmd_create_io_sq(qpair->ctrlr, qpair, nvme_completion_poll_cb, status);
+ if (rc != 0) {
+ free(status);
+ return rc;
+ }
+
+ if (nvme_wait_for_completion(ctrlr->adminq, status)) {
+ SPDK_ERRLOG("nvme_create_io_sq failed!\n");
+ if (status->timed_out) {
+ /* Request is still queued, the memory will be freed in a completion callback.
+ allocate a new request */
+ status = calloc(1, sizeof(*status));
+ if (!status) {
+ SPDK_ERRLOG("Failed to allocate status tracker\n");
+ return -ENOMEM;
+ }
+ }
+
+ memset(status, 0, sizeof(*status));
+ /* Attempt to delete the completion queue */
+ rc = nvme_pcie_ctrlr_cmd_delete_io_cq(qpair->ctrlr, qpair, nvme_completion_poll_cb, status);
+ if (rc != 0) {
+ /* The originall or newly allocated status structure can be freed since
+ * the corresponding request has been completed of failed to submit */
+ free(status);
+ return -1;
+ }
+ nvme_wait_for_completion(ctrlr->adminq, status);
+ if (!status->timed_out) {
+ /* status can be freed regardless of nvme_wait_for_completion return value */
+ free(status);
+ }
+ return -1;
+ }
+
+ if (ctrlr->shadow_doorbell) {
+ pqpair->shadow_doorbell.sq_tdbl = ctrlr->shadow_doorbell + (2 * qpair->id + 0) *
+ pctrlr->doorbell_stride_u32;
+ pqpair->shadow_doorbell.cq_hdbl = ctrlr->shadow_doorbell + (2 * qpair->id + 1) *
+ pctrlr->doorbell_stride_u32;
+ pqpair->shadow_doorbell.sq_eventidx = ctrlr->eventidx + (2 * qpair->id + 0) *
+ pctrlr->doorbell_stride_u32;
+ pqpair->shadow_doorbell.cq_eventidx = ctrlr->eventidx + (2 * qpair->id + 1) *
+ pctrlr->doorbell_stride_u32;
+ pqpair->flags.has_shadow_doorbell = 1;
+ } else {
+ pqpair->flags.has_shadow_doorbell = 0;
+ }
+ nvme_pcie_qpair_reset(qpair);
+ free(status);
+
+ return 0;
+}
+
+static struct spdk_nvme_qpair *
+nvme_pcie_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid,
+ const struct spdk_nvme_io_qpair_opts *opts)
+{
+ struct nvme_pcie_qpair *pqpair;
+ struct spdk_nvme_qpair *qpair;
+ int rc;
+
+ assert(ctrlr != NULL);
+
+ pqpair = spdk_zmalloc(sizeof(*pqpair), 64, NULL,
+ SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE);
+ if (pqpair == NULL) {
+ return NULL;
+ }
+
+ pqpair->num_entries = opts->io_queue_size;
+ pqpair->flags.delay_cmd_submit = opts->delay_cmd_submit;
+
+ qpair = &pqpair->qpair;
+
+ rc = nvme_qpair_init(qpair, qid, ctrlr, opts->qprio, opts->io_queue_requests);
+ if (rc != 0) {
+ nvme_pcie_qpair_destroy(qpair);
+ return NULL;
+ }
+
+ rc = nvme_pcie_qpair_construct(qpair, opts);
+
+ if (rc != 0) {
+ nvme_pcie_qpair_destroy(qpair);
+ return NULL;
+ }
+
+ return qpair;
+}
+
+static int
+nvme_pcie_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
+{
+ if (nvme_qpair_is_admin_queue(qpair)) {
+ return 0;
+ } else {
+ return _nvme_pcie_ctrlr_create_io_qpair(ctrlr, qpair, qpair->id);
+ }
+}
+
+static void
+nvme_pcie_ctrlr_disconnect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
+{
+}
+
+static int
+nvme_pcie_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
+{
+ struct nvme_completion_poll_status *status;
+ int rc;
+
+ assert(ctrlr != NULL);
+
+ if (ctrlr->is_removed) {
+ goto free;
+ }
+
+ status = calloc(1, sizeof(*status));
+ if (!status) {
+ SPDK_ERRLOG("Failed to allocate status tracker\n");
+ return -ENOMEM;
+ }
+
+ /* Delete the I/O submission queue */
+ rc = nvme_pcie_ctrlr_cmd_delete_io_sq(ctrlr, qpair, nvme_completion_poll_cb, status);
+ if (rc != 0) {
+ SPDK_ERRLOG("Failed to send request to delete_io_sq with rc=%d\n", rc);
+ free(status);
+ return rc;
+ }
+ if (nvme_wait_for_completion(ctrlr->adminq, status)) {
+ if (!status->timed_out) {
+ free(status);
+ }
+ return -1;
+ }
+
+ memset(status, 0, sizeof(*status));
+ /* Delete the completion queue */
+ rc = nvme_pcie_ctrlr_cmd_delete_io_cq(ctrlr, qpair, nvme_completion_poll_cb, status);
+ if (rc != 0) {
+ SPDK_ERRLOG("Failed to send request to delete_io_cq with rc=%d\n", rc);
+ free(status);
+ return rc;
+ }
+ if (nvme_wait_for_completion(ctrlr->adminq, status)) {
+ if (!status->timed_out) {
+ free(status);
+ }
+ return -1;
+ }
+ free(status);
+
+free:
+ if (qpair->no_deletion_notification_needed == 0) {
+ /* Abort the rest of the I/O */
+ nvme_pcie_qpair_abort_trackers(qpair, 1);
+ }
+
+ nvme_pcie_qpair_destroy(qpair);
+ return 0;
+}
+
+static void
+nvme_pcie_fail_request_bad_vtophys(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr)
+{
+ /*
+ * Bad vtophys translation, so abort this request and return
+ * immediately.
+ */
+ nvme_pcie_qpair_manual_complete_tracker(qpair, tr, SPDK_NVME_SCT_GENERIC,
+ SPDK_NVME_SC_INVALID_FIELD,
+ 1 /* do not retry */, true);
+}
+
+/*
+ * Append PRP list entries to describe a virtually contiguous buffer starting at virt_addr of len bytes.
+ *
+ * *prp_index will be updated to account for the number of PRP entries used.
+ */
+static inline int
+nvme_pcie_prp_list_append(struct nvme_tracker *tr, uint32_t *prp_index, void *virt_addr, size_t len,
+ uint32_t page_size)
+{
+ struct spdk_nvme_cmd *cmd = &tr->req->cmd;
+ uintptr_t page_mask = page_size - 1;
+ uint64_t phys_addr;
+ uint32_t i;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp_index:%u virt_addr:%p len:%u\n",
+ *prp_index, virt_addr, (uint32_t)len);
+
+ if (spdk_unlikely(((uintptr_t)virt_addr & 3) != 0)) {
+ SPDK_ERRLOG("virt_addr %p not dword aligned\n", virt_addr);
+ return -EFAULT;
+ }
+
+ i = *prp_index;
+ while (len) {
+ uint32_t seg_len;
+
+ /*
+ * prp_index 0 is stored in prp1, and the rest are stored in the prp[] array,
+ * so prp_index == count is valid.
+ */
+ if (spdk_unlikely(i > SPDK_COUNTOF(tr->u.prp))) {
+ SPDK_ERRLOG("out of PRP entries\n");
+ return -EFAULT;
+ }
+
+ phys_addr = spdk_vtophys(virt_addr, NULL);
+ if (spdk_unlikely(phys_addr == SPDK_VTOPHYS_ERROR)) {
+ SPDK_ERRLOG("vtophys(%p) failed\n", virt_addr);
+ return -EFAULT;
+ }
+
+ if (i == 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp1 = %p\n", (void *)phys_addr);
+ cmd->dptr.prp.prp1 = phys_addr;
+ seg_len = page_size - ((uintptr_t)virt_addr & page_mask);
+ } else {
+ if ((phys_addr & page_mask) != 0) {
+ SPDK_ERRLOG("PRP %u not page aligned (%p)\n", i, virt_addr);
+ return -EFAULT;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp[%u] = %p\n", i - 1, (void *)phys_addr);
+ tr->u.prp[i - 1] = phys_addr;
+ seg_len = page_size;
+ }
+
+ seg_len = spdk_min(seg_len, len);
+ virt_addr += seg_len;
+ len -= seg_len;
+ i++;
+ }
+
+ cmd->psdt = SPDK_NVME_PSDT_PRP;
+ if (i <= 1) {
+ cmd->dptr.prp.prp2 = 0;
+ } else if (i == 2) {
+ cmd->dptr.prp.prp2 = tr->u.prp[0];
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp2 = %p\n", (void *)cmd->dptr.prp.prp2);
+ } else {
+ cmd->dptr.prp.prp2 = tr->prp_sgl_bus_addr;
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp2 = %p (PRP list)\n", (void *)cmd->dptr.prp.prp2);
+ }
+
+ *prp_index = i;
+ return 0;
+}
+
+static int
+nvme_pcie_qpair_build_request_invalid(struct spdk_nvme_qpair *qpair,
+ struct nvme_request *req, struct nvme_tracker *tr, bool dword_aligned)
+{
+ assert(0);
+ nvme_pcie_fail_request_bad_vtophys(qpair, tr);
+ return -EINVAL;
+}
+
+/**
+ * Build PRP list describing physically contiguous payload buffer.
+ */
+static int
+nvme_pcie_qpair_build_contig_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req,
+ struct nvme_tracker *tr, bool dword_aligned)
+{
+ uint32_t prp_index = 0;
+ int rc;
+
+ rc = nvme_pcie_prp_list_append(tr, &prp_index, req->payload.contig_or_cb_arg + req->payload_offset,
+ req->payload_size, qpair->ctrlr->page_size);
+ if (rc) {
+ nvme_pcie_fail_request_bad_vtophys(qpair, tr);
+ }
+
+ return rc;
+}
+
+/**
+ * Build an SGL describing a physically contiguous payload buffer.
+ *
+ * This is more efficient than using PRP because large buffers can be
+ * described this way.
+ */
+static int
+nvme_pcie_qpair_build_contig_hw_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req,
+ struct nvme_tracker *tr, bool dword_aligned)
+{
+ void *virt_addr;
+ uint64_t phys_addr, mapping_length;
+ uint32_t length;
+ struct spdk_nvme_sgl_descriptor *sgl;
+ uint32_t nseg = 0;
+
+ assert(req->payload_size != 0);
+ assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG);
+
+ sgl = tr->u.sgl;
+ req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG;
+ req->cmd.dptr.sgl1.unkeyed.subtype = 0;
+
+ length = req->payload_size;
+ virt_addr = req->payload.contig_or_cb_arg + req->payload_offset;
+ mapping_length = length;
+
+ while (length > 0) {
+ if (nseg >= NVME_MAX_SGL_DESCRIPTORS) {
+ nvme_pcie_fail_request_bad_vtophys(qpair, tr);
+ return -EFAULT;
+ }
+
+ if (dword_aligned && ((uintptr_t)virt_addr & 3)) {
+ SPDK_ERRLOG("virt_addr %p not dword aligned\n", virt_addr);
+ nvme_pcie_fail_request_bad_vtophys(qpair, tr);
+ return -EFAULT;
+ }
+
+ phys_addr = spdk_vtophys(virt_addr, &mapping_length);
+ if (phys_addr == SPDK_VTOPHYS_ERROR) {
+ nvme_pcie_fail_request_bad_vtophys(qpair, tr);
+ return -EFAULT;
+ }
+
+ mapping_length = spdk_min(length, mapping_length);
+
+ length -= mapping_length;
+ virt_addr += mapping_length;
+
+ sgl->unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
+ sgl->unkeyed.length = mapping_length;
+ sgl->address = phys_addr;
+ sgl->unkeyed.subtype = 0;
+
+ sgl++;
+ nseg++;
+ }
+
+ if (nseg == 1) {
+ /*
+ * The whole transfer can be described by a single SGL descriptor.
+ * Use the special case described by the spec where SGL1's type is Data Block.
+ * This means the SGL in the tracker is not used at all, so copy the first (and only)
+ * SGL element into SGL1.
+ */
+ req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
+ req->cmd.dptr.sgl1.address = tr->u.sgl[0].address;
+ req->cmd.dptr.sgl1.unkeyed.length = tr->u.sgl[0].unkeyed.length;
+ } else {
+ /* SPDK NVMe driver supports only 1 SGL segment for now, it is enough because
+ * NVME_MAX_SGL_DESCRIPTORS * 16 is less than one page.
+ */
+ req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_LAST_SEGMENT;
+ req->cmd.dptr.sgl1.address = tr->prp_sgl_bus_addr;
+ req->cmd.dptr.sgl1.unkeyed.length = nseg * sizeof(struct spdk_nvme_sgl_descriptor);
+ }
+
+ return 0;
+}
+
+/**
+ * Build SGL list describing scattered payload buffer.
+ */
+static int
+nvme_pcie_qpair_build_hw_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req,
+ struct nvme_tracker *tr, bool dword_aligned)
+{
+ int rc;
+ void *virt_addr;
+ uint64_t phys_addr;
+ uint32_t remaining_transfer_len, remaining_user_sge_len, length;
+ struct spdk_nvme_sgl_descriptor *sgl;
+ uint32_t nseg = 0;
+
+ /*
+ * Build scattered payloads.
+ */
+ assert(req->payload_size != 0);
+ assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL);
+ assert(req->payload.reset_sgl_fn != NULL);
+ assert(req->payload.next_sge_fn != NULL);
+ req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset);
+
+ sgl = tr->u.sgl;
+ req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG;
+ req->cmd.dptr.sgl1.unkeyed.subtype = 0;
+
+ remaining_transfer_len = req->payload_size;
+
+ while (remaining_transfer_len > 0) {
+ rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg,
+ &virt_addr, &remaining_user_sge_len);
+ if (rc) {
+ nvme_pcie_fail_request_bad_vtophys(qpair, tr);
+ return -EFAULT;
+ }
+
+ /* Bit Bucket SGL descriptor */
+ if ((uint64_t)virt_addr == UINT64_MAX) {
+ /* TODO: enable WRITE and COMPARE when necessary */
+ if (req->cmd.opc != SPDK_NVME_OPC_READ) {
+ SPDK_ERRLOG("Only READ command can be supported\n");
+ goto exit;
+ }
+ if (nseg >= NVME_MAX_SGL_DESCRIPTORS) {
+ SPDK_ERRLOG("Too many SGL entries\n");
+ goto exit;
+ }
+
+ sgl->unkeyed.type = SPDK_NVME_SGL_TYPE_BIT_BUCKET;
+ /* If the SGL describes a destination data buffer, the length of data
+ * buffer shall be discarded by controller, and the length is included
+ * in Number of Logical Blocks (NLB) parameter. Otherwise, the length
+ * is not included in the NLB parameter.
+ */
+ remaining_user_sge_len = spdk_min(remaining_user_sge_len, remaining_transfer_len);
+ remaining_transfer_len -= remaining_user_sge_len;
+
+ sgl->unkeyed.length = remaining_user_sge_len;
+ sgl->address = 0;
+ sgl->unkeyed.subtype = 0;
+
+ sgl++;
+ nseg++;
+
+ continue;
+ }
+
+ remaining_user_sge_len = spdk_min(remaining_user_sge_len, remaining_transfer_len);
+ remaining_transfer_len -= remaining_user_sge_len;
+ while (remaining_user_sge_len > 0) {
+ if (nseg >= NVME_MAX_SGL_DESCRIPTORS) {
+ SPDK_ERRLOG("Too many SGL entries\n");
+ goto exit;
+ }
+
+ if (dword_aligned && ((uintptr_t)virt_addr & 3)) {
+ SPDK_ERRLOG("virt_addr %p not dword aligned\n", virt_addr);
+ goto exit;
+ }
+
+ phys_addr = spdk_vtophys(virt_addr, NULL);
+ if (phys_addr == SPDK_VTOPHYS_ERROR) {
+ goto exit;
+ }
+
+ length = spdk_min(remaining_user_sge_len, VALUE_2MB - _2MB_OFFSET(virt_addr));
+ remaining_user_sge_len -= length;
+ virt_addr += length;
+
+ if (nseg > 0 && phys_addr ==
+ (*(sgl - 1)).address + (*(sgl - 1)).unkeyed.length) {
+ /* extend previous entry */
+ (*(sgl - 1)).unkeyed.length += length;
+ continue;
+ }
+
+ sgl->unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
+ sgl->unkeyed.length = length;
+ sgl->address = phys_addr;
+ sgl->unkeyed.subtype = 0;
+
+ sgl++;
+ nseg++;
+ }
+ }
+
+ if (nseg == 1) {
+ /*
+ * The whole transfer can be described by a single SGL descriptor.
+ * Use the special case described by the spec where SGL1's type is Data Block.
+ * This means the SGL in the tracker is not used at all, so copy the first (and only)
+ * SGL element into SGL1.
+ */
+ req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
+ req->cmd.dptr.sgl1.address = tr->u.sgl[0].address;
+ req->cmd.dptr.sgl1.unkeyed.length = tr->u.sgl[0].unkeyed.length;
+ } else {
+ /* SPDK NVMe driver supports only 1 SGL segment for now, it is enough because
+ * NVME_MAX_SGL_DESCRIPTORS * 16 is less than one page.
+ */
+ req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_LAST_SEGMENT;
+ req->cmd.dptr.sgl1.address = tr->prp_sgl_bus_addr;
+ req->cmd.dptr.sgl1.unkeyed.length = nseg * sizeof(struct spdk_nvme_sgl_descriptor);
+ }
+
+ return 0;
+
+exit:
+ nvme_pcie_fail_request_bad_vtophys(qpair, tr);
+ return -EFAULT;
+}
+
+/**
+ * Build PRP list describing scattered payload buffer.
+ */
+static int
+nvme_pcie_qpair_build_prps_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req,
+ struct nvme_tracker *tr, bool dword_aligned)
+{
+ int rc;
+ void *virt_addr;
+ uint32_t remaining_transfer_len, length;
+ uint32_t prp_index = 0;
+ uint32_t page_size = qpair->ctrlr->page_size;
+
+ /*
+ * Build scattered payloads.
+ */
+ assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL);
+ assert(req->payload.reset_sgl_fn != NULL);
+ req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset);
+
+ remaining_transfer_len = req->payload_size;
+ while (remaining_transfer_len > 0) {
+ assert(req->payload.next_sge_fn != NULL);
+ rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, &virt_addr, &length);
+ if (rc) {
+ nvme_pcie_fail_request_bad_vtophys(qpair, tr);
+ return -EFAULT;
+ }
+
+ length = spdk_min(remaining_transfer_len, length);
+
+ /*
+ * Any incompatible sges should have been handled up in the splitting routine,
+ * but assert here as an additional check.
+ *
+ * All SGEs except last must end on a page boundary.
+ */
+ assert((length == remaining_transfer_len) ||
+ _is_page_aligned((uintptr_t)virt_addr + length, page_size));
+
+ rc = nvme_pcie_prp_list_append(tr, &prp_index, virt_addr, length, page_size);
+ if (rc) {
+ nvme_pcie_fail_request_bad_vtophys(qpair, tr);
+ return rc;
+ }
+
+ remaining_transfer_len -= length;
+ }
+
+ return 0;
+}
+
+typedef int(*build_req_fn)(struct spdk_nvme_qpair *, struct nvme_request *, struct nvme_tracker *,
+ bool);
+
+static build_req_fn const g_nvme_pcie_build_req_table[][2] = {
+ [NVME_PAYLOAD_TYPE_INVALID] = {
+ nvme_pcie_qpair_build_request_invalid, /* PRP */
+ nvme_pcie_qpair_build_request_invalid /* SGL */
+ },
+ [NVME_PAYLOAD_TYPE_CONTIG] = {
+ nvme_pcie_qpair_build_contig_request, /* PRP */
+ nvme_pcie_qpair_build_contig_hw_sgl_request /* SGL */
+ },
+ [NVME_PAYLOAD_TYPE_SGL] = {
+ nvme_pcie_qpair_build_prps_sgl_request, /* PRP */
+ nvme_pcie_qpair_build_hw_sgl_request /* SGL */
+ }
+};
+
+static int
+nvme_pcie_qpair_build_metadata(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr,
+ bool sgl_supported, bool dword_aligned)
+{
+ void *md_payload;
+ struct nvme_request *req = tr->req;
+
+ if (req->payload.md) {
+ md_payload = req->payload.md + req->md_offset;
+ if (dword_aligned && ((uintptr_t)md_payload & 3)) {
+ SPDK_ERRLOG("virt_addr %p not dword aligned\n", md_payload);
+ goto exit;
+ }
+
+ if (sgl_supported && dword_aligned) {
+ assert(req->cmd.psdt == SPDK_NVME_PSDT_SGL_MPTR_CONTIG);
+ req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_SGL;
+ tr->meta_sgl.address = spdk_vtophys(md_payload, NULL);
+ if (tr->meta_sgl.address == SPDK_VTOPHYS_ERROR) {
+ goto exit;
+ }
+ tr->meta_sgl.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
+ tr->meta_sgl.unkeyed.length = req->md_size;
+ tr->meta_sgl.unkeyed.subtype = 0;
+ req->cmd.mptr = tr->prp_sgl_bus_addr - sizeof(struct spdk_nvme_sgl_descriptor);
+ } else {
+ req->cmd.mptr = spdk_vtophys(md_payload, NULL);
+ if (req->cmd.mptr == SPDK_VTOPHYS_ERROR) {
+ goto exit;
+ }
+ }
+ }
+
+ return 0;
+
+exit:
+ nvme_pcie_fail_request_bad_vtophys(qpair, tr);
+ return -EINVAL;
+}
+
+static int
+nvme_pcie_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req)
+{
+ struct nvme_tracker *tr;
+ int rc = 0;
+ struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
+ struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
+ enum nvme_payload_type payload_type;
+ bool sgl_supported;
+ bool dword_aligned = true;
+
+ if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) {
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ }
+
+ tr = TAILQ_FIRST(&pqpair->free_tr);
+
+ if (tr == NULL) {
+ /* Inform the upper layer to try again later. */
+ rc = -EAGAIN;
+ goto exit;
+ }
+
+ TAILQ_REMOVE(&pqpair->free_tr, tr, tq_list); /* remove tr from free_tr */
+ TAILQ_INSERT_TAIL(&pqpair->outstanding_tr, tr, tq_list);
+ tr->req = req;
+ tr->cb_fn = req->cb_fn;
+ tr->cb_arg = req->cb_arg;
+ req->cmd.cid = tr->cid;
+
+ if (req->payload_size != 0) {
+ payload_type = nvme_payload_type(&req->payload);
+ /* According to the specification, PRPs shall be used for all
+ * Admin commands for NVMe over PCIe implementations.
+ */
+ sgl_supported = (ctrlr->flags & SPDK_NVME_CTRLR_SGL_SUPPORTED) != 0 &&
+ !nvme_qpair_is_admin_queue(qpair);
+
+ if (sgl_supported && !(ctrlr->flags & SPDK_NVME_CTRLR_SGL_REQUIRES_DWORD_ALIGNMENT)) {
+ dword_aligned = false;
+ }
+ rc = g_nvme_pcie_build_req_table[payload_type][sgl_supported](qpair, req, tr, dword_aligned);
+ if (rc < 0) {
+ goto exit;
+ }
+
+ rc = nvme_pcie_qpair_build_metadata(qpair, tr, sgl_supported, dword_aligned);
+ if (rc < 0) {
+ goto exit;
+ }
+ }
+
+ nvme_pcie_qpair_submit_tracker(qpair, tr);
+
+exit:
+ if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) {
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ }
+
+ return rc;
+}
+
+static void
+nvme_pcie_qpair_check_timeout(struct spdk_nvme_qpair *qpair)
+{
+ uint64_t t02;
+ struct nvme_tracker *tr, *tmp;
+ struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
+ struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
+ struct spdk_nvme_ctrlr_process *active_proc;
+
+ /* Don't check timeouts during controller initialization. */
+ if (ctrlr->state != NVME_CTRLR_STATE_READY) {
+ return;
+ }
+
+ if (nvme_qpair_is_admin_queue(qpair)) {
+ active_proc = nvme_ctrlr_get_current_process(ctrlr);
+ } else {
+ active_proc = qpair->active_proc;
+ }
+
+ /* Only check timeouts if the current process has a timeout callback. */
+ if (active_proc == NULL || active_proc->timeout_cb_fn == NULL) {
+ return;
+ }
+
+ t02 = spdk_get_ticks();
+ TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, tmp) {
+ assert(tr->req != NULL);
+
+ if (nvme_request_check_timeout(tr->req, tr->cid, active_proc, t02)) {
+ /*
+ * The requests are in order, so as soon as one has not timed out,
+ * stop iterating.
+ */
+ break;
+ }
+ }
+}
+
+static int32_t
+nvme_pcie_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_completions)
+{
+ struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
+ struct nvme_tracker *tr;
+ struct spdk_nvme_cpl *cpl, *next_cpl;
+ uint32_t num_completions = 0;
+ struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
+ uint16_t next_cq_head;
+ uint8_t next_phase;
+ bool next_is_valid = false;
+
+ if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) {
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ }
+
+ if (max_completions == 0 || max_completions > pqpair->max_completions_cap) {
+ /*
+ * max_completions == 0 means unlimited, but complete at most
+ * max_completions_cap batch of I/O at a time so that the completion
+ * queue doorbells don't wrap around.
+ */
+ max_completions = pqpair->max_completions_cap;
+ }
+
+ while (1) {
+ cpl = &pqpair->cpl[pqpair->cq_head];
+
+ if (!next_is_valid && cpl->status.p != pqpair->flags.phase) {
+ break;
+ }
+
+ if (spdk_likely(pqpair->cq_head + 1 != pqpair->num_entries)) {
+ next_cq_head = pqpair->cq_head + 1;
+ next_phase = pqpair->flags.phase;
+ } else {
+ next_cq_head = 0;
+ next_phase = !pqpair->flags.phase;
+ }
+ next_cpl = &pqpair->cpl[next_cq_head];
+ next_is_valid = (next_cpl->status.p == next_phase);
+ if (next_is_valid) {
+ __builtin_prefetch(&pqpair->tr[next_cpl->cid]);
+ }
+
+#ifdef __PPC64__
+ /*
+ * This memory barrier prevents reordering of:
+ * - load after store from/to tr
+ * - load after load cpl phase and cpl cid
+ */
+ spdk_mb();
+#elif defined(__aarch64__)
+ __asm volatile("dmb oshld" ::: "memory");
+#endif
+
+ if (spdk_unlikely(++pqpair->cq_head == pqpair->num_entries)) {
+ pqpair->cq_head = 0;
+ pqpair->flags.phase = !pqpair->flags.phase;
+ }
+
+ tr = &pqpair->tr[cpl->cid];
+ /* Prefetch the req's STAILQ_ENTRY since we'll need to access it
+ * as part of putting the req back on the qpair's free list.
+ */
+ __builtin_prefetch(&tr->req->stailq);
+ pqpair->sq_head = cpl->sqhd;
+
+ if (tr->req) {
+ nvme_pcie_qpair_complete_tracker(qpair, tr, cpl, true);
+ } else {
+ SPDK_ERRLOG("cpl does not map to outstanding cmd\n");
+ spdk_nvme_qpair_print_completion(qpair, cpl);
+ assert(0);
+ }
+
+ if (++num_completions == max_completions) {
+ break;
+ }
+ }
+
+ if (num_completions > 0) {
+ nvme_pcie_qpair_ring_cq_doorbell(qpair);
+ }
+
+ if (pqpair->flags.delay_cmd_submit) {
+ if (pqpair->last_sq_tail != pqpair->sq_tail) {
+ nvme_pcie_qpair_ring_sq_doorbell(qpair);
+ pqpair->last_sq_tail = pqpair->sq_tail;
+ }
+ }
+
+ if (spdk_unlikely(ctrlr->timeout_enabled)) {
+ /*
+ * User registered for timeout callback
+ */
+ nvme_pcie_qpair_check_timeout(qpair);
+ }
+
+ /* Before returning, complete any pending admin request. */
+ if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) {
+ nvme_pcie_qpair_complete_pending_admin_request(qpair);
+
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ }
+
+ return num_completions;
+}
+
+static struct spdk_nvme_transport_poll_group *
+nvme_pcie_poll_group_create(void)
+{
+ struct nvme_pcie_poll_group *group = calloc(1, sizeof(*group));
+
+ if (group == NULL) {
+ SPDK_ERRLOG("Unable to allocate poll group.\n");
+ return NULL;
+ }
+
+ return &group->group;
+}
+
+static int
+nvme_pcie_poll_group_connect_qpair(struct spdk_nvme_qpair *qpair)
+{
+ return 0;
+}
+
+static int
+nvme_pcie_poll_group_disconnect_qpair(struct spdk_nvme_qpair *qpair)
+{
+ return 0;
+}
+
+static int
+nvme_pcie_poll_group_add(struct spdk_nvme_transport_poll_group *tgroup,
+ struct spdk_nvme_qpair *qpair)
+{
+ return 0;
+}
+
+static int
+nvme_pcie_poll_group_remove(struct spdk_nvme_transport_poll_group *tgroup,
+ struct spdk_nvme_qpair *qpair)
+{
+ return 0;
+}
+
+static int64_t
+nvme_pcie_poll_group_process_completions(struct spdk_nvme_transport_poll_group *tgroup,
+ uint32_t completions_per_qpair, spdk_nvme_disconnected_qpair_cb disconnected_qpair_cb)
+{
+ struct spdk_nvme_qpair *qpair, *tmp_qpair;
+ int32_t local_completions = 0;
+ int64_t total_completions = 0;
+
+ STAILQ_FOREACH_SAFE(qpair, &tgroup->disconnected_qpairs, poll_group_stailq, tmp_qpair) {
+ disconnected_qpair_cb(qpair, tgroup->group->ctx);
+ }
+
+ STAILQ_FOREACH_SAFE(qpair, &tgroup->connected_qpairs, poll_group_stailq, tmp_qpair) {
+ local_completions = spdk_nvme_qpair_process_completions(qpair, completions_per_qpair);
+ if (local_completions < 0) {
+ disconnected_qpair_cb(qpair, tgroup->group->ctx);
+ local_completions = 0;
+ }
+ total_completions += local_completions;
+ }
+
+ return total_completions;
+}
+
+static int
+nvme_pcie_poll_group_destroy(struct spdk_nvme_transport_poll_group *tgroup)
+{
+ if (!STAILQ_EMPTY(&tgroup->connected_qpairs) || !STAILQ_EMPTY(&tgroup->disconnected_qpairs)) {
+ return -EBUSY;
+ }
+
+ free(tgroup);
+
+ return 0;
+}
+
+static struct spdk_pci_id nvme_pci_driver_id[] = {
+ {
+ .class_id = SPDK_PCI_CLASS_NVME,
+ .vendor_id = SPDK_PCI_ANY_ID,
+ .device_id = SPDK_PCI_ANY_ID,
+ .subvendor_id = SPDK_PCI_ANY_ID,
+ .subdevice_id = SPDK_PCI_ANY_ID,
+ },
+ { .vendor_id = 0, /* sentinel */ },
+};
+
+SPDK_PCI_DRIVER_REGISTER("nvme", nvme_pci_driver_id,
+ SPDK_PCI_DRIVER_NEED_MAPPING | SPDK_PCI_DRIVER_WC_ACTIVATE);
+
+const struct spdk_nvme_transport_ops pcie_ops = {
+ .name = "PCIE",
+ .type = SPDK_NVME_TRANSPORT_PCIE,
+ .ctrlr_construct = nvme_pcie_ctrlr_construct,
+ .ctrlr_scan = nvme_pcie_ctrlr_scan,
+ .ctrlr_destruct = nvme_pcie_ctrlr_destruct,
+ .ctrlr_enable = nvme_pcie_ctrlr_enable,
+
+ .ctrlr_set_reg_4 = nvme_pcie_ctrlr_set_reg_4,
+ .ctrlr_set_reg_8 = nvme_pcie_ctrlr_set_reg_8,
+ .ctrlr_get_reg_4 = nvme_pcie_ctrlr_get_reg_4,
+ .ctrlr_get_reg_8 = nvme_pcie_ctrlr_get_reg_8,
+
+ .ctrlr_get_max_xfer_size = nvme_pcie_ctrlr_get_max_xfer_size,
+ .ctrlr_get_max_sges = nvme_pcie_ctrlr_get_max_sges,
+
+ .ctrlr_reserve_cmb = nvme_pcie_ctrlr_reserve_cmb,
+ .ctrlr_map_cmb = nvme_pcie_ctrlr_map_io_cmb,
+ .ctrlr_unmap_cmb = nvme_pcie_ctrlr_unmap_io_cmb,
+
+ .ctrlr_create_io_qpair = nvme_pcie_ctrlr_create_io_qpair,
+ .ctrlr_delete_io_qpair = nvme_pcie_ctrlr_delete_io_qpair,
+ .ctrlr_connect_qpair = nvme_pcie_ctrlr_connect_qpair,
+ .ctrlr_disconnect_qpair = nvme_pcie_ctrlr_disconnect_qpair,
+
+ .qpair_abort_reqs = nvme_pcie_qpair_abort_reqs,
+ .qpair_reset = nvme_pcie_qpair_reset,
+ .qpair_submit_request = nvme_pcie_qpair_submit_request,
+ .qpair_process_completions = nvme_pcie_qpair_process_completions,
+ .qpair_iterate_requests = nvme_pcie_qpair_iterate_requests,
+ .admin_qpair_abort_aers = nvme_pcie_admin_qpair_abort_aers,
+
+ .poll_group_create = nvme_pcie_poll_group_create,
+ .poll_group_connect_qpair = nvme_pcie_poll_group_connect_qpair,
+ .poll_group_disconnect_qpair = nvme_pcie_poll_group_disconnect_qpair,
+ .poll_group_add = nvme_pcie_poll_group_add,
+ .poll_group_remove = nvme_pcie_poll_group_remove,
+ .poll_group_process_completions = nvme_pcie_poll_group_process_completions,
+ .poll_group_destroy = nvme_pcie_poll_group_destroy,
+};
+
+SPDK_NVME_TRANSPORT_REGISTER(pcie, &pcie_ops);
diff --git a/src/spdk/lib/nvme/nvme_poll_group.c b/src/spdk/lib/nvme/nvme_poll_group.c
new file mode 100644
index 000000000..291f55e63
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_poll_group.c
@@ -0,0 +1,164 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include "nvme_internal.h"
+
+struct spdk_nvme_poll_group *
+spdk_nvme_poll_group_create(void *ctx)
+{
+ struct spdk_nvme_poll_group *group;
+
+ group = calloc(1, sizeof(*group));
+ if (group == NULL) {
+ return NULL;
+ }
+
+ group->ctx = ctx;
+ STAILQ_INIT(&group->tgroups);
+
+ return group;
+}
+
+int
+spdk_nvme_poll_group_add(struct spdk_nvme_poll_group *group, struct spdk_nvme_qpair *qpair)
+{
+ struct spdk_nvme_transport_poll_group *tgroup;
+ const struct spdk_nvme_transport *transport;
+
+ if (nvme_qpair_get_state(qpair) != NVME_QPAIR_DISCONNECTED) {
+ return -EINVAL;
+ }
+
+ STAILQ_FOREACH(tgroup, &group->tgroups, link) {
+ if (tgroup->transport == qpair->transport) {
+ break;
+ }
+ }
+
+ /* See if a new transport has been added (dlopen style) and we need to update the poll group */
+ if (!tgroup) {
+ transport = nvme_get_first_transport();
+ while (transport != NULL) {
+ if (transport == qpair->transport) {
+ tgroup = nvme_transport_poll_group_create(transport);
+ if (tgroup == NULL) {
+ return -ENOMEM;
+ }
+ tgroup->group = group;
+ STAILQ_INSERT_TAIL(&group->tgroups, tgroup, link);
+ break;
+ }
+ transport = nvme_get_next_transport(transport);
+ }
+ }
+
+ return tgroup ? nvme_transport_poll_group_add(tgroup, qpair) : -ENODEV;
+}
+
+int
+spdk_nvme_poll_group_remove(struct spdk_nvme_poll_group *group, struct spdk_nvme_qpair *qpair)
+{
+ struct spdk_nvme_transport_poll_group *tgroup;
+
+ STAILQ_FOREACH(tgroup, &group->tgroups, link) {
+ if (tgroup->transport == qpair->transport) {
+ return nvme_transport_poll_group_remove(tgroup, qpair);
+ }
+ }
+
+ return -ENODEV;
+}
+
+int
+nvme_poll_group_connect_qpair(struct spdk_nvme_qpair *qpair)
+{
+ return nvme_transport_poll_group_connect_qpair(qpair);
+}
+
+int
+nvme_poll_group_disconnect_qpair(struct spdk_nvme_qpair *qpair)
+{
+ return nvme_transport_poll_group_disconnect_qpair(qpair);
+}
+
+int64_t
+spdk_nvme_poll_group_process_completions(struct spdk_nvme_poll_group *group,
+ uint32_t completions_per_qpair, spdk_nvme_disconnected_qpair_cb disconnected_qpair_cb)
+{
+ struct spdk_nvme_transport_poll_group *tgroup;
+ int64_t local_completions = 0, error_reason = 0, num_completions = 0;
+
+ if (disconnected_qpair_cb == NULL) {
+ return -EINVAL;
+ }
+
+ STAILQ_FOREACH(tgroup, &group->tgroups, link) {
+ local_completions = nvme_transport_poll_group_process_completions(tgroup, completions_per_qpair,
+ disconnected_qpair_cb);
+ if (local_completions < 0 && error_reason == 0) {
+ error_reason = local_completions;
+ } else {
+ num_completions += local_completions;
+ /* Just to be safe */
+ assert(num_completions >= 0);
+ }
+ }
+
+ return error_reason ? error_reason : num_completions;
+}
+
+void *
+spdk_nvme_poll_group_get_ctx(struct spdk_nvme_poll_group *group)
+{
+ return group->ctx;
+}
+
+int
+spdk_nvme_poll_group_destroy(struct spdk_nvme_poll_group *group)
+{
+ struct spdk_nvme_transport_poll_group *tgroup, *tmp_tgroup;
+
+ STAILQ_FOREACH_SAFE(tgroup, &group->tgroups, link, tmp_tgroup) {
+ STAILQ_REMOVE(&group->tgroups, tgroup, spdk_nvme_transport_poll_group, link);
+ if (nvme_transport_poll_group_destroy(tgroup) != 0) {
+ STAILQ_INSERT_TAIL(&group->tgroups, tgroup, link);
+ return -EBUSY;
+ }
+
+ }
+
+ free(group);
+
+ return 0;
+}
diff --git a/src/spdk/lib/nvme/nvme_qpair.c b/src/spdk/lib/nvme/nvme_qpair.c
new file mode 100644
index 000000000..a3fdc2169
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_qpair.c
@@ -0,0 +1,1064 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "nvme_internal.h"
+#include "spdk/nvme_ocssd.h"
+
+#define NVME_CMD_DPTR_STR_SIZE 256
+
+static int nvme_qpair_resubmit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req);
+
+struct nvme_string {
+ uint16_t value;
+ const char *str;
+};
+
+static const struct nvme_string admin_opcode[] = {
+ { SPDK_NVME_OPC_DELETE_IO_SQ, "DELETE IO SQ" },
+ { SPDK_NVME_OPC_CREATE_IO_SQ, "CREATE IO SQ" },
+ { SPDK_NVME_OPC_GET_LOG_PAGE, "GET LOG PAGE" },
+ { SPDK_NVME_OPC_DELETE_IO_CQ, "DELETE IO CQ" },
+ { SPDK_NVME_OPC_CREATE_IO_CQ, "CREATE IO CQ" },
+ { SPDK_NVME_OPC_IDENTIFY, "IDENTIFY" },
+ { SPDK_NVME_OPC_ABORT, "ABORT" },
+ { SPDK_NVME_OPC_SET_FEATURES, "SET FEATURES" },
+ { SPDK_NVME_OPC_GET_FEATURES, "GET FEATURES" },
+ { SPDK_NVME_OPC_ASYNC_EVENT_REQUEST, "ASYNC EVENT REQUEST" },
+ { SPDK_NVME_OPC_NS_MANAGEMENT, "NAMESPACE MANAGEMENT" },
+ { SPDK_NVME_OPC_FIRMWARE_COMMIT, "FIRMWARE COMMIT" },
+ { SPDK_NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD, "FIRMWARE IMAGE DOWNLOAD" },
+ { SPDK_NVME_OPC_DEVICE_SELF_TEST, "DEVICE SELF-TEST" },
+ { SPDK_NVME_OPC_NS_ATTACHMENT, "NAMESPACE ATTACHMENT" },
+ { SPDK_NVME_OPC_KEEP_ALIVE, "KEEP ALIVE" },
+ { SPDK_NVME_OPC_DIRECTIVE_SEND, "DIRECTIVE SEND" },
+ { SPDK_NVME_OPC_DIRECTIVE_RECEIVE, "DIRECTIVE RECEIVE" },
+ { SPDK_NVME_OPC_VIRTUALIZATION_MANAGEMENT, "VIRTUALIZATION MANAGEMENT" },
+ { SPDK_NVME_OPC_NVME_MI_SEND, "NVME-MI SEND" },
+ { SPDK_NVME_OPC_NVME_MI_RECEIVE, "NVME-MI RECEIVE" },
+ { SPDK_NVME_OPC_DOORBELL_BUFFER_CONFIG, "DOORBELL BUFFER CONFIG" },
+ { SPDK_NVME_OPC_FABRIC, "FABRIC" },
+ { SPDK_NVME_OPC_FORMAT_NVM, "FORMAT NVM" },
+ { SPDK_NVME_OPC_SECURITY_SEND, "SECURITY SEND" },
+ { SPDK_NVME_OPC_SECURITY_RECEIVE, "SECURITY RECEIVE" },
+ { SPDK_NVME_OPC_SANITIZE, "SANITIZE" },
+ { SPDK_NVME_OPC_GET_LBA_STATUS, "GET LBA STATUS" },
+ { SPDK_OCSSD_OPC_GEOMETRY, "OCSSD / GEOMETRY" },
+ { 0xFFFF, "ADMIN COMMAND" }
+};
+
+static const struct nvme_string fabric_opcode[] = {
+ { SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET, "PROPERTY SET" },
+ { SPDK_NVMF_FABRIC_COMMAND_CONNECT, "CONNECT" },
+ { SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET, "PROPERTY GET" },
+ { SPDK_NVMF_FABRIC_COMMAND_AUTHENTICATION_SEND, "AUTHENTICATION SEND" },
+ { SPDK_NVMF_FABRIC_COMMAND_AUTHENTICATION_RECV, "AUTHENTICATION RECV" },
+ { 0xFFFF, "RESERVED / VENDOR SPECIFIC" }
+};
+
+static const struct nvme_string feat_opcode[] = {
+ { SPDK_NVME_FEAT_ARBITRATION, "ARBITRATION" },
+ { SPDK_NVME_FEAT_POWER_MANAGEMENT, "POWER MANAGEMENT" },
+ { SPDK_NVME_FEAT_LBA_RANGE_TYPE, "LBA RANGE TYPE" },
+ { SPDK_NVME_FEAT_TEMPERATURE_THRESHOLD, "TEMPERATURE THRESHOLD" },
+ { SPDK_NVME_FEAT_ERROR_RECOVERY, "ERROR_RECOVERY" },
+ { SPDK_NVME_FEAT_VOLATILE_WRITE_CACHE, "VOLATILE WRITE CACHE" },
+ { SPDK_NVME_FEAT_NUMBER_OF_QUEUES, "NUMBER OF QUEUES" },
+ { SPDK_NVME_FEAT_INTERRUPT_COALESCING, "INTERRUPT COALESCING" },
+ { SPDK_NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION, "INTERRUPT VECTOR CONFIGURATION" },
+ { SPDK_NVME_FEAT_WRITE_ATOMICITY, "WRITE ATOMICITY" },
+ { SPDK_NVME_FEAT_ASYNC_EVENT_CONFIGURATION, "ASYNC EVENT CONFIGURATION" },
+ { SPDK_NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION, "AUTONOMOUS POWER STATE TRANSITION" },
+ { SPDK_NVME_FEAT_HOST_MEM_BUFFER, "HOST MEM BUFFER" },
+ { SPDK_NVME_FEAT_TIMESTAMP, "TIMESTAMP" },
+ { SPDK_NVME_FEAT_KEEP_ALIVE_TIMER, "KEEP ALIVE TIMER" },
+ { SPDK_NVME_FEAT_HOST_CONTROLLED_THERMAL_MANAGEMENT, "HOST CONTROLLED THERMAL MANAGEMENT" },
+ { SPDK_NVME_FEAT_NON_OPERATIONAL_POWER_STATE_CONFIG, "NON OPERATIONAL POWER STATE CONFIG" },
+ { SPDK_NVME_FEAT_SOFTWARE_PROGRESS_MARKER, "SOFTWARE PROGRESS MARKER" },
+ { SPDK_NVME_FEAT_HOST_IDENTIFIER, "HOST IDENTIFIER" },
+ { SPDK_NVME_FEAT_HOST_RESERVE_MASK, "HOST RESERVE MASK" },
+ { SPDK_NVME_FEAT_HOST_RESERVE_PERSIST, "HOST RESERVE PERSIST" },
+ { 0xFFFF, "RESERVED" }
+};
+
+static const struct nvme_string io_opcode[] = {
+ { SPDK_NVME_OPC_FLUSH, "FLUSH" },
+ { SPDK_NVME_OPC_WRITE, "WRITE" },
+ { SPDK_NVME_OPC_READ, "READ" },
+ { SPDK_NVME_OPC_WRITE_UNCORRECTABLE, "WRITE UNCORRECTABLE" },
+ { SPDK_NVME_OPC_COMPARE, "COMPARE" },
+ { SPDK_NVME_OPC_WRITE_ZEROES, "WRITE ZEROES" },
+ { SPDK_NVME_OPC_DATASET_MANAGEMENT, "DATASET MANAGEMENT" },
+ { SPDK_NVME_OPC_RESERVATION_REGISTER, "RESERVATION REGISTER" },
+ { SPDK_NVME_OPC_RESERVATION_REPORT, "RESERVATION REPORT" },
+ { SPDK_NVME_OPC_RESERVATION_ACQUIRE, "RESERVATION ACQUIRE" },
+ { SPDK_NVME_OPC_RESERVATION_RELEASE, "RESERVATION RELEASE" },
+ { SPDK_OCSSD_OPC_VECTOR_RESET, "OCSSD / VECTOR RESET" },
+ { SPDK_OCSSD_OPC_VECTOR_WRITE, "OCSSD / VECTOR WRITE" },
+ { SPDK_OCSSD_OPC_VECTOR_READ, "OCSSD / VECTOR READ" },
+ { SPDK_OCSSD_OPC_VECTOR_COPY, "OCSSD / VECTOR COPY" },
+ { 0xFFFF, "IO COMMAND" }
+};
+
+static const struct nvme_string sgl_type[] = {
+ { SPDK_NVME_SGL_TYPE_DATA_BLOCK, "DATA BLOCK" },
+ { SPDK_NVME_SGL_TYPE_BIT_BUCKET, "BIT BUCKET" },
+ { SPDK_NVME_SGL_TYPE_SEGMENT, "SEGMENT" },
+ { SPDK_NVME_SGL_TYPE_LAST_SEGMENT, "LAST SEGMENT" },
+ { SPDK_NVME_SGL_TYPE_TRANSPORT_DATA_BLOCK, "TRANSPORT DATA BLOCK" },
+ { SPDK_NVME_SGL_TYPE_VENDOR_SPECIFIC, "VENDOR SPECIFIC" },
+ { 0xFFFF, "RESERVED" }
+};
+
+static const struct nvme_string sgl_subtype[] = {
+ { SPDK_NVME_SGL_SUBTYPE_ADDRESS, "ADDRESS" },
+ { SPDK_NVME_SGL_SUBTYPE_OFFSET, "OFFSET" },
+ { SPDK_NVME_SGL_SUBTYPE_TRANSPORT, "TRANSPORT" },
+ { SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY, "INVALIDATE KEY" },
+ { 0xFFFF, "RESERVED" }
+};
+
+static const char *
+nvme_get_string(const struct nvme_string *strings, uint16_t value)
+{
+ const struct nvme_string *entry;
+
+ entry = strings;
+
+ while (entry->value != 0xFFFF) {
+ if (entry->value == value) {
+ return entry->str;
+ }
+ entry++;
+ }
+ return entry->str;
+}
+
+static void
+nvme_get_sgl_unkeyed(char *buf, size_t size, struct spdk_nvme_cmd *cmd)
+{
+ struct spdk_nvme_sgl_descriptor *sgl = &cmd->dptr.sgl1;
+
+ snprintf(buf, size, " len:0x%x", sgl->unkeyed.length);
+}
+
+static void
+nvme_get_sgl_keyed(char *buf, size_t size, struct spdk_nvme_cmd *cmd)
+{
+ struct spdk_nvme_sgl_descriptor *sgl = &cmd->dptr.sgl1;
+
+ snprintf(buf, size, " len:0x%x key:0x%x", sgl->keyed.length, sgl->keyed.key);
+}
+
+static void
+nvme_get_sgl(char *buf, size_t size, struct spdk_nvme_cmd *cmd)
+{
+ struct spdk_nvme_sgl_descriptor *sgl = &cmd->dptr.sgl1;
+ int c;
+
+ c = snprintf(buf, size, "SGL %s %s 0x%" PRIx64, nvme_get_string(sgl_type, sgl->generic.type),
+ nvme_get_string(sgl_subtype, sgl->generic.subtype), sgl->address);
+ assert(c >= 0 && (size_t)c < size);
+
+ if (sgl->generic.type == SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK) {
+ nvme_get_sgl_unkeyed(buf + c, size - c, cmd);
+ }
+
+ if (sgl->generic.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) {
+ nvme_get_sgl_keyed(buf + c, size - c, cmd);
+ }
+}
+
+static void
+nvme_get_prp(char *buf, size_t size, struct spdk_nvme_cmd *cmd)
+{
+ snprintf(buf, size, "PRP1 0x%" PRIx64 " PRP2 0x%" PRIx64, cmd->dptr.prp.prp1, cmd->dptr.prp.prp2);
+}
+
+static void
+nvme_get_dptr(char *buf, size_t size, struct spdk_nvme_cmd *cmd)
+{
+ if (spdk_nvme_opc_get_data_transfer(cmd->opc) != SPDK_NVME_DATA_NONE) {
+ switch (cmd->psdt) {
+ case SPDK_NVME_PSDT_PRP:
+ nvme_get_prp(buf, size, cmd);
+ break;
+ case SPDK_NVME_PSDT_SGL_MPTR_CONTIG:
+ case SPDK_NVME_PSDT_SGL_MPTR_SGL:
+ nvme_get_sgl(buf, size, cmd);
+ break;
+ default:
+ ;
+ }
+ }
+}
+
+static void
+nvme_admin_qpair_print_command(uint16_t qid, struct spdk_nvme_cmd *cmd)
+{
+ struct spdk_nvmf_capsule_cmd *fcmd = (void *)cmd;
+ char dptr[NVME_CMD_DPTR_STR_SIZE] = {'\0'};
+
+ assert(cmd != NULL);
+
+ nvme_get_dptr(dptr, sizeof(dptr), cmd);
+
+ switch ((int)cmd->opc) {
+ case SPDK_NVME_OPC_SET_FEATURES:
+ case SPDK_NVME_OPC_GET_FEATURES:
+ SPDK_NOTICELOG("%s %s cid:%d cdw10:%08x %s\n",
+ nvme_get_string(admin_opcode, cmd->opc), nvme_get_string(feat_opcode,
+ cmd->cdw10_bits.set_features.fid), cmd->cid, cmd->cdw10, dptr);
+ break;
+ case SPDK_NVME_OPC_FABRIC:
+ SPDK_NOTICELOG("%s %s qid:%d cid:%d %s\n",
+ nvme_get_string(admin_opcode, cmd->opc), nvme_get_string(fabric_opcode, fcmd->fctype), qid,
+ fcmd->cid, dptr);
+ break;
+ default:
+ SPDK_NOTICELOG("%s (%02x) qid:%d cid:%d nsid:%x cdw10:%08x cdw11:%08x %s\n",
+ nvme_get_string(admin_opcode, cmd->opc), cmd->opc, qid, cmd->cid, cmd->nsid, cmd->cdw10,
+ cmd->cdw11, dptr);
+ }
+}
+
+static void
+nvme_io_qpair_print_command(uint16_t qid, struct spdk_nvme_cmd *cmd)
+{
+ char dptr[NVME_CMD_DPTR_STR_SIZE] = {'\0'};
+
+ assert(cmd != NULL);
+
+ nvme_get_dptr(dptr, sizeof(dptr), cmd);
+
+ switch ((int)cmd->opc) {
+ case SPDK_NVME_OPC_WRITE:
+ case SPDK_NVME_OPC_READ:
+ case SPDK_NVME_OPC_WRITE_UNCORRECTABLE:
+ case SPDK_NVME_OPC_COMPARE:
+ SPDK_NOTICELOG("%s sqid:%d cid:%d nsid:%d "
+ "lba:%llu len:%d %s\n",
+ nvme_get_string(io_opcode, cmd->opc), qid, cmd->cid, cmd->nsid,
+ ((unsigned long long)cmd->cdw11 << 32) + cmd->cdw10,
+ (cmd->cdw12 & 0xFFFF) + 1, dptr);
+ break;
+ case SPDK_NVME_OPC_FLUSH:
+ case SPDK_NVME_OPC_DATASET_MANAGEMENT:
+ SPDK_NOTICELOG("%s sqid:%d cid:%d nsid:%d\n",
+ nvme_get_string(io_opcode, cmd->opc), qid, cmd->cid, cmd->nsid);
+ break;
+ default:
+ SPDK_NOTICELOG("%s (%02x) sqid:%d cid:%d nsid:%d\n",
+ nvme_get_string(io_opcode, cmd->opc), cmd->opc, qid, cmd->cid, cmd->nsid);
+ break;
+ }
+}
+
+void
+spdk_nvme_print_command(uint16_t qid, struct spdk_nvme_cmd *cmd)
+{
+ assert(cmd != NULL);
+
+ if (qid == 0 || cmd->opc == SPDK_NVME_OPC_FABRIC) {
+ nvme_admin_qpair_print_command(qid, cmd);
+ } else {
+ nvme_io_qpair_print_command(qid, cmd);
+ }
+}
+
+void
+spdk_nvme_qpair_print_command(struct spdk_nvme_qpair *qpair, struct spdk_nvme_cmd *cmd)
+{
+ assert(qpair != NULL);
+ assert(cmd != NULL);
+
+ spdk_nvme_print_command(qpair->id, cmd);
+}
+
+static const struct nvme_string generic_status[] = {
+ { SPDK_NVME_SC_SUCCESS, "SUCCESS" },
+ { SPDK_NVME_SC_INVALID_OPCODE, "INVALID OPCODE" },
+ { SPDK_NVME_SC_INVALID_FIELD, "INVALID FIELD" },
+ { SPDK_NVME_SC_COMMAND_ID_CONFLICT, "COMMAND ID CONFLICT" },
+ { SPDK_NVME_SC_DATA_TRANSFER_ERROR, "DATA TRANSFER ERROR" },
+ { SPDK_NVME_SC_ABORTED_POWER_LOSS, "ABORTED - POWER LOSS" },
+ { SPDK_NVME_SC_INTERNAL_DEVICE_ERROR, "INTERNAL DEVICE ERROR" },
+ { SPDK_NVME_SC_ABORTED_BY_REQUEST, "ABORTED - BY REQUEST" },
+ { SPDK_NVME_SC_ABORTED_SQ_DELETION, "ABORTED - SQ DELETION" },
+ { SPDK_NVME_SC_ABORTED_FAILED_FUSED, "ABORTED - FAILED FUSED" },
+ { SPDK_NVME_SC_ABORTED_MISSING_FUSED, "ABORTED - MISSING FUSED" },
+ { SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT, "INVALID NAMESPACE OR FORMAT" },
+ { SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR, "COMMAND SEQUENCE ERROR" },
+ { SPDK_NVME_SC_INVALID_SGL_SEG_DESCRIPTOR, "INVALID SGL SEGMENT DESCRIPTOR" },
+ { SPDK_NVME_SC_INVALID_NUM_SGL_DESCIRPTORS, "INVALID NUMBER OF SGL DESCRIPTORS" },
+ { SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID, "DATA SGL LENGTH INVALID" },
+ { SPDK_NVME_SC_METADATA_SGL_LENGTH_INVALID, "METADATA SGL LENGTH INVALID" },
+ { SPDK_NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID, "SGL DESCRIPTOR TYPE INVALID" },
+ { SPDK_NVME_SC_INVALID_CONTROLLER_MEM_BUF, "INVALID CONTROLLER MEMORY BUFFER" },
+ { SPDK_NVME_SC_INVALID_PRP_OFFSET, "INVALID PRP OFFSET" },
+ { SPDK_NVME_SC_ATOMIC_WRITE_UNIT_EXCEEDED, "ATOMIC WRITE UNIT EXCEEDED" },
+ { SPDK_NVME_SC_OPERATION_DENIED, "OPERATION DENIED" },
+ { SPDK_NVME_SC_INVALID_SGL_OFFSET, "INVALID SGL OFFSET" },
+ { SPDK_NVME_SC_HOSTID_INCONSISTENT_FORMAT, "HOSTID INCONSISTENT FORMAT" },
+ { SPDK_NVME_SC_KEEP_ALIVE_EXPIRED, "KEEP ALIVE EXPIRED" },
+ { SPDK_NVME_SC_KEEP_ALIVE_INVALID, "KEEP ALIVE INVALID" },
+ { SPDK_NVME_SC_ABORTED_PREEMPT, "ABORTED - PREEMPT AND ABORT" },
+ { SPDK_NVME_SC_SANITIZE_FAILED, "SANITIZE FAILED" },
+ { SPDK_NVME_SC_SANITIZE_IN_PROGRESS, "SANITIZE IN PROGRESS" },
+ { SPDK_NVME_SC_SGL_DATA_BLOCK_GRANULARITY_INVALID, "DATA BLOCK GRANULARITY INVALID" },
+ { SPDK_NVME_SC_COMMAND_INVALID_IN_CMB, "COMMAND NOT SUPPORTED FOR QUEUE IN CMB" },
+ { SPDK_NVME_SC_LBA_OUT_OF_RANGE, "LBA OUT OF RANGE" },
+ { SPDK_NVME_SC_CAPACITY_EXCEEDED, "CAPACITY EXCEEDED" },
+ { SPDK_NVME_SC_NAMESPACE_NOT_READY, "NAMESPACE NOT READY" },
+ { SPDK_NVME_SC_RESERVATION_CONFLICT, "RESERVATION CONFLICT" },
+ { SPDK_NVME_SC_FORMAT_IN_PROGRESS, "FORMAT IN PROGRESS" },
+ { 0xFFFF, "GENERIC" }
+};
+
+static const struct nvme_string command_specific_status[] = {
+ { SPDK_NVME_SC_COMPLETION_QUEUE_INVALID, "INVALID COMPLETION QUEUE" },
+ { SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER, "INVALID QUEUE IDENTIFIER" },
+ { SPDK_NVME_SC_INVALID_QUEUE_SIZE, "INVALID QUEUE SIZE" },
+ { SPDK_NVME_SC_ABORT_COMMAND_LIMIT_EXCEEDED, "ABORT CMD LIMIT EXCEEDED" },
+ { SPDK_NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED, "ASYNC LIMIT EXCEEDED" },
+ { SPDK_NVME_SC_INVALID_FIRMWARE_SLOT, "INVALID FIRMWARE SLOT" },
+ { SPDK_NVME_SC_INVALID_FIRMWARE_IMAGE, "INVALID FIRMWARE IMAGE" },
+ { SPDK_NVME_SC_INVALID_INTERRUPT_VECTOR, "INVALID INTERRUPT VECTOR" },
+ { SPDK_NVME_SC_INVALID_LOG_PAGE, "INVALID LOG PAGE" },
+ { SPDK_NVME_SC_INVALID_FORMAT, "INVALID FORMAT" },
+ { SPDK_NVME_SC_FIRMWARE_REQ_CONVENTIONAL_RESET, "FIRMWARE REQUIRES CONVENTIONAL RESET" },
+ { SPDK_NVME_SC_INVALID_QUEUE_DELETION, "INVALID QUEUE DELETION" },
+ { SPDK_NVME_SC_FEATURE_ID_NOT_SAVEABLE, "FEATURE ID NOT SAVEABLE" },
+ { SPDK_NVME_SC_FEATURE_NOT_CHANGEABLE, "FEATURE NOT CHANGEABLE" },
+ { SPDK_NVME_SC_FEATURE_NOT_NAMESPACE_SPECIFIC, "FEATURE NOT NAMESPACE SPECIFIC" },
+ { SPDK_NVME_SC_FIRMWARE_REQ_NVM_RESET, "FIRMWARE REQUIRES NVM RESET" },
+ { SPDK_NVME_SC_FIRMWARE_REQ_RESET, "FIRMWARE REQUIRES RESET" },
+ { SPDK_NVME_SC_FIRMWARE_REQ_MAX_TIME_VIOLATION, "FIRMWARE REQUIRES MAX TIME VIOLATION" },
+ { SPDK_NVME_SC_FIRMWARE_ACTIVATION_PROHIBITED, "FIRMWARE ACTIVATION PROHIBITED" },
+ { SPDK_NVME_SC_OVERLAPPING_RANGE, "OVERLAPPING RANGE" },
+ { SPDK_NVME_SC_NAMESPACE_INSUFFICIENT_CAPACITY, "NAMESPACE INSUFFICIENT CAPACITY" },
+ { SPDK_NVME_SC_NAMESPACE_ID_UNAVAILABLE, "NAMESPACE ID UNAVAILABLE" },
+ { SPDK_NVME_SC_NAMESPACE_ALREADY_ATTACHED, "NAMESPACE ALREADY ATTACHED" },
+ { SPDK_NVME_SC_NAMESPACE_IS_PRIVATE, "NAMESPACE IS PRIVATE" },
+ { SPDK_NVME_SC_NAMESPACE_NOT_ATTACHED, "NAMESPACE NOT ATTACHED" },
+ { SPDK_NVME_SC_THINPROVISIONING_NOT_SUPPORTED, "THINPROVISIONING NOT SUPPORTED" },
+ { SPDK_NVME_SC_CONTROLLER_LIST_INVALID, "CONTROLLER LIST INVALID" },
+ { SPDK_NVME_SC_DEVICE_SELF_TEST_IN_PROGRESS, "DEVICE SELF-TEST IN PROGRESS" },
+ { SPDK_NVME_SC_BOOT_PARTITION_WRITE_PROHIBITED, "BOOT PARTITION WRITE PROHIBITED" },
+ { SPDK_NVME_SC_INVALID_CTRLR_ID, "INVALID CONTROLLER ID" },
+ { SPDK_NVME_SC_INVALID_SECONDARY_CTRLR_STATE, "INVALID SECONDARY CONTROLLER STATE" },
+ { SPDK_NVME_SC_INVALID_NUM_CTRLR_RESOURCES, "INVALID NUMBER OF CONTROLLER RESOURCES" },
+ { SPDK_NVME_SC_INVALID_RESOURCE_ID, "INVALID RESOURCE IDENTIFIER" },
+ { SPDK_NVME_SC_CONFLICTING_ATTRIBUTES, "CONFLICTING ATTRIBUTES" },
+ { SPDK_NVME_SC_INVALID_PROTECTION_INFO, "INVALID PROTECTION INFO" },
+ { SPDK_NVME_SC_ATTEMPTED_WRITE_TO_RO_RANGE, "WRITE TO RO RANGE" },
+ { 0xFFFF, "COMMAND SPECIFIC" }
+};
+
+static const struct nvme_string media_error_status[] = {
+ { SPDK_NVME_SC_WRITE_FAULTS, "WRITE FAULTS" },
+ { SPDK_NVME_SC_UNRECOVERED_READ_ERROR, "UNRECOVERED READ ERROR" },
+ { SPDK_NVME_SC_GUARD_CHECK_ERROR, "GUARD CHECK ERROR" },
+ { SPDK_NVME_SC_APPLICATION_TAG_CHECK_ERROR, "APPLICATION TAG CHECK ERROR" },
+ { SPDK_NVME_SC_REFERENCE_TAG_CHECK_ERROR, "REFERENCE TAG CHECK ERROR" },
+ { SPDK_NVME_SC_COMPARE_FAILURE, "COMPARE FAILURE" },
+ { SPDK_NVME_SC_ACCESS_DENIED, "ACCESS DENIED" },
+ { SPDK_NVME_SC_DEALLOCATED_OR_UNWRITTEN_BLOCK, "DEALLOCATED OR UNWRITTEN BLOCK" },
+ { SPDK_OCSSD_SC_OFFLINE_CHUNK, "RESET OFFLINE CHUNK" },
+ { SPDK_OCSSD_SC_INVALID_RESET, "INVALID RESET" },
+ { SPDK_OCSSD_SC_WRITE_FAIL_WRITE_NEXT_UNIT, "WRITE FAIL WRITE NEXT UNIT" },
+ { SPDK_OCSSD_SC_WRITE_FAIL_CHUNK_EARLY_CLOSE, "WRITE FAIL CHUNK EARLY CLOSE" },
+ { SPDK_OCSSD_SC_OUT_OF_ORDER_WRITE, "OUT OF ORDER WRITE" },
+ { SPDK_OCSSD_SC_READ_HIGH_ECC, "READ HIGH ECC" },
+ { 0xFFFF, "MEDIA ERROR" }
+};
+
+static const struct nvme_string path_status[] = {
+ { SPDK_NVME_SC_INTERNAL_PATH_ERROR, "INTERNAL PATH ERROR" },
+ { SPDK_NVME_SC_CONTROLLER_PATH_ERROR, "CONTROLLER PATH ERROR" },
+ { SPDK_NVME_SC_HOST_PATH_ERROR, "HOST PATH ERROR" },
+ { SPDK_NVME_SC_ABORTED_BY_HOST, "ABORTED BY HOST" },
+ { 0xFFFF, "PATH ERROR" }
+};
+
+const char *
+spdk_nvme_cpl_get_status_string(const struct spdk_nvme_status *status)
+{
+ const struct nvme_string *entry;
+
+ switch (status->sct) {
+ case SPDK_NVME_SCT_GENERIC:
+ entry = generic_status;
+ break;
+ case SPDK_NVME_SCT_COMMAND_SPECIFIC:
+ entry = command_specific_status;
+ break;
+ case SPDK_NVME_SCT_MEDIA_ERROR:
+ entry = media_error_status;
+ break;
+ case SPDK_NVME_SCT_PATH:
+ entry = path_status;
+ break;
+ case SPDK_NVME_SCT_VENDOR_SPECIFIC:
+ return "VENDOR SPECIFIC";
+ default:
+ return "RESERVED";
+ }
+
+ return nvme_get_string(entry, status->sc);
+}
+
+void
+spdk_nvme_print_completion(uint16_t qid, struct spdk_nvme_cpl *cpl)
+{
+ assert(cpl != NULL);
+
+ /* Check that sqid matches qid. Note that sqid is reserved
+ * for fabrics so don't print an error when sqid is 0. */
+ if (cpl->sqid != qid && cpl->sqid != 0) {
+ SPDK_ERRLOG("sqid %u doesn't match qid\n", cpl->sqid);
+ }
+
+ SPDK_NOTICELOG("%s (%02x/%02x) qid:%d cid:%d cdw0:%x sqhd:%04x p:%x m:%x dnr:%x\n",
+ spdk_nvme_cpl_get_status_string(&cpl->status),
+ cpl->status.sct, cpl->status.sc, qid, cpl->cid, cpl->cdw0,
+ cpl->sqhd, cpl->status.p, cpl->status.m, cpl->status.dnr);
+}
+
+void
+spdk_nvme_qpair_print_completion(struct spdk_nvme_qpair *qpair, struct spdk_nvme_cpl *cpl)
+{
+ spdk_nvme_print_completion(qpair->id, cpl);
+}
+
+bool
+nvme_completion_is_retry(const struct spdk_nvme_cpl *cpl)
+{
+ /*
+ * TODO: spec is not clear how commands that are aborted due
+ * to TLER will be marked. So for now, it seems
+ * NAMESPACE_NOT_READY is the only case where we should
+ * look at the DNR bit.
+ */
+ switch ((int)cpl->status.sct) {
+ case SPDK_NVME_SCT_GENERIC:
+ switch ((int)cpl->status.sc) {
+ case SPDK_NVME_SC_NAMESPACE_NOT_READY:
+ case SPDK_NVME_SC_FORMAT_IN_PROGRESS:
+ if (cpl->status.dnr) {
+ return false;
+ } else {
+ return true;
+ }
+ case SPDK_NVME_SC_INVALID_OPCODE:
+ case SPDK_NVME_SC_INVALID_FIELD:
+ case SPDK_NVME_SC_COMMAND_ID_CONFLICT:
+ case SPDK_NVME_SC_DATA_TRANSFER_ERROR:
+ case SPDK_NVME_SC_ABORTED_POWER_LOSS:
+ case SPDK_NVME_SC_INTERNAL_DEVICE_ERROR:
+ case SPDK_NVME_SC_ABORTED_BY_REQUEST:
+ case SPDK_NVME_SC_ABORTED_SQ_DELETION:
+ case SPDK_NVME_SC_ABORTED_FAILED_FUSED:
+ case SPDK_NVME_SC_ABORTED_MISSING_FUSED:
+ case SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT:
+ case SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR:
+ case SPDK_NVME_SC_LBA_OUT_OF_RANGE:
+ case SPDK_NVME_SC_CAPACITY_EXCEEDED:
+ default:
+ return false;
+ }
+ case SPDK_NVME_SCT_PATH:
+ /*
+ * Per NVMe TP 4028 (Path and Transport Error Enhancements), retries should be
+ * based on the setting of the DNR bit for Internal Path Error
+ */
+ switch ((int)cpl->status.sc) {
+ case SPDK_NVME_SC_INTERNAL_PATH_ERROR:
+ return !cpl->status.dnr;
+ default:
+ return false;
+ }
+ case SPDK_NVME_SCT_COMMAND_SPECIFIC:
+ case SPDK_NVME_SCT_MEDIA_ERROR:
+ case SPDK_NVME_SCT_VENDOR_SPECIFIC:
+ default:
+ return false;
+ }
+}
+
+static void
+nvme_qpair_manual_complete_request(struct spdk_nvme_qpair *qpair,
+ struct nvme_request *req, uint32_t sct, uint32_t sc,
+ uint32_t dnr, bool print_on_error)
+{
+ struct spdk_nvme_cpl cpl;
+ bool error;
+
+ memset(&cpl, 0, sizeof(cpl));
+ cpl.sqid = qpair->id;
+ cpl.status.sct = sct;
+ cpl.status.sc = sc;
+ cpl.status.dnr = dnr;
+
+ error = spdk_nvme_cpl_is_error(&cpl);
+
+ if (error && print_on_error && !qpair->ctrlr->opts.disable_error_logging) {
+ SPDK_NOTICELOG("Command completed manually:\n");
+ spdk_nvme_qpair_print_command(qpair, &req->cmd);
+ spdk_nvme_qpair_print_completion(qpair, &cpl);
+ }
+
+ nvme_complete_request(req->cb_fn, req->cb_arg, qpair, req, &cpl);
+ nvme_free_request(req);
+}
+
+static void
+_nvme_qpair_abort_queued_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr)
+{
+ struct nvme_request *req;
+
+ while (!STAILQ_EMPTY(&qpair->queued_req)) {
+ req = STAILQ_FIRST(&qpair->queued_req);
+ STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq);
+ if (!qpair->ctrlr->opts.disable_error_logging) {
+ SPDK_ERRLOG("aborting queued i/o\n");
+ }
+ nvme_qpair_manual_complete_request(qpair, req, SPDK_NVME_SCT_GENERIC,
+ SPDK_NVME_SC_ABORTED_BY_REQUEST, dnr, true);
+ }
+}
+
+/* The callback to a request may submit the next request which is queued and
+ * then the same callback may abort it immediately. This repetition may cause
+ * infinite recursive calls. Hence move aborting requests to another list here
+ * and abort them later at resubmission.
+ */
+static void
+_nvme_qpair_complete_abort_queued_reqs(struct spdk_nvme_qpair *qpair)
+{
+ struct nvme_request *req;
+
+ while (!STAILQ_EMPTY(&qpair->aborting_queued_req)) {
+ req = STAILQ_FIRST(&qpair->aborting_queued_req);
+ STAILQ_REMOVE_HEAD(&qpair->aborting_queued_req, stailq);
+ nvme_qpair_manual_complete_request(qpair, req, SPDK_NVME_SCT_GENERIC,
+ SPDK_NVME_SC_ABORTED_BY_REQUEST, 1, true);
+ }
+}
+
+uint32_t
+nvme_qpair_abort_queued_reqs(struct spdk_nvme_qpair *qpair, void *cmd_cb_arg)
+{
+ struct nvme_request *req, *tmp;
+ uint32_t aborting = 0;
+
+ STAILQ_FOREACH_SAFE(req, &qpair->queued_req, stailq, tmp) {
+ if (req->cb_arg == cmd_cb_arg) {
+ STAILQ_REMOVE(&qpair->queued_req, req, nvme_request, stailq);
+ STAILQ_INSERT_TAIL(&qpair->aborting_queued_req, req, stailq);
+ if (!qpair->ctrlr->opts.disable_error_logging) {
+ SPDK_ERRLOG("aborting queued i/o\n");
+ }
+ aborting++;
+ }
+ }
+
+ return aborting;
+}
+
+static inline bool
+nvme_qpair_check_enabled(struct spdk_nvme_qpair *qpair)
+{
+ struct nvme_request *req;
+
+ /*
+ * Either during initial connect or reset, the qpair should follow the given state machine.
+ * QPAIR_DISABLED->QPAIR_CONNECTING->QPAIR_CONNECTED->QPAIR_ENABLING->QPAIR_ENABLED. In the
+ * reset case, once the qpair is properly connected, we need to abort any outstanding requests
+ * from the old transport connection and encourage the application to retry them. We also need
+ * to submit any queued requests that built up while we were in the connected or enabling state.
+ */
+ if (nvme_qpair_get_state(qpair) == NVME_QPAIR_CONNECTED && !qpair->ctrlr->is_resetting) {
+ nvme_qpair_set_state(qpair, NVME_QPAIR_ENABLING);
+ /*
+ * PCIe is special, for fabrics transports, we can abort requests before disconnect during reset
+ * but we have historically not disconnected pcie qpairs during reset so we have to abort requests
+ * here.
+ */
+ if (qpair->ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) {
+ nvme_qpair_abort_reqs(qpair, 0);
+ }
+ nvme_qpair_set_state(qpair, NVME_QPAIR_ENABLED);
+ while (!STAILQ_EMPTY(&qpair->queued_req)) {
+ req = STAILQ_FIRST(&qpair->queued_req);
+ STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq);
+ if (nvme_qpair_resubmit_request(qpair, req)) {
+ break;
+ }
+ }
+ }
+
+ /*
+ * When doing a reset, we must disconnect the qpair on the proper core.
+ * Note, reset is the only case where we set the failure reason without
+ * setting the qpair state since reset is done at the generic layer on the
+ * controller thread and we can't disconnect I/O qpairs from the controller
+ * thread.
+ */
+ if (qpair->transport_failure_reason != SPDK_NVME_QPAIR_FAILURE_NONE &&
+ nvme_qpair_get_state(qpair) == NVME_QPAIR_ENABLED) {
+ /* Don't disconnect PCIe qpairs. They are a special case for reset. */
+ if (qpair->ctrlr->trid.trtype != SPDK_NVME_TRANSPORT_PCIE) {
+ nvme_ctrlr_disconnect_qpair(qpair);
+ }
+ return false;
+ }
+
+ return nvme_qpair_get_state(qpair) == NVME_QPAIR_ENABLED;
+}
+
+void
+nvme_qpair_resubmit_requests(struct spdk_nvme_qpair *qpair, uint32_t num_requests)
+{
+ uint32_t i;
+ int resubmit_rc;
+ struct nvme_request *req;
+
+ for (i = 0; i < num_requests; i++) {
+ if (qpair->ctrlr->is_resetting) {
+ break;
+ }
+ if ((req = STAILQ_FIRST(&qpair->queued_req)) == NULL) {
+ break;
+ }
+ STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq);
+ resubmit_rc = nvme_qpair_resubmit_request(qpair, req);
+ if (spdk_unlikely(resubmit_rc != 0)) {
+ SPDK_ERRLOG("Unable to resubmit as many requests as we completed.\n");
+ break;
+ }
+ }
+
+ _nvme_qpair_complete_abort_queued_reqs(qpair);
+}
+
+int32_t
+spdk_nvme_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_completions)
+{
+ int32_t ret;
+ struct nvme_request *req, *tmp;
+
+ if (spdk_unlikely(qpair->ctrlr->is_failed)) {
+ if (qpair->ctrlr->is_removed) {
+ nvme_qpair_set_state(qpair, NVME_QPAIR_DESTROYING);
+ nvme_qpair_abort_reqs(qpair, 1 /* Do not retry */);
+ }
+ return -ENXIO;
+ }
+
+ if (spdk_unlikely(!nvme_qpair_check_enabled(qpair) &&
+ !(nvme_qpair_get_state(qpair) == NVME_QPAIR_CONNECTING))) {
+ /*
+ * qpair is not enabled, likely because a controller reset is
+ * in progress.
+ */
+ return -ENXIO;
+ }
+
+ /* error injection for those queued error requests */
+ if (spdk_unlikely(!STAILQ_EMPTY(&qpair->err_req_head))) {
+ STAILQ_FOREACH_SAFE(req, &qpair->err_req_head, stailq, tmp) {
+ if (spdk_get_ticks() - req->submit_tick > req->timeout_tsc) {
+ STAILQ_REMOVE(&qpair->err_req_head, req, nvme_request, stailq);
+ nvme_qpair_manual_complete_request(qpair, req,
+ req->cpl.status.sct,
+ req->cpl.status.sc, 0, true);
+ }
+ }
+ }
+
+ qpair->in_completion_context = 1;
+ ret = nvme_transport_qpair_process_completions(qpair, max_completions);
+ if (ret < 0) {
+ SPDK_ERRLOG("CQ error, abort requests after transport retry counter exceeded\n");
+ if (nvme_qpair_is_admin_queue(qpair)) {
+ nvme_ctrlr_fail(qpair->ctrlr, false);
+ }
+ }
+ qpair->in_completion_context = 0;
+ if (qpair->delete_after_completion_context) {
+ /*
+ * A request to delete this qpair was made in the context of this completion
+ * routine - so it is safe to delete it now.
+ */
+ spdk_nvme_ctrlr_free_io_qpair(qpair);
+ return ret;
+ }
+
+ /*
+ * At this point, ret must represent the number of completions we reaped.
+ * submit as many queued requests as we completed.
+ */
+ nvme_qpair_resubmit_requests(qpair, ret);
+
+ return ret;
+}
+
+spdk_nvme_qp_failure_reason
+spdk_nvme_qpair_get_failure_reason(struct spdk_nvme_qpair *qpair)
+{
+ return qpair->transport_failure_reason;
+}
+
+int
+nvme_qpair_init(struct spdk_nvme_qpair *qpair, uint16_t id,
+ struct spdk_nvme_ctrlr *ctrlr,
+ enum spdk_nvme_qprio qprio,
+ uint32_t num_requests)
+{
+ size_t req_size_padded;
+ uint32_t i;
+
+ qpair->id = id;
+ qpair->qprio = qprio;
+
+ qpair->in_completion_context = 0;
+ qpair->delete_after_completion_context = 0;
+ qpair->no_deletion_notification_needed = 0;
+
+ qpair->ctrlr = ctrlr;
+ qpair->trtype = ctrlr->trid.trtype;
+
+ STAILQ_INIT(&qpair->free_req);
+ STAILQ_INIT(&qpair->queued_req);
+ STAILQ_INIT(&qpair->aborting_queued_req);
+ TAILQ_INIT(&qpair->err_cmd_head);
+ STAILQ_INIT(&qpair->err_req_head);
+
+ req_size_padded = (sizeof(struct nvme_request) + 63) & ~(size_t)63;
+
+ qpair->req_buf = spdk_zmalloc(req_size_padded * num_requests, 64, NULL,
+ SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE);
+ if (qpair->req_buf == NULL) {
+ SPDK_ERRLOG("no memory to allocate qpair(cntlid:0x%x sqid:%d) req_buf with %d request\n",
+ ctrlr->cntlid, qpair->id, num_requests);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < num_requests; i++) {
+ struct nvme_request *req = qpair->req_buf + i * req_size_padded;
+
+ req->qpair = qpair;
+ STAILQ_INSERT_HEAD(&qpair->free_req, req, stailq);
+ }
+
+ return 0;
+}
+
+void
+nvme_qpair_complete_error_reqs(struct spdk_nvme_qpair *qpair)
+{
+ struct nvme_request *req;
+
+ while (!STAILQ_EMPTY(&qpair->err_req_head)) {
+ req = STAILQ_FIRST(&qpair->err_req_head);
+ STAILQ_REMOVE_HEAD(&qpair->err_req_head, stailq);
+ nvme_qpair_manual_complete_request(qpair, req,
+ req->cpl.status.sct,
+ req->cpl.status.sc, 0, true);
+ }
+}
+
+void
+nvme_qpair_deinit(struct spdk_nvme_qpair *qpair)
+{
+ struct nvme_error_cmd *cmd, *entry;
+
+ _nvme_qpair_abort_queued_reqs(qpair, 1);
+ _nvme_qpair_complete_abort_queued_reqs(qpair);
+ nvme_qpair_complete_error_reqs(qpair);
+
+ TAILQ_FOREACH_SAFE(cmd, &qpair->err_cmd_head, link, entry) {
+ TAILQ_REMOVE(&qpair->err_cmd_head, cmd, link);
+ spdk_free(cmd);
+ }
+
+ spdk_free(qpair->req_buf);
+}
+
+static inline int
+_nvme_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req)
+{
+ int rc = 0;
+ struct nvme_request *child_req, *tmp;
+ struct nvme_error_cmd *cmd;
+ struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
+ bool child_req_failed = false;
+
+ nvme_qpair_check_enabled(qpair);
+
+ if (req->num_children) {
+ /*
+ * This is a split (parent) request. Submit all of the children but not the parent
+ * request itself, since the parent is the original unsplit request.
+ */
+ TAILQ_FOREACH_SAFE(child_req, &req->children, child_tailq, tmp) {
+ if (spdk_likely(!child_req_failed)) {
+ rc = nvme_qpair_submit_request(qpair, child_req);
+ if (spdk_unlikely(rc != 0)) {
+ child_req_failed = true;
+ }
+ } else { /* free remaining child_reqs since one child_req fails */
+ nvme_request_remove_child(req, child_req);
+ nvme_request_free_children(child_req);
+ nvme_free_request(child_req);
+ }
+ }
+
+ if (spdk_unlikely(child_req_failed)) {
+ /* part of children requests have been submitted,
+ * return success since we must wait for those children to complete,
+ * but set the parent request to failure.
+ */
+ if (req->num_children) {
+ req->cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+ req->cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+ return 0;
+ }
+ goto error;
+ }
+
+ return rc;
+ }
+
+ /* queue those requests which matches with opcode in err_cmd list */
+ if (spdk_unlikely(!TAILQ_EMPTY(&qpair->err_cmd_head))) {
+ TAILQ_FOREACH(cmd, &qpair->err_cmd_head, link) {
+ if (!cmd->do_not_submit) {
+ continue;
+ }
+
+ if ((cmd->opc == req->cmd.opc) && cmd->err_count) {
+ /* add to error request list and set cpl */
+ req->timeout_tsc = cmd->timeout_tsc;
+ req->submit_tick = spdk_get_ticks();
+ req->cpl.status.sct = cmd->status.sct;
+ req->cpl.status.sc = cmd->status.sc;
+ STAILQ_INSERT_TAIL(&qpair->err_req_head, req, stailq);
+ cmd->err_count--;
+ return 0;
+ }
+ }
+ }
+
+ if (spdk_unlikely(ctrlr->is_failed)) {
+ rc = -ENXIO;
+ goto error;
+ }
+
+ /* assign submit_tick before submitting req to specific transport */
+ if (spdk_unlikely(ctrlr->timeout_enabled)) {
+ if (req->submit_tick == 0) { /* req submitted for the first time */
+ req->submit_tick = spdk_get_ticks();
+ req->timed_out = false;
+ }
+ } else {
+ req->submit_tick = 0;
+ }
+
+ /* Allow two cases:
+ * 1. NVMe qpair is enabled.
+ * 2. Always allow fabrics commands through - these get
+ * the controller out of reset state.
+ */
+ if (spdk_likely(nvme_qpair_get_state(qpair) == NVME_QPAIR_ENABLED) ||
+ (req->cmd.opc == SPDK_NVME_OPC_FABRIC &&
+ nvme_qpair_get_state(qpair) == NVME_QPAIR_CONNECTING)) {
+ rc = nvme_transport_qpair_submit_request(qpair, req);
+ } else {
+ /* The controller is being reset - queue this request and
+ * submit it later when the reset is completed.
+ */
+ return -EAGAIN;
+ }
+
+ if (spdk_likely(rc == 0)) {
+ req->queued = false;
+ return 0;
+ }
+
+ if (rc == -EAGAIN) {
+ return -EAGAIN;
+ }
+
+error:
+ if (req->parent != NULL) {
+ nvme_request_remove_child(req->parent, req);
+ }
+
+ /* The request is from queued_req list we should trigger the callback from caller */
+ if (spdk_unlikely(req->queued)) {
+ nvme_qpair_manual_complete_request(qpair, req, SPDK_NVME_SCT_GENERIC,
+ SPDK_NVME_SC_INTERNAL_DEVICE_ERROR, true, true);
+ return rc;
+ }
+
+ nvme_free_request(req);
+
+ return rc;
+}
+
+int
+nvme_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req)
+{
+ int rc;
+
+ /* This prevents us from entering an infinite loop when freeing queued I/O in disconnect. */
+ if (spdk_unlikely(nvme_qpair_get_state(qpair) == NVME_QPAIR_DISCONNECTING ||
+ nvme_qpair_get_state(qpair) == NVME_QPAIR_DESTROYING)) {
+ if (req->parent != NULL) {
+ nvme_request_remove_child(req->parent, req);
+ }
+ nvme_free_request(req);
+ return -ENXIO;
+ }
+
+ if (spdk_unlikely(!STAILQ_EMPTY(&qpair->queued_req) && req->num_children == 0)) {
+ /*
+ * requests that have no children should be sent to the transport after all
+ * currently queued requests. Requests with chilren will be split and go back
+ * through this path.
+ */
+ STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq);
+ req->queued = true;
+ return 0;
+ }
+
+ rc = _nvme_qpair_submit_request(qpair, req);
+ if (rc == -EAGAIN) {
+ STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq);
+ req->queued = true;
+ rc = 0;
+ }
+
+ return rc;
+}
+
+static int
+nvme_qpair_resubmit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req)
+{
+ int rc;
+
+ /*
+ * We should never have a request with children on the queue.
+ * This is necessary to preserve the 1:1 relationship between
+ * completions and resubmissions.
+ */
+ assert(req->num_children == 0);
+ assert(req->queued);
+ rc = _nvme_qpair_submit_request(qpair, req);
+ if (spdk_unlikely(rc == -EAGAIN)) {
+ STAILQ_INSERT_HEAD(&qpair->queued_req, req, stailq);
+ }
+
+ return rc;
+}
+
+void
+nvme_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr)
+{
+ nvme_qpair_complete_error_reqs(qpair);
+ _nvme_qpair_abort_queued_reqs(qpair, dnr);
+ _nvme_qpair_complete_abort_queued_reqs(qpair);
+ nvme_transport_qpair_abort_reqs(qpair, dnr);
+}
+
+int
+spdk_nvme_qpair_add_cmd_error_injection(struct spdk_nvme_ctrlr *ctrlr,
+ struct spdk_nvme_qpair *qpair,
+ uint8_t opc, bool do_not_submit,
+ uint64_t timeout_in_us,
+ uint32_t err_count,
+ uint8_t sct, uint8_t sc)
+{
+ struct nvme_error_cmd *entry, *cmd = NULL;
+
+ if (qpair == NULL) {
+ qpair = ctrlr->adminq;
+ }
+
+ TAILQ_FOREACH(entry, &qpair->err_cmd_head, link) {
+ if (entry->opc == opc) {
+ cmd = entry;
+ break;
+ }
+ }
+
+ if (cmd == NULL) {
+ cmd = spdk_zmalloc(sizeof(*cmd), 64, NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+ if (!cmd) {
+ return -ENOMEM;
+ }
+ TAILQ_INSERT_TAIL(&qpair->err_cmd_head, cmd, link);
+ }
+
+ cmd->do_not_submit = do_not_submit;
+ cmd->err_count = err_count;
+ cmd->timeout_tsc = timeout_in_us * spdk_get_ticks_hz() / 1000000ULL;
+ cmd->opc = opc;
+ cmd->status.sct = sct;
+ cmd->status.sc = sc;
+
+ return 0;
+}
+
+void
+spdk_nvme_qpair_remove_cmd_error_injection(struct spdk_nvme_ctrlr *ctrlr,
+ struct spdk_nvme_qpair *qpair,
+ uint8_t opc)
+{
+ struct nvme_error_cmd *cmd, *entry;
+
+ if (qpair == NULL) {
+ qpair = ctrlr->adminq;
+ }
+
+ TAILQ_FOREACH_SAFE(cmd, &qpair->err_cmd_head, link, entry) {
+ if (cmd->opc == opc) {
+ TAILQ_REMOVE(&qpair->err_cmd_head, cmd, link);
+ spdk_free(cmd);
+ return;
+ }
+ }
+
+ return;
+}
diff --git a/src/spdk/lib/nvme/nvme_quirks.c b/src/spdk/lib/nvme/nvme_quirks.c
new file mode 100644
index 000000000..38c8f0eae
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_quirks.c
@@ -0,0 +1,155 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "nvme_internal.h"
+
+struct nvme_quirk {
+ struct spdk_pci_id id;
+ uint64_t flags;
+};
+
+static const struct nvme_quirk nvme_quirks[] = {
+ { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_INTEL, 0x0953, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID},
+ NVME_INTEL_QUIRK_READ_LATENCY |
+ NVME_INTEL_QUIRK_WRITE_LATENCY |
+ NVME_INTEL_QUIRK_STRIPING |
+ NVME_QUIRK_READ_ZERO_AFTER_DEALLOCATE |
+ NVME_QUIRK_DELAY_BEFORE_INIT |
+ NVME_QUIRK_MINIMUM_IO_QUEUE_SIZE
+ },
+ { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_INTEL, 0x0A53, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID},
+ NVME_INTEL_QUIRK_READ_LATENCY |
+ NVME_INTEL_QUIRK_WRITE_LATENCY |
+ NVME_INTEL_QUIRK_STRIPING |
+ NVME_QUIRK_READ_ZERO_AFTER_DEALLOCATE |
+ NVME_QUIRK_DELAY_BEFORE_INIT |
+ NVME_QUIRK_MINIMUM_IO_QUEUE_SIZE
+ },
+ { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_INTEL, 0x0A54, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID},
+ NVME_INTEL_QUIRK_READ_LATENCY |
+ NVME_INTEL_QUIRK_WRITE_LATENCY |
+ NVME_INTEL_QUIRK_STRIPING |
+ NVME_QUIRK_READ_ZERO_AFTER_DEALLOCATE |
+ NVME_QUIRK_MINIMUM_IO_QUEUE_SIZE
+ },
+ { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_INTEL, 0x0A55, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID},
+ NVME_INTEL_QUIRK_READ_LATENCY |
+ NVME_INTEL_QUIRK_WRITE_LATENCY |
+ NVME_INTEL_QUIRK_STRIPING |
+ NVME_QUIRK_READ_ZERO_AFTER_DEALLOCATE |
+ NVME_QUIRK_MINIMUM_IO_QUEUE_SIZE
+ },
+ { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_MEMBLAZE, 0x0540, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID},
+ NVME_QUIRK_DELAY_BEFORE_CHK_RDY
+ },
+ { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_SAMSUNG, 0xa821, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID},
+ NVME_QUIRK_DELAY_BEFORE_CHK_RDY
+ },
+ { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_SAMSUNG, 0xa822, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID},
+ NVME_QUIRK_DELAY_BEFORE_CHK_RDY
+ },
+ { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_VIRTUALBOX, 0x4e56, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID},
+ NVME_QUIRK_DELAY_AFTER_QUEUE_ALLOC
+ },
+ { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_INTEL, 0x5845, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID},
+ NVME_QUIRK_IDENTIFY_CNS |
+ NVME_INTEL_QUIRK_NO_LOG_PAGES |
+ NVME_QUIRK_MAXIMUM_PCI_ACCESS_WIDTH
+ },
+ { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_CNEXLABS, 0x1f1f, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID},
+ NVME_QUIRK_IDENTIFY_CNS |
+ NVME_QUIRK_OCSSD
+ },
+ { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_VMWARE, 0x07f0, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID},
+ NVME_QUIRK_SHST_COMPLETE
+ },
+ { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_INTEL, 0x2700, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID},
+ NVME_QUIRK_OACS_SECURITY
+ },
+ { {0x000000, 0x0000, 0x0000, 0x0000, 0x0000}, 0}
+};
+
+/* Compare each field. SPDK_PCI_ANY_ID in s1 matches everything */
+static bool
+pci_id_match(const struct spdk_pci_id *s1, const struct spdk_pci_id *s2)
+{
+ if ((s1->class_id == SPDK_PCI_CLASS_ANY_ID || s1->class_id == s2->class_id) &&
+ (s1->vendor_id == SPDK_PCI_ANY_ID || s1->vendor_id == s2->vendor_id) &&
+ (s1->device_id == SPDK_PCI_ANY_ID || s1->device_id == s2->device_id) &&
+ (s1->subvendor_id == SPDK_PCI_ANY_ID || s1->subvendor_id == s2->subvendor_id) &&
+ (s1->subdevice_id == SPDK_PCI_ANY_ID || s1->subdevice_id == s2->subdevice_id)) {
+ return true;
+ }
+ return false;
+}
+
+uint64_t
+nvme_get_quirks(const struct spdk_pci_id *id)
+{
+ const struct nvme_quirk *quirk = nvme_quirks;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Searching for %04x:%04x [%04x:%04x]...\n",
+ id->vendor_id, id->device_id,
+ id->subvendor_id, id->subdevice_id);
+
+ while (quirk->id.vendor_id) {
+ if (pci_id_match(&quirk->id, id)) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Matched quirk %04x:%04x [%04x:%04x]:\n",
+ quirk->id.vendor_id, quirk->id.device_id,
+ quirk->id.subvendor_id, quirk->id.subdevice_id);
+
+#define PRINT_QUIRK(quirk_flag) \
+ do { \
+ if (quirk->flags & (quirk_flag)) { \
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Quirk enabled: %s\n", #quirk_flag); \
+ } \
+ } while (0)
+
+ PRINT_QUIRK(NVME_INTEL_QUIRK_READ_LATENCY);
+ PRINT_QUIRK(NVME_INTEL_QUIRK_WRITE_LATENCY);
+ PRINT_QUIRK(NVME_QUIRK_DELAY_BEFORE_CHK_RDY);
+ PRINT_QUIRK(NVME_INTEL_QUIRK_STRIPING);
+ PRINT_QUIRK(NVME_QUIRK_DELAY_AFTER_QUEUE_ALLOC);
+ PRINT_QUIRK(NVME_QUIRK_READ_ZERO_AFTER_DEALLOCATE);
+ PRINT_QUIRK(NVME_QUIRK_IDENTIFY_CNS);
+ PRINT_QUIRK(NVME_QUIRK_OCSSD);
+
+ return quirk->flags;
+ }
+ quirk++;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "No quirks found.\n");
+
+ return 0;
+}
diff --git a/src/spdk/lib/nvme/nvme_rdma.c b/src/spdk/lib/nvme/nvme_rdma.c
new file mode 100644
index 000000000..84537c4a1
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_rdma.c
@@ -0,0 +1,2852 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation. All rights reserved.
+ * Copyright (c) 2019, 2020 Mellanox Technologies LTD. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * NVMe over RDMA transport
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/assert.h"
+#include "spdk/log.h"
+#include "spdk/trace.h"
+#include "spdk/queue.h"
+#include "spdk/nvme.h"
+#include "spdk/nvmf_spec.h"
+#include "spdk/string.h"
+#include "spdk/endian.h"
+#include "spdk/likely.h"
+#include "spdk/config.h"
+
+#include "nvme_internal.h"
+#include "spdk_internal/rdma.h"
+
+#define NVME_RDMA_TIME_OUT_IN_MS 2000
+#define NVME_RDMA_RW_BUFFER_SIZE 131072
+
+/*
+ * NVME RDMA qpair Resource Defaults
+ */
+#define NVME_RDMA_DEFAULT_TX_SGE 2
+#define NVME_RDMA_DEFAULT_RX_SGE 1
+
+/* Max number of NVMe-oF SGL descriptors supported by the host */
+#define NVME_RDMA_MAX_SGL_DESCRIPTORS 16
+
+/* number of STAILQ entries for holding pending RDMA CM events. */
+#define NVME_RDMA_NUM_CM_EVENTS 256
+
+/* CM event processing timeout */
+#define NVME_RDMA_QPAIR_CM_EVENT_TIMEOUT_US 1000000
+
+/* The default size for a shared rdma completion queue. */
+#define DEFAULT_NVME_RDMA_CQ_SIZE 4096
+
+/*
+ * In the special case of a stale connection we don't expose a mechanism
+ * for the user to retry the connection so we need to handle it internally.
+ */
+#define NVME_RDMA_STALE_CONN_RETRY_MAX 5
+#define NVME_RDMA_STALE_CONN_RETRY_DELAY_US 10000
+
+/*
+ * Maximum value of transport_retry_count used by RDMA controller
+ */
+#define NVME_RDMA_CTRLR_MAX_TRANSPORT_RETRY_COUNT 7
+
+/*
+ * Maximum value of transport_ack_timeout used by RDMA controller
+ */
+#define NVME_RDMA_CTRLR_MAX_TRANSPORT_ACK_TIMEOUT 31
+
+/*
+ * Number of poller cycles to keep a pointer to destroyed qpairs
+ * in the poll group.
+ */
+#define NVME_RDMA_DESTROYED_QPAIR_EXPIRATION_CYCLES 50
+
+/*
+ * The max length of keyed SGL data block (3 bytes)
+ */
+#define NVME_RDMA_MAX_KEYED_SGL_LENGTH ((1u << 24u) - 1)
+
+#define WC_PER_QPAIR(queue_depth) (queue_depth * 2)
+
+enum nvme_rdma_wr_type {
+ RDMA_WR_TYPE_RECV,
+ RDMA_WR_TYPE_SEND,
+};
+
+struct nvme_rdma_wr {
+ /* Using this instead of the enum allows this struct to only occupy one byte. */
+ uint8_t type;
+};
+
+struct spdk_nvmf_cmd {
+ struct spdk_nvme_cmd cmd;
+ struct spdk_nvme_sgl_descriptor sgl[NVME_RDMA_MAX_SGL_DESCRIPTORS];
+};
+
+struct spdk_nvme_rdma_hooks g_nvme_hooks = {};
+
+/* Mapping from virtual address to ibv_mr pointer for a protection domain */
+struct spdk_nvme_rdma_mr_map {
+ struct ibv_pd *pd;
+ struct spdk_mem_map *map;
+ uint64_t ref;
+ LIST_ENTRY(spdk_nvme_rdma_mr_map) link;
+};
+
+/* STAILQ wrapper for cm events. */
+struct nvme_rdma_cm_event_entry {
+ struct rdma_cm_event *evt;
+ STAILQ_ENTRY(nvme_rdma_cm_event_entry) link;
+};
+
+/* NVMe RDMA transport extensions for spdk_nvme_ctrlr */
+struct nvme_rdma_ctrlr {
+ struct spdk_nvme_ctrlr ctrlr;
+
+ struct ibv_pd *pd;
+
+ uint16_t max_sge;
+
+ struct rdma_event_channel *cm_channel;
+
+ STAILQ_HEAD(, nvme_rdma_cm_event_entry) pending_cm_events;
+
+ STAILQ_HEAD(, nvme_rdma_cm_event_entry) free_cm_events;
+
+ struct nvme_rdma_cm_event_entry *cm_events;
+};
+
+struct nvme_rdma_destroyed_qpair {
+ struct nvme_rdma_qpair *destroyed_qpair_tracker;
+ uint32_t completed_cycles;
+ STAILQ_ENTRY(nvme_rdma_destroyed_qpair) link;
+};
+
+struct nvme_rdma_poller {
+ struct ibv_context *device;
+ struct ibv_cq *cq;
+ int required_num_wc;
+ int current_num_wc;
+ STAILQ_ENTRY(nvme_rdma_poller) link;
+};
+
+struct nvme_rdma_poll_group {
+ struct spdk_nvme_transport_poll_group group;
+ STAILQ_HEAD(, nvme_rdma_poller) pollers;
+ int num_pollers;
+ STAILQ_HEAD(, nvme_rdma_destroyed_qpair) destroyed_qpairs;
+};
+
+struct spdk_nvme_send_wr_list {
+ struct ibv_send_wr *first;
+ struct ibv_send_wr *last;
+};
+
+struct spdk_nvme_recv_wr_list {
+ struct ibv_recv_wr *first;
+ struct ibv_recv_wr *last;
+};
+
+/* Memory regions */
+union nvme_rdma_mr {
+ struct ibv_mr *mr;
+ uint64_t key;
+};
+
+/* NVMe RDMA qpair extensions for spdk_nvme_qpair */
+struct nvme_rdma_qpair {
+ struct spdk_nvme_qpair qpair;
+
+ struct spdk_rdma_qp *rdma_qp;
+ struct rdma_cm_id *cm_id;
+ struct ibv_cq *cq;
+
+ struct spdk_nvme_rdma_req *rdma_reqs;
+
+ uint32_t max_send_sge;
+
+ uint32_t max_recv_sge;
+
+ uint16_t num_entries;
+
+ bool delay_cmd_submit;
+
+ bool poll_group_disconnect_in_progress;
+
+ uint32_t num_completions;
+
+ /* Parallel arrays of response buffers + response SGLs of size num_entries */
+ struct ibv_sge *rsp_sgls;
+ struct spdk_nvme_rdma_rsp *rsps;
+
+ struct ibv_recv_wr *rsp_recv_wrs;
+
+ struct spdk_nvme_send_wr_list sends_to_post;
+ struct spdk_nvme_recv_wr_list recvs_to_post;
+
+ /* Memory region describing all rsps for this qpair */
+ union nvme_rdma_mr rsp_mr;
+
+ /*
+ * Array of num_entries NVMe commands registered as RDMA message buffers.
+ * Indexed by rdma_req->id.
+ */
+ struct spdk_nvmf_cmd *cmds;
+
+ /* Memory region describing all cmds for this qpair */
+ union nvme_rdma_mr cmd_mr;
+
+ struct spdk_nvme_rdma_mr_map *mr_map;
+
+ TAILQ_HEAD(, spdk_nvme_rdma_req) free_reqs;
+ TAILQ_HEAD(, spdk_nvme_rdma_req) outstanding_reqs;
+
+ /* Counts of outstanding send and recv objects */
+ uint16_t current_num_recvs;
+ uint16_t current_num_sends;
+
+ /* Placed at the end of the struct since it is not used frequently */
+ struct rdma_cm_event *evt;
+
+ /* Used by poll group to keep the qpair around until it is ready to remove it. */
+ bool defer_deletion_to_pg;
+};
+
+enum NVME_RDMA_COMPLETION_FLAGS {
+ NVME_RDMA_SEND_COMPLETED = 1u << 0,
+ NVME_RDMA_RECV_COMPLETED = 1u << 1,
+};
+
+struct spdk_nvme_rdma_req {
+ uint16_t id;
+ uint16_t completion_flags: 2;
+ uint16_t reserved: 14;
+ /* if completion of RDMA_RECV received before RDMA_SEND, we will complete nvme request
+ * during processing of RDMA_SEND. To complete the request we must know the index
+ * of nvme_cpl received in RDMA_RECV, so store it in this field */
+ uint16_t rsp_idx;
+
+ struct nvme_rdma_wr rdma_wr;
+
+ struct ibv_send_wr send_wr;
+
+ struct nvme_request *req;
+
+ struct ibv_sge send_sgl[NVME_RDMA_DEFAULT_TX_SGE];
+
+ TAILQ_ENTRY(spdk_nvme_rdma_req) link;
+};
+
+enum nvme_rdma_key_type {
+ NVME_RDMA_MR_RKEY,
+ NVME_RDMA_MR_LKEY
+};
+
+struct spdk_nvme_rdma_rsp {
+ struct spdk_nvme_cpl cpl;
+ struct nvme_rdma_qpair *rqpair;
+ uint16_t idx;
+ struct nvme_rdma_wr rdma_wr;
+};
+
+static const char *rdma_cm_event_str[] = {
+ "RDMA_CM_EVENT_ADDR_RESOLVED",
+ "RDMA_CM_EVENT_ADDR_ERROR",
+ "RDMA_CM_EVENT_ROUTE_RESOLVED",
+ "RDMA_CM_EVENT_ROUTE_ERROR",
+ "RDMA_CM_EVENT_CONNECT_REQUEST",
+ "RDMA_CM_EVENT_CONNECT_RESPONSE",
+ "RDMA_CM_EVENT_CONNECT_ERROR",
+ "RDMA_CM_EVENT_UNREACHABLE",
+ "RDMA_CM_EVENT_REJECTED",
+ "RDMA_CM_EVENT_ESTABLISHED",
+ "RDMA_CM_EVENT_DISCONNECTED",
+ "RDMA_CM_EVENT_DEVICE_REMOVAL",
+ "RDMA_CM_EVENT_MULTICAST_JOIN",
+ "RDMA_CM_EVENT_MULTICAST_ERROR",
+ "RDMA_CM_EVENT_ADDR_CHANGE",
+ "RDMA_CM_EVENT_TIMEWAIT_EXIT"
+};
+
+static LIST_HEAD(, spdk_nvme_rdma_mr_map) g_rdma_mr_maps = LIST_HEAD_INITIALIZER(&g_rdma_mr_maps);
+static pthread_mutex_t g_rdma_mr_maps_mutex = PTHREAD_MUTEX_INITIALIZER;
+struct nvme_rdma_qpair *nvme_rdma_poll_group_get_qpair_by_id(struct nvme_rdma_poll_group *group,
+ uint32_t qp_num);
+
+static inline void *
+nvme_rdma_calloc(size_t nmemb, size_t size)
+{
+ if (!g_nvme_hooks.get_rkey) {
+ return calloc(nmemb, size);
+ } else {
+ return spdk_zmalloc(nmemb * size, 0, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
+ }
+}
+
+static inline void
+nvme_rdma_free(void *buf)
+{
+ if (!g_nvme_hooks.get_rkey) {
+ free(buf);
+ } else {
+ spdk_free(buf);
+ }
+}
+
+static int nvme_rdma_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr,
+ struct spdk_nvme_qpair *qpair);
+
+static inline struct nvme_rdma_qpair *
+nvme_rdma_qpair(struct spdk_nvme_qpair *qpair)
+{
+ assert(qpair->trtype == SPDK_NVME_TRANSPORT_RDMA);
+ return SPDK_CONTAINEROF(qpair, struct nvme_rdma_qpair, qpair);
+}
+
+static inline struct nvme_rdma_poll_group *
+nvme_rdma_poll_group(struct spdk_nvme_transport_poll_group *group)
+{
+ return (SPDK_CONTAINEROF(group, struct nvme_rdma_poll_group, group));
+}
+
+static inline struct nvme_rdma_ctrlr *
+nvme_rdma_ctrlr(struct spdk_nvme_ctrlr *ctrlr)
+{
+ assert(ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_RDMA);
+ return SPDK_CONTAINEROF(ctrlr, struct nvme_rdma_ctrlr, ctrlr);
+}
+
+static struct spdk_nvme_rdma_req *
+nvme_rdma_req_get(struct nvme_rdma_qpair *rqpair)
+{
+ struct spdk_nvme_rdma_req *rdma_req;
+
+ rdma_req = TAILQ_FIRST(&rqpair->free_reqs);
+ if (rdma_req) {
+ TAILQ_REMOVE(&rqpair->free_reqs, rdma_req, link);
+ TAILQ_INSERT_TAIL(&rqpair->outstanding_reqs, rdma_req, link);
+ }
+
+ return rdma_req;
+}
+
+static void
+nvme_rdma_req_put(struct nvme_rdma_qpair *rqpair, struct spdk_nvme_rdma_req *rdma_req)
+{
+ rdma_req->completion_flags = 0;
+ rdma_req->req = NULL;
+ TAILQ_INSERT_HEAD(&rqpair->free_reqs, rdma_req, link);
+}
+
+static void
+nvme_rdma_req_complete(struct spdk_nvme_rdma_req *rdma_req,
+ struct spdk_nvme_cpl *rsp)
+{
+ struct nvme_request *req = rdma_req->req;
+ struct nvme_rdma_qpair *rqpair;
+
+ assert(req != NULL);
+
+ rqpair = nvme_rdma_qpair(req->qpair);
+ TAILQ_REMOVE(&rqpair->outstanding_reqs, rdma_req, link);
+
+ nvme_complete_request(req->cb_fn, req->cb_arg, req->qpair, req, rsp);
+ nvme_free_request(req);
+}
+
+static const char *
+nvme_rdma_cm_event_str_get(uint32_t event)
+{
+ if (event < SPDK_COUNTOF(rdma_cm_event_str)) {
+ return rdma_cm_event_str[event];
+ } else {
+ return "Undefined";
+ }
+}
+
+
+static int
+nvme_rdma_qpair_process_cm_event(struct nvme_rdma_qpair *rqpair)
+{
+ struct rdma_cm_event *event = rqpair->evt;
+ struct spdk_nvmf_rdma_accept_private_data *accept_data;
+ int rc = 0;
+
+ if (event) {
+ switch (event->event) {
+ case RDMA_CM_EVENT_ADDR_RESOLVED:
+ case RDMA_CM_EVENT_ADDR_ERROR:
+ case RDMA_CM_EVENT_ROUTE_RESOLVED:
+ case RDMA_CM_EVENT_ROUTE_ERROR:
+ break;
+ case RDMA_CM_EVENT_CONNECT_REQUEST:
+ break;
+ case RDMA_CM_EVENT_CONNECT_ERROR:
+ break;
+ case RDMA_CM_EVENT_UNREACHABLE:
+ case RDMA_CM_EVENT_REJECTED:
+ break;
+ case RDMA_CM_EVENT_CONNECT_RESPONSE:
+ rc = spdk_rdma_qp_complete_connect(rqpair->rdma_qp);
+ /* fall through */
+ case RDMA_CM_EVENT_ESTABLISHED:
+ accept_data = (struct spdk_nvmf_rdma_accept_private_data *)event->param.conn.private_data;
+ if (accept_data == NULL) {
+ rc = -1;
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Requested queue depth %d. Actually got queue depth %d.\n",
+ rqpair->num_entries, accept_data->crqsize);
+ rqpair->num_entries = spdk_min(rqpair->num_entries, accept_data->crqsize);
+ }
+ break;
+ case RDMA_CM_EVENT_DISCONNECTED:
+ rqpair->qpair.transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_REMOTE;
+ break;
+ case RDMA_CM_EVENT_DEVICE_REMOVAL:
+ rqpair->qpair.transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_LOCAL;
+ break;
+ case RDMA_CM_EVENT_MULTICAST_JOIN:
+ case RDMA_CM_EVENT_MULTICAST_ERROR:
+ break;
+ case RDMA_CM_EVENT_ADDR_CHANGE:
+ rqpair->qpair.transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_LOCAL;
+ break;
+ case RDMA_CM_EVENT_TIMEWAIT_EXIT:
+ break;
+ default:
+ SPDK_ERRLOG("Unexpected Acceptor Event [%d]\n", event->event);
+ break;
+ }
+ rqpair->evt = NULL;
+ rdma_ack_cm_event(event);
+ }
+
+ return rc;
+}
+
+/*
+ * This function must be called under the nvme controller's lock
+ * because it touches global controller variables. The lock is taken
+ * by the generic transport code before invoking a few of the functions
+ * in this file: nvme_rdma_ctrlr_connect_qpair, nvme_rdma_ctrlr_delete_io_qpair,
+ * and conditionally nvme_rdma_qpair_process_completions when it is calling
+ * completions on the admin qpair. When adding a new call to this function, please
+ * verify that it is in a situation where it falls under the lock.
+ */
+static int
+nvme_rdma_poll_events(struct nvme_rdma_ctrlr *rctrlr)
+{
+ struct nvme_rdma_cm_event_entry *entry, *tmp;
+ struct nvme_rdma_qpair *event_qpair;
+ struct rdma_cm_event *event;
+ struct rdma_event_channel *channel = rctrlr->cm_channel;
+
+ STAILQ_FOREACH_SAFE(entry, &rctrlr->pending_cm_events, link, tmp) {
+ event_qpair = nvme_rdma_qpair(entry->evt->id->context);
+ if (event_qpair->evt == NULL) {
+ event_qpair->evt = entry->evt;
+ STAILQ_REMOVE(&rctrlr->pending_cm_events, entry, nvme_rdma_cm_event_entry, link);
+ STAILQ_INSERT_HEAD(&rctrlr->free_cm_events, entry, link);
+ }
+ }
+
+ while (rdma_get_cm_event(channel, &event) == 0) {
+ event_qpair = nvme_rdma_qpair(event->id->context);
+ if (event_qpair->evt == NULL) {
+ event_qpair->evt = event;
+ } else {
+ assert(rctrlr == nvme_rdma_ctrlr(event_qpair->qpair.ctrlr));
+ entry = STAILQ_FIRST(&rctrlr->free_cm_events);
+ if (entry == NULL) {
+ rdma_ack_cm_event(event);
+ return -ENOMEM;
+ }
+ STAILQ_REMOVE(&rctrlr->free_cm_events, entry, nvme_rdma_cm_event_entry, link);
+ entry->evt = event;
+ STAILQ_INSERT_TAIL(&rctrlr->pending_cm_events, entry, link);
+ }
+ }
+
+ if (errno == EAGAIN || errno == EWOULDBLOCK) {
+ return 0;
+ } else {
+ return errno;
+ }
+}
+
+static int
+nvme_rdma_validate_cm_event(enum rdma_cm_event_type expected_evt_type,
+ struct rdma_cm_event *reaped_evt)
+{
+ int rc = -EBADMSG;
+
+ if (expected_evt_type == reaped_evt->event) {
+ return 0;
+ }
+
+ switch (expected_evt_type) {
+ case RDMA_CM_EVENT_ESTABLISHED:
+ /*
+ * There is an enum ib_cm_rej_reason in the kernel headers that sets 10 as
+ * IB_CM_REJ_STALE_CONN. I can't find the corresponding userspace but we get
+ * the same values here.
+ */
+ if (reaped_evt->event == RDMA_CM_EVENT_REJECTED && reaped_evt->status == 10) {
+ rc = -ESTALE;
+ } else if (reaped_evt->event == RDMA_CM_EVENT_CONNECT_RESPONSE) {
+ /*
+ * If we are using a qpair which is not created using rdma cm API
+ * then we will receive RDMA_CM_EVENT_CONNECT_RESPONSE instead of
+ * RDMA_CM_EVENT_ESTABLISHED.
+ */
+ return 0;
+ }
+ break;
+ default:
+ break;
+ }
+
+ SPDK_ERRLOG("Expected %s but received %s (%d) from CM event channel (status = %d)\n",
+ nvme_rdma_cm_event_str_get(expected_evt_type),
+ nvme_rdma_cm_event_str_get(reaped_evt->event), reaped_evt->event,
+ reaped_evt->status);
+ return rc;
+}
+
+static int
+nvme_rdma_process_event(struct nvme_rdma_qpair *rqpair,
+ struct rdma_event_channel *channel,
+ enum rdma_cm_event_type evt)
+{
+ struct nvme_rdma_ctrlr *rctrlr;
+ uint64_t timeout_ticks;
+ int rc = 0, rc2;
+
+ if (rqpair->evt != NULL) {
+ rc = nvme_rdma_qpair_process_cm_event(rqpair);
+ if (rc) {
+ return rc;
+ }
+ }
+
+ timeout_ticks = (NVME_RDMA_QPAIR_CM_EVENT_TIMEOUT_US * spdk_get_ticks_hz()) / SPDK_SEC_TO_USEC +
+ spdk_get_ticks();
+ rctrlr = nvme_rdma_ctrlr(rqpair->qpair.ctrlr);
+ assert(rctrlr != NULL);
+
+ while (!rqpair->evt && spdk_get_ticks() < timeout_ticks && rc == 0) {
+ rc = nvme_rdma_poll_events(rctrlr);
+ }
+
+ if (rc) {
+ return rc;
+ }
+
+ if (rqpair->evt == NULL) {
+ return -EADDRNOTAVAIL;
+ }
+
+ rc = nvme_rdma_validate_cm_event(evt, rqpair->evt);
+
+ rc2 = nvme_rdma_qpair_process_cm_event(rqpair);
+ /* bad message takes precedence over the other error codes from processing the event. */
+ return rc == 0 ? rc2 : rc;
+}
+
+static int
+nvme_rdma_qpair_init(struct nvme_rdma_qpair *rqpair)
+{
+ int rc;
+ struct spdk_rdma_qp_init_attr attr = {};
+ struct ibv_device_attr dev_attr;
+ struct nvme_rdma_ctrlr *rctrlr;
+
+ rc = ibv_query_device(rqpair->cm_id->verbs, &dev_attr);
+ if (rc != 0) {
+ SPDK_ERRLOG("Failed to query RDMA device attributes.\n");
+ return -1;
+ }
+
+ if (rqpair->qpair.poll_group) {
+ assert(!rqpair->cq);
+ rc = nvme_poll_group_connect_qpair(&rqpair->qpair);
+ if (rc) {
+ SPDK_ERRLOG("Unable to activate the rdmaqpair.\n");
+ return -1;
+ }
+ assert(rqpair->cq);
+ } else {
+ rqpair->cq = ibv_create_cq(rqpair->cm_id->verbs, rqpair->num_entries * 2, rqpair, NULL, 0);
+ if (!rqpair->cq) {
+ SPDK_ERRLOG("Unable to create completion queue: errno %d: %s\n", errno, spdk_strerror(errno));
+ return -1;
+ }
+ }
+
+ rctrlr = nvme_rdma_ctrlr(rqpair->qpair.ctrlr);
+ if (g_nvme_hooks.get_ibv_pd) {
+ rctrlr->pd = g_nvme_hooks.get_ibv_pd(&rctrlr->ctrlr.trid, rqpair->cm_id->verbs);
+ } else {
+ rctrlr->pd = NULL;
+ }
+
+ attr.pd = rctrlr->pd;
+ attr.send_cq = rqpair->cq;
+ attr.recv_cq = rqpair->cq;
+ attr.cap.max_send_wr = rqpair->num_entries; /* SEND operations */
+ attr.cap.max_recv_wr = rqpair->num_entries; /* RECV operations */
+ attr.cap.max_send_sge = spdk_min(NVME_RDMA_DEFAULT_TX_SGE, dev_attr.max_sge);
+ attr.cap.max_recv_sge = spdk_min(NVME_RDMA_DEFAULT_RX_SGE, dev_attr.max_sge);
+
+ rqpair->rdma_qp = spdk_rdma_qp_create(rqpair->cm_id, &attr);
+
+ if (!rqpair->rdma_qp) {
+ return -1;
+ }
+
+ /* ibv_create_qp will change the values in attr.cap. Make sure we store the proper value. */
+ rqpair->max_send_sge = spdk_min(NVME_RDMA_DEFAULT_TX_SGE, attr.cap.max_send_sge);
+ rqpair->max_recv_sge = spdk_min(NVME_RDMA_DEFAULT_RX_SGE, attr.cap.max_recv_sge);
+ rqpair->current_num_recvs = 0;
+ rqpair->current_num_sends = 0;
+
+ rctrlr->pd = rqpair->rdma_qp->qp->pd;
+
+ rqpair->cm_id->context = &rqpair->qpair;
+
+ return 0;
+}
+
+static inline int
+nvme_rdma_qpair_submit_sends(struct nvme_rdma_qpair *rqpair)
+{
+ struct ibv_send_wr *bad_send_wr;
+ int rc;
+
+ rc = spdk_rdma_qp_flush_send_wrs(rqpair->rdma_qp, &bad_send_wr);
+
+ if (spdk_unlikely(rc)) {
+ SPDK_ERRLOG("Failed to post WRs on send queue, errno %d (%s), bad_wr %p\n",
+ rc, spdk_strerror(rc), bad_send_wr);
+ while (bad_send_wr != NULL) {
+ assert(rqpair->current_num_sends > 0);
+ rqpair->current_num_sends--;
+ bad_send_wr = bad_send_wr->next;
+ }
+ return rc;
+ }
+
+ return 0;
+}
+
+static inline int
+nvme_rdma_qpair_submit_recvs(struct nvme_rdma_qpair *rqpair)
+{
+ struct ibv_recv_wr *bad_recv_wr;
+ int rc = 0;
+
+ if (rqpair->recvs_to_post.first) {
+ rc = ibv_post_recv(rqpair->rdma_qp->qp, rqpair->recvs_to_post.first, &bad_recv_wr);
+ if (spdk_unlikely(rc)) {
+ SPDK_ERRLOG("Failed to post WRs on receive queue, errno %d (%s), bad_wr %p\n",
+ rc, spdk_strerror(rc), bad_recv_wr);
+ while (bad_recv_wr != NULL) {
+ assert(rqpair->current_num_sends > 0);
+ rqpair->current_num_recvs--;
+ bad_recv_wr = bad_recv_wr->next;
+ }
+ }
+
+ rqpair->recvs_to_post.first = NULL;
+ }
+ return rc;
+}
+
+/* Append the given send wr structure to the qpair's outstanding sends list. */
+/* This function accepts only a single wr. */
+static inline int
+nvme_rdma_qpair_queue_send_wr(struct nvme_rdma_qpair *rqpair, struct ibv_send_wr *wr)
+{
+ assert(wr->next == NULL);
+
+ assert(rqpair->current_num_sends < rqpair->num_entries);
+
+ rqpair->current_num_sends++;
+ spdk_rdma_qp_queue_send_wrs(rqpair->rdma_qp, wr);
+
+ if (!rqpair->delay_cmd_submit) {
+ return nvme_rdma_qpair_submit_sends(rqpair);
+ }
+
+ return 0;
+}
+
+/* Append the given recv wr structure to the qpair's outstanding recvs list. */
+/* This function accepts only a single wr. */
+static inline int
+nvme_rdma_qpair_queue_recv_wr(struct nvme_rdma_qpair *rqpair, struct ibv_recv_wr *wr)
+{
+
+ assert(wr->next == NULL);
+ assert(rqpair->current_num_recvs < rqpair->num_entries);
+
+ rqpair->current_num_recvs++;
+ if (rqpair->recvs_to_post.first == NULL) {
+ rqpair->recvs_to_post.first = wr;
+ } else {
+ rqpair->recvs_to_post.last->next = wr;
+ }
+
+ rqpair->recvs_to_post.last = wr;
+
+ if (!rqpair->delay_cmd_submit) {
+ return nvme_rdma_qpair_submit_recvs(rqpair);
+ }
+
+ return 0;
+}
+
+#define nvme_rdma_trace_ibv_sge(sg_list) \
+ if (sg_list) { \
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "local addr %p length 0x%x lkey 0x%x\n", \
+ (void *)(sg_list)->addr, (sg_list)->length, (sg_list)->lkey); \
+ }
+
+static int
+nvme_rdma_post_recv(struct nvme_rdma_qpair *rqpair, uint16_t rsp_idx)
+{
+ struct ibv_recv_wr *wr;
+
+ wr = &rqpair->rsp_recv_wrs[rsp_idx];
+ wr->next = NULL;
+ nvme_rdma_trace_ibv_sge(wr->sg_list);
+ return nvme_rdma_qpair_queue_recv_wr(rqpair, wr);
+}
+
+static int
+nvme_rdma_reg_mr(struct rdma_cm_id *cm_id, union nvme_rdma_mr *mr, void *mem, size_t length)
+{
+ if (!g_nvme_hooks.get_rkey) {
+ mr->mr = rdma_reg_msgs(cm_id, mem, length);
+ if (mr->mr == NULL) {
+ SPDK_ERRLOG("Unable to register mr: %s (%d)\n",
+ spdk_strerror(errno), errno);
+ return -1;
+ }
+ } else {
+ mr->key = g_nvme_hooks.get_rkey(cm_id->pd, mem, length);
+ }
+
+ return 0;
+}
+
+static void
+nvme_rdma_dereg_mr(union nvme_rdma_mr *mr)
+{
+ if (!g_nvme_hooks.get_rkey) {
+ if (mr->mr && rdma_dereg_mr(mr->mr)) {
+ SPDK_ERRLOG("Unable to de-register mr\n");
+ }
+ } else {
+ if (mr->key) {
+ g_nvme_hooks.put_rkey(mr->key);
+ }
+ }
+ memset(mr, 0, sizeof(*mr));
+}
+
+static uint32_t
+nvme_rdma_mr_get_lkey(union nvme_rdma_mr *mr)
+{
+ uint32_t lkey;
+
+ if (!g_nvme_hooks.get_rkey) {
+ lkey = mr->mr->lkey;
+ } else {
+ lkey = *((uint64_t *) mr->key);
+ }
+
+ return lkey;
+}
+
+static void
+nvme_rdma_unregister_rsps(struct nvme_rdma_qpair *rqpair)
+{
+ nvme_rdma_dereg_mr(&rqpair->rsp_mr);
+}
+
+static void
+nvme_rdma_free_rsps(struct nvme_rdma_qpair *rqpair)
+{
+ nvme_rdma_free(rqpair->rsps);
+ rqpair->rsps = NULL;
+ nvme_rdma_free(rqpair->rsp_sgls);
+ rqpair->rsp_sgls = NULL;
+ nvme_rdma_free(rqpair->rsp_recv_wrs);
+ rqpair->rsp_recv_wrs = NULL;
+}
+
+static int
+nvme_rdma_alloc_rsps(struct nvme_rdma_qpair *rqpair)
+{
+ rqpair->rsps = NULL;
+ rqpair->rsp_recv_wrs = NULL;
+
+ rqpair->rsp_sgls = nvme_rdma_calloc(rqpair->num_entries, sizeof(*rqpair->rsp_sgls));
+ if (!rqpair->rsp_sgls) {
+ SPDK_ERRLOG("Failed to allocate rsp_sgls\n");
+ goto fail;
+ }
+
+ rqpair->rsp_recv_wrs = nvme_rdma_calloc(rqpair->num_entries, sizeof(*rqpair->rsp_recv_wrs));
+ if (!rqpair->rsp_recv_wrs) {
+ SPDK_ERRLOG("Failed to allocate rsp_recv_wrs\n");
+ goto fail;
+ }
+
+ rqpair->rsps = nvme_rdma_calloc(rqpair->num_entries, sizeof(*rqpair->rsps));
+ if (!rqpair->rsps) {
+ SPDK_ERRLOG("can not allocate rdma rsps\n");
+ goto fail;
+ }
+
+ return 0;
+fail:
+ nvme_rdma_free_rsps(rqpair);
+ return -ENOMEM;
+}
+
+static int
+nvme_rdma_register_rsps(struct nvme_rdma_qpair *rqpair)
+{
+ uint16_t i;
+ int rc;
+ uint32_t lkey;
+
+ rc = nvme_rdma_reg_mr(rqpair->cm_id, &rqpair->rsp_mr,
+ rqpair->rsps, rqpair->num_entries * sizeof(*rqpair->rsps));
+
+ if (rc < 0) {
+ goto fail;
+ }
+
+ lkey = nvme_rdma_mr_get_lkey(&rqpair->rsp_mr);
+
+ for (i = 0; i < rqpair->num_entries; i++) {
+ struct ibv_sge *rsp_sgl = &rqpair->rsp_sgls[i];
+ struct spdk_nvme_rdma_rsp *rsp = &rqpair->rsps[i];
+
+ rsp->rqpair = rqpair;
+ rsp->rdma_wr.type = RDMA_WR_TYPE_RECV;
+ rsp->idx = i;
+ rsp_sgl->addr = (uint64_t)&rqpair->rsps[i];
+ rsp_sgl->length = sizeof(struct spdk_nvme_cpl);
+ rsp_sgl->lkey = lkey;
+
+ rqpair->rsp_recv_wrs[i].wr_id = (uint64_t)&rsp->rdma_wr;
+ rqpair->rsp_recv_wrs[i].next = NULL;
+ rqpair->rsp_recv_wrs[i].sg_list = rsp_sgl;
+ rqpair->rsp_recv_wrs[i].num_sge = 1;
+
+ rc = nvme_rdma_post_recv(rqpair, i);
+ if (rc) {
+ goto fail;
+ }
+ }
+
+ rc = nvme_rdma_qpair_submit_recvs(rqpair);
+ if (rc) {
+ goto fail;
+ }
+
+ return 0;
+
+fail:
+ nvme_rdma_unregister_rsps(rqpair);
+ return rc;
+}
+
+static void
+nvme_rdma_unregister_reqs(struct nvme_rdma_qpair *rqpair)
+{
+ nvme_rdma_dereg_mr(&rqpair->cmd_mr);
+}
+
+static void
+nvme_rdma_free_reqs(struct nvme_rdma_qpair *rqpair)
+{
+ if (!rqpair->rdma_reqs) {
+ return;
+ }
+
+ nvme_rdma_free(rqpair->cmds);
+ rqpair->cmds = NULL;
+
+ nvme_rdma_free(rqpair->rdma_reqs);
+ rqpair->rdma_reqs = NULL;
+}
+
+static int
+nvme_rdma_alloc_reqs(struct nvme_rdma_qpair *rqpair)
+{
+ uint16_t i;
+
+ rqpair->rdma_reqs = nvme_rdma_calloc(rqpair->num_entries, sizeof(struct spdk_nvme_rdma_req));
+ if (rqpair->rdma_reqs == NULL) {
+ SPDK_ERRLOG("Failed to allocate rdma_reqs\n");
+ goto fail;
+ }
+
+ rqpair->cmds = nvme_rdma_calloc(rqpair->num_entries, sizeof(*rqpair->cmds));
+ if (!rqpair->cmds) {
+ SPDK_ERRLOG("Failed to allocate RDMA cmds\n");
+ goto fail;
+ }
+
+
+ TAILQ_INIT(&rqpair->free_reqs);
+ TAILQ_INIT(&rqpair->outstanding_reqs);
+ for (i = 0; i < rqpair->num_entries; i++) {
+ struct spdk_nvme_rdma_req *rdma_req;
+ struct spdk_nvmf_cmd *cmd;
+
+ rdma_req = &rqpair->rdma_reqs[i];
+ rdma_req->rdma_wr.type = RDMA_WR_TYPE_SEND;
+ cmd = &rqpair->cmds[i];
+
+ rdma_req->id = i;
+
+ /* The first RDMA sgl element will always point
+ * at this data structure. Depending on whether
+ * an NVMe-oF SGL is required, the length of
+ * this element may change. */
+ rdma_req->send_sgl[0].addr = (uint64_t)cmd;
+ rdma_req->send_wr.wr_id = (uint64_t)&rdma_req->rdma_wr;
+ rdma_req->send_wr.next = NULL;
+ rdma_req->send_wr.opcode = IBV_WR_SEND;
+ rdma_req->send_wr.send_flags = IBV_SEND_SIGNALED;
+ rdma_req->send_wr.sg_list = rdma_req->send_sgl;
+ rdma_req->send_wr.imm_data = 0;
+
+ TAILQ_INSERT_TAIL(&rqpair->free_reqs, rdma_req, link);
+ }
+
+ return 0;
+fail:
+ nvme_rdma_free_reqs(rqpair);
+ return -ENOMEM;
+}
+
+static int
+nvme_rdma_register_reqs(struct nvme_rdma_qpair *rqpair)
+{
+ int i;
+ int rc;
+ uint32_t lkey;
+
+ rc = nvme_rdma_reg_mr(rqpair->cm_id, &rqpair->cmd_mr,
+ rqpair->cmds, rqpair->num_entries * sizeof(*rqpair->cmds));
+
+ if (rc < 0) {
+ goto fail;
+ }
+
+ lkey = nvme_rdma_mr_get_lkey(&rqpair->cmd_mr);
+
+ for (i = 0; i < rqpair->num_entries; i++) {
+ rqpair->rdma_reqs[i].send_sgl[0].lkey = lkey;
+ }
+
+ return 0;
+
+fail:
+ nvme_rdma_unregister_reqs(rqpair);
+ return -ENOMEM;
+}
+
+static int
+nvme_rdma_resolve_addr(struct nvme_rdma_qpair *rqpair,
+ struct sockaddr *src_addr,
+ struct sockaddr *dst_addr,
+ struct rdma_event_channel *cm_channel)
+{
+ int ret;
+
+ ret = rdma_resolve_addr(rqpair->cm_id, src_addr, dst_addr,
+ NVME_RDMA_TIME_OUT_IN_MS);
+ if (ret) {
+ SPDK_ERRLOG("rdma_resolve_addr, %d\n", errno);
+ return ret;
+ }
+
+ ret = nvme_rdma_process_event(rqpair, cm_channel, RDMA_CM_EVENT_ADDR_RESOLVED);
+ if (ret) {
+ SPDK_ERRLOG("RDMA address resolution error\n");
+ return -1;
+ }
+
+ if (rqpair->qpair.ctrlr->opts.transport_ack_timeout != SPDK_NVME_TRANSPORT_ACK_TIMEOUT_DISABLED) {
+#ifdef SPDK_CONFIG_RDMA_SET_ACK_TIMEOUT
+ uint8_t timeout = rqpair->qpair.ctrlr->opts.transport_ack_timeout;
+ ret = rdma_set_option(rqpair->cm_id, RDMA_OPTION_ID,
+ RDMA_OPTION_ID_ACK_TIMEOUT,
+ &timeout, sizeof(timeout));
+ if (ret) {
+ SPDK_NOTICELOG("Can't apply RDMA_OPTION_ID_ACK_TIMEOUT %d, ret %d\n", timeout, ret);
+ }
+#else
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "transport_ack_timeout is not supported\n");
+#endif
+ }
+
+
+ ret = rdma_resolve_route(rqpair->cm_id, NVME_RDMA_TIME_OUT_IN_MS);
+ if (ret) {
+ SPDK_ERRLOG("rdma_resolve_route\n");
+ return ret;
+ }
+
+ ret = nvme_rdma_process_event(rqpair, cm_channel, RDMA_CM_EVENT_ROUTE_RESOLVED);
+ if (ret) {
+ SPDK_ERRLOG("RDMA route resolution error\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int
+nvme_rdma_connect(struct nvme_rdma_qpair *rqpair)
+{
+ struct rdma_conn_param param = {};
+ struct spdk_nvmf_rdma_request_private_data request_data = {};
+ struct ibv_device_attr attr;
+ int ret;
+ struct spdk_nvme_ctrlr *ctrlr;
+ struct nvme_rdma_ctrlr *rctrlr;
+
+ ret = ibv_query_device(rqpair->cm_id->verbs, &attr);
+ if (ret != 0) {
+ SPDK_ERRLOG("Failed to query RDMA device attributes.\n");
+ return ret;
+ }
+
+ param.responder_resources = spdk_min(rqpair->num_entries, attr.max_qp_rd_atom);
+
+ ctrlr = rqpair->qpair.ctrlr;
+ if (!ctrlr) {
+ return -1;
+ }
+ rctrlr = nvme_rdma_ctrlr(ctrlr);
+ assert(rctrlr != NULL);
+
+ request_data.qid = rqpair->qpair.id;
+ request_data.hrqsize = rqpair->num_entries;
+ request_data.hsqsize = rqpair->num_entries - 1;
+ request_data.cntlid = ctrlr->cntlid;
+
+ param.private_data = &request_data;
+ param.private_data_len = sizeof(request_data);
+ param.retry_count = ctrlr->opts.transport_retry_count;
+ param.rnr_retry_count = 7;
+
+ /* Fields below are ignored by rdma cm if qpair has been
+ * created using rdma cm API. */
+ param.srq = 0;
+ param.qp_num = rqpair->rdma_qp->qp->qp_num;
+
+ ret = rdma_connect(rqpair->cm_id, &param);
+ if (ret) {
+ SPDK_ERRLOG("nvme rdma connect error\n");
+ return ret;
+ }
+
+ ret = nvme_rdma_process_event(rqpair, rctrlr->cm_channel, RDMA_CM_EVENT_ESTABLISHED);
+ if (ret == -ESTALE) {
+ SPDK_NOTICELOG("Received a stale connection notice during connection.\n");
+ return -EAGAIN;
+ } else if (ret) {
+ SPDK_ERRLOG("RDMA connect error %d\n", ret);
+ return ret;
+ } else {
+ return 0;
+ }
+}
+
+static int
+nvme_rdma_parse_addr(struct sockaddr_storage *sa, int family, const char *addr, const char *service)
+{
+ struct addrinfo *res;
+ struct addrinfo hints;
+ int ret;
+
+ memset(&hints, 0, sizeof(hints));
+ hints.ai_family = family;
+ hints.ai_socktype = SOCK_STREAM;
+ hints.ai_protocol = 0;
+
+ ret = getaddrinfo(addr, service, &hints, &res);
+ if (ret) {
+ SPDK_ERRLOG("getaddrinfo failed: %s (%d)\n", gai_strerror(ret), ret);
+ return ret;
+ }
+
+ if (res->ai_addrlen > sizeof(*sa)) {
+ SPDK_ERRLOG("getaddrinfo() ai_addrlen %zu too large\n", (size_t)res->ai_addrlen);
+ ret = EINVAL;
+ } else {
+ memcpy(sa, res->ai_addr, res->ai_addrlen);
+ }
+
+ freeaddrinfo(res);
+ return ret;
+}
+
+static int
+nvme_rdma_mr_map_notify(void *cb_ctx, struct spdk_mem_map *map,
+ enum spdk_mem_map_notify_action action,
+ void *vaddr, size_t size)
+{
+ struct ibv_pd *pd = cb_ctx;
+ struct ibv_mr *mr;
+ int rc;
+
+ switch (action) {
+ case SPDK_MEM_MAP_NOTIFY_REGISTER:
+ if (!g_nvme_hooks.get_rkey) {
+ mr = ibv_reg_mr(pd, vaddr, size,
+ IBV_ACCESS_LOCAL_WRITE |
+ IBV_ACCESS_REMOTE_READ |
+ IBV_ACCESS_REMOTE_WRITE);
+ if (mr == NULL) {
+ SPDK_ERRLOG("ibv_reg_mr() failed\n");
+ return -EFAULT;
+ } else {
+ rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, size, (uint64_t)mr);
+ }
+ } else {
+ rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, size,
+ g_nvme_hooks.get_rkey(pd, vaddr, size));
+ }
+ break;
+ case SPDK_MEM_MAP_NOTIFY_UNREGISTER:
+ if (!g_nvme_hooks.get_rkey) {
+ mr = (struct ibv_mr *)spdk_mem_map_translate(map, (uint64_t)vaddr, NULL);
+ if (mr) {
+ ibv_dereg_mr(mr);
+ }
+ }
+ rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, size);
+ break;
+ default:
+ SPDK_UNREACHABLE();
+ }
+
+ return rc;
+}
+
+static int
+nvme_rdma_check_contiguous_entries(uint64_t addr_1, uint64_t addr_2)
+{
+ /* Two contiguous mappings will point to the same address which is the start of the RDMA MR. */
+ return addr_1 == addr_2;
+}
+
+static int
+nvme_rdma_register_mem(struct nvme_rdma_qpair *rqpair)
+{
+ struct ibv_pd *pd = rqpair->rdma_qp->qp->pd;
+ struct spdk_nvme_rdma_mr_map *mr_map;
+ const struct spdk_mem_map_ops nvme_rdma_map_ops = {
+ .notify_cb = nvme_rdma_mr_map_notify,
+ .are_contiguous = nvme_rdma_check_contiguous_entries
+ };
+
+ pthread_mutex_lock(&g_rdma_mr_maps_mutex);
+
+ /* Look up existing mem map registration for this pd */
+ LIST_FOREACH(mr_map, &g_rdma_mr_maps, link) {
+ if (mr_map->pd == pd) {
+ mr_map->ref++;
+ rqpair->mr_map = mr_map;
+ pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
+ return 0;
+ }
+ }
+
+ mr_map = nvme_rdma_calloc(1, sizeof(*mr_map));
+ if (mr_map == NULL) {
+ SPDK_ERRLOG("Failed to allocate mr_map\n");
+ pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
+ return -1;
+ }
+
+ mr_map->ref = 1;
+ mr_map->pd = pd;
+ mr_map->map = spdk_mem_map_alloc((uint64_t)NULL, &nvme_rdma_map_ops, pd);
+ if (mr_map->map == NULL) {
+ SPDK_ERRLOG("spdk_mem_map_alloc() failed\n");
+ nvme_rdma_free(mr_map);
+
+ pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
+ return -1;
+ }
+
+ rqpair->mr_map = mr_map;
+ LIST_INSERT_HEAD(&g_rdma_mr_maps, mr_map, link);
+
+ pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
+
+ return 0;
+}
+
+static void
+nvme_rdma_unregister_mem(struct nvme_rdma_qpair *rqpair)
+{
+ struct spdk_nvme_rdma_mr_map *mr_map;
+
+ mr_map = rqpair->mr_map;
+ rqpair->mr_map = NULL;
+
+ if (mr_map == NULL) {
+ return;
+ }
+
+ pthread_mutex_lock(&g_rdma_mr_maps_mutex);
+
+ assert(mr_map->ref > 0);
+ mr_map->ref--;
+ if (mr_map->ref == 0) {
+ LIST_REMOVE(mr_map, link);
+ spdk_mem_map_free(&mr_map->map);
+ nvme_rdma_free(mr_map);
+ }
+
+ pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
+}
+
+static int
+_nvme_rdma_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
+{
+ struct sockaddr_storage dst_addr;
+ struct sockaddr_storage src_addr;
+ bool src_addr_specified;
+ int rc;
+ struct nvme_rdma_ctrlr *rctrlr;
+ struct nvme_rdma_qpair *rqpair;
+ int family;
+
+ rqpair = nvme_rdma_qpair(qpair);
+ rctrlr = nvme_rdma_ctrlr(ctrlr);
+ assert(rctrlr != NULL);
+
+ switch (ctrlr->trid.adrfam) {
+ case SPDK_NVMF_ADRFAM_IPV4:
+ family = AF_INET;
+ break;
+ case SPDK_NVMF_ADRFAM_IPV6:
+ family = AF_INET6;
+ break;
+ default:
+ SPDK_ERRLOG("Unhandled ADRFAM %d\n", ctrlr->trid.adrfam);
+ return -1;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "adrfam %d ai_family %d\n", ctrlr->trid.adrfam, family);
+
+ memset(&dst_addr, 0, sizeof(dst_addr));
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "trsvcid is %s\n", ctrlr->trid.trsvcid);
+ rc = nvme_rdma_parse_addr(&dst_addr, family, ctrlr->trid.traddr, ctrlr->trid.trsvcid);
+ if (rc != 0) {
+ SPDK_ERRLOG("dst_addr nvme_rdma_parse_addr() failed\n");
+ return -1;
+ }
+
+ if (ctrlr->opts.src_addr[0] || ctrlr->opts.src_svcid[0]) {
+ memset(&src_addr, 0, sizeof(src_addr));
+ rc = nvme_rdma_parse_addr(&src_addr, family, ctrlr->opts.src_addr, ctrlr->opts.src_svcid);
+ if (rc != 0) {
+ SPDK_ERRLOG("src_addr nvme_rdma_parse_addr() failed\n");
+ return -1;
+ }
+ src_addr_specified = true;
+ } else {
+ src_addr_specified = false;
+ }
+
+ rc = rdma_create_id(rctrlr->cm_channel, &rqpair->cm_id, rqpair, RDMA_PS_TCP);
+ if (rc < 0) {
+ SPDK_ERRLOG("rdma_create_id() failed\n");
+ return -1;
+ }
+
+ rc = nvme_rdma_resolve_addr(rqpair,
+ src_addr_specified ? (struct sockaddr *)&src_addr : NULL,
+ (struct sockaddr *)&dst_addr, rctrlr->cm_channel);
+ if (rc < 0) {
+ SPDK_ERRLOG("nvme_rdma_resolve_addr() failed\n");
+ return -1;
+ }
+
+ rc = nvme_rdma_qpair_init(rqpair);
+ if (rc < 0) {
+ SPDK_ERRLOG("nvme_rdma_qpair_init() failed\n");
+ return -1;
+ }
+
+ rc = nvme_rdma_connect(rqpair);
+ if (rc != 0) {
+ SPDK_ERRLOG("Unable to connect the rqpair\n");
+ return rc;
+ }
+
+ rc = nvme_rdma_register_reqs(rqpair);
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "rc =%d\n", rc);
+ if (rc) {
+ SPDK_ERRLOG("Unable to register rqpair RDMA requests\n");
+ return -1;
+ }
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "RDMA requests registered\n");
+
+ rc = nvme_rdma_register_rsps(rqpair);
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "rc =%d\n", rc);
+ if (rc < 0) {
+ SPDK_ERRLOG("Unable to register rqpair RDMA responses\n");
+ return -1;
+ }
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "RDMA responses registered\n");
+
+ rc = nvme_rdma_register_mem(rqpair);
+ if (rc < 0) {
+ SPDK_ERRLOG("Unable to register memory for RDMA\n");
+ return -1;
+ }
+
+ rc = nvme_fabric_qpair_connect(&rqpair->qpair, rqpair->num_entries);
+ if (rc < 0) {
+ rqpair->qpair.transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_UNKNOWN;
+ SPDK_ERRLOG("Failed to send an NVMe-oF Fabric CONNECT command\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int
+nvme_rdma_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
+{
+ int rc;
+ int retry_count = 0;
+
+ rc = _nvme_rdma_ctrlr_connect_qpair(ctrlr, qpair);
+
+ /*
+ * -EAGAIN represents the special case where the target side still thought it was connected.
+ * Most NICs will fail the first connection attempt, and the NICs will clean up whatever
+ * state they need to. After that, subsequent connection attempts will succeed.
+ */
+ if (rc == -EAGAIN) {
+ SPDK_NOTICELOG("Detected stale connection on Target side for qpid: %d\n", qpair->id);
+ do {
+ nvme_delay(NVME_RDMA_STALE_CONN_RETRY_DELAY_US);
+ nvme_transport_ctrlr_disconnect_qpair(ctrlr, qpair);
+ rc = _nvme_rdma_ctrlr_connect_qpair(ctrlr, qpair);
+ retry_count++;
+ } while (rc == -EAGAIN && retry_count < NVME_RDMA_STALE_CONN_RETRY_MAX);
+ }
+
+ return rc;
+}
+
+/*
+ * Build SGL describing empty payload.
+ */
+static int
+nvme_rdma_build_null_request(struct spdk_nvme_rdma_req *rdma_req)
+{
+ struct nvme_request *req = rdma_req->req;
+
+ req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG;
+
+ /* The first element of this SGL is pointing at an
+ * spdk_nvmf_cmd object. For this particular command,
+ * we only need the first 64 bytes corresponding to
+ * the NVMe command. */
+ rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd);
+
+ /* The RDMA SGL needs one element describing the NVMe command. */
+ rdma_req->send_wr.num_sge = 1;
+
+ req->cmd.dptr.sgl1.keyed.type = SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK;
+ req->cmd.dptr.sgl1.keyed.subtype = SPDK_NVME_SGL_SUBTYPE_ADDRESS;
+ req->cmd.dptr.sgl1.keyed.length = 0;
+ req->cmd.dptr.sgl1.keyed.key = 0;
+ req->cmd.dptr.sgl1.address = 0;
+
+ return 0;
+}
+
+static inline bool
+nvme_rdma_get_key(struct spdk_mem_map *map, void *payload, uint64_t size,
+ enum nvme_rdma_key_type key_type, uint32_t *key)
+{
+ struct ibv_mr *mr;
+ uint64_t real_size = size;
+ uint32_t _key = 0;
+
+ if (!g_nvme_hooks.get_rkey) {
+ mr = (struct ibv_mr *)spdk_mem_map_translate(map, (uint64_t)payload, &real_size);
+
+ if (spdk_unlikely(!mr)) {
+ SPDK_ERRLOG("No translation for ptr %p, size %lu\n", payload, size);
+ return false;
+ }
+ switch (key_type) {
+ case NVME_RDMA_MR_RKEY:
+ _key = mr->rkey;
+ break;
+ case NVME_RDMA_MR_LKEY:
+ _key = mr->lkey;
+ break;
+ default:
+ SPDK_ERRLOG("Invalid key type %d\n", key_type);
+ assert(0);
+ return false;
+ }
+ } else {
+ _key = spdk_mem_map_translate(map, (uint64_t)payload, &real_size);
+ }
+
+ if (spdk_unlikely(real_size < size)) {
+ SPDK_ERRLOG("Data buffer split over multiple RDMA Memory Regions\n");
+ return false;
+ }
+
+ *key = _key;
+ return true;
+}
+
+/*
+ * Build inline SGL describing contiguous payload buffer.
+ */
+static int
+nvme_rdma_build_contig_inline_request(struct nvme_rdma_qpair *rqpair,
+ struct spdk_nvme_rdma_req *rdma_req)
+{
+ struct nvme_request *req = rdma_req->req;
+ uint32_t lkey = 0;
+ void *payload;
+
+ payload = req->payload.contig_or_cb_arg + req->payload_offset;
+ assert(req->payload_size != 0);
+ assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG);
+
+ if (spdk_unlikely(!nvme_rdma_get_key(rqpair->mr_map->map, payload, req->payload_size,
+ NVME_RDMA_MR_LKEY, &lkey))) {
+ return -1;
+ }
+
+ rdma_req->send_sgl[1].lkey = lkey;
+
+ /* The first element of this SGL is pointing at an
+ * spdk_nvmf_cmd object. For this particular command,
+ * we only need the first 64 bytes corresponding to
+ * the NVMe command. */
+ rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd);
+
+ rdma_req->send_sgl[1].addr = (uint64_t)payload;
+ rdma_req->send_sgl[1].length = (uint32_t)req->payload_size;
+
+ /* The RDMA SGL contains two elements. The first describes
+ * the NVMe command and the second describes the data
+ * payload. */
+ rdma_req->send_wr.num_sge = 2;
+
+ req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG;
+ req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
+ req->cmd.dptr.sgl1.unkeyed.subtype = SPDK_NVME_SGL_SUBTYPE_OFFSET;
+ req->cmd.dptr.sgl1.unkeyed.length = (uint32_t)req->payload_size;
+ /* Inline only supported for icdoff == 0 currently. This function will
+ * not get called for controllers with other values. */
+ req->cmd.dptr.sgl1.address = (uint64_t)0;
+
+ return 0;
+}
+
+/*
+ * Build SGL describing contiguous payload buffer.
+ */
+static int
+nvme_rdma_build_contig_request(struct nvme_rdma_qpair *rqpair,
+ struct spdk_nvme_rdma_req *rdma_req)
+{
+ struct nvme_request *req = rdma_req->req;
+ void *payload = req->payload.contig_or_cb_arg + req->payload_offset;
+ uint32_t rkey = 0;
+
+ assert(req->payload_size != 0);
+ assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG);
+
+ if (spdk_unlikely(req->payload_size > NVME_RDMA_MAX_KEYED_SGL_LENGTH)) {
+ SPDK_ERRLOG("SGL length %u exceeds max keyed SGL block size %u\n",
+ req->payload_size, NVME_RDMA_MAX_KEYED_SGL_LENGTH);
+ return -1;
+ }
+
+ if (spdk_unlikely(!nvme_rdma_get_key(rqpair->mr_map->map, payload, req->payload_size,
+ NVME_RDMA_MR_RKEY, &rkey))) {
+ return -1;
+ }
+
+ req->cmd.dptr.sgl1.keyed.key = rkey;
+
+ /* The first element of this SGL is pointing at an
+ * spdk_nvmf_cmd object. For this particular command,
+ * we only need the first 64 bytes corresponding to
+ * the NVMe command. */
+ rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd);
+
+ /* The RDMA SGL needs one element describing the NVMe command. */
+ rdma_req->send_wr.num_sge = 1;
+
+ req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG;
+ req->cmd.dptr.sgl1.keyed.type = SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK;
+ req->cmd.dptr.sgl1.keyed.subtype = SPDK_NVME_SGL_SUBTYPE_ADDRESS;
+ req->cmd.dptr.sgl1.keyed.length = req->payload_size;
+ req->cmd.dptr.sgl1.address = (uint64_t)payload;
+
+ return 0;
+}
+
+/*
+ * Build SGL describing scattered payload buffer.
+ */
+static int
+nvme_rdma_build_sgl_request(struct nvme_rdma_qpair *rqpair,
+ struct spdk_nvme_rdma_req *rdma_req)
+{
+ struct nvme_request *req = rdma_req->req;
+ struct spdk_nvmf_cmd *cmd = &rqpair->cmds[rdma_req->id];
+ void *virt_addr;
+ uint32_t remaining_size;
+ uint32_t sge_length;
+ int rc, max_num_sgl, num_sgl_desc;
+ uint32_t rkey = 0;
+
+ assert(req->payload_size != 0);
+ assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL);
+ assert(req->payload.reset_sgl_fn != NULL);
+ assert(req->payload.next_sge_fn != NULL);
+ req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset);
+
+ max_num_sgl = req->qpair->ctrlr->max_sges;
+
+ remaining_size = req->payload_size;
+ num_sgl_desc = 0;
+ do {
+ rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, &virt_addr, &sge_length);
+ if (rc) {
+ return -1;
+ }
+
+ sge_length = spdk_min(remaining_size, sge_length);
+
+ if (spdk_unlikely(sge_length > NVME_RDMA_MAX_KEYED_SGL_LENGTH)) {
+ SPDK_ERRLOG("SGL length %u exceeds max keyed SGL block size %u\n",
+ sge_length, NVME_RDMA_MAX_KEYED_SGL_LENGTH);
+ return -1;
+ }
+
+ if (spdk_unlikely(!nvme_rdma_get_key(rqpair->mr_map->map, virt_addr, sge_length,
+ NVME_RDMA_MR_RKEY, &rkey))) {
+ return -1;
+ }
+
+ cmd->sgl[num_sgl_desc].keyed.key = rkey;
+ cmd->sgl[num_sgl_desc].keyed.type = SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK;
+ cmd->sgl[num_sgl_desc].keyed.subtype = SPDK_NVME_SGL_SUBTYPE_ADDRESS;
+ cmd->sgl[num_sgl_desc].keyed.length = sge_length;
+ cmd->sgl[num_sgl_desc].address = (uint64_t)virt_addr;
+
+ remaining_size -= sge_length;
+ num_sgl_desc++;
+ } while (remaining_size > 0 && num_sgl_desc < max_num_sgl);
+
+
+ /* Should be impossible if we did our sgl checks properly up the stack, but do a sanity check here. */
+ if (remaining_size > 0) {
+ return -1;
+ }
+
+ req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG;
+
+ /* The RDMA SGL needs one element describing some portion
+ * of the spdk_nvmf_cmd structure. */
+ rdma_req->send_wr.num_sge = 1;
+
+ /*
+ * If only one SGL descriptor is required, it can be embedded directly in the command
+ * as a data block descriptor.
+ */
+ if (num_sgl_desc == 1) {
+ /* The first element of this SGL is pointing at an
+ * spdk_nvmf_cmd object. For this particular command,
+ * we only need the first 64 bytes corresponding to
+ * the NVMe command. */
+ rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd);
+
+ req->cmd.dptr.sgl1.keyed.type = cmd->sgl[0].keyed.type;
+ req->cmd.dptr.sgl1.keyed.subtype = cmd->sgl[0].keyed.subtype;
+ req->cmd.dptr.sgl1.keyed.length = cmd->sgl[0].keyed.length;
+ req->cmd.dptr.sgl1.keyed.key = cmd->sgl[0].keyed.key;
+ req->cmd.dptr.sgl1.address = cmd->sgl[0].address;
+ } else {
+ /*
+ * Otherwise, The SGL descriptor embedded in the command must point to the list of
+ * SGL descriptors used to describe the operation. In that case it is a last segment descriptor.
+ */
+ rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd) + sizeof(struct
+ spdk_nvme_sgl_descriptor) * num_sgl_desc;
+
+ req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_LAST_SEGMENT;
+ req->cmd.dptr.sgl1.unkeyed.subtype = SPDK_NVME_SGL_SUBTYPE_OFFSET;
+ req->cmd.dptr.sgl1.unkeyed.length = num_sgl_desc * sizeof(struct spdk_nvme_sgl_descriptor);
+ req->cmd.dptr.sgl1.address = (uint64_t)0;
+ }
+
+ return 0;
+}
+
+/*
+ * Build inline SGL describing sgl payload buffer.
+ */
+static int
+nvme_rdma_build_sgl_inline_request(struct nvme_rdma_qpair *rqpair,
+ struct spdk_nvme_rdma_req *rdma_req)
+{
+ struct nvme_request *req = rdma_req->req;
+ uint32_t lkey = 0;
+ uint32_t length;
+ void *virt_addr;
+ int rc;
+
+ assert(req->payload_size != 0);
+ assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL);
+ assert(req->payload.reset_sgl_fn != NULL);
+ assert(req->payload.next_sge_fn != NULL);
+ req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset);
+
+ rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, &virt_addr, &length);
+ if (rc) {
+ return -1;
+ }
+
+ if (length < req->payload_size) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Inline SGL request split so sending separately.\n");
+ return nvme_rdma_build_sgl_request(rqpair, rdma_req);
+ }
+
+ if (length > req->payload_size) {
+ length = req->payload_size;
+ }
+
+ if (spdk_unlikely(!nvme_rdma_get_key(rqpair->mr_map->map, virt_addr, length,
+ NVME_RDMA_MR_LKEY, &lkey))) {
+ return -1;
+ }
+
+ rdma_req->send_sgl[1].addr = (uint64_t)virt_addr;
+ rdma_req->send_sgl[1].length = length;
+ rdma_req->send_sgl[1].lkey = lkey;
+
+ rdma_req->send_wr.num_sge = 2;
+
+ /* The first element of this SGL is pointing at an
+ * spdk_nvmf_cmd object. For this particular command,
+ * we only need the first 64 bytes corresponding to
+ * the NVMe command. */
+ rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd);
+
+ req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG;
+ req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
+ req->cmd.dptr.sgl1.unkeyed.subtype = SPDK_NVME_SGL_SUBTYPE_OFFSET;
+ req->cmd.dptr.sgl1.unkeyed.length = (uint32_t)req->payload_size;
+ /* Inline only supported for icdoff == 0 currently. This function will
+ * not get called for controllers with other values. */
+ req->cmd.dptr.sgl1.address = (uint64_t)0;
+
+ return 0;
+}
+
+static int
+nvme_rdma_req_init(struct nvme_rdma_qpair *rqpair, struct nvme_request *req,
+ struct spdk_nvme_rdma_req *rdma_req)
+{
+ struct spdk_nvme_ctrlr *ctrlr = rqpair->qpair.ctrlr;
+ enum nvme_payload_type payload_type;
+ bool icd_supported;
+ int rc;
+
+ assert(rdma_req->req == NULL);
+ rdma_req->req = req;
+ req->cmd.cid = rdma_req->id;
+ payload_type = nvme_payload_type(&req->payload);
+ /*
+ * Check if icdoff is non zero, to avoid interop conflicts with
+ * targets with non-zero icdoff. Both SPDK and the Linux kernel
+ * targets use icdoff = 0. For targets with non-zero icdoff, we
+ * will currently just not use inline data for now.
+ */
+ icd_supported = spdk_nvme_opc_get_data_transfer(req->cmd.opc) == SPDK_NVME_DATA_HOST_TO_CONTROLLER
+ && req->payload_size <= ctrlr->ioccsz_bytes && ctrlr->icdoff == 0;
+
+ if (req->payload_size == 0) {
+ rc = nvme_rdma_build_null_request(rdma_req);
+ } else if (payload_type == NVME_PAYLOAD_TYPE_CONTIG) {
+ if (icd_supported) {
+ rc = nvme_rdma_build_contig_inline_request(rqpair, rdma_req);
+ } else {
+ rc = nvme_rdma_build_contig_request(rqpair, rdma_req);
+ }
+ } else if (payload_type == NVME_PAYLOAD_TYPE_SGL) {
+ if (icd_supported) {
+ rc = nvme_rdma_build_sgl_inline_request(rqpair, rdma_req);
+ } else {
+ rc = nvme_rdma_build_sgl_request(rqpair, rdma_req);
+ }
+ } else {
+ rc = -1;
+ }
+
+ if (rc) {
+ rdma_req->req = NULL;
+ return rc;
+ }
+
+ memcpy(&rqpair->cmds[rdma_req->id], &req->cmd, sizeof(req->cmd));
+ return 0;
+}
+
+static struct spdk_nvme_qpair *
+nvme_rdma_ctrlr_create_qpair(struct spdk_nvme_ctrlr *ctrlr,
+ uint16_t qid, uint32_t qsize,
+ enum spdk_nvme_qprio qprio,
+ uint32_t num_requests,
+ bool delay_cmd_submit)
+{
+ struct nvme_rdma_qpair *rqpair;
+ struct spdk_nvme_qpair *qpair;
+ int rc;
+
+ rqpair = nvme_rdma_calloc(1, sizeof(struct nvme_rdma_qpair));
+ if (!rqpair) {
+ SPDK_ERRLOG("failed to get create rqpair\n");
+ return NULL;
+ }
+
+ rqpair->num_entries = qsize;
+ rqpair->delay_cmd_submit = delay_cmd_submit;
+ qpair = &rqpair->qpair;
+ rc = nvme_qpair_init(qpair, qid, ctrlr, qprio, num_requests);
+ if (rc != 0) {
+ return NULL;
+ }
+
+ rc = nvme_rdma_alloc_reqs(rqpair);
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "rc =%d\n", rc);
+ if (rc) {
+ SPDK_ERRLOG("Unable to allocate rqpair RDMA requests\n");
+ nvme_rdma_free(rqpair);
+ return NULL;
+ }
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "RDMA requests allocated\n");
+
+ rc = nvme_rdma_alloc_rsps(rqpair);
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "rc =%d\n", rc);
+ if (rc < 0) {
+ SPDK_ERRLOG("Unable to allocate rqpair RDMA responses\n");
+ nvme_rdma_free_reqs(rqpair);
+ nvme_rdma_free(rqpair);
+ return NULL;
+ }
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "RDMA responses allocated\n");
+
+ return qpair;
+}
+
+static void
+nvme_rdma_ctrlr_disconnect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
+{
+ struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair);
+ struct nvme_rdma_ctrlr *rctrlr = NULL;
+ struct nvme_rdma_cm_event_entry *entry, *tmp;
+
+ nvme_rdma_unregister_mem(rqpair);
+ nvme_rdma_unregister_reqs(rqpair);
+ nvme_rdma_unregister_rsps(rqpair);
+
+ if (rqpair->evt) {
+ rdma_ack_cm_event(rqpair->evt);
+ rqpair->evt = NULL;
+ }
+
+ /*
+ * This works because we have the controller lock both in
+ * this function and in the function where we add new events.
+ */
+ if (qpair->ctrlr != NULL) {
+ rctrlr = nvme_rdma_ctrlr(qpair->ctrlr);
+ STAILQ_FOREACH_SAFE(entry, &rctrlr->pending_cm_events, link, tmp) {
+ if (nvme_rdma_qpair(entry->evt->id->context) == rqpair) {
+ STAILQ_REMOVE(&rctrlr->pending_cm_events, entry, nvme_rdma_cm_event_entry, link);
+ rdma_ack_cm_event(entry->evt);
+ STAILQ_INSERT_HEAD(&rctrlr->free_cm_events, entry, link);
+ }
+ }
+ }
+
+ if (rqpair->cm_id) {
+ if (rqpair->rdma_qp) {
+ spdk_rdma_qp_disconnect(rqpair->rdma_qp);
+ if (rctrlr != NULL) {
+ if (nvme_rdma_process_event(rqpair, rctrlr->cm_channel, RDMA_CM_EVENT_DISCONNECTED)) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Target did not respond to qpair disconnect.\n");
+ }
+ }
+ spdk_rdma_qp_destroy(rqpair->rdma_qp);
+ rqpair->rdma_qp = NULL;
+ }
+
+ rdma_destroy_id(rqpair->cm_id);
+ rqpair->cm_id = NULL;
+ }
+
+ if (rqpair->cq) {
+ ibv_destroy_cq(rqpair->cq);
+ rqpair->cq = NULL;
+ }
+}
+
+static void nvme_rdma_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr);
+
+static int
+nvme_rdma_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
+{
+ struct nvme_rdma_qpair *rqpair;
+
+ rqpair = nvme_rdma_qpair(qpair);
+ nvme_transport_ctrlr_disconnect_qpair(ctrlr, qpair);
+ if (rqpair->defer_deletion_to_pg) {
+ nvme_qpair_set_state(qpair, NVME_QPAIR_DESTROYING);
+ return 0;
+ }
+
+ nvme_rdma_qpair_abort_reqs(qpair, 1);
+ nvme_qpair_deinit(qpair);
+
+ nvme_rdma_free_reqs(rqpair);
+ nvme_rdma_free_rsps(rqpair);
+ nvme_rdma_free(rqpair);
+
+ return 0;
+}
+
+static struct spdk_nvme_qpair *
+nvme_rdma_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid,
+ const struct spdk_nvme_io_qpair_opts *opts)
+{
+ return nvme_rdma_ctrlr_create_qpair(ctrlr, qid, opts->io_queue_size, opts->qprio,
+ opts->io_queue_requests,
+ opts->delay_cmd_submit);
+}
+
+static int
+nvme_rdma_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr)
+{
+ /* do nothing here */
+ return 0;
+}
+
+static int nvme_rdma_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr);
+
+static struct spdk_nvme_ctrlr *nvme_rdma_ctrlr_construct(const struct spdk_nvme_transport_id *trid,
+ const struct spdk_nvme_ctrlr_opts *opts,
+ void *devhandle)
+{
+ struct nvme_rdma_ctrlr *rctrlr;
+ union spdk_nvme_cap_register cap;
+ union spdk_nvme_vs_register vs;
+ struct ibv_context **contexts;
+ struct ibv_device_attr dev_attr;
+ int i, flag, rc;
+
+ rctrlr = nvme_rdma_calloc(1, sizeof(struct nvme_rdma_ctrlr));
+ if (rctrlr == NULL) {
+ SPDK_ERRLOG("could not allocate ctrlr\n");
+ return NULL;
+ }
+
+ rctrlr->ctrlr.opts = *opts;
+ rctrlr->ctrlr.trid = *trid;
+
+ if (opts->transport_retry_count > NVME_RDMA_CTRLR_MAX_TRANSPORT_RETRY_COUNT) {
+ SPDK_NOTICELOG("transport_retry_count exceeds max value %d, use max value\n",
+ NVME_RDMA_CTRLR_MAX_TRANSPORT_RETRY_COUNT);
+ rctrlr->ctrlr.opts.transport_retry_count = NVME_RDMA_CTRLR_MAX_TRANSPORT_RETRY_COUNT;
+ }
+
+ if (opts->transport_ack_timeout > NVME_RDMA_CTRLR_MAX_TRANSPORT_ACK_TIMEOUT) {
+ SPDK_NOTICELOG("transport_ack_timeout exceeds max value %d, use max value\n",
+ NVME_RDMA_CTRLR_MAX_TRANSPORT_ACK_TIMEOUT);
+ rctrlr->ctrlr.opts.transport_ack_timeout = NVME_RDMA_CTRLR_MAX_TRANSPORT_ACK_TIMEOUT;
+ }
+
+ contexts = rdma_get_devices(NULL);
+ if (contexts == NULL) {
+ SPDK_ERRLOG("rdma_get_devices() failed: %s (%d)\n", spdk_strerror(errno), errno);
+ nvme_rdma_free(rctrlr);
+ return NULL;
+ }
+
+ i = 0;
+ rctrlr->max_sge = NVME_RDMA_MAX_SGL_DESCRIPTORS;
+
+ while (contexts[i] != NULL) {
+ rc = ibv_query_device(contexts[i], &dev_attr);
+ if (rc < 0) {
+ SPDK_ERRLOG("Failed to query RDMA device attributes.\n");
+ rdma_free_devices(contexts);
+ nvme_rdma_free(rctrlr);
+ return NULL;
+ }
+ rctrlr->max_sge = spdk_min(rctrlr->max_sge, (uint16_t)dev_attr.max_sge);
+ i++;
+ }
+
+ rdma_free_devices(contexts);
+
+ rc = nvme_ctrlr_construct(&rctrlr->ctrlr);
+ if (rc != 0) {
+ nvme_rdma_free(rctrlr);
+ return NULL;
+ }
+
+ STAILQ_INIT(&rctrlr->pending_cm_events);
+ STAILQ_INIT(&rctrlr->free_cm_events);
+ rctrlr->cm_events = nvme_rdma_calloc(NVME_RDMA_NUM_CM_EVENTS, sizeof(*rctrlr->cm_events));
+ if (rctrlr->cm_events == NULL) {
+ SPDK_ERRLOG("unable to allocat buffers to hold CM events.\n");
+ goto destruct_ctrlr;
+ }
+
+ for (i = 0; i < NVME_RDMA_NUM_CM_EVENTS; i++) {
+ STAILQ_INSERT_TAIL(&rctrlr->free_cm_events, &rctrlr->cm_events[i], link);
+ }
+
+ rctrlr->cm_channel = rdma_create_event_channel();
+ if (rctrlr->cm_channel == NULL) {
+ SPDK_ERRLOG("rdma_create_event_channel() failed\n");
+ goto destruct_ctrlr;
+ }
+
+ flag = fcntl(rctrlr->cm_channel->fd, F_GETFL);
+ if (fcntl(rctrlr->cm_channel->fd, F_SETFL, flag | O_NONBLOCK) < 0) {
+ SPDK_ERRLOG("Cannot set event channel to non blocking\n");
+ goto destruct_ctrlr;
+ }
+
+ rctrlr->ctrlr.adminq = nvme_rdma_ctrlr_create_qpair(&rctrlr->ctrlr, 0,
+ rctrlr->ctrlr.opts.admin_queue_size, 0,
+ rctrlr->ctrlr.opts.admin_queue_size, false);
+ if (!rctrlr->ctrlr.adminq) {
+ SPDK_ERRLOG("failed to create admin qpair\n");
+ goto destruct_ctrlr;
+ }
+
+ rc = nvme_transport_ctrlr_connect_qpair(&rctrlr->ctrlr, rctrlr->ctrlr.adminq);
+ if (rc < 0) {
+ SPDK_ERRLOG("failed to connect admin qpair\n");
+ goto destruct_ctrlr;
+ }
+
+ if (nvme_ctrlr_get_cap(&rctrlr->ctrlr, &cap)) {
+ SPDK_ERRLOG("get_cap() failed\n");
+ goto destruct_ctrlr;
+ }
+
+ if (nvme_ctrlr_get_vs(&rctrlr->ctrlr, &vs)) {
+ SPDK_ERRLOG("get_vs() failed\n");
+ goto destruct_ctrlr;
+ }
+
+ if (nvme_ctrlr_add_process(&rctrlr->ctrlr, 0) != 0) {
+ SPDK_ERRLOG("nvme_ctrlr_add_process() failed\n");
+ goto destruct_ctrlr;
+ }
+
+ nvme_ctrlr_init_cap(&rctrlr->ctrlr, &cap, &vs);
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "successfully initialized the nvmf ctrlr\n");
+ return &rctrlr->ctrlr;
+
+destruct_ctrlr:
+ nvme_ctrlr_destruct(&rctrlr->ctrlr);
+ return NULL;
+}
+
+static int
+nvme_rdma_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr)
+{
+ struct nvme_rdma_ctrlr *rctrlr = nvme_rdma_ctrlr(ctrlr);
+ struct nvme_rdma_cm_event_entry *entry;
+
+ if (ctrlr->adminq) {
+ nvme_rdma_ctrlr_delete_io_qpair(ctrlr, ctrlr->adminq);
+ }
+
+ STAILQ_FOREACH(entry, &rctrlr->pending_cm_events, link) {
+ rdma_ack_cm_event(entry->evt);
+ }
+
+ STAILQ_INIT(&rctrlr->free_cm_events);
+ STAILQ_INIT(&rctrlr->pending_cm_events);
+ nvme_rdma_free(rctrlr->cm_events);
+
+ if (rctrlr->cm_channel) {
+ rdma_destroy_event_channel(rctrlr->cm_channel);
+ rctrlr->cm_channel = NULL;
+ }
+
+ nvme_ctrlr_destruct_finish(ctrlr);
+
+ nvme_rdma_free(rctrlr);
+
+ return 0;
+}
+
+static int
+nvme_rdma_qpair_submit_request(struct spdk_nvme_qpair *qpair,
+ struct nvme_request *req)
+{
+ struct nvme_rdma_qpair *rqpair;
+ struct spdk_nvme_rdma_req *rdma_req;
+ struct ibv_send_wr *wr;
+
+ rqpair = nvme_rdma_qpair(qpair);
+ assert(rqpair != NULL);
+ assert(req != NULL);
+
+ rdma_req = nvme_rdma_req_get(rqpair);
+ if (!rdma_req) {
+ /* Inform the upper layer to try again later. */
+ return -EAGAIN;
+ }
+
+ if (nvme_rdma_req_init(rqpair, req, rdma_req)) {
+ SPDK_ERRLOG("nvme_rdma_req_init() failed\n");
+ TAILQ_REMOVE(&rqpair->outstanding_reqs, rdma_req, link);
+ nvme_rdma_req_put(rqpair, rdma_req);
+ return -1;
+ }
+
+ wr = &rdma_req->send_wr;
+ wr->next = NULL;
+ nvme_rdma_trace_ibv_sge(wr->sg_list);
+ return nvme_rdma_qpair_queue_send_wr(rqpair, wr);
+}
+
+static int
+nvme_rdma_qpair_reset(struct spdk_nvme_qpair *qpair)
+{
+ /* Currently, doing nothing here */
+ return 0;
+}
+
+static void
+nvme_rdma_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr)
+{
+ struct spdk_nvme_rdma_req *rdma_req, *tmp;
+ struct spdk_nvme_cpl cpl;
+ struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair);
+
+ cpl.status.sc = SPDK_NVME_SC_ABORTED_SQ_DELETION;
+ cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+ cpl.status.dnr = dnr;
+
+ /*
+ * We cannot abort requests at the RDMA layer without
+ * unregistering them. If we do, we can still get error
+ * free completions on the shared completion queue.
+ */
+ if (nvme_qpair_get_state(qpair) > NVME_QPAIR_DISCONNECTING &&
+ nvme_qpair_get_state(qpair) != NVME_QPAIR_DESTROYING) {
+ nvme_ctrlr_disconnect_qpair(qpair);
+ }
+
+ TAILQ_FOREACH_SAFE(rdma_req, &rqpair->outstanding_reqs, link, tmp) {
+ nvme_rdma_req_complete(rdma_req, &cpl);
+ nvme_rdma_req_put(rqpair, rdma_req);
+ }
+}
+
+static void
+nvme_rdma_qpair_check_timeout(struct spdk_nvme_qpair *qpair)
+{
+ uint64_t t02;
+ struct spdk_nvme_rdma_req *rdma_req, *tmp;
+ struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair);
+ struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
+ struct spdk_nvme_ctrlr_process *active_proc;
+
+ /* Don't check timeouts during controller initialization. */
+ if (ctrlr->state != NVME_CTRLR_STATE_READY) {
+ return;
+ }
+
+ if (nvme_qpair_is_admin_queue(qpair)) {
+ active_proc = nvme_ctrlr_get_current_process(ctrlr);
+ } else {
+ active_proc = qpair->active_proc;
+ }
+
+ /* Only check timeouts if the current process has a timeout callback. */
+ if (active_proc == NULL || active_proc->timeout_cb_fn == NULL) {
+ return;
+ }
+
+ t02 = spdk_get_ticks();
+ TAILQ_FOREACH_SAFE(rdma_req, &rqpair->outstanding_reqs, link, tmp) {
+ assert(rdma_req->req != NULL);
+
+ if (nvme_request_check_timeout(rdma_req->req, rdma_req->id, active_proc, t02)) {
+ /*
+ * The requests are in order, so as soon as one has not timed out,
+ * stop iterating.
+ */
+ break;
+ }
+ }
+}
+
+static inline int
+nvme_rdma_request_ready(struct nvme_rdma_qpair *rqpair, struct spdk_nvme_rdma_req *rdma_req)
+{
+ nvme_rdma_req_complete(rdma_req, &rqpair->rsps[rdma_req->rsp_idx].cpl);
+ nvme_rdma_req_put(rqpair, rdma_req);
+ return nvme_rdma_post_recv(rqpair, rdma_req->rsp_idx);
+}
+
+#define MAX_COMPLETIONS_PER_POLL 128
+
+static void
+nvme_rdma_fail_qpair(struct spdk_nvme_qpair *qpair, int failure_reason)
+{
+ if (failure_reason == IBV_WC_RETRY_EXC_ERR) {
+ qpair->transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_REMOTE;
+ } else if (qpair->transport_failure_reason == SPDK_NVME_QPAIR_FAILURE_NONE) {
+ qpair->transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_UNKNOWN;
+ }
+
+ nvme_ctrlr_disconnect_qpair(qpair);
+}
+
+static void
+nvme_rdma_conditional_fail_qpair(struct nvme_rdma_qpair *rqpair, struct nvme_rdma_poll_group *group)
+{
+ struct nvme_rdma_destroyed_qpair *qpair_tracker;
+
+ assert(rqpair);
+ if (group) {
+ STAILQ_FOREACH(qpair_tracker, &group->destroyed_qpairs, link) {
+ if (qpair_tracker->destroyed_qpair_tracker == rqpair) {
+ return;
+ }
+ }
+ }
+ nvme_rdma_fail_qpair(&rqpair->qpair, 0);
+}
+
+static int
+nvme_rdma_cq_process_completions(struct ibv_cq *cq, uint32_t batch_size,
+ struct nvme_rdma_poll_group *group,
+ struct nvme_rdma_qpair *rdma_qpair)
+{
+ struct ibv_wc wc[MAX_COMPLETIONS_PER_POLL];
+ struct nvme_rdma_qpair *rqpair;
+ struct spdk_nvme_rdma_req *rdma_req;
+ struct spdk_nvme_rdma_rsp *rdma_rsp;
+ struct nvme_rdma_wr *rdma_wr;
+ uint32_t reaped = 0;
+ int completion_rc = 0;
+ int rc, i;
+
+ rc = ibv_poll_cq(cq, batch_size, wc);
+ if (rc < 0) {
+ SPDK_ERRLOG("Error polling CQ! (%d): %s\n",
+ errno, spdk_strerror(errno));
+ return -ECANCELED;
+ } else if (rc == 0) {
+ return 0;
+ }
+
+ for (i = 0; i < rc; i++) {
+ rdma_wr = (struct nvme_rdma_wr *)wc[i].wr_id;
+ switch (rdma_wr->type) {
+ case RDMA_WR_TYPE_RECV:
+ rdma_rsp = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvme_rdma_rsp, rdma_wr);
+ rqpair = rdma_rsp->rqpair;
+ assert(rqpair->current_num_recvs > 0);
+ rqpair->current_num_recvs--;
+
+ if (wc[i].status) {
+ SPDK_ERRLOG("CQ error on Queue Pair %p, Response Index %lu (%d): %s\n",
+ rqpair, wc[i].wr_id, wc[i].status, ibv_wc_status_str(wc[i].status));
+ nvme_rdma_conditional_fail_qpair(rqpair, group);
+ completion_rc = -ENXIO;
+ continue;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "CQ recv completion\n");
+
+ if (wc[i].byte_len < sizeof(struct spdk_nvme_cpl)) {
+ SPDK_ERRLOG("recv length %u less than expected response size\n", wc[i].byte_len);
+ nvme_rdma_conditional_fail_qpair(rqpair, group);
+ completion_rc = -ENXIO;
+ continue;
+ }
+ rdma_req = &rqpair->rdma_reqs[rdma_rsp->cpl.cid];
+ rdma_req->completion_flags |= NVME_RDMA_RECV_COMPLETED;
+ rdma_req->rsp_idx = rdma_rsp->idx;
+
+ if ((rdma_req->completion_flags & NVME_RDMA_SEND_COMPLETED) != 0) {
+ if (spdk_unlikely(nvme_rdma_request_ready(rqpair, rdma_req))) {
+ SPDK_ERRLOG("Unable to re-post rx descriptor\n");
+ nvme_rdma_conditional_fail_qpair(rqpair, group);
+ completion_rc = -ENXIO;
+ continue;
+ }
+ reaped++;
+ rqpair->num_completions++;
+ }
+ break;
+
+ case RDMA_WR_TYPE_SEND:
+ rdma_req = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvme_rdma_req, rdma_wr);
+
+ /* If we are flushing I/O */
+ if (wc[i].status) {
+ rqpair = rdma_req->req ? nvme_rdma_qpair(rdma_req->req->qpair) : NULL;
+ if (!rqpair) {
+ rqpair = rdma_qpair != NULL ? rdma_qpair : nvme_rdma_poll_group_get_qpair_by_id(group,
+ wc[i].qp_num);
+ }
+ assert(rqpair);
+ assert(rqpair->current_num_sends > 0);
+ rqpair->current_num_sends--;
+ nvme_rdma_conditional_fail_qpair(rqpair, group);
+ SPDK_ERRLOG("CQ error on Queue Pair %p, Response Index %lu (%d): %s\n",
+ rqpair, wc[i].wr_id, wc[i].status, ibv_wc_status_str(wc[i].status));
+ completion_rc = -ENXIO;
+ continue;
+ }
+
+ rqpair = nvme_rdma_qpair(rdma_req->req->qpair);
+ rdma_req->completion_flags |= NVME_RDMA_SEND_COMPLETED;
+ rqpair->current_num_sends--;
+
+ if ((rdma_req->completion_flags & NVME_RDMA_RECV_COMPLETED) != 0) {
+ if (spdk_unlikely(nvme_rdma_request_ready(rqpair, rdma_req))) {
+ SPDK_ERRLOG("Unable to re-post rx descriptor\n");
+ nvme_rdma_conditional_fail_qpair(rqpair, group);
+ completion_rc = -ENXIO;
+ continue;
+ }
+ reaped++;
+ rqpair->num_completions++;
+ }
+ break;
+
+ default:
+ SPDK_ERRLOG("Received an unexpected opcode on the CQ: %d\n", rdma_wr->type);
+ return -ECANCELED;
+ }
+ }
+
+ if (completion_rc) {
+ return completion_rc;
+ }
+
+ return reaped;
+}
+
+static void
+dummy_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx)
+{
+
+}
+
+static int
+nvme_rdma_qpair_process_completions(struct spdk_nvme_qpair *qpair,
+ uint32_t max_completions)
+{
+ struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair);
+ int rc = 0, batch_size;
+ struct ibv_cq *cq;
+ struct nvme_rdma_ctrlr *rctrlr;
+
+ /*
+ * This is used during the connection phase. It's possible that we are still reaping error completions
+ * from other qpairs so we need to call the poll group function. Also, it's more correct since the cq
+ * is shared.
+ */
+ if (qpair->poll_group != NULL) {
+ return spdk_nvme_poll_group_process_completions(qpair->poll_group->group, max_completions,
+ dummy_disconnected_qpair_cb);
+ }
+
+ if (max_completions == 0) {
+ max_completions = rqpair->num_entries;
+ } else {
+ max_completions = spdk_min(max_completions, rqpair->num_entries);
+ }
+
+ if (nvme_qpair_is_admin_queue(&rqpair->qpair)) {
+ rctrlr = nvme_rdma_ctrlr(rqpair->qpair.ctrlr);
+ nvme_rdma_poll_events(rctrlr);
+ }
+ nvme_rdma_qpair_process_cm_event(rqpair);
+
+ if (spdk_unlikely(qpair->transport_failure_reason != SPDK_NVME_QPAIR_FAILURE_NONE)) {
+ nvme_rdma_fail_qpair(qpair, 0);
+ return -ENXIO;
+ }
+
+ cq = rqpair->cq;
+
+ rqpair->num_completions = 0;
+ do {
+ batch_size = spdk_min((max_completions - rqpair->num_completions), MAX_COMPLETIONS_PER_POLL);
+ rc = nvme_rdma_cq_process_completions(cq, batch_size, NULL, rqpair);
+
+ if (rc == 0) {
+ break;
+ /* Handle the case where we fail to poll the cq. */
+ } else if (rc == -ECANCELED) {
+ nvme_rdma_fail_qpair(qpair, 0);
+ return -ENXIO;
+ } else if (rc == -ENXIO) {
+ return rc;
+ }
+ } while (rqpair->num_completions < max_completions);
+
+ if (spdk_unlikely(nvme_rdma_qpair_submit_sends(rqpair) ||
+ nvme_rdma_qpair_submit_recvs(rqpair))) {
+ nvme_rdma_fail_qpair(qpair, 0);
+ return -ENXIO;
+ }
+
+ if (spdk_unlikely(rqpair->qpair.ctrlr->timeout_enabled)) {
+ nvme_rdma_qpair_check_timeout(qpair);
+ }
+
+ return rqpair->num_completions;
+}
+
+static uint32_t
+nvme_rdma_ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr *ctrlr)
+{
+ /* max_mr_size by ibv_query_device indicates the largest value that we can
+ * set for a registered memory region. It is independent from the actual
+ * I/O size and is very likely to be larger than 2 MiB which is the
+ * granularity we currently register memory regions. Hence return
+ * UINT32_MAX here and let the generic layer use the controller data to
+ * moderate this value.
+ */
+ return UINT32_MAX;
+}
+
+static uint16_t
+nvme_rdma_ctrlr_get_max_sges(struct spdk_nvme_ctrlr *ctrlr)
+{
+ struct nvme_rdma_ctrlr *rctrlr = nvme_rdma_ctrlr(ctrlr);
+
+ return rctrlr->max_sge;
+}
+
+static int
+nvme_rdma_qpair_iterate_requests(struct spdk_nvme_qpair *qpair,
+ int (*iter_fn)(struct nvme_request *req, void *arg),
+ void *arg)
+{
+ struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair);
+ struct spdk_nvme_rdma_req *rdma_req, *tmp;
+ int rc;
+
+ assert(iter_fn != NULL);
+
+ TAILQ_FOREACH_SAFE(rdma_req, &rqpair->outstanding_reqs, link, tmp) {
+ assert(rdma_req->req != NULL);
+
+ rc = iter_fn(rdma_req->req, arg);
+ if (rc != 0) {
+ return rc;
+ }
+ }
+
+ return 0;
+}
+
+static void
+nvme_rdma_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair)
+{
+ struct spdk_nvme_rdma_req *rdma_req, *tmp;
+ struct spdk_nvme_cpl cpl;
+ struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair);
+
+ cpl.status.sc = SPDK_NVME_SC_ABORTED_SQ_DELETION;
+ cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+
+ TAILQ_FOREACH_SAFE(rdma_req, &rqpair->outstanding_reqs, link, tmp) {
+ assert(rdma_req->req != NULL);
+
+ if (rdma_req->req->cmd.opc != SPDK_NVME_OPC_ASYNC_EVENT_REQUEST) {
+ continue;
+ }
+
+ nvme_rdma_req_complete(rdma_req, &cpl);
+ nvme_rdma_req_put(rqpair, rdma_req);
+ }
+}
+
+static int
+nvme_rdma_poller_create(struct nvme_rdma_poll_group *group, struct ibv_context *ctx)
+{
+ struct nvme_rdma_poller *poller;
+
+ poller = calloc(1, sizeof(*poller));
+ if (poller == NULL) {
+ SPDK_ERRLOG("Unable to allocate poller.\n");
+ return -ENOMEM;
+ }
+
+ poller->device = ctx;
+ poller->cq = ibv_create_cq(poller->device, DEFAULT_NVME_RDMA_CQ_SIZE, group, NULL, 0);
+
+ if (poller->cq == NULL) {
+ free(poller);
+ return -EINVAL;
+ }
+
+ STAILQ_INSERT_HEAD(&group->pollers, poller, link);
+ group->num_pollers++;
+ poller->current_num_wc = DEFAULT_NVME_RDMA_CQ_SIZE;
+ poller->required_num_wc = 0;
+ return 0;
+}
+
+static void
+nvme_rdma_poll_group_free_pollers(struct nvme_rdma_poll_group *group)
+{
+ struct nvme_rdma_poller *poller, *tmp_poller;
+
+ STAILQ_FOREACH_SAFE(poller, &group->pollers, link, tmp_poller) {
+ if (poller->cq) {
+ ibv_destroy_cq(poller->cq);
+ }
+ STAILQ_REMOVE(&group->pollers, poller, nvme_rdma_poller, link);
+ free(poller);
+ }
+}
+
+static struct spdk_nvme_transport_poll_group *
+nvme_rdma_poll_group_create(void)
+{
+ struct nvme_rdma_poll_group *group;
+ struct ibv_context **contexts;
+ int i = 0;
+
+ group = calloc(1, sizeof(*group));
+ if (group == NULL) {
+ SPDK_ERRLOG("Unable to allocate poll group.\n");
+ return NULL;
+ }
+
+ STAILQ_INIT(&group->pollers);
+
+ contexts = rdma_get_devices(NULL);
+ if (contexts == NULL) {
+ SPDK_ERRLOG("rdma_get_devices() failed: %s (%d)\n", spdk_strerror(errno), errno);
+ free(group);
+ return NULL;
+ }
+
+ while (contexts[i] != NULL) {
+ if (nvme_rdma_poller_create(group, contexts[i])) {
+ nvme_rdma_poll_group_free_pollers(group);
+ free(group);
+ rdma_free_devices(contexts);
+ return NULL;
+ }
+ i++;
+ }
+
+ rdma_free_devices(contexts);
+ STAILQ_INIT(&group->destroyed_qpairs);
+ return &group->group;
+}
+
+struct nvme_rdma_qpair *
+nvme_rdma_poll_group_get_qpair_by_id(struct nvme_rdma_poll_group *group, uint32_t qp_num)
+{
+ struct spdk_nvme_qpair *qpair;
+ struct nvme_rdma_destroyed_qpair *rqpair_tracker;
+ struct nvme_rdma_qpair *rqpair;
+
+ STAILQ_FOREACH(qpair, &group->group.disconnected_qpairs, poll_group_stailq) {
+ rqpair = nvme_rdma_qpair(qpair);
+ if (rqpair->rdma_qp->qp->qp_num == qp_num) {
+ return rqpair;
+ }
+ }
+
+ STAILQ_FOREACH(qpair, &group->group.connected_qpairs, poll_group_stailq) {
+ rqpair = nvme_rdma_qpair(qpair);
+ if (rqpair->rdma_qp->qp->qp_num == qp_num) {
+ return rqpair;
+ }
+ }
+
+ STAILQ_FOREACH(rqpair_tracker, &group->destroyed_qpairs, link) {
+ rqpair = rqpair_tracker->destroyed_qpair_tracker;
+ if (rqpair->rdma_qp->qp->qp_num == qp_num) {
+ return rqpair;
+ }
+ }
+
+ return NULL;
+}
+
+static int
+nvme_rdma_resize_cq(struct nvme_rdma_qpair *rqpair, struct nvme_rdma_poller *poller)
+{
+ int current_num_wc, required_num_wc;
+
+ required_num_wc = poller->required_num_wc + WC_PER_QPAIR(rqpair->num_entries);
+ current_num_wc = poller->current_num_wc;
+ if (current_num_wc < required_num_wc) {
+ current_num_wc = spdk_max(current_num_wc * 2, required_num_wc);
+ }
+
+ if (poller->current_num_wc != current_num_wc) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Resize RDMA CQ from %d to %d\n", poller->current_num_wc,
+ current_num_wc);
+ if (ibv_resize_cq(poller->cq, current_num_wc)) {
+ SPDK_ERRLOG("RDMA CQ resize failed: errno %d: %s\n", errno, spdk_strerror(errno));
+ return -1;
+ }
+
+ poller->current_num_wc = current_num_wc;
+ }
+
+ poller->required_num_wc = required_num_wc;
+ return 0;
+}
+
+static int
+nvme_rdma_poll_group_connect_qpair(struct spdk_nvme_qpair *qpair)
+{
+ struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair);
+ struct nvme_rdma_poll_group *group = nvme_rdma_poll_group(qpair->poll_group);
+ struct nvme_rdma_poller *poller;
+
+ assert(rqpair->cq == NULL);
+
+ STAILQ_FOREACH(poller, &group->pollers, link) {
+ if (poller->device == rqpair->cm_id->verbs) {
+ if (nvme_rdma_resize_cq(rqpair, poller)) {
+ return -EPROTO;
+ }
+ rqpair->cq = poller->cq;
+ break;
+ }
+ }
+
+ if (rqpair->cq == NULL) {
+ SPDK_ERRLOG("Unable to find a cq for qpair %p on poll group %p\n", qpair, qpair->poll_group);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int
+nvme_rdma_poll_group_disconnect_qpair(struct spdk_nvme_qpair *qpair)
+{
+ struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair);
+ struct nvme_rdma_poll_group *group;
+ struct nvme_rdma_destroyed_qpair *destroyed_qpair;
+ enum nvme_qpair_state state;
+
+ if (rqpair->poll_group_disconnect_in_progress) {
+ return -EINPROGRESS;
+ }
+
+ rqpair->poll_group_disconnect_in_progress = true;
+ state = nvme_qpair_get_state(qpair);
+ group = nvme_rdma_poll_group(qpair->poll_group);
+ rqpair->cq = NULL;
+
+ /*
+ * We want to guard against an endless recursive loop while making
+ * sure the qpair is disconnected before we disconnect it from the qpair.
+ */
+ if (state > NVME_QPAIR_DISCONNECTING && state != NVME_QPAIR_DESTROYING) {
+ nvme_ctrlr_disconnect_qpair(qpair);
+ }
+
+ /*
+ * If this fails, the system is in serious trouble,
+ * just let the qpair get cleaned up immediately.
+ */
+ destroyed_qpair = calloc(1, sizeof(*destroyed_qpair));
+ if (destroyed_qpair == NULL) {
+ return 0;
+ }
+
+ destroyed_qpair->destroyed_qpair_tracker = rqpair;
+ destroyed_qpair->completed_cycles = 0;
+ STAILQ_INSERT_TAIL(&group->destroyed_qpairs, destroyed_qpair, link);
+
+ rqpair->defer_deletion_to_pg = true;
+
+ rqpair->poll_group_disconnect_in_progress = false;
+ return 0;
+}
+
+static int
+nvme_rdma_poll_group_add(struct spdk_nvme_transport_poll_group *tgroup,
+ struct spdk_nvme_qpair *qpair)
+{
+ return 0;
+}
+
+static int
+nvme_rdma_poll_group_remove(struct spdk_nvme_transport_poll_group *tgroup,
+ struct spdk_nvme_qpair *qpair)
+{
+ if (qpair->poll_group_tailq_head == &tgroup->connected_qpairs) {
+ return nvme_poll_group_disconnect_qpair(qpair);
+ }
+
+ return 0;
+}
+
+static void
+nvme_rdma_poll_group_delete_qpair(struct nvme_rdma_poll_group *group,
+ struct nvme_rdma_destroyed_qpair *qpair_tracker)
+{
+ struct nvme_rdma_qpair *rqpair = qpair_tracker->destroyed_qpair_tracker;
+
+ rqpair->defer_deletion_to_pg = false;
+ if (nvme_qpair_get_state(&rqpair->qpair) == NVME_QPAIR_DESTROYING) {
+ nvme_rdma_ctrlr_delete_io_qpair(rqpair->qpair.ctrlr, &rqpair->qpair);
+ }
+ STAILQ_REMOVE(&group->destroyed_qpairs, qpair_tracker, nvme_rdma_destroyed_qpair, link);
+ free(qpair_tracker);
+}
+
+static int64_t
+nvme_rdma_poll_group_process_completions(struct spdk_nvme_transport_poll_group *tgroup,
+ uint32_t completions_per_qpair, spdk_nvme_disconnected_qpair_cb disconnected_qpair_cb)
+{
+ struct spdk_nvme_qpair *qpair, *tmp_qpair;
+ struct nvme_rdma_destroyed_qpair *qpair_tracker, *tmp_qpair_tracker;
+ struct nvme_rdma_qpair *rqpair;
+ struct nvme_rdma_poll_group *group;
+ struct nvme_rdma_poller *poller;
+ int num_qpairs = 0, batch_size, rc;
+ int64_t total_completions = 0;
+ uint64_t completions_allowed = 0;
+ uint64_t completions_per_poller = 0;
+ uint64_t poller_completions = 0;
+
+
+ if (completions_per_qpair == 0) {
+ completions_per_qpair = MAX_COMPLETIONS_PER_POLL;
+ }
+
+ group = nvme_rdma_poll_group(tgroup);
+ STAILQ_FOREACH_SAFE(qpair, &tgroup->disconnected_qpairs, poll_group_stailq, tmp_qpair) {
+ disconnected_qpair_cb(qpair, tgroup->group->ctx);
+ }
+
+ STAILQ_FOREACH_SAFE(qpair, &tgroup->connected_qpairs, poll_group_stailq, tmp_qpair) {
+ rqpair = nvme_rdma_qpair(qpair);
+ rqpair->num_completions = 0;
+ nvme_rdma_qpair_process_cm_event(rqpair);
+
+ if (spdk_unlikely(qpair->transport_failure_reason != SPDK_NVME_QPAIR_FAILURE_NONE)) {
+ nvme_rdma_fail_qpair(qpair, 0);
+ disconnected_qpair_cb(qpair, tgroup->group->ctx);
+ continue;
+ }
+ num_qpairs++;
+ }
+
+ completions_allowed = completions_per_qpair * num_qpairs;
+ completions_per_poller = spdk_max(completions_allowed / group->num_pollers, 1);
+
+ STAILQ_FOREACH(poller, &group->pollers, link) {
+ poller_completions = 0;
+ do {
+ batch_size = spdk_min((completions_per_poller - poller_completions), MAX_COMPLETIONS_PER_POLL);
+ rc = nvme_rdma_cq_process_completions(poller->cq, batch_size, group, NULL);
+ if (rc <= 0) {
+ if (rc == -ECANCELED) {
+ return -EIO;
+ }
+ break;
+ }
+
+ poller_completions += rc;
+ } while (poller_completions < completions_per_poller);
+ total_completions += poller_completions;
+ }
+
+ STAILQ_FOREACH_SAFE(qpair, &tgroup->connected_qpairs, poll_group_stailq, tmp_qpair) {
+ rqpair = nvme_rdma_qpair(qpair);
+ if (spdk_unlikely(qpair->ctrlr->timeout_enabled)) {
+ nvme_rdma_qpair_check_timeout(qpair);
+ }
+
+ nvme_rdma_qpair_submit_sends(rqpair);
+ nvme_rdma_qpair_submit_recvs(rqpair);
+ nvme_qpair_resubmit_requests(&rqpair->qpair, rqpair->num_completions);
+ }
+
+ /*
+ * Once a qpair is disconnected, we can still get flushed completions for those disconnected qpairs.
+ * For most pieces of hardware, those requests will complete immediately. However, there are certain
+ * cases where flushed requests will linger. Default is to destroy qpair after all completions are freed,
+ * but have a fallback for other cases where we don't get all of our completions back.
+ */
+ STAILQ_FOREACH_SAFE(qpair_tracker, &group->destroyed_qpairs, link, tmp_qpair_tracker) {
+ qpair_tracker->completed_cycles++;
+ rqpair = qpair_tracker->destroyed_qpair_tracker;
+ if ((rqpair->current_num_sends == 0 && rqpair->current_num_recvs == 0) ||
+ qpair_tracker->completed_cycles > NVME_RDMA_DESTROYED_QPAIR_EXPIRATION_CYCLES) {
+ nvme_rdma_poll_group_delete_qpair(group, qpair_tracker);
+ }
+ }
+
+ return total_completions;
+}
+
+static int
+nvme_rdma_poll_group_destroy(struct spdk_nvme_transport_poll_group *tgroup)
+{
+ struct nvme_rdma_poll_group *group = nvme_rdma_poll_group(tgroup);
+ struct nvme_rdma_destroyed_qpair *qpair_tracker, *tmp_qpair_tracker;
+ struct nvme_rdma_qpair *rqpair;
+
+ if (!STAILQ_EMPTY(&tgroup->connected_qpairs) || !STAILQ_EMPTY(&tgroup->disconnected_qpairs)) {
+ return -EBUSY;
+ }
+
+ STAILQ_FOREACH_SAFE(qpair_tracker, &group->destroyed_qpairs, link, tmp_qpair_tracker) {
+ rqpair = qpair_tracker->destroyed_qpair_tracker;
+ if (nvme_qpair_get_state(&rqpair->qpair) == NVME_QPAIR_DESTROYING) {
+ rqpair->defer_deletion_to_pg = false;
+ nvme_rdma_ctrlr_delete_io_qpair(rqpair->qpair.ctrlr, &rqpair->qpair);
+ }
+
+ STAILQ_REMOVE(&group->destroyed_qpairs, qpair_tracker, nvme_rdma_destroyed_qpair, link);
+ free(qpair_tracker);
+ }
+
+ nvme_rdma_poll_group_free_pollers(group);
+ free(group);
+
+ return 0;
+}
+
+void
+spdk_nvme_rdma_init_hooks(struct spdk_nvme_rdma_hooks *hooks)
+{
+ g_nvme_hooks = *hooks;
+}
+
+const struct spdk_nvme_transport_ops rdma_ops = {
+ .name = "RDMA",
+ .type = SPDK_NVME_TRANSPORT_RDMA,
+ .ctrlr_construct = nvme_rdma_ctrlr_construct,
+ .ctrlr_scan = nvme_fabric_ctrlr_scan,
+ .ctrlr_destruct = nvme_rdma_ctrlr_destruct,
+ .ctrlr_enable = nvme_rdma_ctrlr_enable,
+
+ .ctrlr_set_reg_4 = nvme_fabric_ctrlr_set_reg_4,
+ .ctrlr_set_reg_8 = nvme_fabric_ctrlr_set_reg_8,
+ .ctrlr_get_reg_4 = nvme_fabric_ctrlr_get_reg_4,
+ .ctrlr_get_reg_8 = nvme_fabric_ctrlr_get_reg_8,
+
+ .ctrlr_get_max_xfer_size = nvme_rdma_ctrlr_get_max_xfer_size,
+ .ctrlr_get_max_sges = nvme_rdma_ctrlr_get_max_sges,
+
+ .ctrlr_create_io_qpair = nvme_rdma_ctrlr_create_io_qpair,
+ .ctrlr_delete_io_qpair = nvme_rdma_ctrlr_delete_io_qpair,
+ .ctrlr_connect_qpair = nvme_rdma_ctrlr_connect_qpair,
+ .ctrlr_disconnect_qpair = nvme_rdma_ctrlr_disconnect_qpair,
+
+ .qpair_abort_reqs = nvme_rdma_qpair_abort_reqs,
+ .qpair_reset = nvme_rdma_qpair_reset,
+ .qpair_submit_request = nvme_rdma_qpair_submit_request,
+ .qpair_process_completions = nvme_rdma_qpair_process_completions,
+ .qpair_iterate_requests = nvme_rdma_qpair_iterate_requests,
+ .admin_qpair_abort_aers = nvme_rdma_admin_qpair_abort_aers,
+
+ .poll_group_create = nvme_rdma_poll_group_create,
+ .poll_group_connect_qpair = nvme_rdma_poll_group_connect_qpair,
+ .poll_group_disconnect_qpair = nvme_rdma_poll_group_disconnect_qpair,
+ .poll_group_add = nvme_rdma_poll_group_add,
+ .poll_group_remove = nvme_rdma_poll_group_remove,
+ .poll_group_process_completions = nvme_rdma_poll_group_process_completions,
+ .poll_group_destroy = nvme_rdma_poll_group_destroy,
+
+};
+
+SPDK_NVME_TRANSPORT_REGISTER(rdma, &rdma_ops);
diff --git a/src/spdk/lib/nvme/nvme_tcp.c b/src/spdk/lib/nvme/nvme_tcp.c
new file mode 100644
index 000000000..98e8c6827
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_tcp.c
@@ -0,0 +1,1973 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation. All rights reserved.
+ * Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * NVMe/TCP transport
+ */
+
+#include "nvme_internal.h"
+
+#include "spdk/endian.h"
+#include "spdk/likely.h"
+#include "spdk/string.h"
+#include "spdk/stdinc.h"
+#include "spdk/crc32.h"
+#include "spdk/endian.h"
+#include "spdk/assert.h"
+#include "spdk/string.h"
+#include "spdk/thread.h"
+#include "spdk/trace.h"
+#include "spdk/util.h"
+
+#include "spdk_internal/nvme_tcp.h"
+
+#define NVME_TCP_RW_BUFFER_SIZE 131072
+#define NVME_TCP_TIME_OUT_IN_SECONDS 2
+
+#define NVME_TCP_HPDA_DEFAULT 0
+#define NVME_TCP_MAX_R2T_DEFAULT 1
+#define NVME_TCP_PDU_H2C_MIN_DATA_SIZE 4096
+#define NVME_TCP_IN_CAPSULE_DATA_MAX_SIZE 8192
+
+/* NVMe TCP transport extensions for spdk_nvme_ctrlr */
+struct nvme_tcp_ctrlr {
+ struct spdk_nvme_ctrlr ctrlr;
+};
+
+struct nvme_tcp_poll_group {
+ struct spdk_nvme_transport_poll_group group;
+ struct spdk_sock_group *sock_group;
+ uint32_t completions_per_qpair;
+ int64_t num_completions;
+};
+
+/* NVMe TCP qpair extensions for spdk_nvme_qpair */
+struct nvme_tcp_qpair {
+ struct spdk_nvme_qpair qpair;
+ struct spdk_sock *sock;
+
+ TAILQ_HEAD(, nvme_tcp_req) free_reqs;
+ TAILQ_HEAD(, nvme_tcp_req) outstanding_reqs;
+
+ TAILQ_HEAD(, nvme_tcp_pdu) send_queue;
+ struct nvme_tcp_pdu recv_pdu;
+ struct nvme_tcp_pdu send_pdu; /* only for error pdu and init pdu */
+ struct nvme_tcp_pdu *send_pdus; /* Used by tcp_reqs */
+ enum nvme_tcp_pdu_recv_state recv_state;
+
+ struct nvme_tcp_req *tcp_reqs;
+
+ uint16_t num_entries;
+
+ bool host_hdgst_enable;
+ bool host_ddgst_enable;
+
+ /** Specifies the maximum number of PDU-Data bytes per H2C Data Transfer PDU */
+ uint32_t maxh2cdata;
+
+ uint32_t maxr2t;
+
+ /* 0 based value, which is used to guide the padding */
+ uint8_t cpda;
+
+ enum nvme_tcp_qpair_state state;
+};
+
+enum nvme_tcp_req_state {
+ NVME_TCP_REQ_FREE,
+ NVME_TCP_REQ_ACTIVE,
+ NVME_TCP_REQ_ACTIVE_R2T,
+};
+
+struct nvme_tcp_req {
+ struct nvme_request *req;
+ enum nvme_tcp_req_state state;
+ uint16_t cid;
+ uint16_t ttag;
+ uint32_t datao;
+ uint32_t r2tl_remain;
+ uint32_t active_r2ts;
+ bool in_capsule_data;
+ /* It is used to track whether the req can be safely freed */
+ struct {
+ uint8_t send_ack : 1;
+ uint8_t data_recv : 1;
+ uint8_t r2t_recv : 1;
+ uint8_t reserved : 5;
+ } ordering;
+ struct nvme_tcp_pdu *send_pdu;
+ struct iovec iov[NVME_TCP_MAX_SGL_DESCRIPTORS];
+ uint32_t iovcnt;
+ struct nvme_tcp_qpair *tqpair;
+ TAILQ_ENTRY(nvme_tcp_req) link;
+};
+
+static void nvme_tcp_send_h2c_data(struct nvme_tcp_req *tcp_req);
+
+static inline struct nvme_tcp_qpair *
+nvme_tcp_qpair(struct spdk_nvme_qpair *qpair)
+{
+ assert(qpair->trtype == SPDK_NVME_TRANSPORT_TCP);
+ return SPDK_CONTAINEROF(qpair, struct nvme_tcp_qpair, qpair);
+}
+
+static inline struct nvme_tcp_poll_group *
+nvme_tcp_poll_group(struct spdk_nvme_transport_poll_group *group)
+{
+ return SPDK_CONTAINEROF(group, struct nvme_tcp_poll_group, group);
+}
+
+static inline struct nvme_tcp_ctrlr *
+nvme_tcp_ctrlr(struct spdk_nvme_ctrlr *ctrlr)
+{
+ assert(ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_TCP);
+ return SPDK_CONTAINEROF(ctrlr, struct nvme_tcp_ctrlr, ctrlr);
+}
+
+static struct nvme_tcp_req *
+nvme_tcp_req_get(struct nvme_tcp_qpair *tqpair)
+{
+ struct nvme_tcp_req *tcp_req;
+
+ tcp_req = TAILQ_FIRST(&tqpair->free_reqs);
+ if (!tcp_req) {
+ return NULL;
+ }
+
+ assert(tcp_req->state == NVME_TCP_REQ_FREE);
+ tcp_req->state = NVME_TCP_REQ_ACTIVE;
+ TAILQ_REMOVE(&tqpair->free_reqs, tcp_req, link);
+ tcp_req->datao = 0;
+ tcp_req->req = NULL;
+ tcp_req->in_capsule_data = false;
+ tcp_req->r2tl_remain = 0;
+ tcp_req->active_r2ts = 0;
+ tcp_req->iovcnt = 0;
+ tcp_req->ordering.send_ack = 0;
+ tcp_req->ordering.data_recv = 0;
+ tcp_req->ordering.r2t_recv = 0;
+ memset(tcp_req->send_pdu, 0, sizeof(struct nvme_tcp_pdu));
+ TAILQ_INSERT_TAIL(&tqpair->outstanding_reqs, tcp_req, link);
+
+ return tcp_req;
+}
+
+static void
+nvme_tcp_req_put(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_req *tcp_req)
+{
+ assert(tcp_req->state != NVME_TCP_REQ_FREE);
+ tcp_req->state = NVME_TCP_REQ_FREE;
+ TAILQ_INSERT_HEAD(&tqpair->free_reqs, tcp_req, link);
+}
+
+static int
+nvme_tcp_parse_addr(struct sockaddr_storage *sa, int family, const char *addr, const char *service)
+{
+ struct addrinfo *res;
+ struct addrinfo hints;
+ int ret;
+
+ memset(&hints, 0, sizeof(hints));
+ hints.ai_family = family;
+ hints.ai_socktype = SOCK_STREAM;
+ hints.ai_protocol = 0;
+
+ ret = getaddrinfo(addr, service, &hints, &res);
+ if (ret) {
+ SPDK_ERRLOG("getaddrinfo failed: %s (%d)\n", gai_strerror(ret), ret);
+ return ret;
+ }
+
+ if (res->ai_addrlen > sizeof(*sa)) {
+ SPDK_ERRLOG("getaddrinfo() ai_addrlen %zu too large\n", (size_t)res->ai_addrlen);
+ ret = EINVAL;
+ } else {
+ memcpy(sa, res->ai_addr, res->ai_addrlen);
+ }
+
+ freeaddrinfo(res);
+ return ret;
+}
+
+static void
+nvme_tcp_free_reqs(struct nvme_tcp_qpair *tqpair)
+{
+ free(tqpair->tcp_reqs);
+ tqpair->tcp_reqs = NULL;
+
+ spdk_free(tqpair->send_pdus);
+ tqpair->send_pdus = NULL;
+}
+
+static int
+nvme_tcp_alloc_reqs(struct nvme_tcp_qpair *tqpair)
+{
+ uint16_t i;
+ struct nvme_tcp_req *tcp_req;
+
+ tqpair->tcp_reqs = calloc(tqpair->num_entries, sizeof(struct nvme_tcp_req));
+ if (tqpair->tcp_reqs == NULL) {
+ SPDK_ERRLOG("Failed to allocate tcp_reqs on tqpair=%p\n", tqpair);
+ goto fail;
+ }
+
+ tqpair->send_pdus = spdk_zmalloc(tqpair->num_entries * sizeof(struct nvme_tcp_pdu),
+ 0x1000, NULL,
+ SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
+
+ if (tqpair->send_pdus == NULL) {
+ SPDK_ERRLOG("Failed to allocate send_pdus on tqpair=%p\n", tqpair);
+ goto fail;
+ }
+
+ TAILQ_INIT(&tqpair->send_queue);
+ TAILQ_INIT(&tqpair->free_reqs);
+ TAILQ_INIT(&tqpair->outstanding_reqs);
+ for (i = 0; i < tqpair->num_entries; i++) {
+ tcp_req = &tqpair->tcp_reqs[i];
+ tcp_req->cid = i;
+ tcp_req->tqpair = tqpair;
+ tcp_req->send_pdu = &tqpair->send_pdus[i];
+ TAILQ_INSERT_TAIL(&tqpair->free_reqs, tcp_req, link);
+ }
+
+ return 0;
+fail:
+ nvme_tcp_free_reqs(tqpair);
+ return -ENOMEM;
+}
+
+static void
+nvme_tcp_ctrlr_disconnect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
+{
+ struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair);
+ struct nvme_tcp_pdu *pdu;
+
+ spdk_sock_close(&tqpair->sock);
+
+ /* clear the send_queue */
+ while (!TAILQ_EMPTY(&tqpair->send_queue)) {
+ pdu = TAILQ_FIRST(&tqpair->send_queue);
+ /* Remove the pdu from the send_queue to prevent the wrong sending out
+ * in the next round connection
+ */
+ TAILQ_REMOVE(&tqpair->send_queue, pdu, tailq);
+ }
+}
+
+static void nvme_tcp_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr);
+
+static int
+nvme_tcp_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
+{
+ struct nvme_tcp_qpair *tqpair;
+
+ if (!qpair) {
+ return -1;
+ }
+
+ nvme_transport_ctrlr_disconnect_qpair(ctrlr, qpair);
+ nvme_tcp_qpair_abort_reqs(qpair, 1);
+ nvme_qpair_deinit(qpair);
+ tqpair = nvme_tcp_qpair(qpair);
+ nvme_tcp_free_reqs(tqpair);
+ free(tqpair);
+
+ return 0;
+}
+
+static int
+nvme_tcp_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr)
+{
+ return 0;
+}
+
+static int
+nvme_tcp_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr)
+{
+ struct nvme_tcp_ctrlr *tctrlr = nvme_tcp_ctrlr(ctrlr);
+
+ if (ctrlr->adminq) {
+ nvme_tcp_ctrlr_delete_io_qpair(ctrlr, ctrlr->adminq);
+ }
+
+ nvme_ctrlr_destruct_finish(ctrlr);
+
+ free(tctrlr);
+
+ return 0;
+}
+
+static void
+_pdu_write_done(void *cb_arg, int err)
+{
+ struct nvme_tcp_pdu *pdu = cb_arg;
+ struct nvme_tcp_qpair *tqpair = pdu->qpair;
+
+ TAILQ_REMOVE(&tqpair->send_queue, pdu, tailq);
+
+ if (err != 0) {
+ nvme_transport_ctrlr_disconnect_qpair(tqpair->qpair.ctrlr, &tqpair->qpair);
+ return;
+ }
+
+ assert(pdu->cb_fn != NULL);
+ pdu->cb_fn(pdu->cb_arg);
+}
+
+static int
+nvme_tcp_qpair_write_pdu(struct nvme_tcp_qpair *tqpair,
+ struct nvme_tcp_pdu *pdu,
+ nvme_tcp_qpair_xfer_complete_cb cb_fn,
+ void *cb_arg)
+{
+ int hlen;
+ uint32_t crc32c;
+ uint32_t mapped_length = 0;
+
+ hlen = pdu->hdr.common.hlen;
+
+ /* Header Digest */
+ if (g_nvme_tcp_hdgst[pdu->hdr.common.pdu_type] && tqpair->host_hdgst_enable) {
+ crc32c = nvme_tcp_pdu_calc_header_digest(pdu);
+ MAKE_DIGEST_WORD((uint8_t *)pdu->hdr.raw + hlen, crc32c);
+ }
+
+ /* Data Digest */
+ if (pdu->data_len > 0 && g_nvme_tcp_ddgst[pdu->hdr.common.pdu_type] && tqpair->host_ddgst_enable) {
+ crc32c = nvme_tcp_pdu_calc_data_digest(pdu);
+ MAKE_DIGEST_WORD(pdu->data_digest, crc32c);
+ }
+
+ pdu->cb_fn = cb_fn;
+ pdu->cb_arg = cb_arg;
+
+ pdu->sock_req.iovcnt = nvme_tcp_build_iovs(pdu->iov, NVME_TCP_MAX_SGL_DESCRIPTORS, pdu,
+ tqpair->host_hdgst_enable, tqpair->host_ddgst_enable,
+ &mapped_length);
+ pdu->qpair = tqpair;
+ pdu->sock_req.cb_fn = _pdu_write_done;
+ pdu->sock_req.cb_arg = pdu;
+ TAILQ_INSERT_TAIL(&tqpair->send_queue, pdu, tailq);
+ spdk_sock_writev_async(tqpair->sock, &pdu->sock_req);
+
+ return 0;
+}
+
+/*
+ * Build SGL describing contiguous payload buffer.
+ */
+static int
+nvme_tcp_build_contig_request(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_req *tcp_req)
+{
+ struct nvme_request *req = tcp_req->req;
+
+ tcp_req->iov[0].iov_base = req->payload.contig_or_cb_arg + req->payload_offset;
+ tcp_req->iov[0].iov_len = req->payload_size;
+ tcp_req->iovcnt = 1;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "enter\n");
+
+ assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG);
+
+ return 0;
+}
+
+/*
+ * Build SGL describing scattered payload buffer.
+ */
+static int
+nvme_tcp_build_sgl_request(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_req *tcp_req)
+{
+ int rc;
+ uint32_t length, remaining_size, iovcnt = 0, max_num_sgl;
+ struct nvme_request *req = tcp_req->req;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "enter\n");
+
+ assert(req->payload_size != 0);
+ assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL);
+ assert(req->payload.reset_sgl_fn != NULL);
+ assert(req->payload.next_sge_fn != NULL);
+ req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset);
+
+ max_num_sgl = spdk_min(req->qpair->ctrlr->max_sges, NVME_TCP_MAX_SGL_DESCRIPTORS);
+ remaining_size = req->payload_size;
+
+ do {
+ rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, &tcp_req->iov[iovcnt].iov_base,
+ &length);
+ if (rc) {
+ return -1;
+ }
+
+ length = spdk_min(length, remaining_size);
+ tcp_req->iov[iovcnt].iov_len = length;
+ remaining_size -= length;
+ iovcnt++;
+ } while (remaining_size > 0 && iovcnt < max_num_sgl);
+
+
+ /* Should be impossible if we did our sgl checks properly up the stack, but do a sanity check here. */
+ if (remaining_size > 0) {
+ SPDK_ERRLOG("Failed to construct tcp_req=%p, and the iovcnt=%u, remaining_size=%u\n",
+ tcp_req, iovcnt, remaining_size);
+ return -1;
+ }
+
+ tcp_req->iovcnt = iovcnt;
+
+ return 0;
+}
+
+static int
+nvme_tcp_req_init(struct nvme_tcp_qpair *tqpair, struct nvme_request *req,
+ struct nvme_tcp_req *tcp_req)
+{
+ struct spdk_nvme_ctrlr *ctrlr = tqpair->qpair.ctrlr;
+ int rc = 0;
+ enum spdk_nvme_data_transfer xfer;
+ uint32_t max_incapsule_data_size;
+
+ tcp_req->req = req;
+ req->cmd.cid = tcp_req->cid;
+ req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG;
+ req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_TRANSPORT_DATA_BLOCK;
+ req->cmd.dptr.sgl1.unkeyed.subtype = SPDK_NVME_SGL_SUBTYPE_TRANSPORT;
+ req->cmd.dptr.sgl1.unkeyed.length = req->payload_size;
+
+ if (nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG) {
+ rc = nvme_tcp_build_contig_request(tqpair, tcp_req);
+ } else if (nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL) {
+ rc = nvme_tcp_build_sgl_request(tqpair, tcp_req);
+ } else {
+ rc = -1;
+ }
+
+ if (rc) {
+ return rc;
+ }
+
+ if (req->cmd.opc == SPDK_NVME_OPC_FABRIC) {
+ struct spdk_nvmf_capsule_cmd *nvmf_cmd = (struct spdk_nvmf_capsule_cmd *)&req->cmd;
+
+ xfer = spdk_nvme_opc_get_data_transfer(nvmf_cmd->fctype);
+ } else {
+ xfer = spdk_nvme_opc_get_data_transfer(req->cmd.opc);
+ }
+ if (xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) {
+ max_incapsule_data_size = ctrlr->ioccsz_bytes;
+ if ((req->cmd.opc == SPDK_NVME_OPC_FABRIC) || nvme_qpair_is_admin_queue(&tqpair->qpair)) {
+ max_incapsule_data_size = spdk_min(max_incapsule_data_size, NVME_TCP_IN_CAPSULE_DATA_MAX_SIZE);
+ }
+
+ if (req->payload_size <= max_incapsule_data_size) {
+ req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
+ req->cmd.dptr.sgl1.unkeyed.subtype = SPDK_NVME_SGL_SUBTYPE_OFFSET;
+ req->cmd.dptr.sgl1.address = 0;
+ tcp_req->in_capsule_data = true;
+ }
+ }
+
+ return 0;
+}
+
+static inline void
+nvme_tcp_req_put_safe(struct nvme_tcp_req *tcp_req)
+{
+ if (tcp_req->ordering.send_ack && tcp_req->ordering.data_recv) {
+ assert(tcp_req->state == NVME_TCP_REQ_ACTIVE);
+ assert(tcp_req->tqpair != NULL);
+ nvme_tcp_req_put(tcp_req->tqpair, tcp_req);
+ }
+}
+
+static void
+nvme_tcp_qpair_cmd_send_complete(void *cb_arg)
+{
+ struct nvme_tcp_req *tcp_req = cb_arg;
+
+ tcp_req->ordering.send_ack = 1;
+ /* Handle the r2t case */
+ if (spdk_unlikely(tcp_req->ordering.r2t_recv)) {
+ nvme_tcp_send_h2c_data(tcp_req);
+ } else {
+ nvme_tcp_req_put_safe(tcp_req);
+ }
+}
+
+static int
+nvme_tcp_qpair_capsule_cmd_send(struct nvme_tcp_qpair *tqpair,
+ struct nvme_tcp_req *tcp_req)
+{
+ struct nvme_tcp_pdu *pdu;
+ struct spdk_nvme_tcp_cmd *capsule_cmd;
+ uint32_t plen = 0, alignment;
+ uint8_t pdo;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "enter\n");
+ pdu = tcp_req->send_pdu;
+
+ capsule_cmd = &pdu->hdr.capsule_cmd;
+ capsule_cmd->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_CAPSULE_CMD;
+ plen = capsule_cmd->common.hlen = sizeof(*capsule_cmd);
+ capsule_cmd->ccsqe = tcp_req->req->cmd;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "capsule_cmd cid=%u on tqpair(%p)\n", tcp_req->req->cmd.cid, tqpair);
+
+ if (tqpair->host_hdgst_enable) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Header digest is enabled for capsule command on tcp_req=%p\n",
+ tcp_req);
+ capsule_cmd->common.flags |= SPDK_NVME_TCP_CH_FLAGS_HDGSTF;
+ plen += SPDK_NVME_TCP_DIGEST_LEN;
+ }
+
+ if ((tcp_req->req->payload_size == 0) || !tcp_req->in_capsule_data) {
+ goto end;
+ }
+
+ pdo = plen;
+ pdu->padding_len = 0;
+ if (tqpair->cpda) {
+ alignment = (tqpair->cpda + 1) << 2;
+ if (alignment > plen) {
+ pdu->padding_len = alignment - plen;
+ pdo = alignment;
+ plen = alignment;
+ }
+ }
+
+ capsule_cmd->common.pdo = pdo;
+ plen += tcp_req->req->payload_size;
+ if (tqpair->host_ddgst_enable) {
+ capsule_cmd->common.flags |= SPDK_NVME_TCP_CH_FLAGS_DDGSTF;
+ plen += SPDK_NVME_TCP_DIGEST_LEN;
+ }
+
+ tcp_req->datao = 0;
+ nvme_tcp_pdu_set_data_buf(pdu, tcp_req->iov, tcp_req->iovcnt,
+ 0, tcp_req->req->payload_size);
+end:
+ capsule_cmd->common.plen = plen;
+ return nvme_tcp_qpair_write_pdu(tqpair, pdu, nvme_tcp_qpair_cmd_send_complete, tcp_req);
+
+}
+
+static int
+nvme_tcp_qpair_submit_request(struct spdk_nvme_qpair *qpair,
+ struct nvme_request *req)
+{
+ struct nvme_tcp_qpair *tqpair;
+ struct nvme_tcp_req *tcp_req;
+
+ tqpair = nvme_tcp_qpair(qpair);
+ assert(tqpair != NULL);
+ assert(req != NULL);
+
+ tcp_req = nvme_tcp_req_get(tqpair);
+ if (!tcp_req) {
+ /* Inform the upper layer to try again later. */
+ return -EAGAIN;
+ }
+
+ if (nvme_tcp_req_init(tqpair, req, tcp_req)) {
+ SPDK_ERRLOG("nvme_tcp_req_init() failed\n");
+ TAILQ_REMOVE(&tcp_req->tqpair->outstanding_reqs, tcp_req, link);
+ nvme_tcp_req_put(tqpair, tcp_req);
+ return -1;
+ }
+
+ return nvme_tcp_qpair_capsule_cmd_send(tqpair, tcp_req);
+}
+
+static int
+nvme_tcp_qpair_reset(struct spdk_nvme_qpair *qpair)
+{
+ return 0;
+}
+
+static void
+nvme_tcp_req_complete(struct nvme_tcp_req *tcp_req,
+ struct spdk_nvme_cpl *rsp)
+{
+ struct nvme_request *req;
+
+ assert(tcp_req->req != NULL);
+ req = tcp_req->req;
+
+ TAILQ_REMOVE(&tcp_req->tqpair->outstanding_reqs, tcp_req, link);
+ nvme_complete_request(req->cb_fn, req->cb_arg, req->qpair, req, rsp);
+ nvme_free_request(req);
+}
+
+static void
+nvme_tcp_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr)
+{
+ struct nvme_tcp_req *tcp_req, *tmp;
+ struct spdk_nvme_cpl cpl;
+ struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair);
+
+ cpl.status.sc = SPDK_NVME_SC_ABORTED_SQ_DELETION;
+ cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+ cpl.status.dnr = dnr;
+
+ TAILQ_FOREACH_SAFE(tcp_req, &tqpair->outstanding_reqs, link, tmp) {
+ nvme_tcp_req_complete(tcp_req, &cpl);
+ nvme_tcp_req_put(tqpair, tcp_req);
+ }
+}
+
+static void
+nvme_tcp_qpair_set_recv_state(struct nvme_tcp_qpair *tqpair,
+ enum nvme_tcp_pdu_recv_state state)
+{
+ if (tqpair->recv_state == state) {
+ SPDK_ERRLOG("The recv state of tqpair=%p is same with the state(%d) to be set\n",
+ tqpair, state);
+ return;
+ }
+
+ tqpair->recv_state = state;
+ switch (state) {
+ case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY:
+ case NVME_TCP_PDU_RECV_STATE_ERROR:
+ memset(&tqpair->recv_pdu, 0, sizeof(struct nvme_tcp_pdu));
+ break;
+ case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_CH:
+ case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PSH:
+ case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD:
+ default:
+ break;
+ }
+}
+
+static void
+nvme_tcp_qpair_send_h2c_term_req_complete(void *cb_arg)
+{
+ struct nvme_tcp_qpair *tqpair = cb_arg;
+
+ tqpair->state = NVME_TCP_QPAIR_STATE_EXITING;
+}
+
+static void
+nvme_tcp_qpair_send_h2c_term_req(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_pdu *pdu,
+ enum spdk_nvme_tcp_term_req_fes fes, uint32_t error_offset)
+{
+ struct nvme_tcp_pdu *rsp_pdu;
+ struct spdk_nvme_tcp_term_req_hdr *h2c_term_req;
+ uint32_t h2c_term_req_hdr_len = sizeof(*h2c_term_req);
+ uint8_t copy_len;
+
+ rsp_pdu = &tqpair->send_pdu;
+ memset(rsp_pdu, 0, sizeof(*rsp_pdu));
+ h2c_term_req = &rsp_pdu->hdr.term_req;
+ h2c_term_req->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_H2C_TERM_REQ;
+ h2c_term_req->common.hlen = h2c_term_req_hdr_len;
+
+ if ((fes == SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD) ||
+ (fes == SPDK_NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER)) {
+ DSET32(&h2c_term_req->fei, error_offset);
+ }
+
+ copy_len = pdu->hdr.common.hlen;
+ if (copy_len > SPDK_NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE) {
+ copy_len = SPDK_NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE;
+ }
+
+ /* Copy the error info into the buffer */
+ memcpy((uint8_t *)rsp_pdu->hdr.raw + h2c_term_req_hdr_len, pdu->hdr.raw, copy_len);
+ nvme_tcp_pdu_set_data(rsp_pdu, (uint8_t *)rsp_pdu->hdr.raw + h2c_term_req_hdr_len, copy_len);
+
+ /* Contain the header len of the wrong received pdu */
+ h2c_term_req->common.plen = h2c_term_req->common.hlen + copy_len;
+ nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR);
+ nvme_tcp_qpair_write_pdu(tqpair, rsp_pdu, nvme_tcp_qpair_send_h2c_term_req_complete, NULL);
+
+}
+
+static void
+nvme_tcp_pdu_ch_handle(struct nvme_tcp_qpair *tqpair)
+{
+ struct nvme_tcp_pdu *pdu;
+ uint32_t error_offset = 0;
+ enum spdk_nvme_tcp_term_req_fes fes;
+ uint32_t expected_hlen, hd_len = 0;
+ bool plen_error = false;
+
+ pdu = &tqpair->recv_pdu;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "pdu type = %d\n", pdu->hdr.common.pdu_type);
+ if (pdu->hdr.common.pdu_type == SPDK_NVME_TCP_PDU_TYPE_IC_RESP) {
+ if (tqpair->state != NVME_TCP_QPAIR_STATE_INVALID) {
+ SPDK_ERRLOG("Already received IC_RESP PDU, and we should reject this pdu=%p\n", pdu);
+ fes = SPDK_NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR;
+ goto err;
+ }
+ expected_hlen = sizeof(struct spdk_nvme_tcp_ic_resp);
+ if (pdu->hdr.common.plen != expected_hlen) {
+ plen_error = true;
+ }
+ } else {
+ if (tqpair->state != NVME_TCP_QPAIR_STATE_RUNNING) {
+ SPDK_ERRLOG("The TCP/IP tqpair connection is not negotitated\n");
+ fes = SPDK_NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR;
+ goto err;
+ }
+
+ switch (pdu->hdr.common.pdu_type) {
+ case SPDK_NVME_TCP_PDU_TYPE_CAPSULE_RESP:
+ expected_hlen = sizeof(struct spdk_nvme_tcp_rsp);
+ if (pdu->hdr.common.flags & SPDK_NVME_TCP_CH_FLAGS_HDGSTF) {
+ hd_len = SPDK_NVME_TCP_DIGEST_LEN;
+ }
+
+ if (pdu->hdr.common.plen != (expected_hlen + hd_len)) {
+ plen_error = true;
+ }
+ break;
+ case SPDK_NVME_TCP_PDU_TYPE_C2H_DATA:
+ expected_hlen = sizeof(struct spdk_nvme_tcp_c2h_data_hdr);
+ if (pdu->hdr.common.plen < pdu->hdr.common.pdo) {
+ plen_error = true;
+ }
+ break;
+ case SPDK_NVME_TCP_PDU_TYPE_C2H_TERM_REQ:
+ expected_hlen = sizeof(struct spdk_nvme_tcp_term_req_hdr);
+ if ((pdu->hdr.common.plen <= expected_hlen) ||
+ (pdu->hdr.common.plen > SPDK_NVME_TCP_TERM_REQ_PDU_MAX_SIZE)) {
+ plen_error = true;
+ }
+ break;
+ case SPDK_NVME_TCP_PDU_TYPE_R2T:
+ expected_hlen = sizeof(struct spdk_nvme_tcp_r2t_hdr);
+ if (pdu->hdr.common.flags & SPDK_NVME_TCP_CH_FLAGS_HDGSTF) {
+ hd_len = SPDK_NVME_TCP_DIGEST_LEN;
+ }
+
+ if (pdu->hdr.common.plen != (expected_hlen + hd_len)) {
+ plen_error = true;
+ }
+ break;
+
+ default:
+ SPDK_ERRLOG("Unexpected PDU type 0x%02x\n", tqpair->recv_pdu.hdr.common.pdu_type);
+ fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
+ error_offset = offsetof(struct spdk_nvme_tcp_common_pdu_hdr, pdu_type);
+ goto err;
+ }
+ }
+
+ if (pdu->hdr.common.hlen != expected_hlen) {
+ SPDK_ERRLOG("Expected PDU header length %u, got %u\n",
+ expected_hlen, pdu->hdr.common.hlen);
+ fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
+ error_offset = offsetof(struct spdk_nvme_tcp_common_pdu_hdr, hlen);
+ goto err;
+
+ } else if (plen_error) {
+ fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
+ error_offset = offsetof(struct spdk_nvme_tcp_common_pdu_hdr, plen);
+ goto err;
+ } else {
+ nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PSH);
+ nvme_tcp_pdu_calc_psh_len(&tqpair->recv_pdu, tqpair->host_hdgst_enable);
+ return;
+ }
+err:
+ nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset);
+}
+
+static struct nvme_tcp_req *
+get_nvme_active_req_by_cid(struct nvme_tcp_qpair *tqpair, uint32_t cid)
+{
+ assert(tqpair != NULL);
+ if ((cid >= tqpair->num_entries) || (tqpair->tcp_reqs[cid].state == NVME_TCP_REQ_FREE)) {
+ return NULL;
+ }
+
+ return &tqpair->tcp_reqs[cid];
+}
+
+static void
+nvme_tcp_c2h_data_payload_handle(struct nvme_tcp_qpair *tqpair,
+ struct nvme_tcp_pdu *pdu, uint32_t *reaped)
+{
+ struct nvme_tcp_req *tcp_req;
+ struct spdk_nvme_tcp_c2h_data_hdr *c2h_data;
+ struct spdk_nvme_cpl cpl = {};
+ uint8_t flags;
+
+ tcp_req = pdu->req;
+ assert(tcp_req != NULL);
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "enter\n");
+ c2h_data = &pdu->hdr.c2h_data;
+ tcp_req->datao += pdu->data_len;
+ flags = c2h_data->common.flags;
+
+ nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY);
+ if (flags & SPDK_NVME_TCP_C2H_DATA_FLAGS_SUCCESS) {
+ if (tcp_req->datao == tcp_req->req->payload_size) {
+ cpl.status.p = 0;
+ } else {
+ cpl.status.p = 1;
+ }
+
+ cpl.cid = tcp_req->cid;
+ cpl.sqid = tqpair->qpair.id;
+ nvme_tcp_req_complete(tcp_req, &cpl);
+ if (tcp_req->ordering.send_ack) {
+ (*reaped)++;
+ }
+
+ tcp_req->ordering.data_recv = 1;
+ nvme_tcp_req_put_safe(tcp_req);
+ }
+}
+
+static const char *spdk_nvme_tcp_term_req_fes_str[] = {
+ "Invalid PDU Header Field",
+ "PDU Sequence Error",
+ "Header Digest Error",
+ "Data Transfer Out of Range",
+ "Data Transfer Limit Exceeded",
+ "Unsupported parameter",
+};
+
+static void
+nvme_tcp_c2h_term_req_dump(struct spdk_nvme_tcp_term_req_hdr *c2h_term_req)
+{
+ SPDK_ERRLOG("Error info of pdu(%p): %s\n", c2h_term_req,
+ spdk_nvme_tcp_term_req_fes_str[c2h_term_req->fes]);
+ if ((c2h_term_req->fes == SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD) ||
+ (c2h_term_req->fes == SPDK_NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER)) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "The offset from the start of the PDU header is %u\n",
+ DGET32(c2h_term_req->fei));
+ }
+ /* we may also need to dump some other info here */
+}
+
+static void
+nvme_tcp_c2h_term_req_payload_handle(struct nvme_tcp_qpair *tqpair,
+ struct nvme_tcp_pdu *pdu)
+{
+ nvme_tcp_c2h_term_req_dump(&pdu->hdr.term_req);
+ nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR);
+}
+
+static void
+nvme_tcp_pdu_payload_handle(struct nvme_tcp_qpair *tqpair,
+ uint32_t *reaped)
+{
+ int rc = 0;
+ struct nvme_tcp_pdu *pdu;
+ uint32_t crc32c, error_offset = 0;
+ enum spdk_nvme_tcp_term_req_fes fes;
+
+ assert(tqpair->recv_state == NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD);
+ pdu = &tqpair->recv_pdu;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "enter\n");
+
+ /* check data digest if need */
+ if (pdu->ddgst_enable) {
+ crc32c = nvme_tcp_pdu_calc_data_digest(pdu);
+ rc = MATCH_DIGEST_WORD(pdu->data_digest, crc32c);
+ if (rc == 0) {
+ SPDK_ERRLOG("data digest error on tqpair=(%p) with pdu=%p\n", tqpair, pdu);
+ fes = SPDK_NVME_TCP_TERM_REQ_FES_HDGST_ERROR;
+ nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset);
+ return;
+ }
+ }
+
+ switch (pdu->hdr.common.pdu_type) {
+ case SPDK_NVME_TCP_PDU_TYPE_C2H_DATA:
+ nvme_tcp_c2h_data_payload_handle(tqpair, pdu, reaped);
+ break;
+
+ case SPDK_NVME_TCP_PDU_TYPE_C2H_TERM_REQ:
+ nvme_tcp_c2h_term_req_payload_handle(tqpair, pdu);
+ break;
+
+ default:
+ /* The code should not go to here */
+ SPDK_ERRLOG("The code should not go to here\n");
+ break;
+ }
+}
+
+static void
+nvme_tcp_send_icreq_complete(void *cb_arg)
+{
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Complete the icreq send for tqpair=%p\n",
+ (struct nvme_tcp_qpair *)cb_arg);
+}
+
+static void
+nvme_tcp_icresp_handle(struct nvme_tcp_qpair *tqpair,
+ struct nvme_tcp_pdu *pdu)
+{
+ struct spdk_nvme_tcp_ic_resp *ic_resp = &pdu->hdr.ic_resp;
+ uint32_t error_offset = 0;
+ enum spdk_nvme_tcp_term_req_fes fes;
+ int recv_buf_size;
+
+ /* Only PFV 0 is defined currently */
+ if (ic_resp->pfv != 0) {
+ SPDK_ERRLOG("Expected ICResp PFV %u, got %u\n", 0u, ic_resp->pfv);
+ fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
+ error_offset = offsetof(struct spdk_nvme_tcp_ic_resp, pfv);
+ goto end;
+ }
+
+ if (ic_resp->maxh2cdata < NVME_TCP_PDU_H2C_MIN_DATA_SIZE) {
+ SPDK_ERRLOG("Expected ICResp maxh2cdata >=%u, got %u\n", NVME_TCP_PDU_H2C_MIN_DATA_SIZE,
+ ic_resp->maxh2cdata);
+ fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
+ error_offset = offsetof(struct spdk_nvme_tcp_ic_resp, maxh2cdata);
+ goto end;
+ }
+ tqpair->maxh2cdata = ic_resp->maxh2cdata;
+
+ if (ic_resp->cpda > SPDK_NVME_TCP_CPDA_MAX) {
+ SPDK_ERRLOG("Expected ICResp cpda <=%u, got %u\n", SPDK_NVME_TCP_CPDA_MAX, ic_resp->cpda);
+ fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
+ error_offset = offsetof(struct spdk_nvme_tcp_ic_resp, cpda);
+ goto end;
+ }
+ tqpair->cpda = ic_resp->cpda;
+
+ tqpair->host_hdgst_enable = ic_resp->dgst.bits.hdgst_enable ? true : false;
+ tqpair->host_ddgst_enable = ic_resp->dgst.bits.ddgst_enable ? true : false;
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "host_hdgst_enable: %u\n", tqpair->host_hdgst_enable);
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "host_ddgst_enable: %u\n", tqpair->host_ddgst_enable);
+
+ /* Now that we know whether digests are enabled, properly size the receive buffer to
+ * handle several incoming 4K read commands according to SPDK_NVMF_TCP_RECV_BUF_SIZE_FACTOR
+ * parameter. */
+ recv_buf_size = 0x1000 + sizeof(struct spdk_nvme_tcp_c2h_data_hdr);
+
+ if (tqpair->host_hdgst_enable) {
+ recv_buf_size += SPDK_NVME_TCP_DIGEST_LEN;
+ }
+
+ if (tqpair->host_ddgst_enable) {
+ recv_buf_size += SPDK_NVME_TCP_DIGEST_LEN;
+ }
+
+ if (spdk_sock_set_recvbuf(tqpair->sock, recv_buf_size * SPDK_NVMF_TCP_RECV_BUF_SIZE_FACTOR) < 0) {
+ SPDK_WARNLOG("Unable to allocate enough memory for receive buffer on tqpair=%p with size=%d\n",
+ tqpair,
+ recv_buf_size);
+ /* Not fatal. */
+ }
+
+ tqpair->state = NVME_TCP_QPAIR_STATE_RUNNING;
+ nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY);
+ return;
+end:
+ nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset);
+ return;
+}
+
+static void
+nvme_tcp_capsule_resp_hdr_handle(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_pdu *pdu,
+ uint32_t *reaped)
+{
+ struct nvme_tcp_req *tcp_req;
+ struct spdk_nvme_tcp_rsp *capsule_resp = &pdu->hdr.capsule_resp;
+ uint32_t cid, error_offset = 0;
+ enum spdk_nvme_tcp_term_req_fes fes;
+ struct spdk_nvme_cpl cpl;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "enter\n");
+ cpl = capsule_resp->rccqe;
+ cid = cpl.cid;
+
+ /* Recv the pdu again */
+ nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY);
+
+ tcp_req = get_nvme_active_req_by_cid(tqpair, cid);
+ if (!tcp_req) {
+ SPDK_ERRLOG("no tcp_req is found with cid=%u for tqpair=%p\n", cid, tqpair);
+ fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
+ error_offset = offsetof(struct spdk_nvme_tcp_rsp, rccqe);
+ goto end;
+
+ }
+
+ nvme_tcp_req_complete(tcp_req, &cpl);
+ if (tcp_req->ordering.send_ack) {
+ (*reaped)++;
+ }
+
+ tcp_req->ordering.data_recv = 1;
+ nvme_tcp_req_put_safe(tcp_req);
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "complete tcp_req(%p) on tqpair=%p\n", tcp_req, tqpair);
+
+ return;
+
+end:
+ nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset);
+ return;
+}
+
+static void
+nvme_tcp_c2h_term_req_hdr_handle(struct nvme_tcp_qpair *tqpair,
+ struct nvme_tcp_pdu *pdu)
+{
+ struct spdk_nvme_tcp_term_req_hdr *c2h_term_req = &pdu->hdr.term_req;
+ uint32_t error_offset = 0;
+ enum spdk_nvme_tcp_term_req_fes fes;
+
+ if (c2h_term_req->fes > SPDK_NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER) {
+ SPDK_ERRLOG("Fatal Error Stauts(FES) is unknown for c2h_term_req pdu=%p\n", pdu);
+ fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
+ error_offset = offsetof(struct spdk_nvme_tcp_term_req_hdr, fes);
+ goto end;
+ }
+
+ /* set the data buffer */
+ nvme_tcp_pdu_set_data(pdu, (uint8_t *)pdu->hdr.raw + c2h_term_req->common.hlen,
+ c2h_term_req->common.plen - c2h_term_req->common.hlen);
+ nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD);
+ return;
+end:
+ nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset);
+ return;
+}
+
+static void
+nvme_tcp_c2h_data_hdr_handle(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_pdu *pdu)
+{
+ struct nvme_tcp_req *tcp_req;
+ struct spdk_nvme_tcp_c2h_data_hdr *c2h_data = &pdu->hdr.c2h_data;
+ uint32_t error_offset = 0;
+ enum spdk_nvme_tcp_term_req_fes fes;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "enter\n");
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "c2h_data info on tqpair(%p): datao=%u, datal=%u, cccid=%d\n",
+ tqpair, c2h_data->datao, c2h_data->datal, c2h_data->cccid);
+ tcp_req = get_nvme_active_req_by_cid(tqpair, c2h_data->cccid);
+ if (!tcp_req) {
+ SPDK_ERRLOG("no tcp_req found for c2hdata cid=%d\n", c2h_data->cccid);
+ fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
+ error_offset = offsetof(struct spdk_nvme_tcp_c2h_data_hdr, cccid);
+ goto end;
+
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "tcp_req(%p) on tqpair(%p): datao=%u, payload_size=%u\n",
+ tcp_req, tqpair, tcp_req->datao, tcp_req->req->payload_size);
+
+ if (c2h_data->datal > tcp_req->req->payload_size) {
+ SPDK_ERRLOG("Invalid datal for tcp_req(%p), datal(%u) exceeds payload_size(%u)\n",
+ tcp_req, c2h_data->datal, tcp_req->req->payload_size);
+ fes = SPDK_NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE;
+ goto end;
+ }
+
+ if (tcp_req->datao != c2h_data->datao) {
+ SPDK_ERRLOG("Invalid datao for tcp_req(%p), received datal(%u) != datao(%u) in tcp_req\n",
+ tcp_req, c2h_data->datao, tcp_req->datao);
+ fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
+ error_offset = offsetof(struct spdk_nvme_tcp_c2h_data_hdr, datao);
+ goto end;
+ }
+
+ if ((c2h_data->datao + c2h_data->datal) > tcp_req->req->payload_size) {
+ SPDK_ERRLOG("Invalid data range for tcp_req(%p), received (datao(%u) + datal(%u)) > datao(%u) in tcp_req\n",
+ tcp_req, c2h_data->datao, c2h_data->datal, tcp_req->req->payload_size);
+ fes = SPDK_NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE;
+ error_offset = offsetof(struct spdk_nvme_tcp_c2h_data_hdr, datal);
+ goto end;
+
+ }
+
+ nvme_tcp_pdu_set_data_buf(pdu, tcp_req->iov, tcp_req->iovcnt,
+ c2h_data->datao, c2h_data->datal);
+ pdu->req = tcp_req;
+
+ nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD);
+ return;
+
+end:
+ nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset);
+ return;
+}
+
+static void
+nvme_tcp_qpair_h2c_data_send_complete(void *cb_arg)
+{
+ struct nvme_tcp_req *tcp_req = cb_arg;
+
+ assert(tcp_req != NULL);
+
+ tcp_req->ordering.send_ack = 1;
+ if (tcp_req->r2tl_remain) {
+ nvme_tcp_send_h2c_data(tcp_req);
+ } else {
+ assert(tcp_req->active_r2ts > 0);
+ tcp_req->active_r2ts--;
+ tcp_req->state = NVME_TCP_REQ_ACTIVE;
+ /* Need also call this function to free the resource */
+ nvme_tcp_req_put_safe(tcp_req);
+ }
+}
+
+static void
+nvme_tcp_send_h2c_data(struct nvme_tcp_req *tcp_req)
+{
+ struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(tcp_req->req->qpair);
+ struct nvme_tcp_pdu *rsp_pdu;
+ struct spdk_nvme_tcp_h2c_data_hdr *h2c_data;
+ uint32_t plen, pdo, alignment;
+
+ /* Reinit the send_ack and r2t_recv bits */
+ tcp_req->ordering.send_ack = 0;
+ tcp_req->ordering.r2t_recv = 0;
+ rsp_pdu = tcp_req->send_pdu;
+ memset(rsp_pdu, 0, sizeof(*rsp_pdu));
+ h2c_data = &rsp_pdu->hdr.h2c_data;
+
+ h2c_data->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_H2C_DATA;
+ plen = h2c_data->common.hlen = sizeof(*h2c_data);
+ h2c_data->cccid = tcp_req->cid;
+ h2c_data->ttag = tcp_req->ttag;
+ h2c_data->datao = tcp_req->datao;
+
+ h2c_data->datal = spdk_min(tcp_req->r2tl_remain, tqpair->maxh2cdata);
+ nvme_tcp_pdu_set_data_buf(rsp_pdu, tcp_req->iov, tcp_req->iovcnt,
+ h2c_data->datao, h2c_data->datal);
+ tcp_req->r2tl_remain -= h2c_data->datal;
+
+ if (tqpair->host_hdgst_enable) {
+ h2c_data->common.flags |= SPDK_NVME_TCP_CH_FLAGS_HDGSTF;
+ plen += SPDK_NVME_TCP_DIGEST_LEN;
+ }
+
+ rsp_pdu->padding_len = 0;
+ pdo = plen;
+ if (tqpair->cpda) {
+ alignment = (tqpair->cpda + 1) << 2;
+ if (alignment > plen) {
+ rsp_pdu->padding_len = alignment - plen;
+ pdo = plen = alignment;
+ }
+ }
+
+ h2c_data->common.pdo = pdo;
+ plen += h2c_data->datal;
+ if (tqpair->host_ddgst_enable) {
+ h2c_data->common.flags |= SPDK_NVME_TCP_CH_FLAGS_DDGSTF;
+ plen += SPDK_NVME_TCP_DIGEST_LEN;
+ }
+
+ h2c_data->common.plen = plen;
+ tcp_req->datao += h2c_data->datal;
+ if (!tcp_req->r2tl_remain) {
+ h2c_data->common.flags |= SPDK_NVME_TCP_H2C_DATA_FLAGS_LAST_PDU;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "h2c_data info: datao=%u, datal=%u, pdu_len=%u for tqpair=%p\n",
+ h2c_data->datao, h2c_data->datal, h2c_data->common.plen, tqpair);
+
+ nvme_tcp_qpair_write_pdu(tqpair, rsp_pdu, nvme_tcp_qpair_h2c_data_send_complete, tcp_req);
+}
+
+static void
+nvme_tcp_r2t_hdr_handle(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_pdu *pdu)
+{
+ struct nvme_tcp_req *tcp_req;
+ struct spdk_nvme_tcp_r2t_hdr *r2t = &pdu->hdr.r2t;
+ uint32_t cid, error_offset = 0;
+ enum spdk_nvme_tcp_term_req_fes fes;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "enter\n");
+ cid = r2t->cccid;
+ tcp_req = get_nvme_active_req_by_cid(tqpair, cid);
+ if (!tcp_req) {
+ SPDK_ERRLOG("Cannot find tcp_req for tqpair=%p\n", tqpair);
+ fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
+ error_offset = offsetof(struct spdk_nvme_tcp_r2t_hdr, cccid);
+ goto end;
+ }
+
+ tcp_req->ordering.r2t_recv = 1;
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "r2t info: r2to=%u, r2tl=%u for tqpair=%p\n", r2t->r2to, r2t->r2tl,
+ tqpair);
+
+ if (tcp_req->state == NVME_TCP_REQ_ACTIVE) {
+ assert(tcp_req->active_r2ts == 0);
+ tcp_req->state = NVME_TCP_REQ_ACTIVE_R2T;
+ }
+
+ tcp_req->active_r2ts++;
+ if (tcp_req->active_r2ts > tqpair->maxr2t) {
+ fes = SPDK_NVME_TCP_TERM_REQ_FES_R2T_LIMIT_EXCEEDED;
+ SPDK_ERRLOG("Invalid R2T: it exceeds the R2T maixmal=%u for tqpair=%p\n", tqpair->maxr2t, tqpair);
+ goto end;
+ }
+
+ if (tcp_req->datao != r2t->r2to) {
+ fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
+ error_offset = offsetof(struct spdk_nvme_tcp_r2t_hdr, r2to);
+ goto end;
+
+ }
+
+ if ((r2t->r2tl + r2t->r2to) > tcp_req->req->payload_size) {
+ SPDK_ERRLOG("Invalid R2T info for tcp_req=%p: (r2to(%u) + r2tl(%u)) exceeds payload_size(%u)\n",
+ tcp_req, r2t->r2to, r2t->r2tl, tqpair->maxh2cdata);
+ fes = SPDK_NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE;
+ error_offset = offsetof(struct spdk_nvme_tcp_r2t_hdr, r2tl);
+ goto end;
+
+ }
+
+ tcp_req->ttag = r2t->ttag;
+ tcp_req->r2tl_remain = r2t->r2tl;
+ nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY);
+
+ if (spdk_likely(tcp_req->ordering.send_ack)) {
+ nvme_tcp_send_h2c_data(tcp_req);
+ }
+ return;
+
+end:
+ nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset);
+ return;
+
+}
+
+static void
+nvme_tcp_pdu_psh_handle(struct nvme_tcp_qpair *tqpair, uint32_t *reaped)
+{
+ struct nvme_tcp_pdu *pdu;
+ int rc;
+ uint32_t crc32c, error_offset = 0;
+ enum spdk_nvme_tcp_term_req_fes fes;
+
+ assert(tqpair->recv_state == NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PSH);
+ pdu = &tqpair->recv_pdu;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "enter: pdu type =%u\n", pdu->hdr.common.pdu_type);
+ /* check header digest if needed */
+ if (pdu->has_hdgst) {
+ crc32c = nvme_tcp_pdu_calc_header_digest(pdu);
+ rc = MATCH_DIGEST_WORD((uint8_t *)pdu->hdr.raw + pdu->hdr.common.hlen, crc32c);
+ if (rc == 0) {
+ SPDK_ERRLOG("header digest error on tqpair=(%p) with pdu=%p\n", tqpair, pdu);
+ fes = SPDK_NVME_TCP_TERM_REQ_FES_HDGST_ERROR;
+ nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset);
+ return;
+
+ }
+ }
+
+ switch (pdu->hdr.common.pdu_type) {
+ case SPDK_NVME_TCP_PDU_TYPE_IC_RESP:
+ nvme_tcp_icresp_handle(tqpair, pdu);
+ break;
+ case SPDK_NVME_TCP_PDU_TYPE_CAPSULE_RESP:
+ nvme_tcp_capsule_resp_hdr_handle(tqpair, pdu, reaped);
+ break;
+ case SPDK_NVME_TCP_PDU_TYPE_C2H_DATA:
+ nvme_tcp_c2h_data_hdr_handle(tqpair, pdu);
+ break;
+
+ case SPDK_NVME_TCP_PDU_TYPE_C2H_TERM_REQ:
+ nvme_tcp_c2h_term_req_hdr_handle(tqpair, pdu);
+ break;
+ case SPDK_NVME_TCP_PDU_TYPE_R2T:
+ nvme_tcp_r2t_hdr_handle(tqpair, pdu);
+ break;
+
+ default:
+ SPDK_ERRLOG("Unexpected PDU type 0x%02x\n", tqpair->recv_pdu.hdr.common.pdu_type);
+ fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
+ error_offset = 1;
+ nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset);
+ break;
+ }
+
+}
+
+static int
+nvme_tcp_read_pdu(struct nvme_tcp_qpair *tqpair, uint32_t *reaped)
+{
+ int rc = 0;
+ struct nvme_tcp_pdu *pdu;
+ uint32_t data_len;
+ enum nvme_tcp_pdu_recv_state prev_state;
+
+ /* The loop here is to allow for several back-to-back state changes. */
+ do {
+ prev_state = tqpair->recv_state;
+ switch (tqpair->recv_state) {
+ /* If in a new state */
+ case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY:
+ nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_CH);
+ break;
+ /* common header */
+ case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_CH:
+ pdu = &tqpair->recv_pdu;
+ if (pdu->ch_valid_bytes < sizeof(struct spdk_nvme_tcp_common_pdu_hdr)) {
+ rc = nvme_tcp_read_data(tqpair->sock,
+ sizeof(struct spdk_nvme_tcp_common_pdu_hdr) - pdu->ch_valid_bytes,
+ (uint8_t *)&pdu->hdr.common + pdu->ch_valid_bytes);
+ if (rc < 0) {
+ nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR);
+ break;
+ }
+ pdu->ch_valid_bytes += rc;
+ if (pdu->ch_valid_bytes < sizeof(struct spdk_nvme_tcp_common_pdu_hdr)) {
+ return NVME_TCP_PDU_IN_PROGRESS;
+ }
+ }
+
+ /* The command header of this PDU has now been read from the socket. */
+ nvme_tcp_pdu_ch_handle(tqpair);
+ break;
+ /* Wait for the pdu specific header */
+ case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PSH:
+ pdu = &tqpair->recv_pdu;
+ rc = nvme_tcp_read_data(tqpair->sock,
+ pdu->psh_len - pdu->psh_valid_bytes,
+ (uint8_t *)&pdu->hdr.raw + sizeof(struct spdk_nvme_tcp_common_pdu_hdr) + pdu->psh_valid_bytes);
+ if (rc < 0) {
+ nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR);
+ break;
+ }
+
+ pdu->psh_valid_bytes += rc;
+ if (pdu->psh_valid_bytes < pdu->psh_len) {
+ return NVME_TCP_PDU_IN_PROGRESS;
+ }
+
+ /* All header(ch, psh, head digist) of this PDU has now been read from the socket. */
+ nvme_tcp_pdu_psh_handle(tqpair, reaped);
+ break;
+ case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD:
+ pdu = &tqpair->recv_pdu;
+ /* check whether the data is valid, if not we just return */
+ if (!pdu->data_len) {
+ return NVME_TCP_PDU_IN_PROGRESS;
+ }
+
+ data_len = pdu->data_len;
+ /* data digest */
+ if (spdk_unlikely((pdu->hdr.common.pdu_type == SPDK_NVME_TCP_PDU_TYPE_C2H_DATA) &&
+ tqpair->host_ddgst_enable)) {
+ data_len += SPDK_NVME_TCP_DIGEST_LEN;
+ pdu->ddgst_enable = true;
+ }
+
+ rc = nvme_tcp_read_payload_data(tqpair->sock, pdu);
+ if (rc < 0) {
+ nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR);
+ break;
+ }
+
+ pdu->readv_offset += rc;
+ if (pdu->readv_offset < data_len) {
+ return NVME_TCP_PDU_IN_PROGRESS;
+ }
+
+ assert(pdu->readv_offset == data_len);
+ /* All of this PDU has now been read from the socket. */
+ nvme_tcp_pdu_payload_handle(tqpair, reaped);
+ break;
+ case NVME_TCP_PDU_RECV_STATE_ERROR:
+ rc = NVME_TCP_PDU_FATAL;
+ break;
+ default:
+ assert(0);
+ break;
+ }
+ } while (prev_state != tqpair->recv_state);
+
+ return rc;
+}
+
+static void
+nvme_tcp_qpair_check_timeout(struct spdk_nvme_qpair *qpair)
+{
+ uint64_t t02;
+ struct nvme_tcp_req *tcp_req, *tmp;
+ struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair);
+ struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
+ struct spdk_nvme_ctrlr_process *active_proc;
+
+ /* Don't check timeouts during controller initialization. */
+ if (ctrlr->state != NVME_CTRLR_STATE_READY) {
+ return;
+ }
+
+ if (nvme_qpair_is_admin_queue(qpair)) {
+ active_proc = nvme_ctrlr_get_current_process(ctrlr);
+ } else {
+ active_proc = qpair->active_proc;
+ }
+
+ /* Only check timeouts if the current process has a timeout callback. */
+ if (active_proc == NULL || active_proc->timeout_cb_fn == NULL) {
+ return;
+ }
+
+ t02 = spdk_get_ticks();
+ TAILQ_FOREACH_SAFE(tcp_req, &tqpair->outstanding_reqs, link, tmp) {
+ assert(tcp_req->req != NULL);
+
+ if (nvme_request_check_timeout(tcp_req->req, tcp_req->cid, active_proc, t02)) {
+ /*
+ * The requests are in order, so as soon as one has not timed out,
+ * stop iterating.
+ */
+ break;
+ }
+ }
+}
+
+static int
+nvme_tcp_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_completions)
+{
+ struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair);
+ uint32_t reaped;
+ int rc;
+
+ rc = spdk_sock_flush(tqpair->sock);
+ if (rc < 0) {
+ return rc;
+ }
+
+ if (max_completions == 0) {
+ max_completions = tqpair->num_entries;
+ } else {
+ max_completions = spdk_min(max_completions, tqpair->num_entries);
+ }
+
+ reaped = 0;
+ do {
+ rc = nvme_tcp_read_pdu(tqpair, &reaped);
+ if (rc < 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Error polling CQ! (%d): %s\n",
+ errno, spdk_strerror(errno));
+ goto fail;
+ } else if (rc == 0) {
+ /* Partial PDU is read */
+ break;
+ }
+
+ } while (reaped < max_completions);
+
+ if (spdk_unlikely(tqpair->qpair.ctrlr->timeout_enabled)) {
+ nvme_tcp_qpair_check_timeout(qpair);
+ }
+
+ return reaped;
+fail:
+
+ /*
+ * Since admin queues take the ctrlr_lock before entering this function,
+ * we can call nvme_transport_ctrlr_disconnect_qpair. For other qpairs we need
+ * to call the generic function which will take the lock for us.
+ */
+ qpair->transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_UNKNOWN;
+
+ if (nvme_qpair_is_admin_queue(qpair)) {
+ nvme_transport_ctrlr_disconnect_qpair(qpair->ctrlr, qpair);
+ } else {
+ nvme_ctrlr_disconnect_qpair(qpair);
+ }
+ return -ENXIO;
+}
+
+static void
+nvme_tcp_qpair_sock_cb(void *ctx, struct spdk_sock_group *group, struct spdk_sock *sock)
+{
+ struct spdk_nvme_qpair *qpair = ctx;
+ struct nvme_tcp_poll_group *pgroup = nvme_tcp_poll_group(qpair->poll_group);
+ int32_t num_completions;
+
+ num_completions = spdk_nvme_qpair_process_completions(qpair, pgroup->completions_per_qpair);
+
+ if (pgroup->num_completions >= 0 && num_completions >= 0) {
+ pgroup->num_completions += num_completions;
+ } else {
+ pgroup->num_completions = -ENXIO;
+ }
+}
+
+static int
+nvme_tcp_qpair_icreq_send(struct nvme_tcp_qpair *tqpair)
+{
+ struct spdk_nvme_tcp_ic_req *ic_req;
+ struct nvme_tcp_pdu *pdu;
+ uint64_t icreq_timeout_tsc;
+ int rc;
+
+ pdu = &tqpair->send_pdu;
+ memset(&tqpair->send_pdu, 0, sizeof(tqpair->send_pdu));
+ ic_req = &pdu->hdr.ic_req;
+
+ ic_req->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_IC_REQ;
+ ic_req->common.hlen = ic_req->common.plen = sizeof(*ic_req);
+ ic_req->pfv = 0;
+ ic_req->maxr2t = NVME_TCP_MAX_R2T_DEFAULT - 1;
+ ic_req->hpda = NVME_TCP_HPDA_DEFAULT;
+
+ ic_req->dgst.bits.hdgst_enable = tqpair->qpair.ctrlr->opts.header_digest;
+ ic_req->dgst.bits.ddgst_enable = tqpair->qpair.ctrlr->opts.data_digest;
+
+ nvme_tcp_qpair_write_pdu(tqpair, pdu, nvme_tcp_send_icreq_complete, tqpair);
+
+ icreq_timeout_tsc = spdk_get_ticks() + (NVME_TCP_TIME_OUT_IN_SECONDS * spdk_get_ticks_hz());
+ do {
+ rc = nvme_tcp_qpair_process_completions(&tqpair->qpair, 0);
+ } while ((tqpair->state == NVME_TCP_QPAIR_STATE_INVALID) &&
+ (rc == 0) && (spdk_get_ticks() <= icreq_timeout_tsc));
+
+ if (tqpair->state != NVME_TCP_QPAIR_STATE_RUNNING) {
+ SPDK_ERRLOG("Failed to construct the tqpair=%p via correct icresp\n", tqpair);
+ return -1;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Succesfully construct the tqpair=%p via correct icresp\n", tqpair);
+
+ return 0;
+}
+
+static int
+nvme_tcp_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
+{
+ struct sockaddr_storage dst_addr;
+ struct sockaddr_storage src_addr;
+ int rc;
+ struct nvme_tcp_qpair *tqpair;
+ int family;
+ long int port;
+ struct spdk_sock_opts opts;
+
+ tqpair = nvme_tcp_qpair(qpair);
+
+ switch (ctrlr->trid.adrfam) {
+ case SPDK_NVMF_ADRFAM_IPV4:
+ family = AF_INET;
+ break;
+ case SPDK_NVMF_ADRFAM_IPV6:
+ family = AF_INET6;
+ break;
+ default:
+ SPDK_ERRLOG("Unhandled ADRFAM %d\n", ctrlr->trid.adrfam);
+ return -1;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "adrfam %d ai_family %d\n", ctrlr->trid.adrfam, family);
+
+ memset(&dst_addr, 0, sizeof(dst_addr));
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "trsvcid is %s\n", ctrlr->trid.trsvcid);
+ rc = nvme_tcp_parse_addr(&dst_addr, family, ctrlr->trid.traddr, ctrlr->trid.trsvcid);
+ if (rc != 0) {
+ SPDK_ERRLOG("dst_addr nvme_tcp_parse_addr() failed\n");
+ return -1;
+ }
+
+ if (ctrlr->opts.src_addr[0] || ctrlr->opts.src_svcid[0]) {
+ memset(&src_addr, 0, sizeof(src_addr));
+ rc = nvme_tcp_parse_addr(&src_addr, family, ctrlr->opts.src_addr, ctrlr->opts.src_svcid);
+ if (rc != 0) {
+ SPDK_ERRLOG("src_addr nvme_tcp_parse_addr() failed\n");
+ return -1;
+ }
+ }
+
+ port = spdk_strtol(ctrlr->trid.trsvcid, 10);
+ if (port <= 0 || port >= INT_MAX) {
+ SPDK_ERRLOG("Invalid port: %s\n", ctrlr->trid.trsvcid);
+ return -1;
+ }
+
+ opts.opts_size = sizeof(opts);
+ spdk_sock_get_default_opts(&opts);
+ opts.priority = ctrlr->trid.priority;
+ tqpair->sock = spdk_sock_connect_ext(ctrlr->trid.traddr, port, NULL, &opts);
+ if (!tqpair->sock) {
+ SPDK_ERRLOG("sock connection error of tqpair=%p with addr=%s, port=%ld\n",
+ tqpair, ctrlr->trid.traddr, port);
+ return -1;
+ }
+
+ tqpair->maxr2t = NVME_TCP_MAX_R2T_DEFAULT;
+ /* Explicitly set the state and recv_state of tqpair */
+ tqpair->state = NVME_TCP_QPAIR_STATE_INVALID;
+ if (tqpair->recv_state != NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY) {
+ nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY);
+ }
+ rc = nvme_tcp_qpair_icreq_send(tqpair);
+ if (rc != 0) {
+ SPDK_ERRLOG("Unable to connect the tqpair\n");
+ return -1;
+ }
+
+ rc = nvme_fabric_qpair_connect(&tqpair->qpair, tqpair->num_entries);
+ if (rc < 0) {
+ SPDK_ERRLOG("Failed to send an NVMe-oF Fabric CONNECT command\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+static struct spdk_nvme_qpair *
+nvme_tcp_ctrlr_create_qpair(struct spdk_nvme_ctrlr *ctrlr,
+ uint16_t qid, uint32_t qsize,
+ enum spdk_nvme_qprio qprio,
+ uint32_t num_requests)
+{
+ struct nvme_tcp_qpair *tqpair;
+ struct spdk_nvme_qpair *qpair;
+ int rc;
+
+ tqpair = calloc(1, sizeof(struct nvme_tcp_qpair));
+ if (!tqpair) {
+ SPDK_ERRLOG("failed to get create tqpair\n");
+ return NULL;
+ }
+
+ tqpair->num_entries = qsize;
+ qpair = &tqpair->qpair;
+ rc = nvme_qpair_init(qpair, qid, ctrlr, qprio, num_requests);
+ if (rc != 0) {
+ free(tqpair);
+ return NULL;
+ }
+
+ rc = nvme_tcp_alloc_reqs(tqpair);
+ if (rc) {
+ nvme_tcp_ctrlr_delete_io_qpair(ctrlr, qpair);
+ return NULL;
+ }
+
+ return qpair;
+}
+
+static struct spdk_nvme_qpair *
+nvme_tcp_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid,
+ const struct spdk_nvme_io_qpair_opts *opts)
+{
+ return nvme_tcp_ctrlr_create_qpair(ctrlr, qid, opts->io_queue_size, opts->qprio,
+ opts->io_queue_requests);
+}
+
+static struct spdk_nvme_ctrlr *nvme_tcp_ctrlr_construct(const struct spdk_nvme_transport_id *trid,
+ const struct spdk_nvme_ctrlr_opts *opts,
+ void *devhandle)
+{
+ struct nvme_tcp_ctrlr *tctrlr;
+ union spdk_nvme_cap_register cap;
+ union spdk_nvme_vs_register vs;
+ int rc;
+
+ tctrlr = calloc(1, sizeof(*tctrlr));
+ if (tctrlr == NULL) {
+ SPDK_ERRLOG("could not allocate ctrlr\n");
+ return NULL;
+ }
+
+ tctrlr->ctrlr.opts = *opts;
+ tctrlr->ctrlr.trid = *trid;
+
+ rc = nvme_ctrlr_construct(&tctrlr->ctrlr);
+ if (rc != 0) {
+ free(tctrlr);
+ return NULL;
+ }
+
+ tctrlr->ctrlr.adminq = nvme_tcp_ctrlr_create_qpair(&tctrlr->ctrlr, 0,
+ tctrlr->ctrlr.opts.admin_queue_size, 0,
+ tctrlr->ctrlr.opts.admin_queue_size);
+ if (!tctrlr->ctrlr.adminq) {
+ SPDK_ERRLOG("failed to create admin qpair\n");
+ nvme_tcp_ctrlr_destruct(&tctrlr->ctrlr);
+ return NULL;
+ }
+
+ rc = nvme_transport_ctrlr_connect_qpair(&tctrlr->ctrlr, tctrlr->ctrlr.adminq);
+ if (rc < 0) {
+ SPDK_ERRLOG("failed to connect admin qpair\n");
+ nvme_tcp_ctrlr_destruct(&tctrlr->ctrlr);
+ return NULL;
+ }
+
+ if (nvme_ctrlr_get_cap(&tctrlr->ctrlr, &cap)) {
+ SPDK_ERRLOG("get_cap() failed\n");
+ nvme_ctrlr_destruct(&tctrlr->ctrlr);
+ return NULL;
+ }
+
+ if (nvme_ctrlr_get_vs(&tctrlr->ctrlr, &vs)) {
+ SPDK_ERRLOG("get_vs() failed\n");
+ nvme_ctrlr_destruct(&tctrlr->ctrlr);
+ return NULL;
+ }
+
+ if (nvme_ctrlr_add_process(&tctrlr->ctrlr, 0) != 0) {
+ SPDK_ERRLOG("nvme_ctrlr_add_process() failed\n");
+ nvme_ctrlr_destruct(&tctrlr->ctrlr);
+ return NULL;
+ }
+
+ nvme_ctrlr_init_cap(&tctrlr->ctrlr, &cap, &vs);
+
+ return &tctrlr->ctrlr;
+}
+
+static uint32_t
+nvme_tcp_ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr *ctrlr)
+{
+ /* TCP transport doens't limit maximum IO transfer size. */
+ return UINT32_MAX;
+}
+
+static uint16_t
+nvme_tcp_ctrlr_get_max_sges(struct spdk_nvme_ctrlr *ctrlr)
+{
+ /*
+ * We do not support >1 SGE in the initiator currently,
+ * so we can only return 1 here. Once that support is
+ * added, this should return ctrlr->cdata.nvmf_specific.msdbd
+ * instead.
+ */
+ return 1;
+}
+
+static int
+nvme_tcp_qpair_iterate_requests(struct spdk_nvme_qpair *qpair,
+ int (*iter_fn)(struct nvme_request *req, void *arg),
+ void *arg)
+{
+ struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair);
+ struct nvme_tcp_req *tcp_req, *tmp;
+ int rc;
+
+ assert(iter_fn != NULL);
+
+ TAILQ_FOREACH_SAFE(tcp_req, &tqpair->outstanding_reqs, link, tmp) {
+ assert(tcp_req->req != NULL);
+
+ rc = iter_fn(tcp_req->req, arg);
+ if (rc != 0) {
+ return rc;
+ }
+ }
+
+ return 0;
+}
+
+static void
+nvme_tcp_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair)
+{
+ struct nvme_tcp_req *tcp_req, *tmp;
+ struct spdk_nvme_cpl cpl;
+ struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair);
+
+ cpl.status.sc = SPDK_NVME_SC_ABORTED_SQ_DELETION;
+ cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+
+ TAILQ_FOREACH_SAFE(tcp_req, &tqpair->outstanding_reqs, link, tmp) {
+ assert(tcp_req->req != NULL);
+ if (tcp_req->req->cmd.opc != SPDK_NVME_OPC_ASYNC_EVENT_REQUEST) {
+ continue;
+ }
+
+ nvme_tcp_req_complete(tcp_req, &cpl);
+ nvme_tcp_req_put(tqpair, tcp_req);
+ }
+}
+
+static struct spdk_nvme_transport_poll_group *
+nvme_tcp_poll_group_create(void)
+{
+ struct nvme_tcp_poll_group *group = calloc(1, sizeof(*group));
+
+ if (group == NULL) {
+ SPDK_ERRLOG("Unable to allocate poll group.\n");
+ return NULL;
+ }
+
+ group->sock_group = spdk_sock_group_create(group);
+ if (group->sock_group == NULL) {
+ free(group);
+ SPDK_ERRLOG("Unable to allocate sock group.\n");
+ return NULL;
+ }
+
+ return &group->group;
+}
+
+static int
+nvme_tcp_poll_group_connect_qpair(struct spdk_nvme_qpair *qpair)
+{
+ struct nvme_tcp_poll_group *group = nvme_tcp_poll_group(qpair->poll_group);
+ struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair);
+
+ if (spdk_sock_group_add_sock(group->sock_group, tqpair->sock, nvme_tcp_qpair_sock_cb, qpair)) {
+ return -EPROTO;
+ }
+ return 0;
+}
+
+static int
+nvme_tcp_poll_group_disconnect_qpair(struct spdk_nvme_qpair *qpair)
+{
+ struct nvme_tcp_poll_group *group = nvme_tcp_poll_group(qpair->poll_group);
+ struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair);
+
+ if (tqpair->sock && group->sock_group) {
+ if (spdk_sock_group_remove_sock(group->sock_group, tqpair->sock)) {
+ return -EPROTO;
+ }
+ }
+ return 0;
+}
+
+static int
+nvme_tcp_poll_group_add(struct spdk_nvme_transport_poll_group *tgroup,
+ struct spdk_nvme_qpair *qpair)
+{
+ struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair);
+ struct nvme_tcp_poll_group *group = nvme_tcp_poll_group(tgroup);
+
+ /* disconnected qpairs won't have a sock to add. */
+ if (nvme_qpair_get_state(qpair) >= NVME_QPAIR_CONNECTED) {
+ if (spdk_sock_group_add_sock(group->sock_group, tqpair->sock, nvme_tcp_qpair_sock_cb, qpair)) {
+ return -EPROTO;
+ }
+ }
+
+ return 0;
+}
+
+static int
+nvme_tcp_poll_group_remove(struct spdk_nvme_transport_poll_group *tgroup,
+ struct spdk_nvme_qpair *qpair)
+{
+ if (qpair->poll_group_tailq_head == &tgroup->connected_qpairs) {
+ return nvme_poll_group_disconnect_qpair(qpair);
+ }
+
+ return 0;
+}
+
+static int64_t
+nvme_tcp_poll_group_process_completions(struct spdk_nvme_transport_poll_group *tgroup,
+ uint32_t completions_per_qpair, spdk_nvme_disconnected_qpair_cb disconnected_qpair_cb)
+{
+ struct nvme_tcp_poll_group *group = nvme_tcp_poll_group(tgroup);
+ struct spdk_nvme_qpair *qpair, *tmp_qpair;
+
+ group->completions_per_qpair = completions_per_qpair;
+ group->num_completions = 0;
+
+ spdk_sock_group_poll(group->sock_group);
+
+ STAILQ_FOREACH_SAFE(qpair, &tgroup->disconnected_qpairs, poll_group_stailq, tmp_qpair) {
+ disconnected_qpair_cb(qpair, tgroup->group->ctx);
+ }
+
+ return group->num_completions;
+}
+
+static int
+nvme_tcp_poll_group_destroy(struct spdk_nvme_transport_poll_group *tgroup)
+{
+ int rc;
+ struct nvme_tcp_poll_group *group = nvme_tcp_poll_group(tgroup);
+
+ if (!STAILQ_EMPTY(&tgroup->connected_qpairs) || !STAILQ_EMPTY(&tgroup->disconnected_qpairs)) {
+ return -EBUSY;
+ }
+
+ rc = spdk_sock_group_close(&group->sock_group);
+ if (rc != 0) {
+ SPDK_ERRLOG("Failed to close the sock group for a tcp poll group.\n");
+ assert(false);
+ }
+
+ free(tgroup);
+
+ return 0;
+}
+
+const struct spdk_nvme_transport_ops tcp_ops = {
+ .name = "TCP",
+ .type = SPDK_NVME_TRANSPORT_TCP,
+ .ctrlr_construct = nvme_tcp_ctrlr_construct,
+ .ctrlr_scan = nvme_fabric_ctrlr_scan,
+ .ctrlr_destruct = nvme_tcp_ctrlr_destruct,
+ .ctrlr_enable = nvme_tcp_ctrlr_enable,
+
+ .ctrlr_set_reg_4 = nvme_fabric_ctrlr_set_reg_4,
+ .ctrlr_set_reg_8 = nvme_fabric_ctrlr_set_reg_8,
+ .ctrlr_get_reg_4 = nvme_fabric_ctrlr_get_reg_4,
+ .ctrlr_get_reg_8 = nvme_fabric_ctrlr_get_reg_8,
+
+ .ctrlr_get_max_xfer_size = nvme_tcp_ctrlr_get_max_xfer_size,
+ .ctrlr_get_max_sges = nvme_tcp_ctrlr_get_max_sges,
+
+ .ctrlr_create_io_qpair = nvme_tcp_ctrlr_create_io_qpair,
+ .ctrlr_delete_io_qpair = nvme_tcp_ctrlr_delete_io_qpair,
+ .ctrlr_connect_qpair = nvme_tcp_ctrlr_connect_qpair,
+ .ctrlr_disconnect_qpair = nvme_tcp_ctrlr_disconnect_qpair,
+
+ .qpair_abort_reqs = nvme_tcp_qpair_abort_reqs,
+ .qpair_reset = nvme_tcp_qpair_reset,
+ .qpair_submit_request = nvme_tcp_qpair_submit_request,
+ .qpair_process_completions = nvme_tcp_qpair_process_completions,
+ .qpair_iterate_requests = nvme_tcp_qpair_iterate_requests,
+ .admin_qpair_abort_aers = nvme_tcp_admin_qpair_abort_aers,
+
+ .poll_group_create = nvme_tcp_poll_group_create,
+ .poll_group_connect_qpair = nvme_tcp_poll_group_connect_qpair,
+ .poll_group_disconnect_qpair = nvme_tcp_poll_group_disconnect_qpair,
+ .poll_group_add = nvme_tcp_poll_group_add,
+ .poll_group_remove = nvme_tcp_poll_group_remove,
+ .poll_group_process_completions = nvme_tcp_poll_group_process_completions,
+ .poll_group_destroy = nvme_tcp_poll_group_destroy,
+};
+
+SPDK_NVME_TRANSPORT_REGISTER(tcp, &tcp_ops);
diff --git a/src/spdk/lib/nvme/nvme_transport.c b/src/spdk/lib/nvme/nvme_transport.c
new file mode 100644
index 000000000..76efd5966
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_transport.c
@@ -0,0 +1,591 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * NVMe transport abstraction
+ */
+
+#include "nvme_internal.h"
+#include "spdk/queue.h"
+
+#define SPDK_MAX_NUM_OF_TRANSPORTS 16
+
+struct spdk_nvme_transport {
+ struct spdk_nvme_transport_ops ops;
+ TAILQ_ENTRY(spdk_nvme_transport) link;
+};
+
+TAILQ_HEAD(nvme_transport_list, spdk_nvme_transport) g_spdk_nvme_transports =
+ TAILQ_HEAD_INITIALIZER(g_spdk_nvme_transports);
+
+struct spdk_nvme_transport g_spdk_transports[SPDK_MAX_NUM_OF_TRANSPORTS] = {};
+int g_current_transport_index = 0;
+
+const struct spdk_nvme_transport *
+nvme_get_first_transport(void)
+{
+ return TAILQ_FIRST(&g_spdk_nvme_transports);
+}
+
+const struct spdk_nvme_transport *
+nvme_get_next_transport(const struct spdk_nvme_transport *transport)
+{
+ return TAILQ_NEXT(transport, link);
+}
+
+/*
+ * Unfortunately, due to NVMe PCIe multiprocess support, we cannot store the
+ * transport object in either the controller struct or the admin qpair. THis means
+ * that a lot of admin related transport calls will have to call nvme_get_transport
+ * in order to knwo which functions to call.
+ * In the I/O path, we have the ability to store the transport struct in the I/O
+ * qpairs to avoid taking a performance hit.
+ */
+const struct spdk_nvme_transport *
+nvme_get_transport(const char *transport_name)
+{
+ struct spdk_nvme_transport *registered_transport;
+
+ TAILQ_FOREACH(registered_transport, &g_spdk_nvme_transports, link) {
+ if (strcasecmp(transport_name, registered_transport->ops.name) == 0) {
+ return registered_transport;
+ }
+ }
+
+ return NULL;
+}
+
+bool
+spdk_nvme_transport_available(enum spdk_nvme_transport_type trtype)
+{
+ return nvme_get_transport(spdk_nvme_transport_id_trtype_str(trtype)) == NULL ? false : true;
+}
+
+bool
+spdk_nvme_transport_available_by_name(const char *transport_name)
+{
+ return nvme_get_transport(transport_name) == NULL ? false : true;
+}
+
+void spdk_nvme_transport_register(const struct spdk_nvme_transport_ops *ops)
+{
+ struct spdk_nvme_transport *new_transport;
+
+ if (nvme_get_transport(ops->name)) {
+ SPDK_ERRLOG("Double registering NVMe transport %s is prohibited.\n", ops->name);
+ assert(false);
+ }
+
+ if (g_current_transport_index == SPDK_MAX_NUM_OF_TRANSPORTS) {
+ SPDK_ERRLOG("Unable to register new NVMe transport.\n");
+ assert(false);
+ return;
+ }
+ new_transport = &g_spdk_transports[g_current_transport_index++];
+
+ new_transport->ops = *ops;
+ TAILQ_INSERT_TAIL(&g_spdk_nvme_transports, new_transport, link);
+}
+
+struct spdk_nvme_ctrlr *nvme_transport_ctrlr_construct(const struct spdk_nvme_transport_id *trid,
+ const struct spdk_nvme_ctrlr_opts *opts,
+ void *devhandle)
+{
+ const struct spdk_nvme_transport *transport = nvme_get_transport(trid->trstring);
+ struct spdk_nvme_ctrlr *ctrlr;
+
+ if (transport == NULL) {
+ SPDK_ERRLOG("Transport %s doesn't exist.", trid->trstring);
+ return NULL;
+ }
+
+ ctrlr = transport->ops.ctrlr_construct(trid, opts, devhandle);
+
+ return ctrlr;
+}
+
+int
+nvme_transport_ctrlr_scan(struct spdk_nvme_probe_ctx *probe_ctx,
+ bool direct_connect)
+{
+ const struct spdk_nvme_transport *transport = nvme_get_transport(probe_ctx->trid.trstring);
+
+ if (transport == NULL) {
+ SPDK_ERRLOG("Transport %s doesn't exist.", probe_ctx->trid.trstring);
+ return -ENOENT;
+ }
+
+ return transport->ops.ctrlr_scan(probe_ctx, direct_connect);
+}
+
+int
+nvme_transport_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr)
+{
+ const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring);
+
+ assert(transport != NULL);
+ return transport->ops.ctrlr_destruct(ctrlr);
+}
+
+int
+nvme_transport_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr)
+{
+ const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring);
+
+ assert(transport != NULL);
+ return transport->ops.ctrlr_enable(ctrlr);
+}
+
+int
+nvme_transport_ctrlr_set_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t value)
+{
+ const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring);
+
+ assert(transport != NULL);
+ return transport->ops.ctrlr_set_reg_4(ctrlr, offset, value);
+}
+
+int
+nvme_transport_ctrlr_set_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t value)
+{
+ const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring);
+
+ assert(transport != NULL);
+ return transport->ops.ctrlr_set_reg_8(ctrlr, offset, value);
+}
+
+int
+nvme_transport_ctrlr_get_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t *value)
+{
+ const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring);
+
+ assert(transport != NULL);
+ return transport->ops.ctrlr_get_reg_4(ctrlr, offset, value);
+}
+
+int
+nvme_transport_ctrlr_get_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t *value)
+{
+ const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring);
+
+ assert(transport != NULL);
+ return transport->ops.ctrlr_get_reg_8(ctrlr, offset, value);
+}
+
+uint32_t
+nvme_transport_ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr *ctrlr)
+{
+ const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring);
+
+ assert(transport != NULL);
+ return transport->ops.ctrlr_get_max_xfer_size(ctrlr);
+}
+
+uint16_t
+nvme_transport_ctrlr_get_max_sges(struct spdk_nvme_ctrlr *ctrlr)
+{
+ const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring);
+
+ assert(transport != NULL);
+ return transport->ops.ctrlr_get_max_sges(ctrlr);
+}
+
+int
+nvme_transport_ctrlr_reserve_cmb(struct spdk_nvme_ctrlr *ctrlr)
+{
+ const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring);
+
+ assert(transport != NULL);
+ if (transport->ops.ctrlr_reserve_cmb != NULL) {
+ return transport->ops.ctrlr_reserve_cmb(ctrlr);
+ }
+
+ return -ENOTSUP;
+}
+
+void *
+nvme_transport_ctrlr_map_cmb(struct spdk_nvme_ctrlr *ctrlr, size_t *size)
+{
+ const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring);
+
+ assert(transport != NULL);
+ if (transport->ops.ctrlr_map_cmb != NULL) {
+ return transport->ops.ctrlr_map_cmb(ctrlr, size);
+ }
+
+ return NULL;
+}
+
+int
+nvme_transport_ctrlr_unmap_cmb(struct spdk_nvme_ctrlr *ctrlr)
+{
+ const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring);
+
+ assert(transport != NULL);
+ if (transport->ops.ctrlr_unmap_cmb != NULL) {
+ return transport->ops.ctrlr_unmap_cmb(ctrlr);
+ }
+
+ return 0;
+}
+
+struct spdk_nvme_qpair *
+nvme_transport_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid,
+ const struct spdk_nvme_io_qpair_opts *opts)
+{
+ struct spdk_nvme_qpair *qpair;
+ const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring);
+
+ assert(transport != NULL);
+ qpair = transport->ops.ctrlr_create_io_qpair(ctrlr, qid, opts);
+ if (qpair != NULL && !nvme_qpair_is_admin_queue(qpair)) {
+ qpair->transport = transport;
+ }
+
+ return qpair;
+}
+
+int
+nvme_transport_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
+{
+ const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring);
+
+ assert(transport != NULL);
+
+ /* Do not rely on qpair->transport. For multi-process cases, a foreign process may delete
+ * the IO qpair, in which case the transport object would be invalid (each process has their
+ * own unique transport objects since they contain function pointers). So we look up the
+ * transport object in the delete_io_qpair case.
+ */
+ return transport->ops.ctrlr_delete_io_qpair(ctrlr, qpair);
+}
+
+int
+nvme_transport_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
+{
+ const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring);
+ uint8_t transport_failure_reason;
+ int rc;
+
+ assert(transport != NULL);
+ if (!nvme_qpair_is_admin_queue(qpair)) {
+ qpair->transport = transport;
+ }
+
+ transport_failure_reason = qpair->transport_failure_reason;
+ qpair->transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_NONE;
+
+ nvme_qpair_set_state(qpair, NVME_QPAIR_CONNECTING);
+ rc = transport->ops.ctrlr_connect_qpair(ctrlr, qpair);
+ if (rc != 0) {
+ goto err;
+ }
+
+ nvme_qpair_set_state(qpair, NVME_QPAIR_CONNECTED);
+ if (qpair->poll_group) {
+ rc = nvme_poll_group_connect_qpair(qpair);
+ if (rc) {
+ goto err;
+ }
+ }
+
+ return rc;
+
+err:
+ /* If the qpair was unable to reconnect, restore the original failure reason. */
+ qpair->transport_failure_reason = transport_failure_reason;
+ nvme_transport_ctrlr_disconnect_qpair(ctrlr, qpair);
+ nvme_qpair_set_state(qpair, NVME_QPAIR_DISCONNECTED);
+ return rc;
+}
+
+void
+nvme_transport_ctrlr_disconnect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
+{
+ const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring);
+
+ if (nvme_qpair_get_state(qpair) == NVME_QPAIR_DISCONNECTING ||
+ nvme_qpair_get_state(qpair) == NVME_QPAIR_DISCONNECTED) {
+ return;
+ }
+
+ nvme_qpair_set_state(qpair, NVME_QPAIR_DISCONNECTING);
+ assert(transport != NULL);
+ if (qpair->poll_group) {
+ nvme_poll_group_disconnect_qpair(qpair);
+ }
+
+ transport->ops.ctrlr_disconnect_qpair(ctrlr, qpair);
+
+ nvme_qpair_abort_reqs(qpair, 0);
+ nvme_qpair_set_state(qpair, NVME_QPAIR_DISCONNECTED);
+}
+
+void
+nvme_transport_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr)
+{
+ const struct spdk_nvme_transport *transport;
+
+ assert(dnr <= 1);
+ if (spdk_likely(!nvme_qpair_is_admin_queue(qpair))) {
+ qpair->transport->ops.qpair_abort_reqs(qpair, dnr);
+ } else {
+ transport = nvme_get_transport(qpair->ctrlr->trid.trstring);
+ assert(transport != NULL);
+ transport->ops.qpair_abort_reqs(qpair, dnr);
+ }
+}
+
+int
+nvme_transport_qpair_reset(struct spdk_nvme_qpair *qpair)
+{
+ const struct spdk_nvme_transport *transport;
+
+ if (spdk_likely(!nvme_qpair_is_admin_queue(qpair))) {
+ return qpair->transport->ops.qpair_reset(qpair);
+ }
+
+ transport = nvme_get_transport(qpair->ctrlr->trid.trstring);
+ assert(transport != NULL);
+ return transport->ops.qpair_reset(qpair);
+}
+
+int
+nvme_transport_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req)
+{
+ const struct spdk_nvme_transport *transport;
+
+ if (spdk_likely(!nvme_qpair_is_admin_queue(qpair))) {
+ return qpair->transport->ops.qpair_submit_request(qpair, req);
+ }
+
+ transport = nvme_get_transport(qpair->ctrlr->trid.trstring);
+ assert(transport != NULL);
+ return transport->ops.qpair_submit_request(qpair, req);
+}
+
+int32_t
+nvme_transport_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_completions)
+{
+ const struct spdk_nvme_transport *transport;
+
+ if (spdk_likely(!nvme_qpair_is_admin_queue(qpair))) {
+ return qpair->transport->ops.qpair_process_completions(qpair, max_completions);
+ }
+
+ transport = nvme_get_transport(qpair->ctrlr->trid.trstring);
+ assert(transport != NULL);
+ return transport->ops.qpair_process_completions(qpair, max_completions);
+}
+
+int
+nvme_transport_qpair_iterate_requests(struct spdk_nvme_qpair *qpair,
+ int (*iter_fn)(struct nvme_request *req, void *arg),
+ void *arg)
+{
+ const struct spdk_nvme_transport *transport;
+
+ if (spdk_likely(!nvme_qpair_is_admin_queue(qpair))) {
+ return qpair->transport->ops.qpair_iterate_requests(qpair, iter_fn, arg);
+ }
+
+ transport = nvme_get_transport(qpair->ctrlr->trid.trstring);
+ assert(transport != NULL);
+ return transport->ops.qpair_iterate_requests(qpair, iter_fn, arg);
+}
+
+void
+nvme_transport_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair)
+{
+ const struct spdk_nvme_transport *transport = nvme_get_transport(qpair->ctrlr->trid.trstring);
+
+ assert(transport != NULL);
+ transport->ops.admin_qpair_abort_aers(qpair);
+}
+
+struct spdk_nvme_transport_poll_group *
+nvme_transport_poll_group_create(const struct spdk_nvme_transport *transport)
+{
+ struct spdk_nvme_transport_poll_group *group = NULL;
+
+ group = transport->ops.poll_group_create();
+ if (group) {
+ group->transport = transport;
+ STAILQ_INIT(&group->connected_qpairs);
+ STAILQ_INIT(&group->disconnected_qpairs);
+ }
+
+ return group;
+}
+
+int
+nvme_transport_poll_group_add(struct spdk_nvme_transport_poll_group *tgroup,
+ struct spdk_nvme_qpair *qpair)
+{
+ int rc;
+
+ rc = tgroup->transport->ops.poll_group_add(tgroup, qpair);
+ if (rc == 0) {
+ qpair->poll_group = tgroup;
+ assert(nvme_qpair_get_state(qpair) < NVME_QPAIR_CONNECTED);
+ qpair->poll_group_tailq_head = &tgroup->disconnected_qpairs;
+ STAILQ_INSERT_TAIL(&tgroup->disconnected_qpairs, qpair, poll_group_stailq);
+ }
+
+ return rc;
+}
+
+int
+nvme_transport_poll_group_remove(struct spdk_nvme_transport_poll_group *tgroup,
+ struct spdk_nvme_qpair *qpair)
+{
+ int rc;
+
+ rc = tgroup->transport->ops.poll_group_remove(tgroup, qpair);
+ if (rc == 0) {
+ if (qpair->poll_group_tailq_head == &tgroup->connected_qpairs) {
+ STAILQ_REMOVE(&tgroup->connected_qpairs, qpair, spdk_nvme_qpair, poll_group_stailq);
+ } else if (qpair->poll_group_tailq_head == &tgroup->disconnected_qpairs) {
+ STAILQ_REMOVE(&tgroup->disconnected_qpairs, qpair, spdk_nvme_qpair, poll_group_stailq);
+ } else {
+ return -ENOENT;
+ }
+
+ qpair->poll_group = NULL;
+ qpair->poll_group_tailq_head = NULL;
+ }
+
+ return rc;
+}
+
+int64_t
+nvme_transport_poll_group_process_completions(struct spdk_nvme_transport_poll_group *tgroup,
+ uint32_t completions_per_qpair, spdk_nvme_disconnected_qpair_cb disconnected_qpair_cb)
+{
+ struct spdk_nvme_qpair *qpair;
+ int64_t rc;
+
+ tgroup->in_completion_context = true;
+ rc = tgroup->transport->ops.poll_group_process_completions(tgroup, completions_per_qpair,
+ disconnected_qpair_cb);
+ tgroup->in_completion_context = false;
+
+ if (spdk_unlikely(tgroup->num_qpairs_to_delete > 0)) {
+ /* deleted qpairs are more likely to be in the disconnected qpairs list. */
+ STAILQ_FOREACH(qpair, &tgroup->disconnected_qpairs, poll_group_stailq) {
+ if (spdk_unlikely(qpair->delete_after_completion_context)) {
+ spdk_nvme_ctrlr_free_io_qpair(qpair);
+ if (--tgroup->num_qpairs_to_delete == 0) {
+ return rc;
+ }
+ }
+ }
+
+ STAILQ_FOREACH(qpair, &tgroup->connected_qpairs, poll_group_stailq) {
+ if (spdk_unlikely(qpair->delete_after_completion_context)) {
+ spdk_nvme_ctrlr_free_io_qpair(qpair);
+ if (--tgroup->num_qpairs_to_delete == 0) {
+ return rc;
+ }
+ }
+ }
+ /* Just in case. */
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Mismatch between qpairs to delete and poll group number.\n");
+ tgroup->num_qpairs_to_delete = 0;
+ }
+
+ return rc;
+}
+
+int
+nvme_transport_poll_group_destroy(struct spdk_nvme_transport_poll_group *tgroup)
+{
+ return tgroup->transport->ops.poll_group_destroy(tgroup);
+}
+
+int
+nvme_transport_poll_group_disconnect_qpair(struct spdk_nvme_qpair *qpair)
+{
+ struct spdk_nvme_transport_poll_group *tgroup;
+ int rc;
+
+ tgroup = qpair->poll_group;
+
+ if (qpair->poll_group_tailq_head == &tgroup->disconnected_qpairs) {
+ return 0;
+ }
+
+ if (qpair->poll_group_tailq_head == &tgroup->connected_qpairs) {
+ rc = tgroup->transport->ops.poll_group_disconnect_qpair(qpair);
+ if (rc == 0) {
+ qpair->poll_group_tailq_head = &tgroup->disconnected_qpairs;
+ STAILQ_REMOVE(&tgroup->connected_qpairs, qpair, spdk_nvme_qpair, poll_group_stailq);
+ STAILQ_INSERT_TAIL(&tgroup->disconnected_qpairs, qpair, poll_group_stailq);
+ /* EINPROGRESS indicates that a call has already been made to this function.
+ * It just keeps us from segfaulting on a double removal/insert.
+ */
+ } else if (rc == -EINPROGRESS) {
+ rc = 0;
+ }
+ return rc;
+ }
+
+ return -EINVAL;
+}
+
+int
+nvme_transport_poll_group_connect_qpair(struct spdk_nvme_qpair *qpair)
+{
+ struct spdk_nvme_transport_poll_group *tgroup;
+ int rc;
+
+ tgroup = qpair->poll_group;
+
+ if (qpair->poll_group_tailq_head == &tgroup->connected_qpairs) {
+ return 0;
+ }
+
+ if (qpair->poll_group_tailq_head == &tgroup->disconnected_qpairs) {
+ rc = tgroup->transport->ops.poll_group_connect_qpair(qpair);
+ if (rc == 0) {
+ qpair->poll_group_tailq_head = &tgroup->connected_qpairs;
+ STAILQ_REMOVE(&tgroup->disconnected_qpairs, qpair, spdk_nvme_qpair, poll_group_stailq);
+ STAILQ_INSERT_TAIL(&tgroup->connected_qpairs, qpair, poll_group_stailq);
+ }
+
+ return rc == -EINPROGRESS ? 0 : rc;
+ }
+
+
+ return -EINVAL;
+}
diff --git a/src/spdk/lib/nvme/nvme_uevent.c b/src/spdk/lib/nvme/nvme_uevent.c
new file mode 100644
index 000000000..1bcfff1cb
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_uevent.c
@@ -0,0 +1,213 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+#include "spdk/string.h"
+
+#include "spdk/log.h"
+
+#include "nvme_uevent.h"
+
+#ifdef __linux__
+
+#include <linux/netlink.h>
+
+#define SPDK_UEVENT_MSG_LEN 4096
+
+int
+nvme_uevent_connect(void)
+{
+ struct sockaddr_nl addr;
+ int netlink_fd;
+ int size = 64 * 1024;
+ int flag;
+
+ memset(&addr, 0, sizeof(addr));
+ addr.nl_family = AF_NETLINK;
+ addr.nl_pid = getpid();
+ addr.nl_groups = 0xffffffff;
+
+ netlink_fd = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_KOBJECT_UEVENT);
+ if (netlink_fd < 0) {
+ return -1;
+ }
+
+ setsockopt(netlink_fd, SOL_SOCKET, SO_RCVBUFFORCE, &size, sizeof(size));
+
+ flag = fcntl(netlink_fd, F_GETFL);
+ if (fcntl(netlink_fd, F_SETFL, flag | O_NONBLOCK) < 0) {
+ SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%s)\n", netlink_fd,
+ spdk_strerror(errno));
+ close(netlink_fd);
+ return -1;
+ }
+
+ if (bind(netlink_fd, (struct sockaddr *) &addr, sizeof(addr)) < 0) {
+ close(netlink_fd);
+ return -1;
+ }
+ return netlink_fd;
+}
+
+/* Note: We only parse the event from uio subsystem and will ignore
+ * all the event from other subsystem. the event from uio subsystem
+ * as below:
+ * action: "add" or "remove"
+ * subsystem: "uio"
+ * dev_path: "/devices/pci0000:80/0000:80:01.0/0000:81:00.0/uio/uio0"
+ */
+static int
+parse_event(const char *buf, struct spdk_uevent *event)
+{
+ char action[SPDK_UEVENT_MSG_LEN];
+ char subsystem[SPDK_UEVENT_MSG_LEN];
+ char dev_path[SPDK_UEVENT_MSG_LEN];
+ char driver[SPDK_UEVENT_MSG_LEN];
+ char vfio_pci_addr[SPDK_UEVENT_MSG_LEN];
+
+ memset(action, 0, SPDK_UEVENT_MSG_LEN);
+ memset(subsystem, 0, SPDK_UEVENT_MSG_LEN);
+ memset(dev_path, 0, SPDK_UEVENT_MSG_LEN);
+ memset(driver, 0, SPDK_UEVENT_MSG_LEN);
+ memset(vfio_pci_addr, 0, SPDK_UEVENT_MSG_LEN);
+
+ while (*buf) {
+ if (!strncmp(buf, "ACTION=", 7)) {
+ buf += 7;
+ snprintf(action, sizeof(action), "%s", buf);
+ } else if (!strncmp(buf, "DEVPATH=", 8)) {
+ buf += 8;
+ snprintf(dev_path, sizeof(dev_path), "%s", buf);
+ } else if (!strncmp(buf, "SUBSYSTEM=", 10)) {
+ buf += 10;
+ snprintf(subsystem, sizeof(subsystem), "%s", buf);
+ } else if (!strncmp(buf, "DRIVER=", 7)) {
+ buf += 7;
+ snprintf(driver, sizeof(driver), "%s", buf);
+ } else if (!strncmp(buf, "PCI_SLOT_NAME=", 14)) {
+ buf += 14;
+ snprintf(vfio_pci_addr, sizeof(vfio_pci_addr), "%s", buf);
+ }
+ while (*buf++)
+ ;
+ }
+
+ if (!strncmp(subsystem, "uio", 3)) {
+ char *pci_address, *tmp;
+ struct spdk_pci_addr pci_addr;
+
+ event->subsystem = SPDK_NVME_UEVENT_SUBSYSTEM_UIO;
+ if (!strncmp(action, "add", 3)) {
+ event->action = SPDK_NVME_UEVENT_ADD;
+ }
+ if (!strncmp(action, "remove", 6)) {
+ event->action = SPDK_NVME_UEVENT_REMOVE;
+ }
+ tmp = strstr(dev_path, "/uio/");
+
+ memset(tmp, 0, SPDK_UEVENT_MSG_LEN - (tmp - dev_path));
+
+ pci_address = strrchr(dev_path, '/');
+ pci_address++;
+ if (spdk_pci_addr_parse(&pci_addr, pci_address) != 0) {
+ SPDK_ERRLOG("Invalid format for NVMe BDF: %s\n", pci_address);
+ return -1;
+ }
+ spdk_pci_addr_fmt(event->traddr, sizeof(event->traddr), &pci_addr);
+ return 1;
+ }
+ if (!strncmp(driver, "vfio-pci", 8)) {
+ struct spdk_pci_addr pci_addr;
+
+ event->subsystem = SPDK_NVME_UEVENT_SUBSYSTEM_VFIO;
+ if (!strncmp(action, "bind", 4)) {
+ event->action = SPDK_NVME_UEVENT_ADD;
+ }
+ if (!strncmp(action, "remove", 6)) {
+ event->action = SPDK_NVME_UEVENT_REMOVE;
+ }
+ if (spdk_pci_addr_parse(&pci_addr, vfio_pci_addr) != 0) {
+ SPDK_ERRLOG("Invalid format for NVMe BDF: %s\n", vfio_pci_addr);
+ return -1;
+ }
+ spdk_pci_addr_fmt(event->traddr, sizeof(event->traddr), &pci_addr);
+ return 1;
+
+ }
+ return -1;
+}
+
+int
+nvme_get_uevent(int fd, struct spdk_uevent *uevent)
+{
+ int ret;
+ char buf[SPDK_UEVENT_MSG_LEN];
+
+ memset(uevent, 0, sizeof(struct spdk_uevent));
+ memset(buf, 0, SPDK_UEVENT_MSG_LEN);
+
+ ret = recv(fd, buf, SPDK_UEVENT_MSG_LEN - 1, MSG_DONTWAIT);
+ if (ret > 0) {
+ return parse_event(buf, uevent);
+ }
+
+ if (ret < 0) {
+ if (errno == EAGAIN || errno == EWOULDBLOCK) {
+ return 0;
+ } else {
+ SPDK_ERRLOG("Socket read error(%d): %s\n", errno, spdk_strerror(errno));
+ return -1;
+ }
+ }
+
+ /* connection closed */
+ if (ret == 0) {
+ return -1;
+ }
+ return 0;
+}
+
+#else /* Not Linux */
+
+int
+nvme_uevent_connect(void)
+{
+ return -1;
+}
+
+int
+nvme_get_uevent(int fd, struct spdk_uevent *uevent)
+{
+ return -1;
+}
+#endif
diff --git a/src/spdk/lib/nvme/nvme_uevent.h b/src/spdk/lib/nvme/nvme_uevent.h
new file mode 100644
index 000000000..778d73c2a
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_uevent.h
@@ -0,0 +1,61 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * SPDK uevent
+ */
+
+#include "spdk/env.h"
+#include "spdk/nvmf_spec.h"
+
+#ifndef SPDK_UEVENT_H_
+#define SPDK_UEVENT_H_
+
+#define SPDK_NVME_UEVENT_SUBSYSTEM_UIO 1
+#define SPDK_NVME_UEVENT_SUBSYSTEM_VFIO 2
+
+enum spdk_nvme_uevent_action {
+ SPDK_NVME_UEVENT_ADD = 0,
+ SPDK_NVME_UEVENT_REMOVE = 1,
+};
+
+struct spdk_uevent {
+ enum spdk_nvme_uevent_action action;
+ int subsystem;
+ char traddr[SPDK_NVMF_TRADDR_MAX_LEN + 1];
+};
+
+int nvme_uevent_connect(void);
+int nvme_get_uevent(int fd, struct spdk_uevent *uevent);
+
+#endif /* SPDK_UEVENT_H_ */
diff --git a/src/spdk/lib/nvme/spdk_nvme.map b/src/spdk/lib/nvme/spdk_nvme.map
new file mode 100644
index 000000000..63a04eeca
--- /dev/null
+++ b/src/spdk/lib/nvme/spdk_nvme.map
@@ -0,0 +1,185 @@
+{
+ global:
+
+ # public functions from nvme.h
+ spdk_nvme_transport_register;
+ spdk_nvme_transport_available;
+ spdk_nvme_transport_available_by_name;
+ spdk_nvme_transport_id_parse;
+ spdk_nvme_transport_id_populate_trstring;
+ spdk_nvme_transport_id_parse_trtype;
+ spdk_nvme_transport_id_trtype_str;
+ spdk_nvme_transport_id_adrfam_str;
+ spdk_nvme_transport_id_parse_adrfam;
+ spdk_nvme_transport_id_compare;
+ spdk_nvme_trid_populate_transport;
+ spdk_nvme_host_id_parse;
+
+ spdk_nvme_prchk_flags_parse;
+ spdk_nvme_prchk_flags_str;
+
+ spdk_nvme_probe;
+ spdk_nvme_connect;
+ spdk_nvme_connect_async;
+ spdk_nvme_probe_async;
+ spdk_nvme_probe_poll_async;
+ spdk_nvme_detach;
+
+ spdk_nvme_ctrlr_is_discovery;
+ spdk_nvme_ctrlr_get_default_ctrlr_opts;
+ spdk_nvme_ctrlr_set_trid;
+ spdk_nvme_ctrlr_reset;
+ spdk_nvme_ctrlr_fail;
+ spdk_nvme_ctrlr_is_failed;
+ spdk_nvme_ctrlr_get_data;
+ spdk_nvme_ctrlr_get_regs_csts;
+ spdk_nvme_ctrlr_get_regs_cap;
+ spdk_nvme_ctrlr_get_regs_vs;
+ spdk_nvme_ctrlr_get_regs_cmbsz;
+ spdk_nvme_ctrlr_get_num_ns;
+ spdk_nvme_ctrlr_get_pci_device;
+ spdk_nvme_ctrlr_get_max_xfer_size;
+ spdk_nvme_ctrlr_is_active_ns;
+ spdk_nvme_ctrlr_get_first_active_ns;
+ spdk_nvme_ctrlr_get_next_active_ns;
+ spdk_nvme_ctrlr_is_log_page_supported;
+ spdk_nvme_ctrlr_is_feature_supported;
+ spdk_nvme_ctrlr_register_aer_callback;
+ spdk_nvme_ctrlr_register_timeout_callback;
+ spdk_nvme_ctrlr_get_default_io_qpair_opts;
+ spdk_nvme_ctrlr_alloc_io_qpair;
+ spdk_nvme_ctrlr_connect_io_qpair;
+ spdk_nvme_ctrlr_disconnect_io_qpair;
+ spdk_nvme_ctrlr_reconnect_io_qpair;
+ spdk_nvme_ctrlr_get_admin_qp_failure_reason;
+ spdk_nvme_ctrlr_free_io_qpair;
+ spdk_nvme_ctrlr_io_cmd_raw_no_payload_build;
+ spdk_nvme_ctrlr_cmd_io_raw;
+ spdk_nvme_ctrlr_cmd_io_raw_with_md;
+ spdk_nvme_ctrlr_cmd_admin_raw;
+ spdk_nvme_ctrlr_process_admin_completions;
+ spdk_nvme_ctrlr_get_ns;
+ spdk_nvme_ctrlr_cmd_get_log_page;
+ spdk_nvme_ctrlr_cmd_get_log_page_ext;
+ spdk_nvme_ctrlr_cmd_abort;
+ spdk_nvme_ctrlr_cmd_abort_ext;
+ spdk_nvme_ctrlr_cmd_set_feature;
+ spdk_nvme_ctrlr_cmd_get_feature;
+ spdk_nvme_ctrlr_cmd_get_feature_ns;
+ spdk_nvme_ctrlr_cmd_set_feature_ns;
+ spdk_nvme_ctrlr_cmd_security_receive;
+ spdk_nvme_ctrlr_cmd_security_send;
+ spdk_nvme_ctrlr_security_receive;
+ spdk_nvme_ctrlr_security_send;
+ spdk_nvme_ctrlr_get_flags;
+ spdk_nvme_ctrlr_attach_ns;
+ spdk_nvme_ctrlr_detach_ns;
+ spdk_nvme_ctrlr_create_ns;
+ spdk_nvme_ctrlr_delete_ns;
+ spdk_nvme_ctrlr_format;
+ spdk_nvme_ctrlr_update_firmware;
+ spdk_nvme_ctrlr_get_registers;
+ spdk_nvme_ctrlr_reserve_cmb;
+ spdk_nvme_ctrlr_map_cmb;
+ spdk_nvme_ctrlr_unmap_cmb;
+ spdk_nvme_ctrlr_get_transport_id;
+
+ spdk_nvme_poll_group_create;
+ spdk_nvme_poll_group_add;
+ spdk_nvme_poll_group_remove;
+ spdk_nvme_poll_group_destroy;
+ spdk_nvme_poll_group_process_completions;
+ spdk_nvme_poll_group_get_ctx;
+
+ spdk_nvme_ns_get_data;
+ spdk_nvme_ns_get_id;
+ spdk_nvme_ns_get_ctrlr;
+ spdk_nvme_ns_is_active;
+ spdk_nvme_ns_get_max_io_xfer_size;
+ spdk_nvme_ns_get_sector_size;
+ spdk_nvme_ns_get_extended_sector_size;
+ spdk_nvme_ns_get_num_sectors;
+ spdk_nvme_ns_get_size;
+ spdk_nvme_ns_get_pi_type;
+ spdk_nvme_ns_get_md_size;
+ spdk_nvme_ns_supports_extended_lba;
+ spdk_nvme_ns_supports_compare;
+ spdk_nvme_ns_get_dealloc_logical_block_read_value;
+ spdk_nvme_ns_get_optimal_io_boundary;
+ spdk_nvme_ns_get_uuid;
+ spdk_nvme_ns_get_flags;
+
+ spdk_nvme_ns_cmd_write;
+ spdk_nvme_ns_cmd_writev;
+ spdk_nvme_ns_cmd_writev_with_md;
+ spdk_nvme_ns_cmd_write_with_md;
+ spdk_nvme_ns_cmd_write_zeroes;
+ spdk_nvme_ns_cmd_write_uncorrectable;
+ spdk_nvme_ns_cmd_read;
+ spdk_nvme_ns_cmd_readv;
+ spdk_nvme_ns_cmd_readv_with_md;
+ spdk_nvme_ns_cmd_read_with_md;
+ spdk_nvme_ns_cmd_dataset_management;
+ spdk_nvme_ns_cmd_flush;
+ spdk_nvme_ns_cmd_reservation_register;
+ spdk_nvme_ns_cmd_reservation_release;
+ spdk_nvme_ns_cmd_reservation_acquire;
+ spdk_nvme_ns_cmd_reservation_report;
+ spdk_nvme_ns_cmd_compare;
+ spdk_nvme_ns_cmd_comparev;
+ spdk_nvme_ns_cmd_comparev_with_md;
+ spdk_nvme_ns_cmd_compare_with_md;
+
+ spdk_nvme_qpair_process_completions;
+ spdk_nvme_qpair_get_failure_reason;
+ spdk_nvme_qpair_add_cmd_error_injection;
+ spdk_nvme_qpair_remove_cmd_error_injection;
+ spdk_nvme_qpair_print_command;
+ spdk_nvme_qpair_print_completion;
+ spdk_nvme_print_command;
+ spdk_nvme_print_completion;
+
+ spdk_nvme_cpl_get_status_string;
+
+ spdk_nvme_rdma_init_hooks;
+
+ spdk_nvme_cuse_get_ctrlr_name;
+ spdk_nvme_cuse_get_ns_name;
+ spdk_nvme_cuse_register;
+ spdk_nvme_cuse_unregister;
+ spdk_nvme_cuse_update_namespaces;
+
+ spdk_nvme_map_prps;
+
+ # public functions from nvme_ocssd.h
+ spdk_nvme_ctrlr_is_ocssd_supported;
+ spdk_nvme_ocssd_ctrlr_cmd_geometry;
+ spdk_nvme_ocssd_ns_cmd_vector_reset;
+ spdk_nvme_ocssd_ns_cmd_vector_write;
+ spdk_nvme_ocssd_ns_cmd_vector_write_with_md;
+ spdk_nvme_ocssd_ns_cmd_vector_read;
+ spdk_nvme_ocssd_ns_cmd_vector_read_with_md;
+ spdk_nvme_ocssd_ns_cmd_vector_copy;
+
+ # public functions from opal.h
+ spdk_opal_dev_construct;
+ spdk_opal_dev_destruct;
+ spdk_opal_get_d0_features_info;
+ spdk_opal_supported;
+ spdk_opal_cmd_take_ownership;
+ spdk_opal_cmd_revert_tper;
+ spdk_opal_cmd_activate_locking_sp;
+ spdk_opal_cmd_lock_unlock;
+ spdk_opal_cmd_setup_locking_range;
+ spdk_opal_cmd_get_max_ranges;
+ spdk_opal_cmd_get_locking_range_info;
+ spdk_opal_cmd_enable_user;
+ spdk_opal_cmd_add_user_to_locking_range;
+ spdk_opal_cmd_set_new_passwd;
+ spdk_opal_cmd_erase_locking_range;
+ spdk_opal_cmd_secure_erase_locking_range;
+ spdk_opal_get_locking_range_info;
+ spdk_opal_free_locking_range_info;
+
+ local: *;
+};
diff --git a/src/spdk/lib/nvmf/Makefile b/src/spdk/lib/nvmf/Makefile
new file mode 100644
index 000000000..b4556564a
--- /dev/null
+++ b/src/spdk/lib/nvmf/Makefile
@@ -0,0 +1,75 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 5
+SO_MINOR := 0
+
+C_SRCS = ctrlr.c ctrlr_discovery.c ctrlr_bdev.c \
+ subsystem.c nvmf.c nvmf_rpc.c transport.c tcp.c
+
+C_SRCS-$(CONFIG_RDMA) += rdma.c
+LIBNAME = nvmf
+LOCAL_SYS_LIBS = -luuid
+ifeq ($(CONFIG_RDMA),y)
+LOCAL_SYS_LIBS += -libverbs -lrdmacm
+#Attach only if FreeBSD and RDMA is specified with configure
+ifeq ($(OS),FreeBSD)
+# Mellanox - MLX4 HBA Userspace Library
+ifneq ("$(wildcard /usr/lib/libmlx4.*)","")
+LOCAL_SYS_LIBS += -lmlx4
+endif
+# Mellanox - MLX5 HBA Userspace Library
+ifneq ("$(wildcard /usr/lib/libmlx5.*)","")
+LOCAL_SYS_LIBS += -lmlx5
+endif
+# Chelsio HBA Userspace Library
+ifneq ("$(wildcard /usr/lib/libcxgb4.*)","")
+LOCAL_SYS_LIBS += -lcxgb4
+endif
+endif
+endif
+
+ifeq ($(CONFIG_FC),y)
+C_SRCS += fc.c fc_ls.c
+CFLAGS += -I$(CURDIR)
+ifneq ($(strip $(CONFIG_FC_PATH)),)
+CFLAGS += -I$(CONFIG_FC_PATH)
+endif
+endif
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_nvmf.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/nvmf/ctrlr.c b/src/spdk/lib/nvmf/ctrlr.c
new file mode 100644
index 000000000..638cde9d2
--- /dev/null
+++ b/src/spdk/lib/nvmf/ctrlr.c
@@ -0,0 +1,3224 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation. All rights reserved.
+ * Copyright (c) 2019, 2020 Mellanox Technologies LTD. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "nvmf_internal.h"
+#include "transport.h"
+
+#include "spdk/bit_array.h"
+#include "spdk/endian.h"
+#include "spdk/thread.h"
+#include "spdk/trace.h"
+#include "spdk/nvme_spec.h"
+#include "spdk/nvmf_cmd.h"
+#include "spdk/string.h"
+#include "spdk/util.h"
+#include "spdk/version.h"
+
+#include "spdk_internal/log.h"
+
+#define MIN_KEEP_ALIVE_TIMEOUT_IN_MS 10000
+#define NVMF_DISC_KATO_IN_MS 120000
+#define KAS_TIME_UNIT_IN_MS 100
+#define KAS_DEFAULT_VALUE (MIN_KEEP_ALIVE_TIMEOUT_IN_MS / KAS_TIME_UNIT_IN_MS)
+
+/*
+ * Report the SPDK version as the firmware revision.
+ * SPDK_VERSION_STRING won't fit into FR (only 8 bytes), so try to fit the most important parts.
+ */
+#define FW_VERSION SPDK_VERSION_MAJOR_STRING SPDK_VERSION_MINOR_STRING SPDK_VERSION_PATCH_STRING
+
+/*
+ * Support for custom admin command handlers
+ */
+struct spdk_nvmf_custom_admin_cmd {
+ spdk_nvmf_custom_cmd_hdlr hdlr;
+ uint32_t nsid; /* nsid to forward */
+};
+
+static struct spdk_nvmf_custom_admin_cmd g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_MAX_OPC + 1];
+
+static void _nvmf_request_complete(void *ctx);
+
+static inline void
+nvmf_invalid_connect_response(struct spdk_nvmf_fabric_connect_rsp *rsp,
+ uint8_t iattr, uint16_t ipo)
+{
+ rsp->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
+ rsp->status.sc = SPDK_NVMF_FABRIC_SC_INVALID_PARAM;
+ rsp->status_code_specific.invalid.iattr = iattr;
+ rsp->status_code_specific.invalid.ipo = ipo;
+}
+
+#define SPDK_NVMF_INVALID_CONNECT_CMD(rsp, field) \
+ nvmf_invalid_connect_response(rsp, 0, offsetof(struct spdk_nvmf_fabric_connect_cmd, field))
+#define SPDK_NVMF_INVALID_CONNECT_DATA(rsp, field) \
+ nvmf_invalid_connect_response(rsp, 1, offsetof(struct spdk_nvmf_fabric_connect_data, field))
+
+static void
+nvmf_ctrlr_stop_keep_alive_timer(struct spdk_nvmf_ctrlr *ctrlr)
+{
+ if (!ctrlr) {
+ SPDK_ERRLOG("Controller is NULL\n");
+ return;
+ }
+
+ if (ctrlr->keep_alive_poller == NULL) {
+ return;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Stop keep alive poller\n");
+ spdk_poller_unregister(&ctrlr->keep_alive_poller);
+}
+
+static void
+nvmf_ctrlr_disconnect_qpairs_done(struct spdk_io_channel_iter *i, int status)
+{
+ if (status == 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "ctrlr disconnect qpairs complete successfully\n");
+ } else {
+ SPDK_ERRLOG("Fail to disconnect ctrlr qpairs\n");
+ }
+}
+
+static int
+_nvmf_ctrlr_disconnect_qpairs_on_pg(struct spdk_io_channel_iter *i, bool include_admin)
+{
+ int rc = 0;
+ struct spdk_nvmf_ctrlr *ctrlr;
+ struct spdk_nvmf_qpair *qpair, *temp_qpair;
+ struct spdk_io_channel *ch;
+ struct spdk_nvmf_poll_group *group;
+
+ ctrlr = spdk_io_channel_iter_get_ctx(i);
+ ch = spdk_io_channel_iter_get_channel(i);
+ group = spdk_io_channel_get_ctx(ch);
+
+ TAILQ_FOREACH_SAFE(qpair, &group->qpairs, link, temp_qpair) {
+ if (qpair->ctrlr == ctrlr && (include_admin || !nvmf_qpair_is_admin_queue(qpair))) {
+ rc = spdk_nvmf_qpair_disconnect(qpair, NULL, NULL);
+ if (rc) {
+ SPDK_ERRLOG("Qpair disconnect failed\n");
+ return rc;
+ }
+ }
+ }
+
+ return rc;
+}
+
+static void
+nvmf_ctrlr_disconnect_qpairs_on_pg(struct spdk_io_channel_iter *i)
+{
+ spdk_for_each_channel_continue(i, _nvmf_ctrlr_disconnect_qpairs_on_pg(i, true));
+}
+
+static void
+nvmf_ctrlr_disconnect_io_qpairs_on_pg(struct spdk_io_channel_iter *i)
+{
+ spdk_for_each_channel_continue(i, _nvmf_ctrlr_disconnect_qpairs_on_pg(i, false));
+}
+
+static int
+nvmf_ctrlr_keep_alive_poll(void *ctx)
+{
+ uint64_t keep_alive_timeout_tick;
+ uint64_t now = spdk_get_ticks();
+ struct spdk_nvmf_ctrlr *ctrlr = ctx;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Polling ctrlr keep alive timeout\n");
+
+ /* If the Keep alive feature is in use and the timer expires */
+ keep_alive_timeout_tick = ctrlr->last_keep_alive_tick +
+ ctrlr->feat.keep_alive_timer.bits.kato * spdk_get_ticks_hz() / UINT64_C(1000);
+ if (now > keep_alive_timeout_tick) {
+ SPDK_NOTICELOG("Disconnecting host from subsystem %s due to keep alive timeout.\n",
+ ctrlr->subsys->subnqn);
+ /* set the Controller Fatal Status bit to '1' */
+ if (ctrlr->vcprop.csts.bits.cfs == 0) {
+ ctrlr->vcprop.csts.bits.cfs = 1;
+
+ /*
+ * disconnect qpairs, terminate Transport connection
+ * destroy ctrlr, break the host to controller association
+ * disconnect qpairs with qpair->ctrlr == ctrlr
+ */
+ spdk_for_each_channel(ctrlr->subsys->tgt,
+ nvmf_ctrlr_disconnect_qpairs_on_pg,
+ ctrlr,
+ nvmf_ctrlr_disconnect_qpairs_done);
+ }
+ }
+
+ return SPDK_POLLER_BUSY;
+}
+
+static void
+nvmf_ctrlr_start_keep_alive_timer(struct spdk_nvmf_ctrlr *ctrlr)
+{
+ if (!ctrlr) {
+ SPDK_ERRLOG("Controller is NULL\n");
+ return;
+ }
+
+ /* if cleared to 0 then the Keep Alive Timer is disabled */
+ if (ctrlr->feat.keep_alive_timer.bits.kato != 0) {
+
+ ctrlr->last_keep_alive_tick = spdk_get_ticks();
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Ctrlr add keep alive poller\n");
+ ctrlr->keep_alive_poller = SPDK_POLLER_REGISTER(nvmf_ctrlr_keep_alive_poll, ctrlr,
+ ctrlr->feat.keep_alive_timer.bits.kato * 1000);
+ }
+}
+
+static void
+ctrlr_add_qpair_and_update_rsp(struct spdk_nvmf_qpair *qpair,
+ struct spdk_nvmf_ctrlr *ctrlr,
+ struct spdk_nvmf_fabric_connect_rsp *rsp)
+{
+ assert(ctrlr->admin_qpair->group->thread == spdk_get_thread());
+
+ /* check if we would exceed ctrlr connection limit */
+ if (qpair->qid >= spdk_bit_array_capacity(ctrlr->qpair_mask)) {
+ SPDK_ERRLOG("Requested QID %u but Max QID is %u\n",
+ qpair->qid, spdk_bit_array_capacity(ctrlr->qpair_mask) - 1);
+ rsp->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
+ rsp->status.sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER;
+ return;
+ }
+
+ if (spdk_bit_array_get(ctrlr->qpair_mask, qpair->qid)) {
+ SPDK_ERRLOG("Got I/O connect with duplicate QID %u\n", qpair->qid);
+ rsp->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
+ rsp->status.sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER;
+ return;
+ }
+
+ qpair->ctrlr = ctrlr;
+ spdk_bit_array_set(ctrlr->qpair_mask, qpair->qid);
+
+ rsp->status.sc = SPDK_NVME_SC_SUCCESS;
+ rsp->status_code_specific.success.cntlid = ctrlr->cntlid;
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "connect capsule response: cntlid = 0x%04x\n",
+ rsp->status_code_specific.success.cntlid);
+}
+
+static void
+_nvmf_ctrlr_add_admin_qpair(void *ctx)
+{
+ struct spdk_nvmf_request *req = ctx;
+ struct spdk_nvmf_fabric_connect_rsp *rsp = &req->rsp->connect_rsp;
+ struct spdk_nvmf_qpair *qpair = req->qpair;
+ struct spdk_nvmf_ctrlr *ctrlr = qpair->ctrlr;
+
+ ctrlr->admin_qpair = qpair;
+ nvmf_ctrlr_start_keep_alive_timer(ctrlr);
+ ctrlr_add_qpair_and_update_rsp(qpair, ctrlr, rsp);
+ _nvmf_request_complete(req);
+}
+
+static void
+_nvmf_subsystem_add_ctrlr(void *ctx)
+{
+ struct spdk_nvmf_request *req = ctx;
+ struct spdk_nvmf_qpair *qpair = req->qpair;
+ struct spdk_nvmf_fabric_connect_rsp *rsp = &req->rsp->connect_rsp;
+ struct spdk_nvmf_ctrlr *ctrlr = qpair->ctrlr;
+
+ if (nvmf_subsystem_add_ctrlr(ctrlr->subsys, ctrlr)) {
+ SPDK_ERRLOG("Unable to add controller to subsystem\n");
+ spdk_bit_array_free(&ctrlr->qpair_mask);
+ free(ctrlr);
+ qpair->ctrlr = NULL;
+ rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+ spdk_nvmf_request_complete(req);
+ return;
+ }
+
+ spdk_thread_send_msg(ctrlr->thread, _nvmf_ctrlr_add_admin_qpair, req);
+}
+
+static void
+nvmf_ctrlr_cdata_init(struct spdk_nvmf_transport *transport, struct spdk_nvmf_subsystem *subsystem,
+ struct spdk_nvmf_ctrlr_data *cdata)
+{
+ cdata->kas = KAS_DEFAULT_VALUE;
+ cdata->sgls.supported = 1;
+ cdata->sgls.keyed_sgl = 1;
+ cdata->sgls.sgl_offset = 1;
+ cdata->nvmf_specific.ioccsz = sizeof(struct spdk_nvme_cmd) / 16;
+ cdata->nvmf_specific.ioccsz += transport->opts.in_capsule_data_size / 16;
+ cdata->nvmf_specific.iorcsz = sizeof(struct spdk_nvme_cpl) / 16;
+ cdata->nvmf_specific.icdoff = 0; /* offset starts directly after SQE */
+ cdata->nvmf_specific.ctrattr.ctrlr_model = SPDK_NVMF_CTRLR_MODEL_DYNAMIC;
+ cdata->nvmf_specific.msdbd = 1;
+
+ if (transport->ops->cdata_init) {
+ transport->ops->cdata_init(transport, subsystem, cdata);
+ }
+}
+
+static struct spdk_nvmf_ctrlr *
+nvmf_ctrlr_create(struct spdk_nvmf_subsystem *subsystem,
+ struct spdk_nvmf_request *req,
+ struct spdk_nvmf_fabric_connect_cmd *connect_cmd,
+ struct spdk_nvmf_fabric_connect_data *connect_data)
+{
+ struct spdk_nvmf_ctrlr *ctrlr;
+ struct spdk_nvmf_transport *transport;
+
+ ctrlr = calloc(1, sizeof(*ctrlr));
+ if (ctrlr == NULL) {
+ SPDK_ERRLOG("Memory allocation failed\n");
+ return NULL;
+ }
+
+ TAILQ_INIT(&ctrlr->log_head);
+ ctrlr->subsys = subsystem;
+ ctrlr->thread = req->qpair->group->thread;
+
+ transport = req->qpair->transport;
+ ctrlr->qpair_mask = spdk_bit_array_create(transport->opts.max_qpairs_per_ctrlr);
+ if (!ctrlr->qpair_mask) {
+ SPDK_ERRLOG("Failed to allocate controller qpair mask\n");
+ free(ctrlr);
+ return NULL;
+ }
+
+ nvmf_ctrlr_cdata_init(transport, subsystem, &ctrlr->cdata);
+
+ /*
+ * KAS: This field indicates the granularity of the Keep Alive Timer in 100ms units.
+ * If this field is cleared to 0h, then Keep Alive is not supported.
+ */
+ if (ctrlr->cdata.kas) {
+ ctrlr->feat.keep_alive_timer.bits.kato = spdk_divide_round_up(connect_cmd->kato,
+ KAS_DEFAULT_VALUE * KAS_TIME_UNIT_IN_MS) *
+ KAS_DEFAULT_VALUE * KAS_TIME_UNIT_IN_MS;
+ }
+
+ ctrlr->feat.async_event_configuration.bits.ns_attr_notice = 1;
+ ctrlr->feat.volatile_write_cache.bits.wce = 1;
+
+ if (ctrlr->subsys->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) {
+ /*
+ * If keep-alive timeout is not set, discovery controllers use some
+ * arbitrary high value in order to cleanup stale discovery sessions
+ *
+ * From the 1.0a nvme-of spec:
+ * "The Keep Alive command is reserved for
+ * Discovery controllers. A transport may specify a
+ * fixed Discovery controller activity timeout value
+ * (e.g., 2 minutes). If no commands are received
+ * by a Discovery controller within that time
+ * period, the controller may perform the
+ * actions for Keep Alive Timer expiration".
+ * kato is in millisecond.
+ */
+ if (ctrlr->feat.keep_alive_timer.bits.kato == 0) {
+ ctrlr->feat.keep_alive_timer.bits.kato = NVMF_DISC_KATO_IN_MS;
+ }
+ }
+
+ /* Subtract 1 for admin queue, 1 for 0's based */
+ ctrlr->feat.number_of_queues.bits.ncqr = transport->opts.max_qpairs_per_ctrlr - 1 -
+ 1;
+ ctrlr->feat.number_of_queues.bits.nsqr = transport->opts.max_qpairs_per_ctrlr - 1 -
+ 1;
+
+ spdk_uuid_copy(&ctrlr->hostid, (struct spdk_uuid *)connect_data->hostid);
+ memcpy(ctrlr->hostnqn, connect_data->hostnqn, sizeof(ctrlr->hostnqn));
+
+ ctrlr->vcprop.cap.raw = 0;
+ ctrlr->vcprop.cap.bits.cqr = 1; /* NVMe-oF specification required */
+ ctrlr->vcprop.cap.bits.mqes = transport->opts.max_queue_depth -
+ 1; /* max queue depth */
+ ctrlr->vcprop.cap.bits.ams = 0; /* optional arb mechanisms */
+ ctrlr->vcprop.cap.bits.to = 1; /* ready timeout - 500 msec units */
+ ctrlr->vcprop.cap.bits.dstrd = 0; /* fixed to 0 for NVMe-oF */
+ ctrlr->vcprop.cap.bits.css = SPDK_NVME_CAP_CSS_NVM; /* NVM command set */
+ ctrlr->vcprop.cap.bits.mpsmin = 0; /* 2 ^ (12 + mpsmin) == 4k */
+ ctrlr->vcprop.cap.bits.mpsmax = 0; /* 2 ^ (12 + mpsmax) == 4k */
+
+ /* Version Supported: 1.3 */
+ ctrlr->vcprop.vs.bits.mjr = 1;
+ ctrlr->vcprop.vs.bits.mnr = 3;
+ ctrlr->vcprop.vs.bits.ter = 0;
+
+ ctrlr->vcprop.cc.raw = 0;
+ ctrlr->vcprop.cc.bits.en = 0; /* Init controller disabled */
+
+ ctrlr->vcprop.csts.raw = 0;
+ ctrlr->vcprop.csts.bits.rdy = 0; /* Init controller as not ready */
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "cap 0x%" PRIx64 "\n", ctrlr->vcprop.cap.raw);
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "vs 0x%x\n", ctrlr->vcprop.vs.raw);
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "cc 0x%x\n", ctrlr->vcprop.cc.raw);
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "csts 0x%x\n", ctrlr->vcprop.csts.raw);
+
+ ctrlr->dif_insert_or_strip = transport->opts.dif_insert_or_strip;
+
+ req->qpair->ctrlr = ctrlr;
+ spdk_thread_send_msg(subsystem->thread, _nvmf_subsystem_add_ctrlr, req);
+
+ return ctrlr;
+}
+
+static void
+_nvmf_ctrlr_destruct(void *ctx)
+{
+ struct spdk_nvmf_ctrlr *ctrlr = ctx;
+ struct spdk_nvmf_reservation_log *log, *log_tmp;
+
+ nvmf_ctrlr_stop_keep_alive_timer(ctrlr);
+
+ TAILQ_FOREACH_SAFE(log, &ctrlr->log_head, link, log_tmp) {
+ TAILQ_REMOVE(&ctrlr->log_head, log, link);
+ free(log);
+ }
+ free(ctrlr);
+}
+
+void
+nvmf_ctrlr_destruct(struct spdk_nvmf_ctrlr *ctrlr)
+{
+ nvmf_subsystem_remove_ctrlr(ctrlr->subsys, ctrlr);
+
+ spdk_thread_send_msg(ctrlr->thread, _nvmf_ctrlr_destruct, ctrlr);
+}
+
+static void
+nvmf_ctrlr_add_io_qpair(void *ctx)
+{
+ struct spdk_nvmf_request *req = ctx;
+ struct spdk_nvmf_fabric_connect_rsp *rsp = &req->rsp->connect_rsp;
+ struct spdk_nvmf_qpair *qpair = req->qpair;
+ struct spdk_nvmf_ctrlr *ctrlr = qpair->ctrlr;
+
+ /* Unit test will check qpair->ctrlr after calling spdk_nvmf_ctrlr_connect.
+ * For error case, the value should be NULL. So set it to NULL at first.
+ */
+ qpair->ctrlr = NULL;
+
+ if (ctrlr->subsys->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) {
+ SPDK_ERRLOG("I/O connect not allowed on discovery controller\n");
+ SPDK_NVMF_INVALID_CONNECT_CMD(rsp, qid);
+ goto end;
+ }
+
+ if (!ctrlr->vcprop.cc.bits.en) {
+ SPDK_ERRLOG("Got I/O connect before ctrlr was enabled\n");
+ SPDK_NVMF_INVALID_CONNECT_CMD(rsp, qid);
+ goto end;
+ }
+
+ if (1u << ctrlr->vcprop.cc.bits.iosqes != sizeof(struct spdk_nvme_cmd)) {
+ SPDK_ERRLOG("Got I/O connect with invalid IOSQES %u\n",
+ ctrlr->vcprop.cc.bits.iosqes);
+ SPDK_NVMF_INVALID_CONNECT_CMD(rsp, qid);
+ goto end;
+ }
+
+ if (1u << ctrlr->vcprop.cc.bits.iocqes != sizeof(struct spdk_nvme_cpl)) {
+ SPDK_ERRLOG("Got I/O connect with invalid IOCQES %u\n",
+ ctrlr->vcprop.cc.bits.iocqes);
+ SPDK_NVMF_INVALID_CONNECT_CMD(rsp, qid);
+ goto end;
+ }
+
+ ctrlr_add_qpair_and_update_rsp(qpair, ctrlr, rsp);
+end:
+ spdk_nvmf_request_complete(req);
+}
+
+static void
+_nvmf_ctrlr_add_io_qpair(void *ctx)
+{
+ struct spdk_nvmf_request *req = ctx;
+ struct spdk_nvmf_fabric_connect_rsp *rsp = &req->rsp->connect_rsp;
+ struct spdk_nvmf_fabric_connect_data *data = req->data;
+ struct spdk_nvmf_ctrlr *ctrlr;
+ struct spdk_nvmf_qpair *qpair = req->qpair;
+ struct spdk_nvmf_qpair *admin_qpair;
+ struct spdk_nvmf_tgt *tgt = qpair->transport->tgt;
+ struct spdk_nvmf_subsystem *subsystem;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Connect I/O Queue for controller id 0x%x\n", data->cntlid);
+
+ subsystem = spdk_nvmf_tgt_find_subsystem(tgt, data->subnqn);
+ /* We already checked this in spdk_nvmf_ctrlr_connect */
+ assert(subsystem != NULL);
+
+ ctrlr = nvmf_subsystem_get_ctrlr(subsystem, data->cntlid);
+ if (ctrlr == NULL) {
+ SPDK_ERRLOG("Unknown controller ID 0x%x\n", data->cntlid);
+ SPDK_NVMF_INVALID_CONNECT_DATA(rsp, cntlid);
+ spdk_nvmf_request_complete(req);
+ return;
+ }
+
+ admin_qpair = ctrlr->admin_qpair;
+ qpair->ctrlr = ctrlr;
+ spdk_thread_send_msg(admin_qpair->group->thread, nvmf_ctrlr_add_io_qpair, req);
+}
+
+static bool
+nvmf_qpair_access_allowed(struct spdk_nvmf_qpair *qpair, struct spdk_nvmf_subsystem *subsystem,
+ const char *hostnqn)
+{
+ struct spdk_nvme_transport_id listen_trid = {};
+
+ if (!spdk_nvmf_subsystem_host_allowed(subsystem, hostnqn)) {
+ SPDK_ERRLOG("Subsystem '%s' does not allow host '%s'\n", subsystem->subnqn, hostnqn);
+ return false;
+ }
+
+ if (spdk_nvmf_qpair_get_listen_trid(qpair, &listen_trid)) {
+ SPDK_ERRLOG("Subsystem '%s' is unable to enforce access control due to an internal error.\n",
+ subsystem->subnqn);
+ return false;
+ }
+
+ if (!spdk_nvmf_subsystem_listener_allowed(subsystem, &listen_trid)) {
+ SPDK_ERRLOG("Subsystem '%s' does not allow host '%s' to connect at this address.\n",
+ subsystem->subnqn, hostnqn);
+ return false;
+ }
+
+ return true;
+}
+
+static int
+_nvmf_ctrlr_connect(struct spdk_nvmf_request *req)
+{
+ struct spdk_nvmf_fabric_connect_data *data = req->data;
+ struct spdk_nvmf_fabric_connect_cmd *cmd = &req->cmd->connect_cmd;
+ struct spdk_nvmf_fabric_connect_rsp *rsp = &req->rsp->connect_rsp;
+ struct spdk_nvmf_qpair *qpair = req->qpair;
+ struct spdk_nvmf_transport *transport = qpair->transport;
+ struct spdk_nvmf_ctrlr *ctrlr;
+ struct spdk_nvmf_subsystem *subsystem;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "recfmt 0x%x qid %u sqsize %u\n",
+ cmd->recfmt, cmd->qid, cmd->sqsize);
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Connect data:\n");
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, " cntlid: 0x%04x\n", data->cntlid);
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, " hostid: %08x-%04x-%04x-%02x%02x-%04x%08x ***\n",
+ ntohl(*(uint32_t *)&data->hostid[0]),
+ ntohs(*(uint16_t *)&data->hostid[4]),
+ ntohs(*(uint16_t *)&data->hostid[6]),
+ data->hostid[8],
+ data->hostid[9],
+ ntohs(*(uint16_t *)&data->hostid[10]),
+ ntohl(*(uint32_t *)&data->hostid[12]));
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, " subnqn: \"%s\"\n", data->subnqn);
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, " hostnqn: \"%s\"\n", data->hostnqn);
+
+ subsystem = spdk_nvmf_tgt_find_subsystem(transport->tgt, data->subnqn);
+ if (!subsystem) {
+ SPDK_NVMF_INVALID_CONNECT_DATA(rsp, subnqn);
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ if (cmd->recfmt != 0) {
+ SPDK_ERRLOG("Connect command unsupported RECFMT %u\n", cmd->recfmt);
+ rsp->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
+ rsp->status.sc = SPDK_NVMF_FABRIC_SC_INCOMPATIBLE_FORMAT;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ /*
+ * SQSIZE is a 0-based value, so it must be at least 1 (minimum queue depth is 2) and
+ * strictly less than max_aq_depth (admin queues) or max_queue_depth (io queues).
+ */
+ if (cmd->sqsize == 0) {
+ SPDK_ERRLOG("Invalid SQSIZE = 0\n");
+ SPDK_NVMF_INVALID_CONNECT_CMD(rsp, sqsize);
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ if (cmd->qid == 0) {
+ if (cmd->sqsize >= transport->opts.max_aq_depth) {
+ SPDK_ERRLOG("Invalid SQSIZE for admin queue %u (min 1, max %u)\n",
+ cmd->sqsize, transport->opts.max_aq_depth - 1);
+ SPDK_NVMF_INVALID_CONNECT_CMD(rsp, sqsize);
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+ } else if (cmd->sqsize >= transport->opts.max_queue_depth) {
+ SPDK_ERRLOG("Invalid SQSIZE %u (min 1, max %u)\n",
+ cmd->sqsize, transport->opts.max_queue_depth - 1);
+ SPDK_NVMF_INVALID_CONNECT_CMD(rsp, sqsize);
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ qpair->sq_head_max = cmd->sqsize;
+ qpair->qid = cmd->qid;
+
+ if (0 == qpair->qid) {
+ qpair->group->stat.admin_qpairs++;
+ } else {
+ qpair->group->stat.io_qpairs++;
+ }
+
+ if (cmd->qid == 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Connect Admin Queue for controller ID 0x%x\n", data->cntlid);
+
+ if (data->cntlid != 0xFFFF) {
+ /* This NVMf target only supports dynamic mode. */
+ SPDK_ERRLOG("The NVMf target only supports dynamic mode (CNTLID = 0x%x).\n", data->cntlid);
+ SPDK_NVMF_INVALID_CONNECT_DATA(rsp, cntlid);
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ /* Establish a new ctrlr */
+ ctrlr = nvmf_ctrlr_create(subsystem, req, cmd, data);
+ if (!ctrlr) {
+ SPDK_ERRLOG("nvmf_ctrlr_create() failed\n");
+ rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ } else {
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+ }
+ } else {
+ spdk_thread_send_msg(subsystem->thread, _nvmf_ctrlr_add_io_qpair, req);
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+ }
+}
+
+static inline bool
+nvmf_request_is_fabric_connect(struct spdk_nvmf_request *req)
+{
+ return req->cmd->nvmf_cmd.opcode == SPDK_NVME_OPC_FABRIC &&
+ req->cmd->nvmf_cmd.fctype == SPDK_NVMF_FABRIC_COMMAND_CONNECT;
+}
+
+static struct spdk_nvmf_subsystem_poll_group *
+nvmf_subsystem_pg_from_connect_cmd(struct spdk_nvmf_request *req)
+{
+ struct spdk_nvmf_fabric_connect_data *data;
+ struct spdk_nvmf_subsystem *subsystem;
+ struct spdk_nvmf_tgt *tgt;
+
+ assert(nvmf_request_is_fabric_connect(req));
+ assert(req->qpair->ctrlr == NULL);
+
+ data = req->data;
+ tgt = req->qpair->transport->tgt;
+
+ subsystem = spdk_nvmf_tgt_find_subsystem(tgt, data->subnqn);
+ if (subsystem == NULL) {
+ return NULL;
+ }
+
+ return &req->qpair->group->sgroups[subsystem->id];
+}
+
+int
+spdk_nvmf_ctrlr_connect(struct spdk_nvmf_request *req)
+{
+ struct spdk_nvmf_fabric_connect_rsp *rsp = &req->rsp->connect_rsp;
+ struct spdk_nvmf_qpair *qpair = req->qpair;
+ struct spdk_nvmf_subsystem_poll_group *sgroup;
+ enum spdk_nvmf_request_exec_status status;
+
+ sgroup = nvmf_subsystem_pg_from_connect_cmd(req);
+ if (!sgroup) {
+ SPDK_NVMF_INVALID_CONNECT_DATA(rsp, subnqn);
+ status = SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ goto out;
+ }
+
+ sgroup->io_outstanding++;
+ TAILQ_INSERT_TAIL(&qpair->outstanding, req, link);
+
+ status = _nvmf_ctrlr_connect(req);
+
+out:
+ if (status == SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE) {
+ _nvmf_request_complete(req);
+ }
+
+ return status;
+}
+
+static int
+nvmf_ctrlr_cmd_connect(struct spdk_nvmf_request *req)
+{
+ struct spdk_nvmf_fabric_connect_data *data = req->data;
+ struct spdk_nvmf_fabric_connect_rsp *rsp = &req->rsp->connect_rsp;
+ struct spdk_nvmf_transport *transport = req->qpair->transport;
+ struct spdk_nvmf_subsystem *subsystem;
+
+ if (req->length < sizeof(struct spdk_nvmf_fabric_connect_data)) {
+ SPDK_ERRLOG("Connect command data length 0x%x too small\n", req->length);
+ rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ subsystem = spdk_nvmf_tgt_find_subsystem(transport->tgt, data->subnqn);
+ if (!subsystem) {
+ SPDK_NVMF_INVALID_CONNECT_DATA(rsp, subnqn);
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ if ((subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE) ||
+ (subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSING) ||
+ (subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSED) ||
+ (subsystem->state == SPDK_NVMF_SUBSYSTEM_DEACTIVATING)) {
+ SPDK_ERRLOG("Subsystem '%s' is not ready\n", subsystem->subnqn);
+ rsp->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
+ rsp->status.sc = SPDK_NVMF_FABRIC_SC_CONTROLLER_BUSY;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ /* Ensure that hostnqn is null terminated */
+ if (!memchr(data->hostnqn, '\0', SPDK_NVMF_NQN_MAX_LEN + 1)) {
+ SPDK_ERRLOG("Connect HOSTNQN is not null terminated\n");
+ SPDK_NVMF_INVALID_CONNECT_DATA(rsp, hostnqn);
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ if (!nvmf_qpair_access_allowed(req->qpair, subsystem, data->hostnqn)) {
+ rsp->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
+ rsp->status.sc = SPDK_NVMF_FABRIC_SC_INVALID_HOST;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ return _nvmf_ctrlr_connect(req);
+}
+
+static void
+nvmf_ctrlr_cc_reset_done(struct spdk_io_channel_iter *i, int status)
+{
+ struct spdk_nvmf_ctrlr *ctrlr = spdk_io_channel_iter_get_ctx(i);
+
+ if (status < 0) {
+ SPDK_ERRLOG("Fail to disconnect io ctrlr qpairs\n");
+ assert(false);
+ }
+
+ /* Only a subset of the registers are cleared out on a reset */
+ ctrlr->vcprop.cc.raw = 0;
+ ctrlr->vcprop.csts.raw = 0;
+
+}
+
+const struct spdk_nvmf_registers *
+spdk_nvmf_ctrlr_get_regs(struct spdk_nvmf_ctrlr *ctrlr)
+{
+ return &ctrlr->vcprop;
+}
+
+static uint64_t
+nvmf_prop_get_cap(struct spdk_nvmf_ctrlr *ctrlr)
+{
+ return ctrlr->vcprop.cap.raw;
+}
+
+static uint64_t
+nvmf_prop_get_vs(struct spdk_nvmf_ctrlr *ctrlr)
+{
+ return ctrlr->vcprop.vs.raw;
+}
+
+static uint64_t
+nvmf_prop_get_cc(struct spdk_nvmf_ctrlr *ctrlr)
+{
+ return ctrlr->vcprop.cc.raw;
+}
+
+static bool
+nvmf_prop_set_cc(struct spdk_nvmf_ctrlr *ctrlr, uint32_t value)
+{
+ union spdk_nvme_cc_register cc, diff;
+
+ cc.raw = value;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "cur CC: 0x%08x\n", ctrlr->vcprop.cc.raw);
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "new CC: 0x%08x\n", cc.raw);
+
+ /*
+ * Calculate which bits changed between the current and new CC.
+ * Mark each bit as 0 once it is handled to determine if any unhandled bits were changed.
+ */
+ diff.raw = cc.raw ^ ctrlr->vcprop.cc.raw;
+
+ if (diff.bits.en) {
+ if (cc.bits.en) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Property Set CC Enable!\n");
+ ctrlr->vcprop.cc.bits.en = 1;
+ ctrlr->vcprop.csts.bits.rdy = 1;
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Property Set CC Disable!\n");
+ ctrlr->vcprop.cc.bits.en = 0;
+ spdk_for_each_channel(ctrlr->subsys->tgt,
+ nvmf_ctrlr_disconnect_io_qpairs_on_pg,
+ ctrlr,
+ nvmf_ctrlr_cc_reset_done);
+ }
+ diff.bits.en = 0;
+ }
+
+ if (diff.bits.shn) {
+ if (cc.bits.shn == SPDK_NVME_SHN_NORMAL ||
+ cc.bits.shn == SPDK_NVME_SHN_ABRUPT) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Property Set CC Shutdown %u%ub!\n",
+ cc.bits.shn >> 1, cc.bits.shn & 1);
+ ctrlr->vcprop.cc.bits.shn = cc.bits.shn;
+ ctrlr->vcprop.cc.bits.en = 0;
+ ctrlr->vcprop.csts.bits.rdy = 0;
+ ctrlr->vcprop.csts.bits.shst = SPDK_NVME_SHST_COMPLETE;
+ } else if (cc.bits.shn == 0) {
+ ctrlr->vcprop.cc.bits.shn = 0;
+ } else {
+ SPDK_ERRLOG("Prop Set CC: Invalid SHN value %u%ub\n",
+ cc.bits.shn >> 1, cc.bits.shn & 1);
+ return false;
+ }
+ diff.bits.shn = 0;
+ }
+
+ if (diff.bits.iosqes) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Prop Set IOSQES = %u (%u bytes)\n",
+ cc.bits.iosqes, 1u << cc.bits.iosqes);
+ ctrlr->vcprop.cc.bits.iosqes = cc.bits.iosqes;
+ diff.bits.iosqes = 0;
+ }
+
+ if (diff.bits.iocqes) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Prop Set IOCQES = %u (%u bytes)\n",
+ cc.bits.iocqes, 1u << cc.bits.iocqes);
+ ctrlr->vcprop.cc.bits.iocqes = cc.bits.iocqes;
+ diff.bits.iocqes = 0;
+ }
+
+ if (diff.bits.ams) {
+ SPDK_ERRLOG("Arbitration Mechanism Selected (AMS) 0x%x not supported!\n", cc.bits.ams);
+ return false;
+ }
+
+ if (diff.bits.mps) {
+ SPDK_ERRLOG("Memory Page Size (MPS) %u KiB not supported!\n", (1 << (2 + cc.bits.mps)));
+ return false;
+ }
+
+ if (diff.bits.css) {
+ SPDK_ERRLOG("I/O Command Set Selected (CSS) 0x%x not supported!\n", cc.bits.css);
+ return false;
+ }
+
+ if (diff.raw != 0) {
+ SPDK_ERRLOG("Prop Set CC toggled reserved bits 0x%x!\n", diff.raw);
+ return false;
+ }
+
+ return true;
+}
+
+static uint64_t
+nvmf_prop_get_csts(struct spdk_nvmf_ctrlr *ctrlr)
+{
+ return ctrlr->vcprop.csts.raw;
+}
+
+static uint64_t
+nvmf_prop_get_aqa(struct spdk_nvmf_ctrlr *ctrlr)
+{
+ return ctrlr->vcprop.aqa.raw;
+}
+
+static bool
+nvmf_prop_set_aqa(struct spdk_nvmf_ctrlr *ctrlr, uint32_t value)
+{
+ union spdk_nvme_aqa_register aqa;
+
+ aqa.raw = value;
+
+ if (aqa.bits.asqs > ctrlr->vcprop.cap.bits.mqes ||
+ aqa.bits.acqs > ctrlr->vcprop.cap.bits.mqes) {
+ return false;
+ }
+
+ ctrlr->vcprop.aqa.raw = value;
+
+ return true;
+}
+
+static uint64_t
+nvmf_prop_get_asq(struct spdk_nvmf_ctrlr *ctrlr)
+{
+ return ctrlr->vcprop.asq;
+}
+
+static bool
+nvmf_prop_set_asq_lower(struct spdk_nvmf_ctrlr *ctrlr, uint32_t value)
+{
+ ctrlr->vcprop.asq = (ctrlr->vcprop.asq & (0xFFFFFFFFULL << 32ULL)) | value;
+
+ return true;
+}
+
+static bool
+nvmf_prop_set_asq_upper(struct spdk_nvmf_ctrlr *ctrlr, uint32_t value)
+{
+ ctrlr->vcprop.asq = (ctrlr->vcprop.asq & 0xFFFFFFFFULL) | ((uint64_t)value << 32ULL);
+
+ return true;
+}
+
+static uint64_t
+nvmf_prop_get_acq(struct spdk_nvmf_ctrlr *ctrlr)
+{
+ return ctrlr->vcprop.acq;
+}
+
+static bool
+nvmf_prop_set_acq_lower(struct spdk_nvmf_ctrlr *ctrlr, uint32_t value)
+{
+ ctrlr->vcprop.acq = (ctrlr->vcprop.acq & (0xFFFFFFFFULL << 32ULL)) | value;
+
+ return true;
+}
+
+static bool
+nvmf_prop_set_acq_upper(struct spdk_nvmf_ctrlr *ctrlr, uint32_t value)
+{
+ ctrlr->vcprop.acq = (ctrlr->vcprop.acq & 0xFFFFFFFFULL) | ((uint64_t)value << 32ULL);
+
+ return true;
+}
+
+struct nvmf_prop {
+ uint32_t ofst;
+ uint8_t size;
+ char name[11];
+ uint64_t (*get_cb)(struct spdk_nvmf_ctrlr *ctrlr);
+ bool (*set_cb)(struct spdk_nvmf_ctrlr *ctrlr, uint32_t value);
+ bool (*set_upper_cb)(struct spdk_nvmf_ctrlr *ctrlr, uint32_t value);
+};
+
+#define PROP(field, size, get_cb, set_cb, set_upper_cb) \
+ { \
+ offsetof(struct spdk_nvme_registers, field), \
+ size, \
+ #field, \
+ get_cb, set_cb, set_upper_cb \
+ }
+
+static const struct nvmf_prop nvmf_props[] = {
+ PROP(cap, 8, nvmf_prop_get_cap, NULL, NULL),
+ PROP(vs, 4, nvmf_prop_get_vs, NULL, NULL),
+ PROP(cc, 4, nvmf_prop_get_cc, nvmf_prop_set_cc, NULL),
+ PROP(csts, 4, nvmf_prop_get_csts, NULL, NULL),
+ PROP(aqa, 4, nvmf_prop_get_aqa, nvmf_prop_set_aqa, NULL),
+ PROP(asq, 8, nvmf_prop_get_asq, nvmf_prop_set_asq_lower, nvmf_prop_set_asq_upper),
+ PROP(acq, 8, nvmf_prop_get_acq, nvmf_prop_set_acq_lower, nvmf_prop_set_acq_upper),
+};
+
+static const struct nvmf_prop *
+find_prop(uint32_t ofst, uint8_t size)
+{
+ size_t i;
+
+ for (i = 0; i < SPDK_COUNTOF(nvmf_props); i++) {
+ const struct nvmf_prop *prop = &nvmf_props[i];
+
+ if ((ofst >= prop->ofst) && (ofst + size <= prop->ofst + prop->size)) {
+ return prop;
+ }
+ }
+
+ return NULL;
+}
+
+static int
+nvmf_property_get(struct spdk_nvmf_request *req)
+{
+ struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+ struct spdk_nvmf_fabric_prop_get_cmd *cmd = &req->cmd->prop_get_cmd;
+ struct spdk_nvmf_fabric_prop_get_rsp *response = &req->rsp->prop_get_rsp;
+ const struct nvmf_prop *prop;
+ uint8_t size;
+
+ response->status.sc = 0;
+ response->value.u64 = 0;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "size %d, offset 0x%x\n",
+ cmd->attrib.size, cmd->ofst);
+
+ switch (cmd->attrib.size) {
+ case SPDK_NVMF_PROP_SIZE_4:
+ size = 4;
+ break;
+ case SPDK_NVMF_PROP_SIZE_8:
+ size = 8;
+ break;
+ default:
+ SPDK_ERRLOG("Invalid size value %d\n", cmd->attrib.size);
+ response->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
+ response->status.sc = SPDK_NVMF_FABRIC_SC_INVALID_PARAM;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ prop = find_prop(cmd->ofst, size);
+ if (prop == NULL || prop->get_cb == NULL) {
+ response->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
+ response->status.sc = SPDK_NVMF_FABRIC_SC_INVALID_PARAM;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "name: %s\n", prop->name);
+
+ response->value.u64 = prop->get_cb(ctrlr);
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "response value: 0x%" PRIx64 "\n", response->value.u64);
+
+ if (size != prop->size) {
+ /* The size must be 4 and the prop->size is 8. Figure out which part of the property to read. */
+ assert(size == 4);
+ assert(prop->size == 8);
+
+ if (cmd->ofst == prop->ofst) {
+ /* Keep bottom 4 bytes only */
+ response->value.u64 &= 0xFFFFFFFF;
+ } else {
+ /* Keep top 4 bytes only */
+ response->value.u64 >>= 32;
+ }
+ }
+
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static int
+nvmf_property_set(struct spdk_nvmf_request *req)
+{
+ struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+ struct spdk_nvmf_fabric_prop_set_cmd *cmd = &req->cmd->prop_set_cmd;
+ struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl;
+ const struct nvmf_prop *prop;
+ uint64_t value;
+ uint8_t size;
+ bool ret;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "size %d, offset 0x%x, value 0x%" PRIx64 "\n",
+ cmd->attrib.size, cmd->ofst, cmd->value.u64);
+
+ switch (cmd->attrib.size) {
+ case SPDK_NVMF_PROP_SIZE_4:
+ size = 4;
+ break;
+ case SPDK_NVMF_PROP_SIZE_8:
+ size = 8;
+ break;
+ default:
+ SPDK_ERRLOG("Invalid size value %d\n", cmd->attrib.size);
+ response->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
+ response->status.sc = SPDK_NVMF_FABRIC_SC_INVALID_PARAM;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ prop = find_prop(cmd->ofst, size);
+ if (prop == NULL || prop->set_cb == NULL) {
+ SPDK_ERRLOG("Invalid offset 0x%x\n", cmd->ofst);
+ response->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
+ response->status.sc = SPDK_NVMF_FABRIC_SC_INVALID_PARAM;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "name: %s\n", prop->name);
+
+ value = cmd->value.u64;
+
+ if (prop->size == 4) {
+ ret = prop->set_cb(ctrlr, (uint32_t)value);
+ } else if (size != prop->size) {
+ /* The size must be 4 and the prop->size is 8. Figure out which part of the property to write. */
+ assert(size == 4);
+ assert(prop->size == 8);
+
+ if (cmd->ofst == prop->ofst) {
+ ret = prop->set_cb(ctrlr, (uint32_t)value);
+ } else {
+ ret = prop->set_upper_cb(ctrlr, (uint32_t)value);
+ }
+ } else {
+ ret = prop->set_cb(ctrlr, (uint32_t)value);
+ if (ret) {
+ ret = prop->set_upper_cb(ctrlr, (uint32_t)(value >> 32));
+ }
+ }
+
+ if (!ret) {
+ SPDK_ERRLOG("prop set_cb failed\n");
+ response->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
+ response->status.sc = SPDK_NVMF_FABRIC_SC_INVALID_PARAM;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static int
+nvmf_ctrlr_set_features_arbitration(struct spdk_nvmf_request *req)
+{
+ struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+ struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Arbitration (cdw11 = 0x%0x)\n", cmd->cdw11);
+
+ ctrlr->feat.arbitration.raw = cmd->cdw11;
+ ctrlr->feat.arbitration.bits.reserved = 0;
+
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static int
+nvmf_ctrlr_set_features_power_management(struct spdk_nvmf_request *req)
+{
+ struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+ struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+ struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Power Management (cdw11 = 0x%0x)\n", cmd->cdw11);
+
+ /* Only PS = 0 is allowed, since we report NPSS = 0 */
+ if (cmd->cdw11_bits.feat_power_management.bits.ps != 0) {
+ SPDK_ERRLOG("Invalid power state %u\n", cmd->cdw11_bits.feat_power_management.bits.ps);
+ rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+ rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ ctrlr->feat.power_management.raw = cmd->cdw11;
+ ctrlr->feat.power_management.bits.reserved = 0;
+
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static bool
+temp_threshold_opts_valid(const union spdk_nvme_feat_temperature_threshold *opts)
+{
+ /*
+ * Valid TMPSEL values:
+ * 0000b - 1000b: temperature sensors
+ * 1111b: set all implemented temperature sensors
+ */
+ if (opts->bits.tmpsel >= 9 && opts->bits.tmpsel != 15) {
+ /* 1001b - 1110b: reserved */
+ SPDK_ERRLOG("Invalid TMPSEL %u\n", opts->bits.tmpsel);
+ return false;
+ }
+
+ /*
+ * Valid THSEL values:
+ * 00b: over temperature threshold
+ * 01b: under temperature threshold
+ */
+ if (opts->bits.thsel > 1) {
+ /* 10b - 11b: reserved */
+ SPDK_ERRLOG("Invalid THSEL %u\n", opts->bits.thsel);
+ return false;
+ }
+
+ return true;
+}
+
+static int
+nvmf_ctrlr_set_features_temperature_threshold(struct spdk_nvmf_request *req)
+{
+ struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+ struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Temperature Threshold (cdw11 = 0x%0x)\n", cmd->cdw11);
+
+ if (!temp_threshold_opts_valid(&cmd->cdw11_bits.feat_temp_threshold)) {
+ rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+ rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ /* TODO: no sensors implemented - ignore new values */
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static int
+nvmf_ctrlr_get_features_temperature_threshold(struct spdk_nvmf_request *req)
+{
+ struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+ struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Get Features - Temperature Threshold (cdw11 = 0x%0x)\n", cmd->cdw11);
+
+ if (!temp_threshold_opts_valid(&cmd->cdw11_bits.feat_temp_threshold)) {
+ rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+ rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ /* TODO: no sensors implemented - return 0 for all thresholds */
+ rsp->cdw0 = 0;
+
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static int
+nvmf_ctrlr_set_features_error_recovery(struct spdk_nvmf_request *req)
+{
+ struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+ struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+ struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Error Recovery (cdw11 = 0x%0x)\n", cmd->cdw11);
+
+ if (cmd->cdw11_bits.feat_error_recovery.bits.dulbe) {
+ /*
+ * Host is not allowed to set this bit, since we don't advertise it in
+ * Identify Namespace.
+ */
+ SPDK_ERRLOG("Host set unsupported DULBE bit\n");
+ rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+ rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ ctrlr->feat.error_recovery.raw = cmd->cdw11;
+ ctrlr->feat.error_recovery.bits.reserved = 0;
+
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static int
+nvmf_ctrlr_set_features_volatile_write_cache(struct spdk_nvmf_request *req)
+{
+ struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+ struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Volatile Write Cache (cdw11 = 0x%0x)\n", cmd->cdw11);
+
+ ctrlr->feat.volatile_write_cache.raw = cmd->cdw11;
+ ctrlr->feat.volatile_write_cache.bits.reserved = 0;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Volatile Write Cache %s\n",
+ ctrlr->feat.volatile_write_cache.bits.wce ? "Enabled" : "Disabled");
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static int
+nvmf_ctrlr_set_features_write_atomicity(struct spdk_nvmf_request *req)
+{
+ struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+ struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Write Atomicity (cdw11 = 0x%0x)\n", cmd->cdw11);
+
+ ctrlr->feat.write_atomicity.raw = cmd->cdw11;
+ ctrlr->feat.write_atomicity.bits.reserved = 0;
+
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static int
+nvmf_ctrlr_set_features_host_identifier(struct spdk_nvmf_request *req)
+{
+ struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl;
+
+ SPDK_ERRLOG("Set Features - Host Identifier not allowed\n");
+ response->status.sc = SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static int
+nvmf_ctrlr_get_features_host_identifier(struct spdk_nvmf_request *req)
+{
+ struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+ struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+ struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Get Features - Host Identifier\n");
+
+ if (!cmd->cdw11_bits.feat_host_identifier.bits.exhid) {
+ /* NVMe over Fabrics requires EXHID=1 (128-bit/16-byte host ID) */
+ SPDK_ERRLOG("Get Features - Host Identifier with EXHID=0 not allowed\n");
+ response->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ if (req->data == NULL || req->length < sizeof(ctrlr->hostid)) {
+ SPDK_ERRLOG("Invalid data buffer for Get Features - Host Identifier\n");
+ response->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ spdk_uuid_copy((struct spdk_uuid *)req->data, &ctrlr->hostid);
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static int
+nvmf_ctrlr_get_features_reservation_notification_mask(struct spdk_nvmf_request *req)
+{
+ struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+ struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+ struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl;
+ struct spdk_nvmf_ns *ns;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "get Features - Reservation Notificaton Mask\n");
+
+ if (cmd->nsid == 0xffffffffu) {
+ SPDK_ERRLOG("get Features - Invalid Namespace ID\n");
+ rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ ns = _nvmf_subsystem_get_ns(ctrlr->subsys, cmd->nsid);
+ if (ns == NULL) {
+ SPDK_ERRLOG("Set Features - Invalid Namespace ID\n");
+ rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+ rsp->cdw0 = ns->mask;
+
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static int
+nvmf_ctrlr_set_features_reservation_notification_mask(struct spdk_nvmf_request *req)
+{
+ struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+ struct spdk_nvmf_subsystem *subsystem = ctrlr->subsys;
+ struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+ struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl;
+ struct spdk_nvmf_ns *ns;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Reservation Notificaton Mask\n");
+
+ if (cmd->nsid == 0xffffffffu) {
+ for (ns = spdk_nvmf_subsystem_get_first_ns(subsystem); ns != NULL;
+ ns = spdk_nvmf_subsystem_get_next_ns(subsystem, ns)) {
+ ns->mask = cmd->cdw11;
+ }
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ ns = _nvmf_subsystem_get_ns(ctrlr->subsys, cmd->nsid);
+ if (ns == NULL) {
+ SPDK_ERRLOG("Set Features - Invalid Namespace ID\n");
+ rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+ ns->mask = cmd->cdw11;
+
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static int
+nvmf_ctrlr_get_features_reservation_persistence(struct spdk_nvmf_request *req)
+{
+ struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+ struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+ struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl;
+ struct spdk_nvmf_ns *ns;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Get Features - Reservation Persistence\n");
+
+ ns = _nvmf_subsystem_get_ns(ctrlr->subsys, cmd->nsid);
+ /* NSID with 0xffffffffu also included */
+ if (ns == NULL) {
+ SPDK_ERRLOG("Get Features - Invalid Namespace ID\n");
+ response->status.sct = SPDK_NVME_SCT_GENERIC;
+ response->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ response->cdw0 = ns->ptpl_activated;
+
+ response->status.sct = SPDK_NVME_SCT_GENERIC;
+ response->status.sc = SPDK_NVME_SC_SUCCESS;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static int
+nvmf_ctrlr_set_features_reservation_persistence(struct spdk_nvmf_request *req)
+{
+ struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+ struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+ struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl;
+ struct spdk_nvmf_ns *ns;
+ bool ptpl;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Reservation Persistence\n");
+
+ ns = _nvmf_subsystem_get_ns(ctrlr->subsys, cmd->nsid);
+ ptpl = cmd->cdw11_bits.feat_rsv_persistence.bits.ptpl;
+
+ if (cmd->nsid != 0xffffffffu && ns && ns->ptpl_file) {
+ ns->ptpl_activated = ptpl;
+ } else if (cmd->nsid == 0xffffffffu) {
+ for (ns = spdk_nvmf_subsystem_get_first_ns(ctrlr->subsys); ns && ns->ptpl_file;
+ ns = spdk_nvmf_subsystem_get_next_ns(ctrlr->subsys, ns)) {
+ ns->ptpl_activated = ptpl;
+ }
+ } else {
+ SPDK_ERRLOG("Set Features - Invalid Namespace ID or Reservation Configuration\n");
+ response->status.sct = SPDK_NVME_SCT_GENERIC;
+ response->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ /* TODO: Feature not changeable for now */
+ response->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
+ response->status.sc = SPDK_NVME_SC_FEATURE_ID_NOT_SAVEABLE;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static int
+nvmf_ctrlr_set_features_keep_alive_timer(struct spdk_nvmf_request *req)
+{
+ struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+ struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+ struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Keep Alive Timer (%u ms)\n", cmd->cdw11);
+
+ /*
+ * if attempts to disable keep alive by setting kato to 0h
+ * a status value of keep alive invalid shall be returned
+ */
+ if (cmd->cdw11_bits.feat_keep_alive_timer.bits.kato == 0) {
+ rsp->status.sc = SPDK_NVME_SC_KEEP_ALIVE_INVALID;
+ } else if (cmd->cdw11_bits.feat_keep_alive_timer.bits.kato < MIN_KEEP_ALIVE_TIMEOUT_IN_MS) {
+ ctrlr->feat.keep_alive_timer.bits.kato = MIN_KEEP_ALIVE_TIMEOUT_IN_MS;
+ } else {
+ /* round up to milliseconds */
+ ctrlr->feat.keep_alive_timer.bits.kato = spdk_divide_round_up(
+ cmd->cdw11_bits.feat_keep_alive_timer.bits.kato,
+ KAS_DEFAULT_VALUE * KAS_TIME_UNIT_IN_MS) *
+ KAS_DEFAULT_VALUE * KAS_TIME_UNIT_IN_MS;
+ }
+
+ /*
+ * if change the keep alive timeout value successfully
+ * update the keep alive poller.
+ */
+ if (cmd->cdw11_bits.feat_keep_alive_timer.bits.kato != 0) {
+ if (ctrlr->keep_alive_poller != NULL) {
+ spdk_poller_unregister(&ctrlr->keep_alive_poller);
+ }
+ ctrlr->keep_alive_poller = SPDK_POLLER_REGISTER(nvmf_ctrlr_keep_alive_poll, ctrlr,
+ ctrlr->feat.keep_alive_timer.bits.kato * 1000);
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Keep Alive Timer set to %u ms\n",
+ ctrlr->feat.keep_alive_timer.bits.kato);
+
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static int
+nvmf_ctrlr_set_features_number_of_queues(struct spdk_nvmf_request *req)
+{
+ struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+ struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl;
+ uint32_t count;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Number of Queues, cdw11 0x%x\n",
+ req->cmd->nvme_cmd.cdw11);
+
+ count = spdk_bit_array_count_set(ctrlr->qpair_mask);
+ /* verify that the controller is ready to process commands */
+ if (count > 1) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Queue pairs already active!\n");
+ rsp->status.sc = SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR;
+ } else {
+ /*
+ * Ignore the value requested by the host -
+ * always return the pre-configured value based on max_qpairs_allowed.
+ */
+ rsp->cdw0 = ctrlr->feat.number_of_queues.raw;
+ }
+
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static int
+nvmf_ctrlr_set_features_async_event_configuration(struct spdk_nvmf_request *req)
+{
+ struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+ struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Async Event Configuration, cdw11 0x%08x\n",
+ cmd->cdw11);
+ ctrlr->feat.async_event_configuration.raw = cmd->cdw11;
+ ctrlr->feat.async_event_configuration.bits.reserved = 0;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static int
+nvmf_ctrlr_async_event_request(struct spdk_nvmf_request *req)
+{
+ struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+ struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl;
+ struct spdk_nvmf_subsystem_poll_group *sgroup;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Async Event Request\n");
+
+ /* Four asynchronous events are supported for now */
+ if (ctrlr->nr_aer_reqs >= NVMF_MAX_ASYNC_EVENTS) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "AERL exceeded\n");
+ rsp->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
+ rsp->status.sc = SPDK_NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ if (ctrlr->notice_event.bits.async_event_type ==
+ SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) {
+ rsp->cdw0 = ctrlr->notice_event.raw;
+ ctrlr->notice_event.raw = 0;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ if (ctrlr->reservation_event.bits.async_event_type ==
+ SPDK_NVME_ASYNC_EVENT_TYPE_IO) {
+ rsp->cdw0 = ctrlr->reservation_event.raw;
+ ctrlr->reservation_event.raw = 0;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ /* AER cmd is an exception */
+ sgroup = &req->qpair->group->sgroups[ctrlr->subsys->id];
+ assert(sgroup != NULL);
+ sgroup->io_outstanding--;
+
+ ctrlr->aer_req[ctrlr->nr_aer_reqs++] = req;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+}
+
+static void
+nvmf_get_firmware_slot_log_page(void *buffer, uint64_t offset, uint32_t length)
+{
+ struct spdk_nvme_firmware_page fw_page;
+ size_t copy_len;
+
+ memset(&fw_page, 0, sizeof(fw_page));
+ fw_page.afi.active_slot = 1;
+ fw_page.afi.next_reset_slot = 0;
+ spdk_strcpy_pad(fw_page.revision[0], FW_VERSION, sizeof(fw_page.revision[0]), ' ');
+
+ if (offset < sizeof(fw_page)) {
+ copy_len = spdk_min(sizeof(fw_page) - offset, length);
+ if (copy_len > 0) {
+ memcpy(buffer, (const char *)&fw_page + offset, copy_len);
+ }
+ }
+}
+
+void
+nvmf_ctrlr_ns_changed(struct spdk_nvmf_ctrlr *ctrlr, uint32_t nsid)
+{
+ uint16_t max_changes = SPDK_COUNTOF(ctrlr->changed_ns_list.ns_list);
+ uint16_t i;
+ bool found = false;
+
+ for (i = 0; i < ctrlr->changed_ns_list_count; i++) {
+ if (ctrlr->changed_ns_list.ns_list[i] == nsid) {
+ /* nsid is already in the list */
+ found = true;
+ break;
+ }
+ }
+
+ if (!found) {
+ if (ctrlr->changed_ns_list_count == max_changes) {
+ /* Out of space - set first entry to FFFFFFFFh and zero-fill the rest. */
+ ctrlr->changed_ns_list.ns_list[0] = 0xFFFFFFFFu;
+ for (i = 1; i < max_changes; i++) {
+ ctrlr->changed_ns_list.ns_list[i] = 0;
+ }
+ } else {
+ ctrlr->changed_ns_list.ns_list[ctrlr->changed_ns_list_count++] = nsid;
+ }
+ }
+}
+
+static void
+nvmf_get_changed_ns_list_log_page(struct spdk_nvmf_ctrlr *ctrlr,
+ void *buffer, uint64_t offset, uint32_t length)
+{
+ size_t copy_length;
+
+ if (offset < sizeof(ctrlr->changed_ns_list)) {
+ copy_length = spdk_min(length, sizeof(ctrlr->changed_ns_list) - offset);
+ if (copy_length) {
+ memcpy(buffer, (char *)&ctrlr->changed_ns_list + offset, copy_length);
+ }
+ }
+
+ /* Clear log page each time it is read */
+ ctrlr->changed_ns_list_count = 0;
+ memset(&ctrlr->changed_ns_list, 0, sizeof(ctrlr->changed_ns_list));
+}
+
+/* The structure can be modified if we provide support for other commands in future */
+static const struct spdk_nvme_cmds_and_effect_log_page g_cmds_and_effect_log_page = {
+ .admin_cmds_supported = {
+ /* CSUPP, LBCC, NCC, NIC, CCC, CSE */
+ /* Get Log Page */
+ [SPDK_NVME_OPC_GET_LOG_PAGE] = {1, 0, 0, 0, 0, 0, 0, 0},
+ /* Identify */
+ [SPDK_NVME_OPC_IDENTIFY] = {1, 0, 0, 0, 0, 0, 0, 0},
+ /* Abort */
+ [SPDK_NVME_OPC_ABORT] = {1, 0, 0, 0, 0, 0, 0, 0},
+ /* Set Features */
+ [SPDK_NVME_OPC_SET_FEATURES] = {1, 0, 0, 0, 0, 0, 0, 0},
+ /* Get Features */
+ [SPDK_NVME_OPC_GET_FEATURES] = {1, 0, 0, 0, 0, 0, 0, 0},
+ /* Async Event Request */
+ [SPDK_NVME_OPC_ASYNC_EVENT_REQUEST] = {1, 0, 0, 0, 0, 0, 0, 0},
+ /* Keep Alive */
+ [SPDK_NVME_OPC_KEEP_ALIVE] = {1, 0, 0, 0, 0, 0, 0, 0},
+ },
+ .io_cmds_supported = {
+ /* FLUSH */
+ [SPDK_NVME_OPC_FLUSH] = {1, 1, 0, 0, 0, 0, 0, 0},
+ /* WRITE */
+ [SPDK_NVME_OPC_WRITE] = {1, 1, 0, 0, 0, 0, 0, 0},
+ /* READ */
+ [SPDK_NVME_OPC_READ] = {1, 0, 0, 0, 0, 0, 0, 0},
+ /* WRITE ZEROES */
+ [SPDK_NVME_OPC_WRITE_ZEROES] = {1, 1, 0, 0, 0, 0, 0, 0},
+ /* DATASET MANAGEMENT */
+ [SPDK_NVME_OPC_DATASET_MANAGEMENT] = {1, 1, 0, 0, 0, 0, 0, 0},
+ /* COMPARE */
+ [SPDK_NVME_OPC_COMPARE] = {1, 0, 0, 0, 0, 0, 0, 0},
+ },
+};
+
+static void
+nvmf_get_cmds_and_effects_log_page(void *buffer,
+ uint64_t offset, uint32_t length)
+{
+ uint32_t page_size = sizeof(struct spdk_nvme_cmds_and_effect_log_page);
+ size_t copy_len = 0;
+ size_t zero_len = length;
+
+ if (offset < page_size) {
+ copy_len = spdk_min(page_size - offset, length);
+ zero_len -= copy_len;
+ memcpy(buffer, (char *)(&g_cmds_and_effect_log_page) + offset, copy_len);
+ }
+
+ if (zero_len) {
+ memset((char *)buffer + copy_len, 0, zero_len);
+ }
+}
+
+static void
+nvmf_get_reservation_notification_log_page(struct spdk_nvmf_ctrlr *ctrlr,
+ void *data, uint64_t offset, uint32_t length)
+{
+ uint32_t unit_log_len, avail_log_len, next_pos, copy_len;
+ struct spdk_nvmf_reservation_log *log, *log_tmp;
+ uint8_t *buf = data;
+
+ unit_log_len = sizeof(struct spdk_nvme_reservation_notification_log);
+ /* No available log, return 1 zeroed log page */
+ if (!ctrlr->num_avail_log_pages) {
+ memset(buf, 0, spdk_min(length, unit_log_len));
+ return;
+ }
+
+ avail_log_len = ctrlr->num_avail_log_pages * unit_log_len;
+ if (offset >= avail_log_len) {
+ return;
+ }
+
+ next_pos = copy_len = 0;
+ TAILQ_FOREACH_SAFE(log, &ctrlr->log_head, link, log_tmp) {
+ TAILQ_REMOVE(&ctrlr->log_head, log, link);
+ ctrlr->num_avail_log_pages--;
+
+ next_pos += unit_log_len;
+ if (next_pos > offset) {
+ copy_len = spdk_min(next_pos - offset, length);
+ memcpy(buf, &log->log, copy_len);
+ length -= copy_len;
+ offset += copy_len;
+ buf += copy_len;
+ }
+ free(log);
+
+ if (length == 0) {
+ break;
+ }
+ }
+ return;
+}
+
+static int
+nvmf_ctrlr_get_log_page(struct spdk_nvmf_request *req)
+{
+ struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+ struct spdk_nvmf_subsystem *subsystem = ctrlr->subsys;
+ struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+ struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl;
+ uint64_t offset, len;
+ uint32_t numdl, numdu;
+ uint8_t lid;
+
+ if (req->data == NULL) {
+ SPDK_ERRLOG("get log command with no buffer\n");
+ response->status.sct = SPDK_NVME_SCT_GENERIC;
+ response->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ offset = (uint64_t)cmd->cdw12 | ((uint64_t)cmd->cdw13 << 32);
+ if (offset & 3) {
+ SPDK_ERRLOG("Invalid log page offset 0x%" PRIx64 "\n", offset);
+ response->status.sct = SPDK_NVME_SCT_GENERIC;
+ response->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ numdl = cmd->cdw10_bits.get_log_page.numdl;
+ numdu = cmd->cdw11_bits.get_log_page.numdu;
+ len = ((numdu << 16) + numdl + (uint64_t)1) * 4;
+ if (len > req->length) {
+ SPDK_ERRLOG("Get log page: len (%" PRIu64 ") > buf size (%u)\n",
+ len, req->length);
+ response->status.sct = SPDK_NVME_SCT_GENERIC;
+ response->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ lid = cmd->cdw10_bits.get_log_page.lid;
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Get log page: LID=0x%02X offset=0x%" PRIx64 " len=0x%" PRIx64 "\n",
+ lid, offset, len);
+
+ if (subsystem->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) {
+ switch (lid) {
+ case SPDK_NVME_LOG_DISCOVERY:
+ nvmf_get_discovery_log_page(subsystem->tgt, ctrlr->hostnqn, req->iov, req->iovcnt, offset,
+ len);
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ default:
+ goto invalid_log_page;
+ }
+ } else {
+ switch (lid) {
+ case SPDK_NVME_LOG_ERROR:
+ case SPDK_NVME_LOG_HEALTH_INFORMATION:
+ /* TODO: actually fill out log page data */
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ case SPDK_NVME_LOG_FIRMWARE_SLOT:
+ nvmf_get_firmware_slot_log_page(req->data, offset, len);
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ case SPDK_NVME_LOG_COMMAND_EFFECTS_LOG:
+ nvmf_get_cmds_and_effects_log_page(req->data, offset, len);
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ case SPDK_NVME_LOG_CHANGED_NS_LIST:
+ nvmf_get_changed_ns_list_log_page(ctrlr, req->data, offset, len);
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ case SPDK_NVME_LOG_RESERVATION_NOTIFICATION:
+ nvmf_get_reservation_notification_log_page(ctrlr, req->data, offset, len);
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ default:
+ goto invalid_log_page;
+ }
+ }
+
+invalid_log_page:
+ SPDK_ERRLOG("Unsupported Get Log Page 0x%02X\n", lid);
+ response->status.sct = SPDK_NVME_SCT_GENERIC;
+ response->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+int
+spdk_nvmf_ctrlr_identify_ns(struct spdk_nvmf_ctrlr *ctrlr,
+ struct spdk_nvme_cmd *cmd,
+ struct spdk_nvme_cpl *rsp,
+ struct spdk_nvme_ns_data *nsdata)
+{
+ struct spdk_nvmf_subsystem *subsystem = ctrlr->subsys;
+ struct spdk_nvmf_ns *ns;
+ uint32_t max_num_blocks;
+
+ if (cmd->nsid == 0 || cmd->nsid > subsystem->max_nsid) {
+ SPDK_ERRLOG("Identify Namespace for invalid NSID %u\n", cmd->nsid);
+ rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+ rsp->status.sc = SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ ns = _nvmf_subsystem_get_ns(subsystem, cmd->nsid);
+ if (ns == NULL || ns->bdev == NULL) {
+ /*
+ * Inactive namespaces should return a zero filled data structure.
+ * The data buffer is already zeroed by nvmf_ctrlr_process_admin_cmd(),
+ * so we can just return early here.
+ */
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Identify Namespace for inactive NSID %u\n", cmd->nsid);
+ rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+ rsp->status.sc = SPDK_NVME_SC_SUCCESS;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ nvmf_bdev_ctrlr_identify_ns(ns, nsdata, ctrlr->dif_insert_or_strip);
+
+ /* Due to bug in the Linux kernel NVMe driver we have to set noiob no larger than mdts */
+ max_num_blocks = ctrlr->admin_qpair->transport->opts.max_io_size /
+ (1U << nsdata->lbaf[nsdata->flbas.format].lbads);
+ if (nsdata->noiob > max_num_blocks) {
+ nsdata->noiob = max_num_blocks;
+ }
+
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static void
+nvmf_ctrlr_populate_oacs(struct spdk_nvmf_ctrlr *ctrlr,
+ struct spdk_nvme_ctrlr_data *cdata)
+{
+ cdata->oacs.virtualization_management =
+ g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_VIRTUALIZATION_MANAGEMENT].hdlr != NULL;
+ cdata->oacs.nvme_mi = g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_NVME_MI_SEND].hdlr != NULL
+ && g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_NVME_MI_RECEIVE].hdlr != NULL;
+ cdata->oacs.directives = g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_DIRECTIVE_SEND].hdlr != NULL
+ && g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_DIRECTIVE_RECEIVE].hdlr != NULL;
+ cdata->oacs.device_self_test =
+ g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_DEVICE_SELF_TEST].hdlr != NULL;
+ cdata->oacs.ns_manage = g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_NS_MANAGEMENT].hdlr != NULL
+ && g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_NS_ATTACHMENT].hdlr != NULL;
+ cdata->oacs.firmware = g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD].hdlr !=
+ NULL
+ && g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_FIRMWARE_COMMIT].hdlr != NULL;
+ cdata->oacs.format =
+ g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_FORMAT_NVM].hdlr != NULL;
+ cdata->oacs.security = g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_SECURITY_SEND].hdlr != NULL
+ && g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_SECURITY_RECEIVE].hdlr != NULL;
+ cdata->oacs.get_lba_status = g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_GET_LBA_STATUS].hdlr !=
+ NULL;
+}
+
+int
+spdk_nvmf_ctrlr_identify_ctrlr(struct spdk_nvmf_ctrlr *ctrlr, struct spdk_nvme_ctrlr_data *cdata)
+{
+ struct spdk_nvmf_subsystem *subsystem = ctrlr->subsys;
+ struct spdk_nvmf_transport *transport = ctrlr->admin_qpair->transport;
+
+ /*
+ * Common fields for discovery and NVM subsystems
+ */
+ spdk_strcpy_pad(cdata->fr, FW_VERSION, sizeof(cdata->fr), ' ');
+ assert((transport->opts.max_io_size % 4096) == 0);
+ cdata->mdts = spdk_u32log2(transport->opts.max_io_size / 4096);
+ cdata->cntlid = ctrlr->cntlid;
+ cdata->ver = ctrlr->vcprop.vs;
+ cdata->aerl = NVMF_MAX_ASYNC_EVENTS - 1;
+ cdata->lpa.edlp = 1;
+ cdata->elpe = 127;
+ cdata->maxcmd = transport->opts.max_queue_depth;
+ cdata->sgls = ctrlr->cdata.sgls;
+ cdata->fuses.compare_and_write = 1;
+ cdata->acwu = 1;
+ spdk_strcpy_pad(cdata->subnqn, subsystem->subnqn, sizeof(cdata->subnqn), '\0');
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "ctrlr data: maxcmd 0x%x\n", cdata->maxcmd);
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "sgls data: 0x%x\n", from_le32(&cdata->sgls));
+
+ /*
+ * NVM subsystem fields (reserved for discovery subsystems)
+ */
+ if (subsystem->subtype == SPDK_NVMF_SUBTYPE_NVME) {
+ spdk_strcpy_pad(cdata->mn, spdk_nvmf_subsystem_get_mn(subsystem), sizeof(cdata->mn), ' ');
+ spdk_strcpy_pad(cdata->sn, spdk_nvmf_subsystem_get_sn(subsystem), sizeof(cdata->sn), ' ');
+ cdata->kas = ctrlr->cdata.kas;
+
+ cdata->rab = 6;
+ cdata->cmic.multi_port = 1;
+ cdata->cmic.multi_host = 1;
+ cdata->oaes.ns_attribute_notices = 1;
+ cdata->ctratt.host_id_exhid_supported = 1;
+ /* TODO: Concurrent execution of multiple abort commands. */
+ cdata->acl = 0;
+ cdata->aerl = 0;
+ cdata->frmw.slot1_ro = 1;
+ cdata->frmw.num_slots = 1;
+
+ cdata->lpa.celp = 1; /* Command Effects log page supported */
+
+ cdata->sqes.min = 6;
+ cdata->sqes.max = 6;
+ cdata->cqes.min = 4;
+ cdata->cqes.max = 4;
+ cdata->nn = subsystem->max_nsid;
+ cdata->vwc.present = 1;
+ cdata->vwc.flush_broadcast = SPDK_NVME_FLUSH_BROADCAST_NOT_SUPPORTED;
+
+ cdata->nvmf_specific = ctrlr->cdata.nvmf_specific;
+
+ cdata->oncs.dsm = nvmf_ctrlr_dsm_supported(ctrlr);
+ cdata->oncs.write_zeroes = nvmf_ctrlr_write_zeroes_supported(ctrlr);
+ cdata->oncs.reservations = 1;
+
+ nvmf_ctrlr_populate_oacs(ctrlr, cdata);
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "ext ctrlr data: ioccsz 0x%x\n",
+ cdata->nvmf_specific.ioccsz);
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "ext ctrlr data: iorcsz 0x%x\n",
+ cdata->nvmf_specific.iorcsz);
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "ext ctrlr data: icdoff 0x%x\n",
+ cdata->nvmf_specific.icdoff);
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "ext ctrlr data: ctrattr 0x%x\n",
+ *(uint8_t *)&cdata->nvmf_specific.ctrattr);
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "ext ctrlr data: msdbd 0x%x\n",
+ cdata->nvmf_specific.msdbd);
+ }
+
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static int
+nvmf_ctrlr_identify_active_ns_list(struct spdk_nvmf_subsystem *subsystem,
+ struct spdk_nvme_cmd *cmd,
+ struct spdk_nvme_cpl *rsp,
+ struct spdk_nvme_ns_list *ns_list)
+{
+ struct spdk_nvmf_ns *ns;
+ uint32_t count = 0;
+
+ if (cmd->nsid >= 0xfffffffeUL) {
+ SPDK_ERRLOG("Identify Active Namespace List with invalid NSID %u\n", cmd->nsid);
+ rsp->status.sc = SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ for (ns = spdk_nvmf_subsystem_get_first_ns(subsystem); ns != NULL;
+ ns = spdk_nvmf_subsystem_get_next_ns(subsystem, ns)) {
+ if (ns->opts.nsid <= cmd->nsid) {
+ continue;
+ }
+
+ ns_list->ns_list[count++] = ns->opts.nsid;
+ if (count == SPDK_COUNTOF(ns_list->ns_list)) {
+ break;
+ }
+ }
+
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static void
+_add_ns_id_desc(void **buf_ptr, size_t *buf_remain,
+ enum spdk_nvme_nidt type,
+ const void *data, size_t data_size)
+{
+ struct spdk_nvme_ns_id_desc *desc;
+ size_t desc_size = sizeof(*desc) + data_size;
+
+ /*
+ * These should never fail in practice, since all valid NS ID descriptors
+ * should be defined so that they fit in the available 4096-byte buffer.
+ */
+ assert(data_size > 0);
+ assert(data_size <= UINT8_MAX);
+ assert(desc_size < *buf_remain);
+ if (data_size == 0 || data_size > UINT8_MAX || desc_size > *buf_remain) {
+ return;
+ }
+
+ desc = *buf_ptr;
+ desc->nidt = type;
+ desc->nidl = data_size;
+ memcpy(desc->nid, data, data_size);
+
+ *buf_ptr += desc_size;
+ *buf_remain -= desc_size;
+}
+
+static int
+nvmf_ctrlr_identify_ns_id_descriptor_list(
+ struct spdk_nvmf_subsystem *subsystem,
+ struct spdk_nvme_cmd *cmd,
+ struct spdk_nvme_cpl *rsp,
+ void *id_desc_list, size_t id_desc_list_size)
+{
+ struct spdk_nvmf_ns *ns;
+ size_t buf_remain = id_desc_list_size;
+ void *buf_ptr = id_desc_list;
+
+ ns = _nvmf_subsystem_get_ns(subsystem, cmd->nsid);
+ if (ns == NULL || ns->bdev == NULL) {
+ rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+ rsp->status.sc = SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+#define ADD_ID_DESC(type, data, size) \
+ do { \
+ if (!spdk_mem_all_zero(data, size)) { \
+ _add_ns_id_desc(&buf_ptr, &buf_remain, type, data, size); \
+ } \
+ } while (0)
+
+ ADD_ID_DESC(SPDK_NVME_NIDT_EUI64, ns->opts.eui64, sizeof(ns->opts.eui64));
+ ADD_ID_DESC(SPDK_NVME_NIDT_NGUID, ns->opts.nguid, sizeof(ns->opts.nguid));
+ ADD_ID_DESC(SPDK_NVME_NIDT_UUID, &ns->opts.uuid, sizeof(ns->opts.uuid));
+
+ /*
+ * The list is automatically 0-terminated because controller to host buffers in
+ * admin commands always get zeroed in nvmf_ctrlr_process_admin_cmd().
+ */
+
+#undef ADD_ID_DESC
+
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static int
+nvmf_ctrlr_identify(struct spdk_nvmf_request *req)
+{
+ uint8_t cns;
+ struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+ struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+ struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl;
+ struct spdk_nvmf_subsystem *subsystem = ctrlr->subsys;
+
+ if (req->data == NULL || req->length < 4096) {
+ SPDK_ERRLOG("identify command with invalid buffer\n");
+ rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+ rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ cns = cmd->cdw10_bits.identify.cns;
+
+ if (subsystem->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY &&
+ cns != SPDK_NVME_IDENTIFY_CTRLR) {
+ /* Discovery controllers only support Identify Controller */
+ goto invalid_cns;
+ }
+
+ switch (cns) {
+ case SPDK_NVME_IDENTIFY_NS:
+ return spdk_nvmf_ctrlr_identify_ns(ctrlr, cmd, rsp, req->data);
+ case SPDK_NVME_IDENTIFY_CTRLR:
+ return spdk_nvmf_ctrlr_identify_ctrlr(ctrlr, req->data);
+ case SPDK_NVME_IDENTIFY_ACTIVE_NS_LIST:
+ return nvmf_ctrlr_identify_active_ns_list(subsystem, cmd, rsp, req->data);
+ case SPDK_NVME_IDENTIFY_NS_ID_DESCRIPTOR_LIST:
+ return nvmf_ctrlr_identify_ns_id_descriptor_list(subsystem, cmd, rsp, req->data, req->length);
+ default:
+ goto invalid_cns;
+ }
+
+invalid_cns:
+ SPDK_ERRLOG("Identify command with unsupported CNS 0x%02x\n", cns);
+ rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+ rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static bool
+nvmf_qpair_abort_aer(struct spdk_nvmf_qpair *qpair, uint16_t cid)
+{
+ struct spdk_nvmf_ctrlr *ctrlr = qpair->ctrlr;
+ struct spdk_nvmf_request *req;
+ int i;
+
+ if (!nvmf_qpair_is_admin_queue(qpair)) {
+ return false;
+ }
+
+ for (i = 0; i < ctrlr->nr_aer_reqs; i++) {
+ if (ctrlr->aer_req[i]->cmd->nvme_cmd.cid == cid) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Aborting AER request\n");
+ req = ctrlr->aer_req[i];
+ ctrlr->aer_req[i] = NULL;
+ ctrlr->nr_aer_reqs--;
+
+ /* Move the last req to the aborting position for making aer_reqs
+ * in continuous
+ */
+ if (i < ctrlr->nr_aer_reqs) {
+ ctrlr->aer_req[i] = ctrlr->aer_req[ctrlr->nr_aer_reqs];
+ ctrlr->aer_req[ctrlr->nr_aer_reqs] = NULL;
+ }
+
+ req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+ req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_ABORTED_BY_REQUEST;
+ _nvmf_request_complete(req);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static void
+nvmf_qpair_abort_request(struct spdk_nvmf_qpair *qpair, struct spdk_nvmf_request *req)
+{
+ uint16_t cid = req->cmd->nvme_cmd.cdw10_bits.abort.cid;
+
+ if (nvmf_qpair_abort_aer(qpair, cid)) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "abort ctrlr=%p sqid=%u cid=%u successful\n",
+ qpair->ctrlr, qpair->qid, cid);
+ req->rsp->nvme_cpl.cdw0 &= ~1U; /* Command successfully aborted */
+
+ spdk_nvmf_request_complete(req);
+ return;
+ }
+
+ nvmf_transport_qpair_abort_request(qpair, req);
+}
+
+static void
+nvmf_ctrlr_abort_done(struct spdk_io_channel_iter *i, int status)
+{
+ struct spdk_nvmf_request *req = spdk_io_channel_iter_get_ctx(i);
+
+ if (status == 0) {
+ /* There was no qpair whose ID matches SQID of the abort command.
+ * Hence call _nvmf_request_complete() here.
+ */
+ _nvmf_request_complete(req);
+ }
+}
+
+static void
+nvmf_ctrlr_abort_on_pg(struct spdk_io_channel_iter *i)
+{
+ struct spdk_nvmf_request *req = spdk_io_channel_iter_get_ctx(i);
+ struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
+ struct spdk_nvmf_poll_group *group = spdk_io_channel_get_ctx(ch);
+ uint16_t sqid = req->cmd->nvme_cmd.cdw10_bits.abort.sqid;
+ struct spdk_nvmf_qpair *qpair;
+
+ TAILQ_FOREACH(qpair, &group->qpairs, link) {
+ if (qpair->ctrlr == req->qpair->ctrlr && qpair->qid == sqid) {
+ /* Found the qpair */
+
+ nvmf_qpair_abort_request(qpair, req);
+
+ /* Return -1 for the status so the iteration across threads stops. */
+ spdk_for_each_channel_continue(i, -1);
+ return;
+ }
+ }
+
+ spdk_for_each_channel_continue(i, 0);
+}
+
+static int
+nvmf_ctrlr_abort(struct spdk_nvmf_request *req)
+{
+ struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl;
+
+ rsp->cdw0 = 1U; /* Command not aborted */
+ rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+ rsp->status.sc = SPDK_NVME_SC_SUCCESS;
+
+ /* Send a message to each poll group, searching for this ctrlr, sqid, and command. */
+ spdk_for_each_channel(req->qpair->ctrlr->subsys->tgt,
+ nvmf_ctrlr_abort_on_pg,
+ req,
+ nvmf_ctrlr_abort_done
+ );
+
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+}
+
+int
+nvmf_ctrlr_abort_request(struct spdk_nvmf_request *req)
+{
+ struct spdk_nvmf_request *req_to_abort = req->req_to_abort;
+ struct spdk_bdev *bdev;
+ struct spdk_bdev_desc *desc;
+ struct spdk_io_channel *ch;
+ int rc;
+
+ assert(req_to_abort != NULL);
+
+ if (g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_ABORT].hdlr &&
+ nvmf_qpair_is_admin_queue(req_to_abort->qpair)) {
+ return g_nvmf_custom_admin_cmd_hdlrs[SPDK_NVME_OPC_ABORT].hdlr(req);
+ }
+
+ rc = spdk_nvmf_request_get_bdev(req_to_abort->cmd->nvme_cmd.nsid, req_to_abort,
+ &bdev, &desc, &ch);
+ if (rc != 0) {
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ return spdk_nvmf_bdev_ctrlr_abort_cmd(bdev, desc, ch, req, req_to_abort);
+}
+
+static int
+get_features_generic(struct spdk_nvmf_request *req, uint32_t cdw0)
+{
+ struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl;
+
+ rsp->cdw0 = cdw0;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+static int
+nvmf_ctrlr_get_features(struct spdk_nvmf_request *req)
+{
+ uint8_t feature;
+ struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+ struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+ struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl;
+
+ feature = cmd->cdw10_bits.get_features.fid;
+ switch (feature) {
+ case SPDK_NVME_FEAT_ARBITRATION:
+ return get_features_generic(req, ctrlr->feat.arbitration.raw);
+ case SPDK_NVME_FEAT_POWER_MANAGEMENT:
+ return get_features_generic(req, ctrlr->feat.power_management.raw);
+ case SPDK_NVME_FEAT_TEMPERATURE_THRESHOLD:
+ return nvmf_ctrlr_get_features_temperature_threshold(req);
+ case SPDK_NVME_FEAT_ERROR_RECOVERY:
+ return get_features_generic(req, ctrlr->feat.error_recovery.raw);
+ case SPDK_NVME_FEAT_VOLATILE_WRITE_CACHE:
+ return get_features_generic(req, ctrlr->feat.volatile_write_cache.raw);
+ case SPDK_NVME_FEAT_NUMBER_OF_QUEUES:
+ return get_features_generic(req, ctrlr->feat.number_of_queues.raw);
+ case SPDK_NVME_FEAT_WRITE_ATOMICITY:
+ return get_features_generic(req, ctrlr->feat.write_atomicity.raw);
+ case SPDK_NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
+ return get_features_generic(req, ctrlr->feat.async_event_configuration.raw);
+ case SPDK_NVME_FEAT_KEEP_ALIVE_TIMER:
+ return get_features_generic(req, ctrlr->feat.keep_alive_timer.raw);
+ case SPDK_NVME_FEAT_HOST_IDENTIFIER:
+ return nvmf_ctrlr_get_features_host_identifier(req);
+ case SPDK_NVME_FEAT_HOST_RESERVE_MASK:
+ return nvmf_ctrlr_get_features_reservation_notification_mask(req);
+ case SPDK_NVME_FEAT_HOST_RESERVE_PERSIST:
+ return nvmf_ctrlr_get_features_reservation_persistence(req);
+ default:
+ SPDK_ERRLOG("Get Features command with unsupported feature ID 0x%02x\n", feature);
+ response->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+}
+
+static int
+nvmf_ctrlr_set_features(struct spdk_nvmf_request *req)
+{
+ uint8_t feature, save;
+ struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+ struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl;
+
+ /*
+ * Features are not saveable by the controller as indicated by
+ * ONCS field of the Identify Controller data.
+ * */
+ save = cmd->cdw10_bits.set_features.sv;
+ if (save) {
+ response->status.sc = SPDK_NVME_SC_FEATURE_ID_NOT_SAVEABLE;
+ response->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ feature = cmd->cdw10_bits.set_features.fid;
+ switch (feature) {
+ case SPDK_NVME_FEAT_ARBITRATION:
+ return nvmf_ctrlr_set_features_arbitration(req);
+ case SPDK_NVME_FEAT_POWER_MANAGEMENT:
+ return nvmf_ctrlr_set_features_power_management(req);
+ case SPDK_NVME_FEAT_TEMPERATURE_THRESHOLD:
+ return nvmf_ctrlr_set_features_temperature_threshold(req);
+ case SPDK_NVME_FEAT_ERROR_RECOVERY:
+ return nvmf_ctrlr_set_features_error_recovery(req);
+ case SPDK_NVME_FEAT_VOLATILE_WRITE_CACHE:
+ return nvmf_ctrlr_set_features_volatile_write_cache(req);
+ case SPDK_NVME_FEAT_NUMBER_OF_QUEUES:
+ return nvmf_ctrlr_set_features_number_of_queues(req);
+ case SPDK_NVME_FEAT_WRITE_ATOMICITY:
+ return nvmf_ctrlr_set_features_write_atomicity(req);
+ case SPDK_NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
+ return nvmf_ctrlr_set_features_async_event_configuration(req);
+ case SPDK_NVME_FEAT_KEEP_ALIVE_TIMER:
+ return nvmf_ctrlr_set_features_keep_alive_timer(req);
+ case SPDK_NVME_FEAT_HOST_IDENTIFIER:
+ return nvmf_ctrlr_set_features_host_identifier(req);
+ case SPDK_NVME_FEAT_HOST_RESERVE_MASK:
+ return nvmf_ctrlr_set_features_reservation_notification_mask(req);
+ case SPDK_NVME_FEAT_HOST_RESERVE_PERSIST:
+ return nvmf_ctrlr_set_features_reservation_persistence(req);
+ default:
+ SPDK_ERRLOG("Set Features command with unsupported feature ID 0x%02x\n", feature);
+ response->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+}
+
+static int
+nvmf_ctrlr_keep_alive(struct spdk_nvmf_request *req)
+{
+ struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Keep Alive\n");
+ /*
+ * To handle keep alive just clear or reset the
+ * ctrlr based keep alive duration counter.
+ * When added, a separate timer based process
+ * will monitor if the time since last recorded
+ * keep alive has exceeded the max duration and
+ * take appropriate action.
+ */
+ ctrlr->last_keep_alive_tick = spdk_get_ticks();
+
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+int
+nvmf_ctrlr_process_admin_cmd(struct spdk_nvmf_request *req)
+{
+ struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+ struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+ struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl;
+ int rc;
+
+ if (ctrlr == NULL) {
+ SPDK_ERRLOG("Admin command sent before CONNECT\n");
+ response->status.sct = SPDK_NVME_SCT_GENERIC;
+ response->status.sc = SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ if (ctrlr->vcprop.cc.bits.en != 1) {
+ SPDK_ERRLOG("Admin command sent to disabled controller\n");
+ response->status.sct = SPDK_NVME_SCT_GENERIC;
+ response->status.sc = SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ if (req->data && spdk_nvme_opc_get_data_transfer(cmd->opc) == SPDK_NVME_DATA_CONTROLLER_TO_HOST) {
+ memset(req->data, 0, req->length);
+ }
+
+ if (ctrlr->subsys->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) {
+ /* Discovery controllers only support Get Log Page, Identify and Keep Alive. */
+ switch (cmd->opc) {
+ case SPDK_NVME_OPC_IDENTIFY:
+ case SPDK_NVME_OPC_GET_LOG_PAGE:
+ case SPDK_NVME_OPC_KEEP_ALIVE:
+ break;
+ default:
+ goto invalid_opcode;
+ }
+ }
+
+ /* Call a custom adm cmd handler if set. Aborts are handled in a different path (see nvmf_passthru_admin_cmd) */
+ if (g_nvmf_custom_admin_cmd_hdlrs[cmd->opc].hdlr && cmd->opc != SPDK_NVME_OPC_ABORT) {
+ rc = g_nvmf_custom_admin_cmd_hdlrs[cmd->opc].hdlr(req);
+ if (rc >= SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE) {
+ /* The handler took care of this commmand */
+ return rc;
+ }
+ }
+
+ switch (cmd->opc) {
+ case SPDK_NVME_OPC_GET_LOG_PAGE:
+ return nvmf_ctrlr_get_log_page(req);
+ case SPDK_NVME_OPC_IDENTIFY:
+ return nvmf_ctrlr_identify(req);
+ case SPDK_NVME_OPC_ABORT:
+ return nvmf_ctrlr_abort(req);
+ case SPDK_NVME_OPC_GET_FEATURES:
+ return nvmf_ctrlr_get_features(req);
+ case SPDK_NVME_OPC_SET_FEATURES:
+ return nvmf_ctrlr_set_features(req);
+ case SPDK_NVME_OPC_ASYNC_EVENT_REQUEST:
+ return nvmf_ctrlr_async_event_request(req);
+ case SPDK_NVME_OPC_KEEP_ALIVE:
+ return nvmf_ctrlr_keep_alive(req);
+
+ case SPDK_NVME_OPC_CREATE_IO_SQ:
+ case SPDK_NVME_OPC_CREATE_IO_CQ:
+ case SPDK_NVME_OPC_DELETE_IO_SQ:
+ case SPDK_NVME_OPC_DELETE_IO_CQ:
+ /* Create and Delete I/O CQ/SQ not allowed in NVMe-oF */
+ goto invalid_opcode;
+
+ default:
+ goto invalid_opcode;
+ }
+
+invalid_opcode:
+ SPDK_ERRLOG("Unsupported admin opcode 0x%x\n", cmd->opc);
+ response->status.sct = SPDK_NVME_SCT_GENERIC;
+ response->status.sc = SPDK_NVME_SC_INVALID_OPCODE;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+int
+nvmf_ctrlr_process_fabrics_cmd(struct spdk_nvmf_request *req)
+{
+ struct spdk_nvmf_qpair *qpair = req->qpair;
+ struct spdk_nvmf_capsule_cmd *cap_hdr;
+
+ cap_hdr = &req->cmd->nvmf_cmd;
+
+ if (qpair->ctrlr == NULL) {
+ /* No ctrlr established yet; the only valid command is Connect */
+ if (cap_hdr->fctype == SPDK_NVMF_FABRIC_COMMAND_CONNECT) {
+ return nvmf_ctrlr_cmd_connect(req);
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Got fctype 0x%x, expected Connect\n",
+ cap_hdr->fctype);
+ req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+ req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+ } else if (nvmf_qpair_is_admin_queue(qpair)) {
+ /*
+ * Controller session is established, and this is an admin queue.
+ * Disallow Connect and allow other fabrics commands.
+ */
+ switch (cap_hdr->fctype) {
+ case SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET:
+ return nvmf_property_set(req);
+ case SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET:
+ return nvmf_property_get(req);
+ default:
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "unknown fctype 0x%02x\n",
+ cap_hdr->fctype);
+ req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+ req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INVALID_OPCODE;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+ } else {
+ /* Controller session is established, and this is an I/O queue */
+ /* For now, no I/O-specific Fabrics commands are implemented (other than Connect) */
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Unexpected I/O fctype 0x%x\n", cap_hdr->fctype);
+ req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+ req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INVALID_OPCODE;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+}
+
+static inline int
+nvmf_ctrlr_async_event_notification(struct spdk_nvmf_ctrlr *ctrlr,
+ union spdk_nvme_async_event_completion *event)
+{
+ struct spdk_nvmf_request *req;
+ struct spdk_nvme_cpl *rsp;
+
+ assert(ctrlr->nr_aer_reqs > 0);
+
+ req = ctrlr->aer_req[--ctrlr->nr_aer_reqs];
+ rsp = &req->rsp->nvme_cpl;
+
+ rsp->cdw0 = event->raw;
+
+ _nvmf_request_complete(req);
+ ctrlr->aer_req[ctrlr->nr_aer_reqs] = NULL;
+
+ return 0;
+}
+
+int
+nvmf_ctrlr_async_event_ns_notice(struct spdk_nvmf_ctrlr *ctrlr)
+{
+ union spdk_nvme_async_event_completion event = {0};
+
+ /* Users may disable the event notification */
+ if (!ctrlr->feat.async_event_configuration.bits.ns_attr_notice) {
+ return 0;
+ }
+
+ event.bits.async_event_type = SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE;
+ event.bits.async_event_info = SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED;
+ event.bits.log_page_identifier = SPDK_NVME_LOG_CHANGED_NS_LIST;
+
+ /* If there is no outstanding AER request, queue the event. Then
+ * if an AER is later submitted, this event can be sent as a
+ * response.
+ */
+ if (ctrlr->nr_aer_reqs == 0) {
+ if (ctrlr->notice_event.bits.async_event_type ==
+ SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) {
+ return 0;
+ }
+
+ ctrlr->notice_event.raw = event.raw;
+ return 0;
+ }
+
+ return nvmf_ctrlr_async_event_notification(ctrlr, &event);
+}
+
+void
+nvmf_ctrlr_async_event_reservation_notification(struct spdk_nvmf_ctrlr *ctrlr)
+{
+ union spdk_nvme_async_event_completion event = {0};
+
+ if (!ctrlr->num_avail_log_pages) {
+ return;
+ }
+ event.bits.async_event_type = SPDK_NVME_ASYNC_EVENT_TYPE_IO;
+ event.bits.async_event_info = SPDK_NVME_ASYNC_EVENT_RESERVATION_LOG_AVAIL;
+ event.bits.log_page_identifier = SPDK_NVME_LOG_RESERVATION_NOTIFICATION;
+
+ /* If there is no outstanding AER request, queue the event. Then
+ * if an AER is later submitted, this event can be sent as a
+ * response.
+ */
+ if (ctrlr->nr_aer_reqs == 0) {
+ if (ctrlr->reservation_event.bits.async_event_type ==
+ SPDK_NVME_ASYNC_EVENT_TYPE_IO) {
+ return;
+ }
+
+ ctrlr->reservation_event.raw = event.raw;
+ return;
+ }
+
+ nvmf_ctrlr_async_event_notification(ctrlr, &event);
+}
+
+void
+nvmf_qpair_free_aer(struct spdk_nvmf_qpair *qpair)
+{
+ struct spdk_nvmf_ctrlr *ctrlr = qpair->ctrlr;
+ int i;
+
+ if (!nvmf_qpair_is_admin_queue(qpair)) {
+ return;
+ }
+
+ for (i = 0; i < ctrlr->nr_aer_reqs; i++) {
+ spdk_nvmf_request_free(ctrlr->aer_req[i]);
+ ctrlr->aer_req[i] = NULL;
+ }
+
+ ctrlr->nr_aer_reqs = 0;
+}
+
+void
+nvmf_ctrlr_abort_aer(struct spdk_nvmf_ctrlr *ctrlr)
+{
+ struct spdk_nvmf_request *req;
+ int i;
+
+ for (i = 0; i < ctrlr->nr_aer_reqs; i++) {
+ req = ctrlr->aer_req[i];
+
+ req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+ req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_ABORTED_BY_REQUEST;
+ _nvmf_request_complete(req);
+
+ ctrlr->aer_req[i] = NULL;
+ }
+
+ ctrlr->nr_aer_reqs = 0;
+}
+
+static void
+_nvmf_ctrlr_add_reservation_log(void *ctx)
+{
+ struct spdk_nvmf_reservation_log *log = (struct spdk_nvmf_reservation_log *)ctx;
+ struct spdk_nvmf_ctrlr *ctrlr = log->ctrlr;
+
+ ctrlr->log_page_count++;
+
+ /* Maximum number of queued log pages is 255 */
+ if (ctrlr->num_avail_log_pages == 0xff) {
+ struct spdk_nvmf_reservation_log *entry;
+ entry = TAILQ_LAST(&ctrlr->log_head, log_page_head);
+ entry->log.log_page_count = ctrlr->log_page_count;
+ free(log);
+ return;
+ }
+
+ log->log.log_page_count = ctrlr->log_page_count;
+ log->log.num_avail_log_pages = ctrlr->num_avail_log_pages++;
+ TAILQ_INSERT_TAIL(&ctrlr->log_head, log, link);
+
+ nvmf_ctrlr_async_event_reservation_notification(ctrlr);
+}
+
+void
+nvmf_ctrlr_reservation_notice_log(struct spdk_nvmf_ctrlr *ctrlr,
+ struct spdk_nvmf_ns *ns,
+ enum spdk_nvme_reservation_notification_log_page_type type)
+{
+ struct spdk_nvmf_reservation_log *log;
+
+ switch (type) {
+ case SPDK_NVME_RESERVATION_LOG_PAGE_EMPTY:
+ return;
+ case SPDK_NVME_REGISTRATION_PREEMPTED:
+ if (ns->mask & SPDK_NVME_REGISTRATION_PREEMPTED_MASK) {
+ return;
+ }
+ break;
+ case SPDK_NVME_RESERVATION_RELEASED:
+ if (ns->mask & SPDK_NVME_RESERVATION_RELEASED_MASK) {
+ return;
+ }
+ break;
+ case SPDK_NVME_RESERVATION_PREEMPTED:
+ if (ns->mask & SPDK_NVME_RESERVATION_PREEMPTED_MASK) {
+ return;
+ }
+ break;
+ default:
+ return;
+ }
+
+ log = calloc(1, sizeof(*log));
+ if (!log) {
+ SPDK_ERRLOG("Alloc log page failed, ignore the log\n");
+ return;
+ }
+ log->ctrlr = ctrlr;
+ log->log.type = type;
+ log->log.nsid = ns->nsid;
+
+ spdk_thread_send_msg(ctrlr->thread, _nvmf_ctrlr_add_reservation_log, log);
+}
+
+/* Check from subsystem poll group's namespace information data structure */
+static bool
+nvmf_ns_info_ctrlr_is_registrant(struct spdk_nvmf_subsystem_pg_ns_info *ns_info,
+ struct spdk_nvmf_ctrlr *ctrlr)
+{
+ uint32_t i;
+
+ for (i = 0; i < SPDK_NVMF_MAX_NUM_REGISTRANTS; i++) {
+ if (!spdk_uuid_compare(&ns_info->reg_hostid[i], &ctrlr->hostid)) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/*
+ * Check the NVMe command is permitted or not for current controller(Host).
+ */
+static int
+nvmf_ns_reservation_request_check(struct spdk_nvmf_subsystem_pg_ns_info *ns_info,
+ struct spdk_nvmf_ctrlr *ctrlr,
+ struct spdk_nvmf_request *req)
+{
+ struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+ enum spdk_nvme_reservation_type rtype = ns_info->rtype;
+ uint8_t status = SPDK_NVME_SC_SUCCESS;
+ uint8_t racqa;
+ bool is_registrant;
+
+ /* No valid reservation */
+ if (!rtype) {
+ return 0;
+ }
+
+ is_registrant = nvmf_ns_info_ctrlr_is_registrant(ns_info, ctrlr);
+ /* All registrants type and current ctrlr is a valid registrant */
+ if ((rtype == SPDK_NVME_RESERVE_WRITE_EXCLUSIVE_ALL_REGS ||
+ rtype == SPDK_NVME_RESERVE_EXCLUSIVE_ACCESS_ALL_REGS) && is_registrant) {
+ return 0;
+ } else if (!spdk_uuid_compare(&ns_info->holder_id, &ctrlr->hostid)) {
+ return 0;
+ }
+
+ /* Non-holder for current controller */
+ switch (cmd->opc) {
+ case SPDK_NVME_OPC_READ:
+ case SPDK_NVME_OPC_COMPARE:
+ if (rtype == SPDK_NVME_RESERVE_EXCLUSIVE_ACCESS) {
+ status = SPDK_NVME_SC_RESERVATION_CONFLICT;
+ goto exit;
+ }
+ if ((rtype == SPDK_NVME_RESERVE_EXCLUSIVE_ACCESS_REG_ONLY ||
+ rtype == SPDK_NVME_RESERVE_EXCLUSIVE_ACCESS_ALL_REGS) && !is_registrant) {
+ status = SPDK_NVME_SC_RESERVATION_CONFLICT;
+ }
+ break;
+ case SPDK_NVME_OPC_FLUSH:
+ case SPDK_NVME_OPC_WRITE:
+ case SPDK_NVME_OPC_WRITE_UNCORRECTABLE:
+ case SPDK_NVME_OPC_WRITE_ZEROES:
+ case SPDK_NVME_OPC_DATASET_MANAGEMENT:
+ if (rtype == SPDK_NVME_RESERVE_WRITE_EXCLUSIVE ||
+ rtype == SPDK_NVME_RESERVE_EXCLUSIVE_ACCESS) {
+ status = SPDK_NVME_SC_RESERVATION_CONFLICT;
+ goto exit;
+ }
+ if (!is_registrant) {
+ status = SPDK_NVME_SC_RESERVATION_CONFLICT;
+ }
+ break;
+ case SPDK_NVME_OPC_RESERVATION_ACQUIRE:
+ racqa = cmd->cdw10_bits.resv_acquire.racqa;
+ if (racqa == SPDK_NVME_RESERVE_ACQUIRE) {
+ status = SPDK_NVME_SC_RESERVATION_CONFLICT;
+ goto exit;
+ }
+ if (!is_registrant) {
+ status = SPDK_NVME_SC_RESERVATION_CONFLICT;
+ }
+ break;
+ case SPDK_NVME_OPC_RESERVATION_RELEASE:
+ if (!is_registrant) {
+ status = SPDK_NVME_SC_RESERVATION_CONFLICT;
+ }
+ break;
+ default:
+ break;
+ }
+
+exit:
+ req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+ req->rsp->nvme_cpl.status.sc = status;
+ if (status == SPDK_NVME_SC_RESERVATION_CONFLICT) {
+ return -EPERM;
+ }
+
+ return 0;
+}
+
+static int
+nvmf_ctrlr_process_io_fused_cmd(struct spdk_nvmf_request *req, struct spdk_bdev *bdev,
+ struct spdk_bdev_desc *desc, struct spdk_io_channel *ch)
+{
+ struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+ struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl;
+ struct spdk_nvmf_request *first_fused_req = req->qpair->first_fused_req;
+ int rc;
+
+ if (cmd->fuse == SPDK_NVME_CMD_FUSE_FIRST) {
+ /* first fused operation (should be compare) */
+ if (first_fused_req != NULL) {
+ struct spdk_nvme_cpl *fused_response = &first_fused_req->rsp->nvme_cpl;
+
+ SPDK_ERRLOG("Wrong sequence of fused operations\n");
+
+ /* abort req->qpair->first_fused_request and continue with new fused command */
+ fused_response->status.sc = SPDK_NVME_SC_ABORTED_MISSING_FUSED;
+ fused_response->status.sct = SPDK_NVME_SCT_GENERIC;
+ _nvmf_request_complete(first_fused_req);
+ } else if (cmd->opc != SPDK_NVME_OPC_COMPARE) {
+ SPDK_ERRLOG("Wrong op code of fused operations\n");
+ rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+ rsp->status.sc = SPDK_NVME_SC_INVALID_OPCODE;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ req->qpair->first_fused_req = req;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+ } else if (cmd->fuse == SPDK_NVME_CMD_FUSE_SECOND) {
+ /* second fused operation (should be write) */
+ if (first_fused_req == NULL) {
+ SPDK_ERRLOG("Wrong sequence of fused operations\n");
+ rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+ rsp->status.sc = SPDK_NVME_SC_ABORTED_MISSING_FUSED;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ } else if (cmd->opc != SPDK_NVME_OPC_WRITE) {
+ struct spdk_nvme_cpl *fused_response = &first_fused_req->rsp->nvme_cpl;
+
+ SPDK_ERRLOG("Wrong op code of fused operations\n");
+
+ /* abort req->qpair->first_fused_request and fail current command */
+ fused_response->status.sc = SPDK_NVME_SC_ABORTED_MISSING_FUSED;
+ fused_response->status.sct = SPDK_NVME_SCT_GENERIC;
+ _nvmf_request_complete(first_fused_req);
+
+ rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+ rsp->status.sc = SPDK_NVME_SC_INVALID_OPCODE;
+ req->qpair->first_fused_req = NULL;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ /* save request of first command to generate response later */
+ req->first_fused_req = first_fused_req;
+ req->qpair->first_fused_req = NULL;
+ } else {
+ SPDK_ERRLOG("Invalid fused command fuse field.\n");
+ rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+ rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ rc = nvmf_bdev_ctrlr_compare_and_write_cmd(bdev, desc, ch, req->first_fused_req, req);
+
+ if (rc == SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE) {
+ if (spdk_nvme_cpl_is_error(rsp)) {
+ struct spdk_nvme_cpl *fused_response = &first_fused_req->rsp->nvme_cpl;
+
+ fused_response->status = rsp->status;
+ rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+ rsp->status.sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED;
+ /* Complete first of fused commands. Second will be completed by upper layer */
+ _nvmf_request_complete(first_fused_req);
+ req->first_fused_req = NULL;
+ }
+ }
+
+ return rc;
+}
+
+int
+nvmf_ctrlr_process_io_cmd(struct spdk_nvmf_request *req)
+{
+ uint32_t nsid;
+ struct spdk_nvmf_ns *ns;
+ struct spdk_bdev *bdev;
+ struct spdk_bdev_desc *desc;
+ struct spdk_io_channel *ch;
+ struct spdk_nvmf_poll_group *group = req->qpair->group;
+ struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+ struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+ struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl;
+ struct spdk_nvmf_subsystem_pg_ns_info *ns_info;
+
+ /* pre-set response details for this command */
+ response->status.sc = SPDK_NVME_SC_SUCCESS;
+ nsid = cmd->nsid;
+
+ if (spdk_unlikely(ctrlr == NULL)) {
+ SPDK_ERRLOG("I/O command sent before CONNECT\n");
+ response->status.sct = SPDK_NVME_SCT_GENERIC;
+ response->status.sc = SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ if (spdk_unlikely(ctrlr->vcprop.cc.bits.en != 1)) {
+ SPDK_ERRLOG("I/O command sent to disabled controller\n");
+ response->status.sct = SPDK_NVME_SCT_GENERIC;
+ response->status.sc = SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ ns = _nvmf_subsystem_get_ns(ctrlr->subsys, nsid);
+ if (ns == NULL || ns->bdev == NULL) {
+ SPDK_ERRLOG("Unsuccessful query for nsid %u\n", cmd->nsid);
+ response->status.sc = SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT;
+ response->status.dnr = 1;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ /* scan-build falsely reporting dereference of null pointer */
+ assert(group != NULL && group->sgroups != NULL);
+ ns_info = &group->sgroups[ctrlr->subsys->id].ns_info[nsid - 1];
+ if (nvmf_ns_reservation_request_check(ns_info, ctrlr, req)) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Reservation Conflict for nsid %u, opcode %u\n",
+ cmd->nsid, cmd->opc);
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ bdev = ns->bdev;
+ desc = ns->desc;
+ ch = ns_info->channel;
+
+ if (spdk_unlikely(cmd->fuse & SPDK_NVME_CMD_FUSE_MASK)) {
+ return nvmf_ctrlr_process_io_fused_cmd(req, bdev, desc, ch);
+ } else if (spdk_unlikely(req->qpair->first_fused_req != NULL)) {
+ struct spdk_nvme_cpl *fused_response = &req->qpair->first_fused_req->rsp->nvme_cpl;
+
+ SPDK_ERRLOG("Expected second of fused commands - failing first of fused commands\n");
+
+ /* abort req->qpair->first_fused_request and continue with new command */
+ fused_response->status.sc = SPDK_NVME_SC_ABORTED_MISSING_FUSED;
+ fused_response->status.sct = SPDK_NVME_SCT_GENERIC;
+ _nvmf_request_complete(req->qpair->first_fused_req);
+ req->qpair->first_fused_req = NULL;
+ }
+
+ switch (cmd->opc) {
+ case SPDK_NVME_OPC_READ:
+ return nvmf_bdev_ctrlr_read_cmd(bdev, desc, ch, req);
+ case SPDK_NVME_OPC_WRITE:
+ return nvmf_bdev_ctrlr_write_cmd(bdev, desc, ch, req);
+ case SPDK_NVME_OPC_COMPARE:
+ return nvmf_bdev_ctrlr_compare_cmd(bdev, desc, ch, req);
+ case SPDK_NVME_OPC_WRITE_ZEROES:
+ return nvmf_bdev_ctrlr_write_zeroes_cmd(bdev, desc, ch, req);
+ case SPDK_NVME_OPC_FLUSH:
+ return nvmf_bdev_ctrlr_flush_cmd(bdev, desc, ch, req);
+ case SPDK_NVME_OPC_DATASET_MANAGEMENT:
+ return nvmf_bdev_ctrlr_dsm_cmd(bdev, desc, ch, req);
+ case SPDK_NVME_OPC_RESERVATION_REGISTER:
+ case SPDK_NVME_OPC_RESERVATION_ACQUIRE:
+ case SPDK_NVME_OPC_RESERVATION_RELEASE:
+ case SPDK_NVME_OPC_RESERVATION_REPORT:
+ spdk_thread_send_msg(ctrlr->subsys->thread, nvmf_ns_reservation_request, req);
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+ default:
+ return nvmf_bdev_ctrlr_nvme_passthru_io(bdev, desc, ch, req);
+ }
+}
+
+static void
+nvmf_qpair_request_cleanup(struct spdk_nvmf_qpair *qpair)
+{
+ if (qpair->state == SPDK_NVMF_QPAIR_DEACTIVATING) {
+ assert(qpair->state_cb != NULL);
+
+ if (TAILQ_EMPTY(&qpair->outstanding)) {
+ qpair->state_cb(qpair->state_cb_arg, 0);
+ }
+ }
+}
+
+int
+spdk_nvmf_request_free(struct spdk_nvmf_request *req)
+{
+ struct spdk_nvmf_qpair *qpair = req->qpair;
+
+ TAILQ_REMOVE(&qpair->outstanding, req, link);
+ if (nvmf_transport_req_free(req)) {
+ SPDK_ERRLOG("Unable to free transport level request resources.\n");
+ }
+
+ nvmf_qpair_request_cleanup(qpair);
+
+ return 0;
+}
+
+static void
+_nvmf_request_complete(void *ctx)
+{
+ struct spdk_nvmf_request *req = ctx;
+ struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl;
+ struct spdk_nvmf_qpair *qpair;
+ struct spdk_nvmf_subsystem_poll_group *sgroup = NULL;
+ bool is_aer = false;
+
+ rsp->sqid = 0;
+ rsp->status.p = 0;
+ rsp->cid = req->cmd->nvme_cmd.cid;
+
+ qpair = req->qpair;
+ if (qpair->ctrlr) {
+ sgroup = &qpair->group->sgroups[qpair->ctrlr->subsys->id];
+ assert(sgroup != NULL);
+ is_aer = req->cmd->nvme_cmd.opc == SPDK_NVME_OPC_ASYNC_EVENT_REQUEST;
+ } else if (spdk_unlikely(nvmf_request_is_fabric_connect(req))) {
+ sgroup = nvmf_subsystem_pg_from_connect_cmd(req);
+ }
+
+ if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf")) {
+ spdk_nvme_print_completion(qpair->qid, rsp);
+ }
+
+ TAILQ_REMOVE(&qpair->outstanding, req, link);
+ if (nvmf_transport_req_complete(req)) {
+ SPDK_ERRLOG("Transport request completion error!\n");
+ }
+
+ /* AER cmd is an exception */
+ if (sgroup && !is_aer) {
+ assert(sgroup->io_outstanding > 0);
+ sgroup->io_outstanding--;
+ if (sgroup->state == SPDK_NVMF_SUBSYSTEM_PAUSING &&
+ sgroup->io_outstanding == 0) {
+ sgroup->state = SPDK_NVMF_SUBSYSTEM_PAUSED;
+ sgroup->cb_fn(sgroup->cb_arg, 0);
+ }
+ }
+
+ nvmf_qpair_request_cleanup(qpair);
+}
+
+int
+spdk_nvmf_request_complete(struct spdk_nvmf_request *req)
+{
+ struct spdk_nvmf_qpair *qpair = req->qpair;
+
+ if (spdk_likely(qpair->group->thread == spdk_get_thread())) {
+ _nvmf_request_complete(req);
+ } else {
+ spdk_thread_send_msg(qpair->group->thread,
+ _nvmf_request_complete, req);
+ }
+
+ return 0;
+}
+
+static void
+_nvmf_request_exec(struct spdk_nvmf_request *req,
+ struct spdk_nvmf_subsystem_poll_group *sgroup)
+{
+ struct spdk_nvmf_qpair *qpair = req->qpair;
+ enum spdk_nvmf_request_exec_status status;
+
+ if (SPDK_DEBUGLOG_FLAG_ENABLED("nvmf")) {
+ spdk_nvme_print_command(qpair->qid, &req->cmd->nvme_cmd);
+ }
+
+ if (sgroup) {
+ sgroup->io_outstanding++;
+ }
+
+ /* Place the request on the outstanding list so we can keep track of it */
+ TAILQ_INSERT_TAIL(&qpair->outstanding, req, link);
+
+ if (spdk_unlikely(req->cmd->nvmf_cmd.opcode == SPDK_NVME_OPC_FABRIC)) {
+ status = nvmf_ctrlr_process_fabrics_cmd(req);
+ } else if (spdk_unlikely(nvmf_qpair_is_admin_queue(qpair))) {
+ status = nvmf_ctrlr_process_admin_cmd(req);
+ } else {
+ status = nvmf_ctrlr_process_io_cmd(req);
+ }
+
+ if (status == SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE) {
+ _nvmf_request_complete(req);
+ }
+}
+
+void
+spdk_nvmf_request_exec_fabrics(struct spdk_nvmf_request *req)
+{
+ struct spdk_nvmf_qpair *qpair = req->qpair;
+ struct spdk_nvmf_subsystem_poll_group *sgroup = NULL;
+
+ assert(req->cmd->nvmf_cmd.opcode == SPDK_NVME_OPC_FABRIC);
+
+ if (qpair->ctrlr) {
+ sgroup = &qpair->group->sgroups[qpair->ctrlr->subsys->id];
+ assert(sgroup != NULL);
+ } else {
+ sgroup = nvmf_subsystem_pg_from_connect_cmd(req);
+ }
+
+ _nvmf_request_exec(req, sgroup);
+}
+
+void
+spdk_nvmf_request_exec(struct spdk_nvmf_request *req)
+{
+ struct spdk_nvmf_qpair *qpair = req->qpair;
+ struct spdk_nvmf_subsystem_poll_group *sgroup = NULL;
+
+ if (qpair->ctrlr) {
+ sgroup = &qpair->group->sgroups[qpair->ctrlr->subsys->id];
+ assert(sgroup != NULL);
+ } else if (spdk_unlikely(nvmf_request_is_fabric_connect(req))) {
+ sgroup = nvmf_subsystem_pg_from_connect_cmd(req);
+ }
+
+ if (qpair->state != SPDK_NVMF_QPAIR_ACTIVE) {
+ req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+ req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR;
+ /* Place the request on the outstanding list so we can keep track of it */
+ TAILQ_INSERT_TAIL(&qpair->outstanding, req, link);
+ /* Still increment io_outstanding because request_complete decrements it */
+ if (sgroup != NULL) {
+ sgroup->io_outstanding++;
+ }
+ _nvmf_request_complete(req);
+ return;
+ }
+
+ /* Check if the subsystem is paused (if there is a subsystem) */
+ if (sgroup != NULL) {
+ if (sgroup->state != SPDK_NVMF_SUBSYSTEM_ACTIVE) {
+ /* The subsystem is not currently active. Queue this request. */
+ TAILQ_INSERT_TAIL(&sgroup->queued, req, link);
+ return;
+ }
+ }
+
+ _nvmf_request_exec(req, sgroup);
+}
+
+static bool
+nvmf_ctrlr_get_dif_ctx(struct spdk_nvmf_ctrlr *ctrlr, struct spdk_nvme_cmd *cmd,
+ struct spdk_dif_ctx *dif_ctx)
+{
+ struct spdk_nvmf_ns *ns;
+ struct spdk_bdev *bdev;
+
+ if (ctrlr == NULL || cmd == NULL) {
+ return false;
+ }
+
+ ns = _nvmf_subsystem_get_ns(ctrlr->subsys, cmd->nsid);
+ if (ns == NULL || ns->bdev == NULL) {
+ return false;
+ }
+
+ bdev = ns->bdev;
+
+ switch (cmd->opc) {
+ case SPDK_NVME_OPC_READ:
+ case SPDK_NVME_OPC_WRITE:
+ case SPDK_NVME_OPC_COMPARE:
+ return nvmf_bdev_ctrlr_get_dif_ctx(bdev, cmd, dif_ctx);
+ default:
+ break;
+ }
+
+ return false;
+}
+
+bool
+spdk_nvmf_request_get_dif_ctx(struct spdk_nvmf_request *req, struct spdk_dif_ctx *dif_ctx)
+{
+ struct spdk_nvmf_qpair *qpair = req->qpair;
+ struct spdk_nvmf_ctrlr *ctrlr = qpair->ctrlr;
+
+ if (spdk_likely(ctrlr == NULL || !ctrlr->dif_insert_or_strip)) {
+ return false;
+ }
+
+ if (spdk_unlikely(qpair->state != SPDK_NVMF_QPAIR_ACTIVE)) {
+ return false;
+ }
+
+ if (spdk_unlikely(req->cmd->nvmf_cmd.opcode == SPDK_NVME_OPC_FABRIC)) {
+ return false;
+ }
+
+ if (spdk_unlikely(nvmf_qpair_is_admin_queue(qpair))) {
+ return false;
+ }
+
+ return nvmf_ctrlr_get_dif_ctx(ctrlr, &req->cmd->nvme_cmd, dif_ctx);
+}
+
+void
+spdk_nvmf_set_custom_admin_cmd_hdlr(uint8_t opc, spdk_nvmf_custom_cmd_hdlr hdlr)
+{
+ g_nvmf_custom_admin_cmd_hdlrs[opc].hdlr = hdlr;
+}
+
+static int
+nvmf_passthru_admin_cmd(struct spdk_nvmf_request *req)
+{
+ struct spdk_bdev *bdev;
+ struct spdk_bdev_desc *desc;
+ struct spdk_io_channel *ch;
+ struct spdk_nvme_cmd *cmd = spdk_nvmf_request_get_cmd(req);
+ struct spdk_nvme_cpl *response = spdk_nvmf_request_get_response(req);
+ uint32_t bdev_nsid;
+ int rc;
+
+ if (g_nvmf_custom_admin_cmd_hdlrs[cmd->opc].nsid == 0) {
+ bdev_nsid = cmd->nsid;
+ } else {
+ bdev_nsid = g_nvmf_custom_admin_cmd_hdlrs[cmd->opc].nsid;
+ }
+
+ rc = spdk_nvmf_request_get_bdev(bdev_nsid, req, &bdev, &desc, &ch);
+ if (rc) {
+ response->status.sct = SPDK_NVME_SCT_GENERIC;
+ response->status.sc = SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+ return spdk_nvmf_bdev_ctrlr_nvme_passthru_admin(bdev, desc, ch, req, NULL);
+}
+
+void
+spdk_nvmf_set_passthru_admin_cmd(uint8_t opc, uint32_t forward_nsid)
+{
+ g_nvmf_custom_admin_cmd_hdlrs[opc].hdlr = nvmf_passthru_admin_cmd;
+ g_nvmf_custom_admin_cmd_hdlrs[opc].nsid = forward_nsid;
+}
+
+int
+spdk_nvmf_request_get_bdev(uint32_t nsid, struct spdk_nvmf_request *req,
+ struct spdk_bdev **bdev, struct spdk_bdev_desc **desc, struct spdk_io_channel **ch)
+{
+ struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+ struct spdk_nvmf_ns *ns;
+ struct spdk_nvmf_poll_group *group = req->qpair->group;
+ struct spdk_nvmf_subsystem_pg_ns_info *ns_info;
+
+ *bdev = NULL;
+ *desc = NULL;
+ *ch = NULL;
+
+ ns = _nvmf_subsystem_get_ns(ctrlr->subsys, nsid);
+ if (ns == NULL || ns->bdev == NULL) {
+ return -EINVAL;
+ }
+
+ assert(group != NULL && group->sgroups != NULL);
+ ns_info = &group->sgroups[ctrlr->subsys->id].ns_info[nsid - 1];
+ *bdev = ns->bdev;
+ *desc = ns->desc;
+ *ch = ns_info->channel;
+
+ return 0;
+}
+
+struct spdk_nvmf_ctrlr *spdk_nvmf_request_get_ctrlr(struct spdk_nvmf_request *req)
+{
+ return req->qpair->ctrlr;
+}
+
+struct spdk_nvme_cmd *spdk_nvmf_request_get_cmd(struct spdk_nvmf_request *req)
+{
+ return &req->cmd->nvme_cmd;
+}
+
+struct spdk_nvme_cpl *spdk_nvmf_request_get_response(struct spdk_nvmf_request *req)
+{
+ return &req->rsp->nvme_cpl;
+}
+
+struct spdk_nvmf_subsystem *spdk_nvmf_request_get_subsystem(struct spdk_nvmf_request *req)
+{
+ return req->qpair->ctrlr->subsys;
+}
+
+void spdk_nvmf_request_get_data(struct spdk_nvmf_request *req, void **data, uint32_t *length)
+{
+ *data = req->data;
+ *length = req->length;
+}
+
+struct spdk_nvmf_subsystem *spdk_nvmf_ctrlr_get_subsystem(struct spdk_nvmf_ctrlr *ctrlr)
+{
+ return ctrlr->subsys;
+}
+
+uint16_t spdk_nvmf_ctrlr_get_id(struct spdk_nvmf_ctrlr *ctrlr)
+{
+ return ctrlr->cntlid;
+}
+
+struct spdk_nvmf_request *spdk_nvmf_request_get_req_to_abort(struct spdk_nvmf_request *req)
+{
+ return req->req_to_abort;
+}
diff --git a/src/spdk/lib/nvmf/ctrlr_bdev.c b/src/spdk/lib/nvmf/ctrlr_bdev.c
new file mode 100644
index 000000000..13e0a4309
--- /dev/null
+++ b/src/spdk/lib/nvmf/ctrlr_bdev.c
@@ -0,0 +1,761 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation. All rights reserved.
+ * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "nvmf_internal.h"
+
+#include "spdk/bdev.h"
+#include "spdk/endian.h"
+#include "spdk/thread.h"
+#include "spdk/likely.h"
+#include "spdk/nvme.h"
+#include "spdk/nvmf_cmd.h"
+#include "spdk/nvmf_spec.h"
+#include "spdk/trace.h"
+#include "spdk/scsi_spec.h"
+#include "spdk/string.h"
+#include "spdk/util.h"
+
+#include "spdk_internal/log.h"
+
+static bool
+nvmf_subsystem_bdev_io_type_supported(struct spdk_nvmf_subsystem *subsystem,
+ enum spdk_bdev_io_type io_type)
+{
+ struct spdk_nvmf_ns *ns;
+
+ for (ns = spdk_nvmf_subsystem_get_first_ns(subsystem); ns != NULL;
+ ns = spdk_nvmf_subsystem_get_next_ns(subsystem, ns)) {
+ if (ns->bdev == NULL) {
+ continue;
+ }
+
+ if (!spdk_bdev_io_type_supported(ns->bdev, io_type)) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF,
+ "Subsystem %s namespace %u (%s) does not support io_type %d\n",
+ spdk_nvmf_subsystem_get_nqn(subsystem),
+ ns->opts.nsid, spdk_bdev_get_name(ns->bdev), (int)io_type);
+ return false;
+ }
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "All devices in Subsystem %s support io_type %d\n",
+ spdk_nvmf_subsystem_get_nqn(subsystem), (int)io_type);
+ return true;
+}
+
+bool
+nvmf_ctrlr_dsm_supported(struct spdk_nvmf_ctrlr *ctrlr)
+{
+ return nvmf_subsystem_bdev_io_type_supported(ctrlr->subsys, SPDK_BDEV_IO_TYPE_UNMAP);
+}
+
+bool
+nvmf_ctrlr_write_zeroes_supported(struct spdk_nvmf_ctrlr *ctrlr)
+{
+ return nvmf_subsystem_bdev_io_type_supported(ctrlr->subsys, SPDK_BDEV_IO_TYPE_WRITE_ZEROES);
+}
+
+static void
+nvmf_bdev_ctrlr_complete_cmd(struct spdk_bdev_io *bdev_io, bool success,
+ void *cb_arg)
+{
+ struct spdk_nvmf_request *req = cb_arg;
+ struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl;
+ int first_sc = 0, first_sct = 0, second_sc = 0, second_sct = 0;
+ uint32_t cdw0 = 0;
+ struct spdk_nvmf_request *first_req = req->first_fused_req;
+
+ if (spdk_unlikely(first_req != NULL)) {
+ /* fused commands - get status for both operations */
+ struct spdk_nvme_cpl *fused_response = &first_req->rsp->nvme_cpl;
+
+ spdk_bdev_io_get_nvme_fused_status(bdev_io, &cdw0, &second_sct, &second_sc, &first_sct, &first_sc);
+ fused_response->cdw0 = cdw0;
+ fused_response->status.sc = second_sc;
+ fused_response->status.sct = second_sct;
+
+ /* first request should be completed */
+ spdk_nvmf_request_complete(first_req);
+ req->first_fused_req = NULL;
+ } else {
+ spdk_bdev_io_get_nvme_status(bdev_io, &cdw0, &first_sct, &first_sc);
+ }
+
+ response->cdw0 = cdw0;
+ response->status.sc = first_sc;
+ response->status.sct = first_sct;
+
+ spdk_nvmf_request_complete(req);
+ spdk_bdev_free_io(bdev_io);
+}
+
+static void
+nvmf_bdev_ctrlr_complete_admin_cmd(struct spdk_bdev_io *bdev_io, bool success,
+ void *cb_arg)
+{
+ struct spdk_nvmf_request *req = cb_arg;
+
+ if (req->cmd_cb_fn) {
+ req->cmd_cb_fn(req);
+ }
+
+ nvmf_bdev_ctrlr_complete_cmd(bdev_io, success, req);
+}
+
+void
+nvmf_bdev_ctrlr_identify_ns(struct spdk_nvmf_ns *ns, struct spdk_nvme_ns_data *nsdata,
+ bool dif_insert_or_strip)
+{
+ struct spdk_bdev *bdev = ns->bdev;
+ uint64_t num_blocks;
+
+ num_blocks = spdk_bdev_get_num_blocks(bdev);
+
+ nsdata->nsze = num_blocks;
+ nsdata->ncap = num_blocks;
+ nsdata->nuse = num_blocks;
+ nsdata->nlbaf = 0;
+ nsdata->flbas.format = 0;
+ nsdata->nacwu = spdk_bdev_get_acwu(bdev);
+ if (!dif_insert_or_strip) {
+ nsdata->lbaf[0].ms = spdk_bdev_get_md_size(bdev);
+ nsdata->lbaf[0].lbads = spdk_u32log2(spdk_bdev_get_block_size(bdev));
+ if (nsdata->lbaf[0].ms != 0) {
+ nsdata->flbas.extended = 1;
+ nsdata->mc.extended = 1;
+ nsdata->mc.pointer = 0;
+ nsdata->dps.md_start = spdk_bdev_is_dif_head_of_md(bdev);
+
+ switch (spdk_bdev_get_dif_type(bdev)) {
+ case SPDK_DIF_TYPE1:
+ nsdata->dpc.pit1 = 1;
+ nsdata->dps.pit = SPDK_NVME_FMT_NVM_PROTECTION_TYPE1;
+ break;
+ case SPDK_DIF_TYPE2:
+ nsdata->dpc.pit2 = 1;
+ nsdata->dps.pit = SPDK_NVME_FMT_NVM_PROTECTION_TYPE2;
+ break;
+ case SPDK_DIF_TYPE3:
+ nsdata->dpc.pit3 = 1;
+ nsdata->dps.pit = SPDK_NVME_FMT_NVM_PROTECTION_TYPE3;
+ break;
+ default:
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Protection Disabled\n");
+ nsdata->dps.pit = SPDK_NVME_FMT_NVM_PROTECTION_DISABLE;
+ break;
+ }
+ }
+ } else {
+ nsdata->lbaf[0].ms = 0;
+ nsdata->lbaf[0].lbads = spdk_u32log2(spdk_bdev_get_data_block_size(bdev));
+ }
+ nsdata->noiob = spdk_bdev_get_optimal_io_boundary(bdev);
+ nsdata->nmic.can_share = 1;
+ if (ns->ptpl_file != NULL) {
+ nsdata->nsrescap.rescap.persist = 1;
+ }
+ nsdata->nsrescap.rescap.write_exclusive = 1;
+ nsdata->nsrescap.rescap.exclusive_access = 1;
+ nsdata->nsrescap.rescap.write_exclusive_reg_only = 1;
+ nsdata->nsrescap.rescap.exclusive_access_reg_only = 1;
+ nsdata->nsrescap.rescap.write_exclusive_all_reg = 1;
+ nsdata->nsrescap.rescap.exclusive_access_all_reg = 1;
+ nsdata->nsrescap.rescap.ignore_existing_key = 1;
+
+ SPDK_STATIC_ASSERT(sizeof(nsdata->nguid) == sizeof(ns->opts.nguid), "size mismatch");
+ memcpy(nsdata->nguid, ns->opts.nguid, sizeof(nsdata->nguid));
+
+ SPDK_STATIC_ASSERT(sizeof(nsdata->eui64) == sizeof(ns->opts.eui64), "size mismatch");
+ memcpy(&nsdata->eui64, ns->opts.eui64, sizeof(nsdata->eui64));
+}
+
+static void
+nvmf_bdev_ctrlr_get_rw_params(const struct spdk_nvme_cmd *cmd, uint64_t *start_lba,
+ uint64_t *num_blocks)
+{
+ /* SLBA: CDW10 and CDW11 */
+ *start_lba = from_le64(&cmd->cdw10);
+
+ /* NLB: CDW12 bits 15:00, 0's based */
+ *num_blocks = (from_le32(&cmd->cdw12) & 0xFFFFu) + 1;
+}
+
+static bool
+nvmf_bdev_ctrlr_lba_in_range(uint64_t bdev_num_blocks, uint64_t io_start_lba,
+ uint64_t io_num_blocks)
+{
+ if (io_start_lba + io_num_blocks > bdev_num_blocks ||
+ io_start_lba + io_num_blocks < io_start_lba) {
+ return false;
+ }
+
+ return true;
+}
+
+static void
+nvmf_ctrlr_process_io_cmd_resubmit(void *arg)
+{
+ struct spdk_nvmf_request *req = arg;
+
+ nvmf_ctrlr_process_io_cmd(req);
+}
+
+static void
+nvmf_ctrlr_process_admin_cmd_resubmit(void *arg)
+{
+ struct spdk_nvmf_request *req = arg;
+
+ nvmf_ctrlr_process_admin_cmd(req);
+}
+
+static void
+nvmf_bdev_ctrl_queue_io(struct spdk_nvmf_request *req, struct spdk_bdev *bdev,
+ struct spdk_io_channel *ch, spdk_bdev_io_wait_cb cb_fn, void *cb_arg)
+{
+ int rc;
+
+ req->bdev_io_wait.bdev = bdev;
+ req->bdev_io_wait.cb_fn = cb_fn;
+ req->bdev_io_wait.cb_arg = cb_arg;
+
+ rc = spdk_bdev_queue_io_wait(bdev, ch, &req->bdev_io_wait);
+ if (rc != 0) {
+ assert(false);
+ }
+ req->qpair->group->stat.pending_bdev_io++;
+}
+
+int
+nvmf_bdev_ctrlr_read_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
+ struct spdk_io_channel *ch, struct spdk_nvmf_request *req)
+{
+ uint64_t bdev_num_blocks = spdk_bdev_get_num_blocks(bdev);
+ uint32_t block_size = spdk_bdev_get_block_size(bdev);
+ struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+ struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl;
+ uint64_t start_lba;
+ uint64_t num_blocks;
+ int rc;
+
+ nvmf_bdev_ctrlr_get_rw_params(cmd, &start_lba, &num_blocks);
+
+ if (spdk_unlikely(!nvmf_bdev_ctrlr_lba_in_range(bdev_num_blocks, start_lba, num_blocks))) {
+ SPDK_ERRLOG("end of media\n");
+ rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+ rsp->status.sc = SPDK_NVME_SC_LBA_OUT_OF_RANGE;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ if (spdk_unlikely(num_blocks * block_size > req->length)) {
+ SPDK_ERRLOG("Read NLB %" PRIu64 " * block size %" PRIu32 " > SGL length %" PRIu32 "\n",
+ num_blocks, block_size, req->length);
+ rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+ rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ rc = spdk_bdev_readv_blocks(desc, ch, req->iov, req->iovcnt, start_lba, num_blocks,
+ nvmf_bdev_ctrlr_complete_cmd, req);
+ if (spdk_unlikely(rc)) {
+ if (rc == -ENOMEM) {
+ nvmf_bdev_ctrl_queue_io(req, bdev, ch, nvmf_ctrlr_process_io_cmd_resubmit, req);
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+ }
+ rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+ rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+}
+
+int
+nvmf_bdev_ctrlr_write_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
+ struct spdk_io_channel *ch, struct spdk_nvmf_request *req)
+{
+ uint64_t bdev_num_blocks = spdk_bdev_get_num_blocks(bdev);
+ uint32_t block_size = spdk_bdev_get_block_size(bdev);
+ struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+ struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl;
+ uint64_t start_lba;
+ uint64_t num_blocks;
+ int rc;
+
+ nvmf_bdev_ctrlr_get_rw_params(cmd, &start_lba, &num_blocks);
+
+ if (spdk_unlikely(!nvmf_bdev_ctrlr_lba_in_range(bdev_num_blocks, start_lba, num_blocks))) {
+ SPDK_ERRLOG("end of media\n");
+ rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+ rsp->status.sc = SPDK_NVME_SC_LBA_OUT_OF_RANGE;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ if (spdk_unlikely(num_blocks * block_size > req->length)) {
+ SPDK_ERRLOG("Write NLB %" PRIu64 " * block size %" PRIu32 " > SGL length %" PRIu32 "\n",
+ num_blocks, block_size, req->length);
+ rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+ rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ rc = spdk_bdev_writev_blocks(desc, ch, req->iov, req->iovcnt, start_lba, num_blocks,
+ nvmf_bdev_ctrlr_complete_cmd, req);
+ if (spdk_unlikely(rc)) {
+ if (rc == -ENOMEM) {
+ nvmf_bdev_ctrl_queue_io(req, bdev, ch, nvmf_ctrlr_process_io_cmd_resubmit, req);
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+ }
+ rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+ rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+}
+
+int
+nvmf_bdev_ctrlr_compare_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
+ struct spdk_io_channel *ch, struct spdk_nvmf_request *req)
+{
+ uint64_t bdev_num_blocks = spdk_bdev_get_num_blocks(bdev);
+ uint32_t block_size = spdk_bdev_get_block_size(bdev);
+ struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+ struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl;
+ uint64_t start_lba;
+ uint64_t num_blocks;
+ int rc;
+
+ nvmf_bdev_ctrlr_get_rw_params(cmd, &start_lba, &num_blocks);
+
+ if (spdk_unlikely(!nvmf_bdev_ctrlr_lba_in_range(bdev_num_blocks, start_lba, num_blocks))) {
+ SPDK_ERRLOG("end of media\n");
+ rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+ rsp->status.sc = SPDK_NVME_SC_LBA_OUT_OF_RANGE;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ if (spdk_unlikely(num_blocks * block_size > req->length)) {
+ SPDK_ERRLOG("Compare NLB %" PRIu64 " * block size %" PRIu32 " > SGL length %" PRIu32 "\n",
+ num_blocks, block_size, req->length);
+ rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+ rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ rc = spdk_bdev_comparev_blocks(desc, ch, req->iov, req->iovcnt, start_lba, num_blocks,
+ nvmf_bdev_ctrlr_complete_cmd, req);
+ if (spdk_unlikely(rc)) {
+ if (rc == -ENOMEM) {
+ nvmf_bdev_ctrl_queue_io(req, bdev, ch, nvmf_ctrlr_process_io_cmd_resubmit, req);
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+ }
+ rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+ rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+}
+
+int
+nvmf_bdev_ctrlr_compare_and_write_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
+ struct spdk_io_channel *ch, struct spdk_nvmf_request *cmp_req, struct spdk_nvmf_request *write_req)
+{
+ uint64_t bdev_num_blocks = spdk_bdev_get_num_blocks(bdev);
+ uint32_t block_size = spdk_bdev_get_block_size(bdev);
+ struct spdk_nvme_cmd *cmp_cmd = &cmp_req->cmd->nvme_cmd;
+ struct spdk_nvme_cmd *write_cmd = &write_req->cmd->nvme_cmd;
+ struct spdk_nvme_cpl *rsp = &write_req->rsp->nvme_cpl;
+ uint64_t write_start_lba, cmp_start_lba;
+ uint64_t write_num_blocks, cmp_num_blocks;
+ int rc;
+
+ nvmf_bdev_ctrlr_get_rw_params(cmp_cmd, &cmp_start_lba, &cmp_num_blocks);
+ nvmf_bdev_ctrlr_get_rw_params(write_cmd, &write_start_lba, &write_num_blocks);
+
+ if (spdk_unlikely(write_start_lba != cmp_start_lba || write_num_blocks != cmp_num_blocks)) {
+ SPDK_ERRLOG("Fused command start lba / num blocks mismatch\n");
+ rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+ rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ if (spdk_unlikely(!nvmf_bdev_ctrlr_lba_in_range(bdev_num_blocks, write_start_lba,
+ write_num_blocks))) {
+ SPDK_ERRLOG("end of media\n");
+ rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+ rsp->status.sc = SPDK_NVME_SC_LBA_OUT_OF_RANGE;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ if (spdk_unlikely(write_num_blocks * block_size > write_req->length)) {
+ SPDK_ERRLOG("Write NLB %" PRIu64 " * block size %" PRIu32 " > SGL length %" PRIu32 "\n",
+ write_num_blocks, block_size, write_req->length);
+ rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+ rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ rc = spdk_bdev_comparev_and_writev_blocks(desc, ch, cmp_req->iov, cmp_req->iovcnt, write_req->iov,
+ write_req->iovcnt, write_start_lba, write_num_blocks, nvmf_bdev_ctrlr_complete_cmd, write_req);
+ if (spdk_unlikely(rc)) {
+ if (rc == -ENOMEM) {
+ nvmf_bdev_ctrl_queue_io(cmp_req, bdev, ch, nvmf_ctrlr_process_io_cmd_resubmit, cmp_req);
+ nvmf_bdev_ctrl_queue_io(write_req, bdev, ch, nvmf_ctrlr_process_io_cmd_resubmit, write_req);
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+ }
+ rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+ rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+}
+
+int
+nvmf_bdev_ctrlr_write_zeroes_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
+ struct spdk_io_channel *ch, struct spdk_nvmf_request *req)
+{
+ uint64_t bdev_num_blocks = spdk_bdev_get_num_blocks(bdev);
+ struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+ struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl;
+ uint64_t start_lba;
+ uint64_t num_blocks;
+ int rc;
+
+ nvmf_bdev_ctrlr_get_rw_params(cmd, &start_lba, &num_blocks);
+
+ if (spdk_unlikely(!nvmf_bdev_ctrlr_lba_in_range(bdev_num_blocks, start_lba, num_blocks))) {
+ SPDK_ERRLOG("end of media\n");
+ rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+ rsp->status.sc = SPDK_NVME_SC_LBA_OUT_OF_RANGE;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ rc = spdk_bdev_write_zeroes_blocks(desc, ch, start_lba, num_blocks,
+ nvmf_bdev_ctrlr_complete_cmd, req);
+ if (spdk_unlikely(rc)) {
+ if (rc == -ENOMEM) {
+ nvmf_bdev_ctrl_queue_io(req, bdev, ch, nvmf_ctrlr_process_io_cmd_resubmit, req);
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+ }
+ rsp->status.sct = SPDK_NVME_SCT_GENERIC;
+ rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+}
+
+int
+nvmf_bdev_ctrlr_flush_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
+ struct spdk_io_channel *ch, struct spdk_nvmf_request *req)
+{
+ struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl;
+ int rc;
+
+ /* As for NVMeoF controller, SPDK always set volatile write
+ * cache bit to 1, return success for those block devices
+ * which can't support FLUSH command.
+ */
+ if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH)) {
+ response->status.sct = SPDK_NVME_SCT_GENERIC;
+ response->status.sc = SPDK_NVME_SC_SUCCESS;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ rc = spdk_bdev_flush_blocks(desc, ch, 0, spdk_bdev_get_num_blocks(bdev),
+ nvmf_bdev_ctrlr_complete_cmd, req);
+ if (spdk_unlikely(rc)) {
+ if (rc == -ENOMEM) {
+ nvmf_bdev_ctrl_queue_io(req, bdev, ch, nvmf_ctrlr_process_io_cmd_resubmit, req);
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+ }
+ response->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+}
+
+struct nvmf_bdev_ctrlr_unmap {
+ struct spdk_nvmf_request *req;
+ uint32_t count;
+ struct spdk_bdev_desc *desc;
+ struct spdk_bdev *bdev;
+ struct spdk_io_channel *ch;
+ uint32_t range_index;
+};
+
+static void
+nvmf_bdev_ctrlr_unmap_cpl(struct spdk_bdev_io *bdev_io, bool success,
+ void *cb_arg)
+{
+ struct nvmf_bdev_ctrlr_unmap *unmap_ctx = cb_arg;
+ struct spdk_nvmf_request *req = unmap_ctx->req;
+ struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl;
+ int sc, sct;
+ uint32_t cdw0;
+
+ unmap_ctx->count--;
+
+ if (response->status.sct == SPDK_NVME_SCT_GENERIC &&
+ response->status.sc == SPDK_NVME_SC_SUCCESS) {
+ spdk_bdev_io_get_nvme_status(bdev_io, &cdw0, &sct, &sc);
+ response->cdw0 = cdw0;
+ response->status.sc = sc;
+ response->status.sct = sct;
+ }
+
+ if (unmap_ctx->count == 0) {
+ spdk_nvmf_request_complete(req);
+ free(unmap_ctx);
+ }
+ spdk_bdev_free_io(bdev_io);
+}
+
+static int
+nvmf_bdev_ctrlr_unmap(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
+ struct spdk_io_channel *ch, struct spdk_nvmf_request *req,
+ struct nvmf_bdev_ctrlr_unmap *unmap_ctx);
+static void
+nvmf_bdev_ctrlr_unmap_resubmit(void *arg)
+{
+ struct nvmf_bdev_ctrlr_unmap *unmap_ctx = arg;
+ struct spdk_nvmf_request *req = unmap_ctx->req;
+ struct spdk_bdev_desc *desc = unmap_ctx->desc;
+ struct spdk_bdev *bdev = unmap_ctx->bdev;
+ struct spdk_io_channel *ch = unmap_ctx->ch;
+
+ nvmf_bdev_ctrlr_unmap(bdev, desc, ch, req, unmap_ctx);
+}
+
+static int
+nvmf_bdev_ctrlr_unmap(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
+ struct spdk_io_channel *ch, struct spdk_nvmf_request *req,
+ struct nvmf_bdev_ctrlr_unmap *unmap_ctx)
+{
+ uint16_t nr, i;
+ struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+ struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl;
+ struct spdk_nvme_dsm_range *dsm_range;
+ uint64_t lba;
+ uint32_t lba_count;
+ int rc;
+
+ nr = cmd->cdw10_bits.dsm.nr + 1;
+ if (nr * sizeof(struct spdk_nvme_dsm_range) > req->length) {
+ SPDK_ERRLOG("Dataset Management number of ranges > SGL length\n");
+ response->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ if (unmap_ctx == NULL) {
+ unmap_ctx = calloc(1, sizeof(*unmap_ctx));
+ if (!unmap_ctx) {
+ response->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ unmap_ctx->req = req;
+ unmap_ctx->desc = desc;
+ unmap_ctx->ch = ch;
+ unmap_ctx->bdev = bdev;
+
+ response->status.sct = SPDK_NVME_SCT_GENERIC;
+ response->status.sc = SPDK_NVME_SC_SUCCESS;
+ } else {
+ unmap_ctx->count--; /* dequeued */
+ }
+
+ dsm_range = (struct spdk_nvme_dsm_range *)req->data;
+ for (i = unmap_ctx->range_index; i < nr; i++) {
+ lba = dsm_range[i].starting_lba;
+ lba_count = dsm_range[i].length;
+
+ unmap_ctx->count++;
+
+ rc = spdk_bdev_unmap_blocks(desc, ch, lba, lba_count,
+ nvmf_bdev_ctrlr_unmap_cpl, unmap_ctx);
+ if (rc) {
+ if (rc == -ENOMEM) {
+ nvmf_bdev_ctrl_queue_io(req, bdev, ch, nvmf_bdev_ctrlr_unmap_resubmit, unmap_ctx);
+ /* Unmap was not yet submitted to bdev */
+ /* unmap_ctx->count will be decremented when the request is dequeued */
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+ }
+ response->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+ unmap_ctx->count--;
+ /* We can't return here - we may have to wait for any other
+ * unmaps already sent to complete */
+ break;
+ }
+ unmap_ctx->range_index++;
+ }
+
+ if (unmap_ctx->count == 0) {
+ free(unmap_ctx);
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+}
+
+int
+nvmf_bdev_ctrlr_dsm_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
+ struct spdk_io_channel *ch, struct spdk_nvmf_request *req)
+{
+ struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+ struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl;
+
+ if (cmd->cdw11_bits.dsm.ad) {
+ return nvmf_bdev_ctrlr_unmap(bdev, desc, ch, req, NULL);
+ }
+
+ response->status.sct = SPDK_NVME_SCT_GENERIC;
+ response->status.sc = SPDK_NVME_SC_SUCCESS;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+}
+
+int
+nvmf_bdev_ctrlr_nvme_passthru_io(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
+ struct spdk_io_channel *ch, struct spdk_nvmf_request *req)
+{
+ int rc;
+
+ rc = spdk_bdev_nvme_io_passthru(desc, ch, &req->cmd->nvme_cmd, req->data, req->length,
+ nvmf_bdev_ctrlr_complete_cmd, req);
+ if (spdk_unlikely(rc)) {
+ if (rc == -ENOMEM) {
+ nvmf_bdev_ctrl_queue_io(req, bdev, ch, nvmf_ctrlr_process_io_cmd_resubmit, req);
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+ }
+ req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+ req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INVALID_OPCODE;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+}
+
+int
+spdk_nvmf_bdev_ctrlr_nvme_passthru_admin(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
+ struct spdk_io_channel *ch, struct spdk_nvmf_request *req,
+ spdk_nvmf_nvme_passthru_cmd_cb cb_fn)
+{
+ int rc;
+
+ req->cmd_cb_fn = cb_fn;
+
+ rc = spdk_bdev_nvme_admin_passthru(desc, ch, &req->cmd->nvme_cmd, req->data, req->length,
+ nvmf_bdev_ctrlr_complete_admin_cmd, req);
+ if (spdk_unlikely(rc)) {
+ if (rc == -ENOMEM) {
+ nvmf_bdev_ctrl_queue_io(req, bdev, ch, nvmf_ctrlr_process_admin_cmd_resubmit, req);
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+ }
+ req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+ req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+}
+
+static void
+nvmf_bdev_ctrlr_complete_abort_cmd(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct spdk_nvmf_request *req = cb_arg;
+
+ if (success) {
+ req->rsp->nvme_cpl.cdw0 &= ~1U;
+ }
+
+ spdk_nvmf_request_complete(req);
+ spdk_bdev_free_io(bdev_io);
+}
+
+int
+spdk_nvmf_bdev_ctrlr_abort_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
+ struct spdk_io_channel *ch, struct spdk_nvmf_request *req,
+ struct spdk_nvmf_request *req_to_abort)
+{
+ int rc;
+
+ assert((req->rsp->nvme_cpl.cdw0 & 1U) != 0);
+
+ rc = spdk_bdev_abort(desc, ch, req_to_abort, nvmf_bdev_ctrlr_complete_abort_cmd, req);
+ if (spdk_likely(rc == 0)) {
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+ } else if (rc == -ENOMEM) {
+ nvmf_bdev_ctrl_queue_io(req, bdev, ch, nvmf_ctrlr_process_admin_cmd_resubmit, req);
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS;
+ } else {
+ return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE;
+ }
+}
+
+bool
+nvmf_bdev_ctrlr_get_dif_ctx(struct spdk_bdev *bdev, struct spdk_nvme_cmd *cmd,
+ struct spdk_dif_ctx *dif_ctx)
+{
+ uint32_t init_ref_tag, dif_check_flags = 0;
+ int rc;
+
+ if (spdk_bdev_get_md_size(bdev) == 0) {
+ return false;
+ }
+
+ /* Initial Reference Tag is the lower 32 bits of the start LBA. */
+ init_ref_tag = (uint32_t)from_le64(&cmd->cdw10);
+
+ if (spdk_bdev_is_dif_check_enabled(bdev, SPDK_DIF_CHECK_TYPE_REFTAG)) {
+ dif_check_flags |= SPDK_DIF_FLAGS_REFTAG_CHECK;
+ }
+
+ if (spdk_bdev_is_dif_check_enabled(bdev, SPDK_DIF_CHECK_TYPE_GUARD)) {
+ dif_check_flags |= SPDK_DIF_FLAGS_GUARD_CHECK;
+ }
+
+ rc = spdk_dif_ctx_init(dif_ctx,
+ spdk_bdev_get_block_size(bdev),
+ spdk_bdev_get_md_size(bdev),
+ spdk_bdev_is_md_interleaved(bdev),
+ spdk_bdev_is_dif_head_of_md(bdev),
+ spdk_bdev_get_dif_type(bdev),
+ dif_check_flags,
+ init_ref_tag, 0, 0, 0, 0);
+
+ return (rc == 0) ? true : false;
+}
diff --git a/src/spdk/lib/nvmf/ctrlr_discovery.c b/src/spdk/lib/nvmf/ctrlr_discovery.c
new file mode 100644
index 000000000..ab1c46ba1
--- /dev/null
+++ b/src/spdk/lib/nvmf/ctrlr_discovery.c
@@ -0,0 +1,159 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * NVMe over Fabrics discovery service
+ */
+
+#include "spdk/stdinc.h"
+
+#include "nvmf_internal.h"
+#include "transport.h"
+
+#include "spdk/string.h"
+#include "spdk/trace.h"
+#include "spdk/nvmf_spec.h"
+
+#include "spdk/bdev_module.h"
+#include "spdk_internal/log.h"
+
+static struct spdk_nvmf_discovery_log_page *
+nvmf_generate_discovery_log(struct spdk_nvmf_tgt *tgt, const char *hostnqn, size_t *log_page_size)
+{
+ uint64_t numrec = 0;
+ struct spdk_nvmf_subsystem *subsystem;
+ struct spdk_nvmf_subsystem_listener *listener;
+ struct spdk_nvmf_discovery_log_page_entry *entry;
+ struct spdk_nvmf_discovery_log_page *disc_log;
+ size_t cur_size;
+ uint32_t sid;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Generating log page for genctr %" PRIu64 "\n",
+ tgt->discovery_genctr);
+
+ cur_size = sizeof(struct spdk_nvmf_discovery_log_page);
+ disc_log = calloc(1, cur_size);
+ if (disc_log == NULL) {
+ SPDK_ERRLOG("Discovery log page memory allocation error\n");
+ return NULL;
+ }
+
+ for (sid = 0; sid < tgt->max_subsystems; sid++) {
+ subsystem = tgt->subsystems[sid];
+ if ((subsystem == NULL) ||
+ (subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE) ||
+ (subsystem->state == SPDK_NVMF_SUBSYSTEM_DEACTIVATING)) {
+ continue;
+ }
+
+ if (subsystem->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) {
+ continue;
+ }
+
+ if (!spdk_nvmf_subsystem_host_allowed(subsystem, hostnqn)) {
+ continue;
+ }
+
+ for (listener = spdk_nvmf_subsystem_get_first_listener(subsystem); listener != NULL;
+ listener = spdk_nvmf_subsystem_get_next_listener(subsystem, listener)) {
+ size_t new_size = cur_size + sizeof(*entry);
+ void *new_log_page = realloc(disc_log, new_size);
+
+ if (new_log_page == NULL) {
+ SPDK_ERRLOG("Discovery log page memory allocation error\n");
+ break;
+ }
+
+ disc_log = new_log_page;
+ cur_size = new_size;
+
+ entry = &disc_log->entries[numrec];
+ memset(entry, 0, sizeof(*entry));
+ entry->portid = numrec;
+ entry->cntlid = 0xffff;
+ entry->asqsz = listener->transport->opts.max_aq_depth;
+ entry->subtype = subsystem->subtype;
+ snprintf(entry->subnqn, sizeof(entry->subnqn), "%s", subsystem->subnqn);
+
+ nvmf_transport_listener_discover(listener->transport, listener->trid, entry);
+
+ numrec++;
+ }
+ }
+
+ disc_log->numrec = numrec;
+ disc_log->genctr = tgt->discovery_genctr;
+ *log_page_size = cur_size;
+
+ return disc_log;
+}
+
+void
+nvmf_get_discovery_log_page(struct spdk_nvmf_tgt *tgt, const char *hostnqn, struct iovec *iov,
+ uint32_t iovcnt, uint64_t offset, uint32_t length)
+{
+ size_t copy_len = 0;
+ size_t zero_len = 0;
+ struct iovec *tmp;
+ size_t log_page_size = 0;
+ struct spdk_nvmf_discovery_log_page *discovery_log_page;
+
+ discovery_log_page = nvmf_generate_discovery_log(tgt, hostnqn, &log_page_size);
+
+ /* Copy the valid part of the discovery log page, if any */
+ if (discovery_log_page) {
+ for (tmp = iov; tmp < iov + iovcnt; tmp++) {
+ copy_len = spdk_min(tmp->iov_len, length);
+ copy_len = spdk_min(log_page_size - offset, copy_len);
+
+ memcpy(tmp->iov_base, (char *)discovery_log_page + offset, copy_len);
+
+ offset += copy_len;
+ length -= copy_len;
+ zero_len = tmp->iov_len - copy_len;
+ if (log_page_size <= offset || length == 0) {
+ break;
+ }
+ }
+ /* Zero out the rest of the payload */
+ if (zero_len) {
+ memset((char *)tmp->iov_base + copy_len, 0, zero_len);
+ }
+
+ for (++tmp; tmp < iov + iovcnt; tmp++) {
+ memset((char *)tmp->iov_base, 0, tmp->iov_len);
+ }
+
+ free(discovery_log_page);
+ }
+}
diff --git a/src/spdk/lib/nvmf/fc.c b/src/spdk/lib/nvmf/fc.c
new file mode 100644
index 000000000..678cfc681
--- /dev/null
+++ b/src/spdk/lib/nvmf/fc.c
@@ -0,0 +1,3957 @@
+/*
+ * BSD LICENSE
+ *
+ * Copyright (c) 2018-2019 Broadcom. All Rights Reserved.
+ * The term "Broadcom" refers to Broadcom Inc. and/or its subsidiaries.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * NVMe_FC transport functions.
+ */
+
+#include "spdk/env.h"
+#include "spdk/assert.h"
+#include "spdk/nvmf_transport.h"
+#include "spdk/string.h"
+#include "spdk/trace.h"
+#include "spdk/util.h"
+#include "spdk/likely.h"
+#include "spdk/endian.h"
+#include "spdk/log.h"
+#include "spdk/thread.h"
+
+#include "spdk_internal/log.h"
+
+#include "nvmf_fc.h"
+#include "fc_lld.h"
+
+#ifndef DEV_VERIFY
+#define DEV_VERIFY assert
+#endif
+
+#ifndef ASSERT_SPDK_FC_MASTER_THREAD
+#define ASSERT_SPDK_FC_MASTER_THREAD() \
+ DEV_VERIFY(spdk_get_thread() == nvmf_fc_get_master_thread());
+#endif
+
+/*
+ * PRLI service parameters
+ */
+enum spdk_nvmf_fc_service_parameters {
+ SPDK_NVMF_FC_FIRST_BURST_SUPPORTED = 0x0001,
+ SPDK_NVMF_FC_DISCOVERY_SERVICE = 0x0008,
+ SPDK_NVMF_FC_TARGET_FUNCTION = 0x0010,
+ SPDK_NVMF_FC_INITIATOR_FUNCTION = 0x0020,
+ SPDK_NVMF_FC_CONFIRMED_COMPLETION_SUPPORTED = 0x0080,
+};
+
+static char *fc_req_state_strs[] = {
+ "SPDK_NVMF_FC_REQ_INIT",
+ "SPDK_NVMF_FC_REQ_READ_BDEV",
+ "SPDK_NVMF_FC_REQ_READ_XFER",
+ "SPDK_NVMF_FC_REQ_READ_RSP",
+ "SPDK_NVMF_FC_REQ_WRITE_BUFFS",
+ "SPDK_NVMF_FC_REQ_WRITE_XFER",
+ "SPDK_NVMF_FC_REQ_WRITE_BDEV",
+ "SPDK_NVMF_FC_REQ_WRITE_RSP",
+ "SPDK_NVMF_FC_REQ_NONE_BDEV",
+ "SPDK_NVMF_FC_REQ_NONE_RSP",
+ "SPDK_NVMF_FC_REQ_SUCCESS",
+ "SPDK_NVMF_FC_REQ_FAILED",
+ "SPDK_NVMF_FC_REQ_ABORTED",
+ "SPDK_NVMF_FC_REQ_BDEV_ABORTED",
+ "SPDK_NVMF_FC_REQ_PENDING"
+};
+
+#define OBJECT_NVMF_FC_IO 0xA0
+
+#define TRACE_GROUP_NVMF_FC 0x8
+#define TRACE_FC_REQ_INIT SPDK_TPOINT_ID(TRACE_GROUP_NVMF_FC, 0x01)
+#define TRACE_FC_REQ_READ_BDEV SPDK_TPOINT_ID(TRACE_GROUP_NVMF_FC, 0x02)
+#define TRACE_FC_REQ_READ_XFER SPDK_TPOINT_ID(TRACE_GROUP_NVMF_FC, 0x03)
+#define TRACE_FC_REQ_READ_RSP SPDK_TPOINT_ID(TRACE_GROUP_NVMF_FC, 0x04)
+#define TRACE_FC_REQ_WRITE_BUFFS SPDK_TPOINT_ID(TRACE_GROUP_NVMF_FC, 0x05)
+#define TRACE_FC_REQ_WRITE_XFER SPDK_TPOINT_ID(TRACE_GROUP_NVMF_FC, 0x06)
+#define TRACE_FC_REQ_WRITE_BDEV SPDK_TPOINT_ID(TRACE_GROUP_NVMF_FC, 0x07)
+#define TRACE_FC_REQ_WRITE_RSP SPDK_TPOINT_ID(TRACE_GROUP_NVMF_FC, 0x08)
+#define TRACE_FC_REQ_NONE_BDEV SPDK_TPOINT_ID(TRACE_GROUP_NVMF_FC, 0x09)
+#define TRACE_FC_REQ_NONE_RSP SPDK_TPOINT_ID(TRACE_GROUP_NVMF_FC, 0x0A)
+#define TRACE_FC_REQ_SUCCESS SPDK_TPOINT_ID(TRACE_GROUP_NVMF_FC, 0x0B)
+#define TRACE_FC_REQ_FAILED SPDK_TPOINT_ID(TRACE_GROUP_NVMF_FC, 0x0C)
+#define TRACE_FC_REQ_ABORTED SPDK_TPOINT_ID(TRACE_GROUP_NVMF_FC, 0x0D)
+#define TRACE_FC_REQ_BDEV_ABORTED SPDK_TPOINT_ID(TRACE_GROUP_NVMF_FC, 0x0E)
+#define TRACE_FC_REQ_PENDING SPDK_TPOINT_ID(TRACE_GROUP_NVMF_FC, 0x0F)
+
+SPDK_TRACE_REGISTER_FN(nvmf_fc_trace, "nvmf_fc", TRACE_GROUP_NVMF_FC)
+{
+ spdk_trace_register_object(OBJECT_NVMF_FC_IO, 'r');
+ spdk_trace_register_description("FC_REQ_NEW",
+ TRACE_FC_REQ_INIT,
+ OWNER_NONE, OBJECT_NVMF_FC_IO, 1, 1, "");
+ spdk_trace_register_description("FC_REQ_READ_SUBMIT_TO_BDEV",
+ TRACE_FC_REQ_READ_BDEV,
+ OWNER_NONE, OBJECT_NVMF_FC_IO, 0, 1, "");
+ spdk_trace_register_description("FC_REQ_READ_XFER_DATA",
+ TRACE_FC_REQ_READ_XFER,
+ OWNER_NONE, OBJECT_NVMF_FC_IO, 0, 1, "");
+ spdk_trace_register_description("FC_REQ_READ_RSP",
+ TRACE_FC_REQ_READ_RSP,
+ OWNER_NONE, OBJECT_NVMF_FC_IO, 0, 1, "");
+ spdk_trace_register_description("FC_REQ_WRITE_NEED_BUFFER",
+ TRACE_FC_REQ_WRITE_BUFFS,
+ OWNER_NONE, OBJECT_NVMF_FC_IO, 0, 1, "");
+ spdk_trace_register_description("FC_REQ_WRITE_XFER_DATA",
+ TRACE_FC_REQ_WRITE_XFER,
+ OWNER_NONE, OBJECT_NVMF_FC_IO, 0, 1, "");
+ spdk_trace_register_description("FC_REQ_WRITE_SUBMIT_TO_BDEV",
+ TRACE_FC_REQ_WRITE_BDEV,
+ OWNER_NONE, OBJECT_NVMF_FC_IO, 0, 1, "");
+ spdk_trace_register_description("FC_REQ_WRITE_RSP",
+ TRACE_FC_REQ_WRITE_RSP,
+ OWNER_NONE, OBJECT_NVMF_FC_IO, 0, 1, "");
+ spdk_trace_register_description("FC_REQ_NONE_SUBMIT_TO_BDEV",
+ TRACE_FC_REQ_NONE_BDEV,
+ OWNER_NONE, OBJECT_NVMF_FC_IO, 0, 1, "");
+ spdk_trace_register_description("FC_REQ_NONE_RSP",
+ TRACE_FC_REQ_NONE_RSP,
+ OWNER_NONE, OBJECT_NVMF_FC_IO, 0, 1, "");
+ spdk_trace_register_description("FC_REQ_SUCCESS",
+ TRACE_FC_REQ_SUCCESS,
+ OWNER_NONE, OBJECT_NONE, 0, 0, "");
+ spdk_trace_register_description("FC_REQ_FAILED",
+ TRACE_FC_REQ_FAILED,
+ OWNER_NONE, OBJECT_NONE, 0, 0, "");
+ spdk_trace_register_description("FC_REQ_ABORTED",
+ TRACE_FC_REQ_ABORTED,
+ OWNER_NONE, OBJECT_NONE, 0, 1, "");
+ spdk_trace_register_description("FC_REQ_ABORTED_SUBMIT_TO_BDEV",
+ TRACE_FC_REQ_BDEV_ABORTED,
+ OWNER_NONE, OBJECT_NONE, 0, 1, "");
+ spdk_trace_register_description("FC_REQ_PENDING",
+ TRACE_FC_REQ_PENDING,
+ OWNER_NONE, OBJECT_NONE, 0, 1, "");
+}
+
+/**
+ * The structure used by all fc adm functions
+ */
+struct spdk_nvmf_fc_adm_api_data {
+ void *api_args;
+ spdk_nvmf_fc_callback cb_func;
+};
+
+/**
+ * The callback structure for nport-delete
+ */
+struct spdk_nvmf_fc_adm_nport_del_cb_data {
+ struct spdk_nvmf_fc_nport *nport;
+ uint8_t port_handle;
+ spdk_nvmf_fc_callback fc_cb_func;
+ void *fc_cb_ctx;
+};
+
+/**
+ * The callback structure for it-delete
+ */
+struct spdk_nvmf_fc_adm_i_t_del_cb_data {
+ struct spdk_nvmf_fc_nport *nport;
+ struct spdk_nvmf_fc_remote_port_info *rport;
+ uint8_t port_handle;
+ spdk_nvmf_fc_callback fc_cb_func;
+ void *fc_cb_ctx;
+};
+
+
+typedef void (*spdk_nvmf_fc_adm_i_t_delete_assoc_cb_fn)(void *arg, uint32_t err);
+
+/**
+ * The callback structure for the it-delete-assoc callback
+ */
+struct spdk_nvmf_fc_adm_i_t_del_assoc_cb_data {
+ struct spdk_nvmf_fc_nport *nport;
+ struct spdk_nvmf_fc_remote_port_info *rport;
+ uint8_t port_handle;
+ spdk_nvmf_fc_adm_i_t_delete_assoc_cb_fn cb_func;
+ void *cb_ctx;
+};
+
+/*
+ * Call back function pointer for HW port quiesce.
+ */
+typedef void (*spdk_nvmf_fc_adm_hw_port_quiesce_cb_fn)(void *ctx, int err);
+
+/**
+ * Context structure for quiescing a hardware port
+ */
+struct spdk_nvmf_fc_adm_hw_port_quiesce_ctx {
+ int quiesce_count;
+ void *ctx;
+ spdk_nvmf_fc_adm_hw_port_quiesce_cb_fn cb_func;
+};
+
+/**
+ * Context structure used to reset a hardware port
+ */
+struct spdk_nvmf_fc_adm_hw_port_reset_ctx {
+ void *reset_args;
+ spdk_nvmf_fc_callback reset_cb_func;
+};
+
+/**
+ * The callback structure for HW port link break event
+ */
+struct spdk_nvmf_fc_adm_port_link_break_cb_data {
+ struct spdk_nvmf_hw_port_link_break_args *args;
+ struct spdk_nvmf_fc_nport_delete_args nport_del_args;
+ spdk_nvmf_fc_callback cb_func;
+};
+
+struct spdk_nvmf_fc_transport {
+ struct spdk_nvmf_transport transport;
+ pthread_mutex_t lock;
+};
+
+static struct spdk_nvmf_fc_transport *g_nvmf_ftransport;
+
+static TAILQ_HEAD(, spdk_nvmf_fc_port) g_spdk_nvmf_fc_port_list =
+ TAILQ_HEAD_INITIALIZER(g_spdk_nvmf_fc_port_list);
+
+static struct spdk_thread *g_nvmf_fc_master_thread = NULL;
+
+static uint32_t g_nvmf_fgroup_count = 0;
+static TAILQ_HEAD(, spdk_nvmf_fc_poll_group) g_nvmf_fgroups =
+ TAILQ_HEAD_INITIALIZER(g_nvmf_fgroups);
+
+struct spdk_thread *
+nvmf_fc_get_master_thread(void)
+{
+ return g_nvmf_fc_master_thread;
+}
+
+static inline void
+nvmf_fc_record_req_trace_point(struct spdk_nvmf_fc_request *fc_req,
+ enum spdk_nvmf_fc_request_state state)
+{
+ uint16_t tpoint_id = SPDK_TRACE_MAX_TPOINT_ID;
+
+ switch (state) {
+ case SPDK_NVMF_FC_REQ_INIT:
+ /* Start IO tracing */
+ tpoint_id = TRACE_FC_REQ_INIT;
+ break;
+ case SPDK_NVMF_FC_REQ_READ_BDEV:
+ tpoint_id = TRACE_FC_REQ_READ_BDEV;
+ break;
+ case SPDK_NVMF_FC_REQ_READ_XFER:
+ tpoint_id = TRACE_FC_REQ_READ_XFER;
+ break;
+ case SPDK_NVMF_FC_REQ_READ_RSP:
+ tpoint_id = TRACE_FC_REQ_READ_RSP;
+ break;
+ case SPDK_NVMF_FC_REQ_WRITE_BUFFS:
+ tpoint_id = TRACE_FC_REQ_WRITE_BUFFS;
+ break;
+ case SPDK_NVMF_FC_REQ_WRITE_XFER:
+ tpoint_id = TRACE_FC_REQ_WRITE_XFER;
+ break;
+ case SPDK_NVMF_FC_REQ_WRITE_BDEV:
+ tpoint_id = TRACE_FC_REQ_WRITE_BDEV;
+ break;
+ case SPDK_NVMF_FC_REQ_WRITE_RSP:
+ tpoint_id = TRACE_FC_REQ_WRITE_RSP;
+ break;
+ case SPDK_NVMF_FC_REQ_NONE_BDEV:
+ tpoint_id = TRACE_FC_REQ_NONE_BDEV;
+ break;
+ case SPDK_NVMF_FC_REQ_NONE_RSP:
+ tpoint_id = TRACE_FC_REQ_NONE_RSP;
+ break;
+ case SPDK_NVMF_FC_REQ_SUCCESS:
+ tpoint_id = TRACE_FC_REQ_SUCCESS;
+ break;
+ case SPDK_NVMF_FC_REQ_FAILED:
+ tpoint_id = TRACE_FC_REQ_FAILED;
+ break;
+ case SPDK_NVMF_FC_REQ_ABORTED:
+ tpoint_id = TRACE_FC_REQ_ABORTED;
+ break;
+ case SPDK_NVMF_FC_REQ_BDEV_ABORTED:
+ tpoint_id = TRACE_FC_REQ_ABORTED;
+ break;
+ case SPDK_NVMF_FC_REQ_PENDING:
+ tpoint_id = TRACE_FC_REQ_PENDING;
+ break;
+ default:
+ assert(0);
+ break;
+ }
+ if (tpoint_id != SPDK_TRACE_MAX_TPOINT_ID) {
+ spdk_trace_record(tpoint_id, fc_req->poller_lcore, 0,
+ (uint64_t)(&fc_req->req), 0);
+ }
+}
+
+static void
+nvmf_fc_handle_connection_failure(void *arg)
+{
+ struct spdk_nvmf_fc_conn *fc_conn = arg;
+ struct spdk_nvmf_fc_ls_add_conn_api_data *api_data = NULL;
+
+ if (!fc_conn->create_opd) {
+ return;
+ }
+ api_data = &fc_conn->create_opd->u.add_conn;
+
+ nvmf_fc_ls_add_conn_failure(api_data->assoc, api_data->ls_rqst,
+ api_data->args.fc_conn, api_data->aq_conn);
+}
+
+static void
+nvmf_fc_handle_assoc_deletion(void *arg)
+{
+ struct spdk_nvmf_fc_conn *fc_conn = arg;
+
+ nvmf_fc_delete_association(fc_conn->fc_assoc->tgtport,
+ fc_conn->fc_assoc->assoc_id, false, true, NULL, NULL);
+}
+
+static int
+nvmf_fc_create_req_mempool(struct spdk_nvmf_fc_hwqp *hwqp)
+{
+ uint32_t i;
+ struct spdk_nvmf_fc_request *fc_req;
+
+ TAILQ_INIT(&hwqp->free_reqs);
+ TAILQ_INIT(&hwqp->in_use_reqs);
+
+ hwqp->fc_reqs_buf = calloc(hwqp->rq_size, sizeof(struct spdk_nvmf_fc_request));
+ if (hwqp->fc_reqs_buf == NULL) {
+ SPDK_ERRLOG("create fc request pool failed\n");
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < hwqp->rq_size; i++) {
+ fc_req = hwqp->fc_reqs_buf + i;
+
+ nvmf_fc_request_set_state(fc_req, SPDK_NVMF_FC_REQ_INIT);
+ TAILQ_INSERT_TAIL(&hwqp->free_reqs, fc_req, link);
+ }
+
+ return 0;
+}
+
+static inline struct spdk_nvmf_fc_request *
+nvmf_fc_hwqp_alloc_fc_request(struct spdk_nvmf_fc_hwqp *hwqp)
+{
+ struct spdk_nvmf_fc_request *fc_req;
+
+ if (TAILQ_EMPTY(&hwqp->free_reqs)) {
+ SPDK_ERRLOG("Alloc request buffer failed\n");
+ return NULL;
+ }
+
+ fc_req = TAILQ_FIRST(&hwqp->free_reqs);
+ TAILQ_REMOVE(&hwqp->free_reqs, fc_req, link);
+
+ memset(fc_req, 0, sizeof(struct spdk_nvmf_fc_request));
+ TAILQ_INSERT_TAIL(&hwqp->in_use_reqs, fc_req, link);
+ TAILQ_INIT(&fc_req->abort_cbs);
+ return fc_req;
+}
+
+static inline void
+nvmf_fc_hwqp_free_fc_request(struct spdk_nvmf_fc_hwqp *hwqp, struct spdk_nvmf_fc_request *fc_req)
+{
+ if (fc_req->state != SPDK_NVMF_FC_REQ_SUCCESS) {
+ /* Log an error for debug purpose. */
+ nvmf_fc_request_set_state(fc_req, SPDK_NVMF_FC_REQ_FAILED);
+ }
+
+ /* set the magic to mark req as no longer valid. */
+ fc_req->magic = 0xDEADBEEF;
+
+ TAILQ_REMOVE(&hwqp->in_use_reqs, fc_req, link);
+ TAILQ_INSERT_HEAD(&hwqp->free_reqs, fc_req, link);
+}
+
+static inline bool
+nvmf_fc_req_in_get_buff(struct spdk_nvmf_fc_request *fc_req)
+{
+ switch (fc_req->state) {
+ case SPDK_NVMF_FC_REQ_WRITE_BUFFS:
+ return true;
+ default:
+ return false;
+ }
+}
+
+void
+nvmf_fc_init_poller_queues(struct spdk_nvmf_fc_hwqp *hwqp)
+{
+ nvmf_fc_init_rqpair_buffers(hwqp);
+}
+
+struct spdk_nvmf_fc_conn *
+nvmf_fc_hwqp_find_fc_conn(struct spdk_nvmf_fc_hwqp *hwqp, uint64_t conn_id)
+{
+ struct spdk_nvmf_fc_conn *fc_conn;
+
+ TAILQ_FOREACH(fc_conn, &hwqp->connection_list, link) {
+ if (fc_conn->conn_id == conn_id) {
+ return fc_conn;
+ }
+ }
+
+ return NULL;
+}
+
+void
+nvmf_fc_hwqp_reinit_poller_queues(struct spdk_nvmf_fc_hwqp *hwqp, void *queues_curr)
+{
+ struct spdk_nvmf_fc_abts_ctx *ctx;
+ struct spdk_nvmf_fc_poller_api_queue_sync_args *args = NULL, *tmp = NULL;
+
+ /* Clean up any pending sync callbacks */
+ TAILQ_FOREACH_SAFE(args, &hwqp->sync_cbs, link, tmp) {
+ TAILQ_REMOVE(&hwqp->sync_cbs, args, link);
+ ctx = args->cb_info.cb_data;
+ if (ctx) {
+ if (++ctx->hwqps_responded == ctx->num_hwqps) {
+ free(ctx->sync_poller_args);
+ free(ctx->abts_poller_args);
+ free(ctx);
+ }
+ }
+ }
+
+ nvmf_fc_reinit_q(hwqp->queues, queues_curr);
+}
+
+void
+nvmf_fc_init_hwqp(struct spdk_nvmf_fc_port *fc_port, struct spdk_nvmf_fc_hwqp *hwqp)
+{
+ hwqp->fc_port = fc_port;
+
+ /* clear counters */
+ memset(&hwqp->counters, 0, sizeof(struct spdk_nvmf_fc_errors));
+
+ nvmf_fc_init_poller_queues(hwqp);
+ if (&fc_port->ls_queue != hwqp) {
+ nvmf_fc_create_req_mempool(hwqp);
+ }
+
+ nvmf_fc_init_q(hwqp);
+ TAILQ_INIT(&hwqp->connection_list);
+ TAILQ_INIT(&hwqp->sync_cbs);
+ TAILQ_INIT(&hwqp->ls_pending_queue);
+}
+
+static struct spdk_nvmf_fc_poll_group *
+nvmf_fc_get_idlest_poll_group(void)
+{
+ uint32_t max_count = UINT32_MAX;
+ struct spdk_nvmf_fc_poll_group *fgroup;
+ struct spdk_nvmf_fc_poll_group *ret_fgroup = NULL;
+
+ /* find poll group with least number of hwqp's assigned to it */
+ TAILQ_FOREACH(fgroup, &g_nvmf_fgroups, link) {
+ if (fgroup->hwqp_count < max_count) {
+ ret_fgroup = fgroup;
+ max_count = fgroup->hwqp_count;
+ }
+ }
+
+ return ret_fgroup;
+}
+
+void
+nvmf_fc_poll_group_add_hwqp(struct spdk_nvmf_fc_hwqp *hwqp)
+{
+ struct spdk_nvmf_fc_poll_group *fgroup = NULL;
+
+ assert(hwqp);
+ if (hwqp == NULL) {
+ SPDK_ERRLOG("Error: hwqp is NULL\n");
+ return;
+ }
+
+ assert(g_nvmf_fgroup_count);
+
+ fgroup = nvmf_fc_get_idlest_poll_group();
+ if (!fgroup) {
+ SPDK_ERRLOG("Could not assign poll group for hwqp (%d)\n", hwqp->hwqp_id);
+ return;
+ }
+
+ hwqp->thread = fgroup->group.group->thread;
+ hwqp->fgroup = fgroup;
+ fgroup->hwqp_count++;
+ nvmf_fc_poller_api_func(hwqp, SPDK_NVMF_FC_POLLER_API_ADD_HWQP, NULL);
+}
+
+void
+nvmf_fc_poll_group_remove_hwqp(struct spdk_nvmf_fc_hwqp *hwqp)
+{
+ assert(hwqp);
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC,
+ "Remove hwqp from poller: for port: %d, hwqp: %d\n",
+ hwqp->fc_port->port_hdl, hwqp->hwqp_id);
+
+ if (!hwqp->fgroup) {
+ SPDK_ERRLOG("HWQP (%d) not assigned to poll group\n", hwqp->hwqp_id);
+ } else {
+ hwqp->fgroup->hwqp_count--;
+ nvmf_fc_poller_api_func(hwqp, SPDK_NVMF_FC_POLLER_API_REMOVE_HWQP, NULL);
+ }
+}
+
+/*
+ * Note: This needs to be used only on master poller.
+ */
+static uint64_t
+nvmf_fc_get_abts_unique_id(void)
+{
+ static uint32_t u_id = 0;
+
+ return (uint64_t)(++u_id);
+}
+
+static void
+nvmf_fc_queue_synced_cb(void *cb_data, enum spdk_nvmf_fc_poller_api_ret ret)
+{
+ struct spdk_nvmf_fc_abts_ctx *ctx = cb_data;
+ struct spdk_nvmf_fc_poller_api_abts_recvd_args *args, *poller_arg;
+
+ ctx->hwqps_responded++;
+
+ if (ctx->hwqps_responded < ctx->num_hwqps) {
+ /* Wait for all pollers to complete. */
+ return;
+ }
+
+ /* Free the queue sync poller args. */
+ free(ctx->sync_poller_args);
+
+ /* Mark as queue synced */
+ ctx->queue_synced = true;
+
+ /* Reset the ctx values */
+ ctx->hwqps_responded = 0;
+ ctx->handled = false;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC,
+ "QueueSync(0x%lx) completed for nport: %d, rpi: 0x%x, oxid: 0x%x, rxid: 0x%x\n",
+ ctx->u_id, ctx->nport->nport_hdl, ctx->rpi, ctx->oxid, ctx->rxid);
+
+ /* Resend ABTS to pollers */
+ args = ctx->abts_poller_args;
+ for (int i = 0; i < ctx->num_hwqps; i++) {
+ poller_arg = args + i;
+ nvmf_fc_poller_api_func(poller_arg->hwqp,
+ SPDK_NVMF_FC_POLLER_API_ABTS_RECEIVED,
+ poller_arg);
+ }
+}
+
+static int
+nvmf_fc_handle_abts_notfound(struct spdk_nvmf_fc_abts_ctx *ctx)
+{
+ struct spdk_nvmf_fc_poller_api_queue_sync_args *args, *poller_arg;
+ struct spdk_nvmf_fc_poller_api_abts_recvd_args *abts_args, *abts_poller_arg;
+
+ /* check if FC driver supports queue sync */
+ if (!nvmf_fc_q_sync_available()) {
+ return -EPERM;
+ }
+
+ assert(ctx);
+ if (!ctx) {
+ SPDK_ERRLOG("NULL ctx pointer");
+ return -EINVAL;
+ }
+
+ /* Reset the ctx values */
+ ctx->hwqps_responded = 0;
+
+ args = calloc(ctx->num_hwqps,
+ sizeof(struct spdk_nvmf_fc_poller_api_queue_sync_args));
+ if (!args) {
+ SPDK_ERRLOG("QueueSync(0x%lx) failed for nport: %d, rpi: 0x%x, oxid: 0x%x, rxid: 0x%x\n",
+ ctx->u_id, ctx->nport->nport_hdl, ctx->rpi, ctx->oxid, ctx->rxid);
+ return -ENOMEM;
+ }
+ ctx->sync_poller_args = args;
+
+ abts_args = ctx->abts_poller_args;
+ for (int i = 0; i < ctx->num_hwqps; i++) {
+ abts_poller_arg = abts_args + i;
+ poller_arg = args + i;
+ poller_arg->u_id = ctx->u_id;
+ poller_arg->hwqp = abts_poller_arg->hwqp;
+ poller_arg->cb_info.cb_func = nvmf_fc_queue_synced_cb;
+ poller_arg->cb_info.cb_data = ctx;
+ poller_arg->cb_info.cb_thread = spdk_get_thread();
+
+ /* Send a Queue sync message to interested pollers */
+ nvmf_fc_poller_api_func(poller_arg->hwqp,
+ SPDK_NVMF_FC_POLLER_API_QUEUE_SYNC,
+ poller_arg);
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC,
+ "QueueSync(0x%lx) Sent for nport: %d, rpi: 0x%x, oxid: 0x%x, rxid: 0x%x\n",
+ ctx->u_id, ctx->nport->nport_hdl, ctx->rpi, ctx->oxid, ctx->rxid);
+
+ /* Post Marker to queue to track aborted request */
+ nvmf_fc_issue_q_sync(ctx->ls_hwqp, ctx->u_id, ctx->fcp_rq_id);
+
+ return 0;
+}
+
+static void
+nvmf_fc_abts_handled_cb(void *cb_data, enum spdk_nvmf_fc_poller_api_ret ret)
+{
+ struct spdk_nvmf_fc_abts_ctx *ctx = cb_data;
+ struct spdk_nvmf_fc_nport *nport = NULL;
+
+ if (ret != SPDK_NVMF_FC_POLLER_API_OXID_NOT_FOUND) {
+ ctx->handled = true;
+ }
+
+ ctx->hwqps_responded++;
+
+ if (ctx->hwqps_responded < ctx->num_hwqps) {
+ /* Wait for all pollers to complete. */
+ return;
+ }
+
+ nport = nvmf_fc_nport_find(ctx->port_hdl, ctx->nport_hdl);
+
+ if (ctx->nport != nport) {
+ /* Nport can be deleted while this abort is being
+ * processed by the pollers.
+ */
+ SPDK_NOTICELOG("nport_%d deleted while processing ABTS frame, rpi: 0x%x, oxid: 0x%x, rxid: 0x%x\n",
+ ctx->nport_hdl, ctx->rpi, ctx->oxid, ctx->rxid);
+ } else {
+ if (!ctx->handled) {
+ /* Try syncing the queues and try one more time */
+ if (!ctx->queue_synced && (nvmf_fc_handle_abts_notfound(ctx) == 0)) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC,
+ "QueueSync(0x%lx) for nport: %d, rpi: 0x%x, oxid: 0x%x, rxid: 0x%x\n",
+ ctx->u_id, ctx->nport->nport_hdl, ctx->rpi, ctx->oxid, ctx->rxid);
+ return;
+ } else {
+ /* Send Reject */
+ nvmf_fc_xmt_bls_rsp(&ctx->nport->fc_port->ls_queue,
+ ctx->oxid, ctx->rxid, ctx->rpi, true,
+ FCNVME_BLS_REJECT_EXP_INVALID_OXID, NULL, NULL);
+ }
+ } else {
+ /* Send Accept */
+ nvmf_fc_xmt_bls_rsp(&ctx->nport->fc_port->ls_queue,
+ ctx->oxid, ctx->rxid, ctx->rpi, false,
+ 0, NULL, NULL);
+ }
+ }
+ SPDK_NOTICELOG("BLS_%s sent for ABTS frame nport: %d, rpi: 0x%x, oxid: 0x%x, rxid: 0x%x\n",
+ (ctx->handled) ? "ACC" : "REJ", ctx->nport->nport_hdl, ctx->rpi, ctx->oxid, ctx->rxid);
+
+ free(ctx->abts_poller_args);
+ free(ctx);
+}
+
+void
+nvmf_fc_handle_abts_frame(struct spdk_nvmf_fc_nport *nport, uint16_t rpi,
+ uint16_t oxid, uint16_t rxid)
+{
+ struct spdk_nvmf_fc_abts_ctx *ctx = NULL;
+ struct spdk_nvmf_fc_poller_api_abts_recvd_args *args = NULL, *poller_arg;
+ struct spdk_nvmf_fc_association *assoc = NULL;
+ struct spdk_nvmf_fc_conn *conn = NULL;
+ uint32_t hwqp_cnt = 0;
+ bool skip_hwqp_cnt;
+ struct spdk_nvmf_fc_hwqp **hwqps = NULL;
+ uint32_t i;
+
+ SPDK_NOTICELOG("Handle ABTS frame for nport: %d, rpi: 0x%x, oxid: 0x%x, rxid: 0x%x\n",
+ nport->nport_hdl, rpi, oxid, rxid);
+
+ /* Allocate memory to track hwqp's with at least 1 active connection. */
+ hwqps = calloc(nport->fc_port->num_io_queues, sizeof(struct spdk_nvmf_fc_hwqp *));
+ if (hwqps == NULL) {
+ SPDK_ERRLOG("Unable to allocate temp. hwqp array for abts processing!\n");
+ goto bls_rej;
+ }
+
+ TAILQ_FOREACH(assoc, &nport->fc_associations, link) {
+ TAILQ_FOREACH(conn, &assoc->fc_conns, assoc_link) {
+ if (conn->rpi != rpi) {
+ continue;
+ }
+
+ skip_hwqp_cnt = false;
+ for (i = 0; i < hwqp_cnt; i++) {
+ if (hwqps[i] == conn->hwqp) {
+ /* Skip. This is already present */
+ skip_hwqp_cnt = true;
+ break;
+ }
+ }
+ if (!skip_hwqp_cnt) {
+ assert(hwqp_cnt < nport->fc_port->num_io_queues);
+ hwqps[hwqp_cnt] = conn->hwqp;
+ hwqp_cnt++;
+ }
+ }
+ }
+
+ if (!hwqp_cnt) {
+ goto bls_rej;
+ }
+
+ args = calloc(hwqp_cnt,
+ sizeof(struct spdk_nvmf_fc_poller_api_abts_recvd_args));
+ if (!args) {
+ goto bls_rej;
+ }
+
+ ctx = calloc(1, sizeof(struct spdk_nvmf_fc_abts_ctx));
+ if (!ctx) {
+ goto bls_rej;
+ }
+ ctx->rpi = rpi;
+ ctx->oxid = oxid;
+ ctx->rxid = rxid;
+ ctx->nport = nport;
+ ctx->nport_hdl = nport->nport_hdl;
+ ctx->port_hdl = nport->fc_port->port_hdl;
+ ctx->num_hwqps = hwqp_cnt;
+ ctx->ls_hwqp = &nport->fc_port->ls_queue;
+ ctx->fcp_rq_id = nport->fc_port->fcp_rq_id;
+ ctx->abts_poller_args = args;
+
+ /* Get a unique context for this ABTS */
+ ctx->u_id = nvmf_fc_get_abts_unique_id();
+
+ for (i = 0; i < hwqp_cnt; i++) {
+ poller_arg = args + i;
+ poller_arg->hwqp = hwqps[i];
+ poller_arg->cb_info.cb_func = nvmf_fc_abts_handled_cb;
+ poller_arg->cb_info.cb_data = ctx;
+ poller_arg->cb_info.cb_thread = spdk_get_thread();
+ poller_arg->ctx = ctx;
+
+ nvmf_fc_poller_api_func(poller_arg->hwqp,
+ SPDK_NVMF_FC_POLLER_API_ABTS_RECEIVED,
+ poller_arg);
+ }
+
+ free(hwqps);
+
+ return;
+bls_rej:
+ free(args);
+ free(hwqps);
+
+ /* Send Reject */
+ nvmf_fc_xmt_bls_rsp(&nport->fc_port->ls_queue, oxid, rxid, rpi,
+ true, FCNVME_BLS_REJECT_EXP_NOINFO, NULL, NULL);
+ SPDK_NOTICELOG("BLS_RJT for ABTS frame for nport: %d, rpi: 0x%x, oxid: 0x%x, rxid: 0x%x\n",
+ nport->nport_hdl, rpi, oxid, rxid);
+ return;
+}
+
+/*** Accessor functions for the FC structures - BEGIN */
+/*
+ * Returns true if the port is in offline state.
+ */
+bool
+nvmf_fc_port_is_offline(struct spdk_nvmf_fc_port *fc_port)
+{
+ if (fc_port && (fc_port->hw_port_status == SPDK_FC_PORT_OFFLINE)) {
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Returns true if the port is in online state.
+ */
+bool
+nvmf_fc_port_is_online(struct spdk_nvmf_fc_port *fc_port)
+{
+ if (fc_port && (fc_port->hw_port_status == SPDK_FC_PORT_ONLINE)) {
+ return true;
+ }
+
+ return false;
+}
+
+int
+nvmf_fc_port_set_online(struct spdk_nvmf_fc_port *fc_port)
+{
+ if (fc_port && (fc_port->hw_port_status != SPDK_FC_PORT_ONLINE)) {
+ fc_port->hw_port_status = SPDK_FC_PORT_ONLINE;
+ return 0;
+ }
+
+ return -EPERM;
+}
+
+int
+nvmf_fc_port_set_offline(struct spdk_nvmf_fc_port *fc_port)
+{
+ if (fc_port && (fc_port->hw_port_status != SPDK_FC_PORT_OFFLINE)) {
+ fc_port->hw_port_status = SPDK_FC_PORT_OFFLINE;
+ return 0;
+ }
+
+ return -EPERM;
+}
+
+int
+nvmf_fc_hwqp_set_online(struct spdk_nvmf_fc_hwqp *hwqp)
+{
+ if (hwqp && (hwqp->state != SPDK_FC_HWQP_ONLINE)) {
+ hwqp->state = SPDK_FC_HWQP_ONLINE;
+ /* reset some queue counters */
+ hwqp->num_conns = 0;
+ return nvmf_fc_set_q_online_state(hwqp, true);
+ }
+
+ return -EPERM;
+}
+
+int
+nvmf_fc_hwqp_set_offline(struct spdk_nvmf_fc_hwqp *hwqp)
+{
+ if (hwqp && (hwqp->state != SPDK_FC_HWQP_OFFLINE)) {
+ hwqp->state = SPDK_FC_HWQP_OFFLINE;
+ return nvmf_fc_set_q_online_state(hwqp, false);
+ }
+
+ return -EPERM;
+}
+
+void
+nvmf_fc_port_add(struct spdk_nvmf_fc_port *fc_port)
+{
+ TAILQ_INSERT_TAIL(&g_spdk_nvmf_fc_port_list, fc_port, link);
+}
+
+struct spdk_nvmf_fc_port *
+nvmf_fc_port_lookup(uint8_t port_hdl)
+{
+ struct spdk_nvmf_fc_port *fc_port = NULL;
+
+ TAILQ_FOREACH(fc_port, &g_spdk_nvmf_fc_port_list, link) {
+ if (fc_port->port_hdl == port_hdl) {
+ return fc_port;
+ }
+ }
+ return NULL;
+}
+
+static void
+nvmf_fc_port_cleanup(void)
+{
+ struct spdk_nvmf_fc_port *fc_port, *tmp;
+ struct spdk_nvmf_fc_hwqp *hwqp;
+ uint32_t i;
+
+ TAILQ_FOREACH_SAFE(fc_port, &g_spdk_nvmf_fc_port_list, link, tmp) {
+ TAILQ_REMOVE(&g_spdk_nvmf_fc_port_list, fc_port, link);
+ for (i = 0; i < fc_port->num_io_queues; i++) {
+ hwqp = &fc_port->io_queues[i];
+ if (hwqp->fc_reqs_buf) {
+ free(hwqp->fc_reqs_buf);
+ }
+ }
+ free(fc_port);
+ }
+}
+
+uint32_t
+nvmf_fc_get_prli_service_params(void)
+{
+ return (SPDK_NVMF_FC_DISCOVERY_SERVICE | SPDK_NVMF_FC_TARGET_FUNCTION);
+}
+
+int
+nvmf_fc_port_add_nport(struct spdk_nvmf_fc_port *fc_port,
+ struct spdk_nvmf_fc_nport *nport)
+{
+ if (fc_port) {
+ TAILQ_INSERT_TAIL(&fc_port->nport_list, nport, link);
+ fc_port->num_nports++;
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+int
+nvmf_fc_port_remove_nport(struct spdk_nvmf_fc_port *fc_port,
+ struct spdk_nvmf_fc_nport *nport)
+{
+ if (fc_port && nport) {
+ TAILQ_REMOVE(&fc_port->nport_list, nport, link);
+ fc_port->num_nports--;
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+static struct spdk_nvmf_fc_nport *
+nvmf_fc_nport_hdl_lookup(struct spdk_nvmf_fc_port *fc_port, uint16_t nport_hdl)
+{
+ struct spdk_nvmf_fc_nport *fc_nport = NULL;
+
+ TAILQ_FOREACH(fc_nport, &fc_port->nport_list, link) {
+ if (fc_nport->nport_hdl == nport_hdl) {
+ return fc_nport;
+ }
+ }
+
+ return NULL;
+}
+
+struct spdk_nvmf_fc_nport *
+nvmf_fc_nport_find(uint8_t port_hdl, uint16_t nport_hdl)
+{
+ struct spdk_nvmf_fc_port *fc_port = NULL;
+
+ fc_port = nvmf_fc_port_lookup(port_hdl);
+ if (fc_port) {
+ return nvmf_fc_nport_hdl_lookup(fc_port, nport_hdl);
+ }
+
+ return NULL;
+}
+
+static inline int
+nvmf_fc_hwqp_find_nport_and_rport(struct spdk_nvmf_fc_hwqp *hwqp,
+ uint32_t d_id, struct spdk_nvmf_fc_nport **nport,
+ uint32_t s_id, struct spdk_nvmf_fc_remote_port_info **rport)
+{
+ struct spdk_nvmf_fc_nport *n_port;
+ struct spdk_nvmf_fc_remote_port_info *r_port;
+
+ assert(hwqp);
+ if (hwqp == NULL) {
+ SPDK_ERRLOG("Error: hwqp is NULL\n");
+ return -EINVAL;
+ }
+ assert(nport);
+ if (nport == NULL) {
+ SPDK_ERRLOG("Error: nport is NULL\n");
+ return -EINVAL;
+ }
+ assert(rport);
+ if (rport == NULL) {
+ SPDK_ERRLOG("Error: rport is NULL\n");
+ return -EINVAL;
+ }
+
+ TAILQ_FOREACH(n_port, &hwqp->fc_port->nport_list, link) {
+ if (n_port->d_id == d_id) {
+ TAILQ_FOREACH(r_port, &n_port->rem_port_list, link) {
+ if (r_port->s_id == s_id) {
+ *nport = n_port;
+ *rport = r_port;
+ return 0;
+ }
+ }
+ break;
+ }
+ }
+
+ return -ENOENT;
+}
+
+/* Returns true if the Nport is empty of all rem_ports */
+bool
+nvmf_fc_nport_has_no_rport(struct spdk_nvmf_fc_nport *nport)
+{
+ if (nport && TAILQ_EMPTY(&nport->rem_port_list)) {
+ assert(nport->rport_count == 0);
+ return true;
+ } else {
+ return false;
+ }
+}
+
+int
+nvmf_fc_nport_set_state(struct spdk_nvmf_fc_nport *nport,
+ enum spdk_nvmf_fc_object_state state)
+{
+ if (nport) {
+ nport->nport_state = state;
+ return 0;
+ } else {
+ return -EINVAL;
+ }
+}
+
+bool
+nvmf_fc_nport_add_rem_port(struct spdk_nvmf_fc_nport *nport,
+ struct spdk_nvmf_fc_remote_port_info *rem_port)
+{
+ if (nport && rem_port) {
+ TAILQ_INSERT_TAIL(&nport->rem_port_list, rem_port, link);
+ nport->rport_count++;
+ return 0;
+ } else {
+ return -EINVAL;
+ }
+}
+
+bool
+nvmf_fc_nport_remove_rem_port(struct spdk_nvmf_fc_nport *nport,
+ struct spdk_nvmf_fc_remote_port_info *rem_port)
+{
+ if (nport && rem_port) {
+ TAILQ_REMOVE(&nport->rem_port_list, rem_port, link);
+ nport->rport_count--;
+ return 0;
+ } else {
+ return -EINVAL;
+ }
+}
+
+int
+nvmf_fc_rport_set_state(struct spdk_nvmf_fc_remote_port_info *rport,
+ enum spdk_nvmf_fc_object_state state)
+{
+ if (rport) {
+ rport->rport_state = state;
+ return 0;
+ } else {
+ return -EINVAL;
+ }
+}
+int
+nvmf_fc_assoc_set_state(struct spdk_nvmf_fc_association *assoc,
+ enum spdk_nvmf_fc_object_state state)
+{
+ if (assoc) {
+ assoc->assoc_state = state;
+ return 0;
+ } else {
+ return -EINVAL;
+ }
+}
+
+static struct spdk_nvmf_fc_association *
+nvmf_ctrlr_get_fc_assoc(struct spdk_nvmf_ctrlr *ctrlr)
+{
+ struct spdk_nvmf_qpair *qpair = ctrlr->admin_qpair;
+ struct spdk_nvmf_fc_conn *fc_conn;
+
+ if (!qpair) {
+ SPDK_ERRLOG("Controller %d has no associations\n", ctrlr->cntlid);
+ return NULL;
+ }
+
+ fc_conn = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_fc_conn, qpair);
+
+ return fc_conn->fc_assoc;
+}
+
+bool
+nvmf_ctrlr_is_on_nport(uint8_t port_hdl, uint16_t nport_hdl,
+ struct spdk_nvmf_ctrlr *ctrlr)
+{
+ struct spdk_nvmf_fc_nport *fc_nport = NULL;
+ struct spdk_nvmf_fc_association *assoc = NULL;
+
+ if (!ctrlr) {
+ return false;
+ }
+
+ fc_nport = nvmf_fc_nport_find(port_hdl, nport_hdl);
+ if (!fc_nport) {
+ return false;
+ }
+
+ assoc = nvmf_ctrlr_get_fc_assoc(ctrlr);
+ if (assoc && assoc->tgtport == fc_nport) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC,
+ "Controller: %d corresponding to association: %p(%lu:%d) is on port: %d nport: %d\n",
+ ctrlr->cntlid, assoc, assoc->assoc_id, assoc->assoc_state, port_hdl,
+ nport_hdl);
+ return true;
+ }
+ return false;
+}
+
+static inline bool
+nvmf_fc_req_in_bdev(struct spdk_nvmf_fc_request *fc_req)
+{
+ switch (fc_req->state) {
+ case SPDK_NVMF_FC_REQ_READ_BDEV:
+ case SPDK_NVMF_FC_REQ_WRITE_BDEV:
+ case SPDK_NVMF_FC_REQ_NONE_BDEV:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static inline bool
+nvmf_fc_req_in_pending(struct spdk_nvmf_fc_request *fc_req)
+{
+ struct spdk_nvmf_request *tmp = NULL;
+
+ STAILQ_FOREACH(tmp, &fc_req->hwqp->fgroup->group.pending_buf_queue, buf_link) {
+ if (tmp == &fc_req->req) {
+ return true;
+ }
+ }
+ return false;
+}
+
+static void
+nvmf_fc_req_bdev_abort(void *arg1)
+{
+ struct spdk_nvmf_fc_request *fc_req = arg1;
+ struct spdk_nvmf_ctrlr *ctrlr = fc_req->req.qpair->ctrlr;
+ int i;
+
+ /* Initial release - we don't have to abort Admin Queue or
+ * Fabric commands. The AQ commands supported at this time are
+ * Get-Log-Page,
+ * Identify
+ * Set Features
+ * Get Features
+ * AER -> Special case and handled differently.
+ * Every one of the above Admin commands (except AER) run
+ * to completion and so an Abort of such commands doesn't
+ * make sense.
+ */
+ /* The Fabric commands supported are
+ * Property Set
+ * Property Get
+ * Connect -> Special case (async. handling). Not sure how to
+ * handle at this point. Let it run to completion.
+ */
+ for (i = 0; i < NVMF_MAX_ASYNC_EVENTS; i++) {
+ if (ctrlr->aer_req[i] == &fc_req->req) {
+ SPDK_NOTICELOG("Abort AER request\n");
+ nvmf_qpair_free_aer(fc_req->req.qpair);
+ }
+ }
+}
+
+void
+nvmf_fc_request_abort_complete(void *arg1)
+{
+ struct spdk_nvmf_fc_request *fc_req =
+ (struct spdk_nvmf_fc_request *)arg1;
+ struct spdk_nvmf_fc_caller_ctx *ctx = NULL, *tmp = NULL;
+
+ /* Request abort completed. Notify all the callbacks */
+ TAILQ_FOREACH_SAFE(ctx, &fc_req->abort_cbs, link, tmp) {
+ /* Notify */
+ ctx->cb(fc_req->hwqp, 0, ctx->cb_args);
+ /* Remove */
+ TAILQ_REMOVE(&fc_req->abort_cbs, ctx, link);
+ /* free */
+ free(ctx);
+ }
+
+ SPDK_NOTICELOG("FC Request(%p) in state :%s aborted\n", fc_req,
+ fc_req_state_strs[fc_req->state]);
+
+ _nvmf_fc_request_free(fc_req);
+}
+
+void
+nvmf_fc_request_abort(struct spdk_nvmf_fc_request *fc_req, bool send_abts,
+ spdk_nvmf_fc_caller_cb cb, void *cb_args)
+{
+ struct spdk_nvmf_fc_caller_ctx *ctx = NULL;
+ bool kill_req = false;
+
+ /* Add the cb to list */
+ if (cb) {
+ ctx = calloc(1, sizeof(struct spdk_nvmf_fc_caller_ctx));
+ if (!ctx) {
+ SPDK_ERRLOG("ctx alloc failed.\n");
+ return;
+ }
+ ctx->cb = cb;
+ ctx->cb_args = cb_args;
+
+ TAILQ_INSERT_TAIL(&fc_req->abort_cbs, ctx, link);
+ }
+
+ if (!fc_req->is_aborted) {
+ /* Increment aborted command counter */
+ fc_req->hwqp->counters.num_aborted++;
+ }
+
+ /* If port is dead, skip abort wqe */
+ kill_req = nvmf_fc_is_port_dead(fc_req->hwqp);
+ if (kill_req && nvmf_fc_req_in_xfer(fc_req)) {
+ fc_req->is_aborted = true;
+ goto complete;
+ }
+
+ /* Check if the request is already marked for deletion */
+ if (fc_req->is_aborted) {
+ return;
+ }
+
+ /* Mark request as aborted */
+ fc_req->is_aborted = true;
+
+ /* If xchg is allocated, then save if we need to send abts or not. */
+ if (fc_req->xchg) {
+ fc_req->xchg->send_abts = send_abts;
+ fc_req->xchg->aborted = true;
+ }
+
+ if (fc_req->state == SPDK_NVMF_FC_REQ_BDEV_ABORTED) {
+ /* Aborted by backend */
+ goto complete;
+ } else if (nvmf_fc_req_in_bdev(fc_req)) {
+ /* Notify bdev */
+ spdk_thread_send_msg(fc_req->hwqp->thread,
+ nvmf_fc_req_bdev_abort, (void *)fc_req);
+ } else if (nvmf_fc_req_in_xfer(fc_req)) {
+ /* Notify HBA to abort this exchange */
+ nvmf_fc_issue_abort(fc_req->hwqp, fc_req->xchg, NULL, NULL);
+ } else if (nvmf_fc_req_in_get_buff(fc_req)) {
+ /* Will be completed by request_complete callback. */
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC, "Abort req when getting buffers.\n");
+ } else if (nvmf_fc_req_in_pending(fc_req)) {
+ /* Remove from pending */
+ STAILQ_REMOVE(&fc_req->hwqp->fgroup->group.pending_buf_queue, &fc_req->req,
+ spdk_nvmf_request, buf_link);
+ goto complete;
+ } else {
+ /* Should never happen */
+ SPDK_ERRLOG("Request in invalid state\n");
+ goto complete;
+ }
+
+ return;
+complete:
+ nvmf_fc_request_set_state(fc_req, SPDK_NVMF_FC_REQ_ABORTED);
+ nvmf_fc_poller_api_func(fc_req->hwqp, SPDK_NVMF_FC_POLLER_API_REQ_ABORT_COMPLETE,
+ (void *)fc_req);
+}
+
+static int
+nvmf_fc_request_alloc_buffers(struct spdk_nvmf_fc_request *fc_req)
+{
+ uint32_t length = fc_req->req.length;
+ struct spdk_nvmf_fc_poll_group *fgroup = fc_req->hwqp->fgroup;
+ struct spdk_nvmf_transport_poll_group *group = &fgroup->group;
+ struct spdk_nvmf_transport *transport = group->transport;
+
+ if (spdk_nvmf_request_get_buffers(&fc_req->req, group, transport, length)) {
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static int
+nvmf_fc_request_execute(struct spdk_nvmf_fc_request *fc_req)
+{
+ /* Allocate an XCHG if we dont use send frame for this command. */
+ if (!nvmf_fc_use_send_frame(&fc_req->req)) {
+ fc_req->xchg = nvmf_fc_get_xri(fc_req->hwqp);
+ if (!fc_req->xchg) {
+ fc_req->hwqp->counters.no_xchg++;
+ printf("NO XCHGs!\n");
+ goto pending;
+ }
+ }
+
+ if (fc_req->req.length) {
+ if (nvmf_fc_request_alloc_buffers(fc_req) < 0) {
+ fc_req->hwqp->counters.buf_alloc_err++;
+ goto pending;
+ }
+ fc_req->req.data = fc_req->req.iov[0].iov_base;
+ }
+
+ if (fc_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC, "WRITE CMD.\n");
+
+ nvmf_fc_request_set_state(fc_req, SPDK_NVMF_FC_REQ_WRITE_XFER);
+
+ if (nvmf_fc_recv_data(fc_req)) {
+ /* Dropped return success to caller */
+ fc_req->hwqp->counters.unexpected_err++;
+ _nvmf_fc_request_free(fc_req);
+ }
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC, "READ/NONE CMD\n");
+
+ if (fc_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) {
+ nvmf_fc_request_set_state(fc_req, SPDK_NVMF_FC_REQ_READ_BDEV);
+ } else {
+ nvmf_fc_request_set_state(fc_req, SPDK_NVMF_FC_REQ_NONE_BDEV);
+ }
+ spdk_nvmf_request_exec(&fc_req->req);
+ }
+
+ return 0;
+
+pending:
+ if (fc_req->xchg) {
+ nvmf_fc_put_xchg(fc_req->hwqp, fc_req->xchg);
+ fc_req->xchg = NULL;
+ }
+
+ nvmf_fc_request_set_state(fc_req, SPDK_NVMF_FC_REQ_PENDING);
+
+ return -EAGAIN;
+}
+
+static int
+nvmf_fc_hwqp_handle_request(struct spdk_nvmf_fc_hwqp *hwqp, struct spdk_nvmf_fc_frame_hdr *frame,
+ uint32_t buf_idx, struct spdk_nvmf_fc_buffer_desc *buffer, uint32_t plen)
+{
+ uint16_t cmnd_len;
+ uint64_t rqst_conn_id;
+ struct spdk_nvmf_fc_request *fc_req = NULL;
+ struct spdk_nvmf_fc_cmnd_iu *cmd_iu = NULL;
+ struct spdk_nvmf_fc_conn *fc_conn = NULL;
+ enum spdk_nvme_data_transfer xfer;
+
+ cmd_iu = buffer->virt;
+ cmnd_len = cmd_iu->cmnd_iu_len;
+ cmnd_len = from_be16(&cmnd_len);
+
+ /* check for a valid cmnd_iu format */
+ if ((cmd_iu->fc_id != FCNVME_CMND_IU_FC_ID) ||
+ (cmd_iu->scsi_id != FCNVME_CMND_IU_SCSI_ID) ||
+ (cmnd_len != sizeof(struct spdk_nvmf_fc_cmnd_iu) / 4)) {
+ SPDK_ERRLOG("IU CMD error\n");
+ hwqp->counters.nvme_cmd_iu_err++;
+ return -ENXIO;
+ }
+
+ xfer = spdk_nvme_opc_get_data_transfer(cmd_iu->flags);
+ if (xfer == SPDK_NVME_DATA_BIDIRECTIONAL) {
+ SPDK_ERRLOG("IU CMD xfer error\n");
+ hwqp->counters.nvme_cmd_xfer_err++;
+ return -EPERM;
+ }
+
+ rqst_conn_id = from_be64(&cmd_iu->conn_id);
+
+ /* Check if conn id is valid */
+ fc_conn = nvmf_fc_hwqp_find_fc_conn(hwqp, rqst_conn_id);
+ if (!fc_conn) {
+ SPDK_ERRLOG("IU CMD conn(%ld) invalid\n", rqst_conn_id);
+ hwqp->counters.invalid_conn_err++;
+ return -ENODEV;
+ }
+
+ /* If association/connection is being deleted - return */
+ if (fc_conn->fc_assoc->assoc_state != SPDK_NVMF_FC_OBJECT_CREATED) {
+ SPDK_ERRLOG("Association state not valid\n");
+ return -EACCES;
+ }
+
+ if (fc_conn->qpair.state == SPDK_NVMF_QPAIR_ERROR) {
+ return -EACCES;
+ }
+
+ /* Make sure xfer len is according to mdts */
+ if (from_be32(&cmd_iu->data_len) >
+ hwqp->fgroup->group.transport->opts.max_io_size) {
+ SPDK_ERRLOG("IO length requested is greater than MDTS\n");
+ return -EINVAL;
+ }
+
+ /* allocate a request buffer */
+ fc_req = nvmf_fc_hwqp_alloc_fc_request(hwqp);
+ if (fc_req == NULL) {
+ /* Should not happen. Since fc_reqs == RQ buffers */
+ return -ENOMEM;
+ }
+
+ fc_req->req.length = from_be32(&cmd_iu->data_len);
+ fc_req->req.qpair = &fc_conn->qpair;
+ fc_req->req.cmd = (union nvmf_h2c_msg *)&cmd_iu->cmd;
+ fc_req->req.rsp = (union nvmf_c2h_msg *)&fc_req->ersp.rsp;
+ fc_req->oxid = frame->ox_id;
+ fc_req->oxid = from_be16(&fc_req->oxid);
+ fc_req->rpi = fc_conn->rpi;
+ fc_req->buf_index = buf_idx;
+ fc_req->poller_lcore = hwqp->lcore_id;
+ fc_req->poller_thread = hwqp->thread;
+ fc_req->hwqp = hwqp;
+ fc_req->fc_conn = fc_conn;
+ fc_req->req.xfer = xfer;
+ fc_req->s_id = (uint32_t)frame->s_id;
+ fc_req->d_id = (uint32_t)frame->d_id;
+ fc_req->s_id = from_be32(&fc_req->s_id) >> 8;
+ fc_req->d_id = from_be32(&fc_req->d_id) >> 8;
+
+ nvmf_fc_record_req_trace_point(fc_req, SPDK_NVMF_FC_REQ_INIT);
+ if (nvmf_fc_request_execute(fc_req)) {
+ STAILQ_INSERT_TAIL(&hwqp->fgroup->group.pending_buf_queue, &fc_req->req, buf_link);
+ }
+
+ return 0;
+}
+
+/*
+ * These functions are called from the FC LLD
+ */
+
+void
+_nvmf_fc_request_free(struct spdk_nvmf_fc_request *fc_req)
+{
+ struct spdk_nvmf_fc_hwqp *hwqp = fc_req->hwqp;
+ struct spdk_nvmf_fc_poll_group *fgroup = hwqp->fgroup;
+ struct spdk_nvmf_transport_poll_group *group = &fgroup->group;
+ struct spdk_nvmf_transport *transport = group->transport;
+
+ if (!fc_req) {
+ return;
+ }
+
+ if (fc_req->xchg) {
+ nvmf_fc_put_xchg(hwqp, fc_req->xchg);
+ fc_req->xchg = NULL;
+ }
+
+ /* Release IO buffers */
+ if (fc_req->req.data_from_pool) {
+ spdk_nvmf_request_free_buffers(&fc_req->req, group, transport);
+ }
+ fc_req->req.data = NULL;
+ fc_req->req.iovcnt = 0;
+
+ /* Release Q buffer */
+ nvmf_fc_rqpair_buffer_release(hwqp, fc_req->buf_index);
+
+ /* Free Fc request */
+ nvmf_fc_hwqp_free_fc_request(hwqp, fc_req);
+}
+
+void
+nvmf_fc_request_set_state(struct spdk_nvmf_fc_request *fc_req,
+ enum spdk_nvmf_fc_request_state state)
+{
+ assert(fc_req->magic != 0xDEADBEEF);
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC,
+ "FC Request(%p):\n\tState Old:%s New:%s\n", fc_req,
+ nvmf_fc_request_get_state_str(fc_req->state),
+ nvmf_fc_request_get_state_str(state));
+ nvmf_fc_record_req_trace_point(fc_req, state);
+ fc_req->state = state;
+}
+
+char *
+nvmf_fc_request_get_state_str(int state)
+{
+ static char *unk_str = "unknown";
+
+ return (state >= 0 && state < (int)(sizeof(fc_req_state_strs) / sizeof(char *)) ?
+ fc_req_state_strs[state] : unk_str);
+}
+
+int
+nvmf_fc_hwqp_process_frame(struct spdk_nvmf_fc_hwqp *hwqp,
+ uint32_t buff_idx,
+ struct spdk_nvmf_fc_frame_hdr *frame,
+ struct spdk_nvmf_fc_buffer_desc *buffer,
+ uint32_t plen)
+{
+ int rc = 0;
+ uint32_t s_id, d_id;
+ struct spdk_nvmf_fc_nport *nport = NULL;
+ struct spdk_nvmf_fc_remote_port_info *rport = NULL;
+
+ s_id = (uint32_t)frame->s_id;
+ d_id = (uint32_t)frame->d_id;
+ s_id = from_be32(&s_id) >> 8;
+ d_id = from_be32(&d_id) >> 8;
+
+ /* Note: In tracelog below, we directly do endian conversion on rx_id and.
+ * ox_id Since these are fields, we can't pass address to from_be16().
+ * Since ox_id and rx_id are only needed for tracelog, assigning to local
+ * vars. and doing conversion is a waste of time in non-debug builds. */
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC,
+ "Process NVME frame s_id:0x%x d_id:0x%x oxid:0x%x rxid:0x%x.\n",
+ s_id, d_id,
+ ((frame->ox_id << 8) & 0xff00) | ((frame->ox_id >> 8) & 0xff),
+ ((frame->rx_id << 8) & 0xff00) | ((frame->rx_id >> 8) & 0xff));
+
+ rc = nvmf_fc_hwqp_find_nport_and_rport(hwqp, d_id, &nport, s_id, &rport);
+ if (rc) {
+ if (nport == NULL) {
+ SPDK_ERRLOG("Nport not found. Dropping\n");
+ /* increment invalid nport counter */
+ hwqp->counters.nport_invalid++;
+ } else if (rport == NULL) {
+ SPDK_ERRLOG("Rport not found. Dropping\n");
+ /* increment invalid rport counter */
+ hwqp->counters.rport_invalid++;
+ }
+ return rc;
+ }
+
+ if (nport->nport_state != SPDK_NVMF_FC_OBJECT_CREATED ||
+ rport->rport_state != SPDK_NVMF_FC_OBJECT_CREATED) {
+ SPDK_ERRLOG("%s state not created. Dropping\n",
+ nport->nport_state != SPDK_NVMF_FC_OBJECT_CREATED ?
+ "Nport" : "Rport");
+ return -EACCES;
+ }
+
+ if ((frame->r_ctl == FCNVME_R_CTL_LS_REQUEST) &&
+ (frame->type == FCNVME_TYPE_NVMF_DATA)) {
+ struct spdk_nvmf_fc_rq_buf_ls_request *req_buf = buffer->virt;
+ struct spdk_nvmf_fc_ls_rqst *ls_rqst;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC, "Process LS NVME frame\n");
+
+ /* Use the RQ buffer for holding LS request. */
+ ls_rqst = (struct spdk_nvmf_fc_ls_rqst *)&req_buf->ls_rqst;
+
+ /* Fill in the LS request structure */
+ ls_rqst->rqstbuf.virt = (void *)&req_buf->rqst;
+ ls_rqst->rqstbuf.phys = buffer->phys +
+ offsetof(struct spdk_nvmf_fc_rq_buf_ls_request, rqst);
+ ls_rqst->rqstbuf.buf_index = buff_idx;
+ ls_rqst->rqst_len = plen;
+
+ ls_rqst->rspbuf.virt = (void *)&req_buf->resp;
+ ls_rqst->rspbuf.phys = buffer->phys +
+ offsetof(struct spdk_nvmf_fc_rq_buf_ls_request, resp);
+ ls_rqst->rsp_len = FCNVME_MAX_LS_RSP_SIZE;
+
+ ls_rqst->private_data = (void *)hwqp;
+ ls_rqst->rpi = rport->rpi;
+ ls_rqst->oxid = (uint16_t)frame->ox_id;
+ ls_rqst->oxid = from_be16(&ls_rqst->oxid);
+ ls_rqst->s_id = s_id;
+ ls_rqst->d_id = d_id;
+ ls_rqst->nport = nport;
+ ls_rqst->rport = rport;
+ ls_rqst->nvmf_tgt = g_nvmf_ftransport->transport.tgt;
+
+ ls_rqst->xchg = nvmf_fc_get_xri(hwqp);
+ if (ls_rqst->xchg) {
+ /* Handover the request to LS module */
+ nvmf_fc_handle_ls_rqst(ls_rqst);
+ } else {
+ /* No XCHG available. Add to pending list. */
+ hwqp->counters.no_xchg++;
+ TAILQ_INSERT_TAIL(&hwqp->ls_pending_queue, ls_rqst, ls_pending_link);
+ }
+ } else if ((frame->r_ctl == FCNVME_R_CTL_CMD_REQ) &&
+ (frame->type == FCNVME_TYPE_FC_EXCHANGE)) {
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC, "Process IO NVME frame\n");
+ rc = nvmf_fc_hwqp_handle_request(hwqp, frame, buff_idx, buffer, plen);
+ } else {
+
+ SPDK_ERRLOG("Unknown frame received. Dropping\n");
+ hwqp->counters.unknown_frame++;
+ rc = -EINVAL;
+ }
+
+ return rc;
+}
+
+void
+nvmf_fc_hwqp_process_pending_reqs(struct spdk_nvmf_fc_hwqp *hwqp)
+{
+ struct spdk_nvmf_request *req = NULL, *tmp;
+ struct spdk_nvmf_fc_request *fc_req;
+ int budget = 64;
+
+ if (!hwqp->fgroup) {
+ /* LS queue is tied to acceptor_poll group and LS pending requests
+ * are stagged and processed using hwqp->ls_pending_queue.
+ */
+ return;
+ }
+
+ STAILQ_FOREACH_SAFE(req, &hwqp->fgroup->group.pending_buf_queue, buf_link, tmp) {
+ fc_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_fc_request, req);
+ if (!nvmf_fc_request_execute(fc_req)) {
+ /* Succesfuly posted, Delete from pending. */
+ STAILQ_REMOVE_HEAD(&hwqp->fgroup->group.pending_buf_queue, buf_link);
+ }
+
+ if (budget) {
+ budget--;
+ } else {
+ return;
+ }
+ }
+}
+
+void
+nvmf_fc_hwqp_process_pending_ls_rqsts(struct spdk_nvmf_fc_hwqp *hwqp)
+{
+ struct spdk_nvmf_fc_ls_rqst *ls_rqst = NULL, *tmp;
+ struct spdk_nvmf_fc_nport *nport = NULL;
+ struct spdk_nvmf_fc_remote_port_info *rport = NULL;
+
+ TAILQ_FOREACH_SAFE(ls_rqst, &hwqp->ls_pending_queue, ls_pending_link, tmp) {
+ /* lookup nport and rport again - make sure they are still valid */
+ int rc = nvmf_fc_hwqp_find_nport_and_rport(hwqp, ls_rqst->d_id, &nport, ls_rqst->s_id, &rport);
+ if (rc) {
+ if (nport == NULL) {
+ SPDK_ERRLOG("Nport not found. Dropping\n");
+ /* increment invalid nport counter */
+ hwqp->counters.nport_invalid++;
+ } else if (rport == NULL) {
+ SPDK_ERRLOG("Rport not found. Dropping\n");
+ /* increment invalid rport counter */
+ hwqp->counters.rport_invalid++;
+ }
+ TAILQ_REMOVE(&hwqp->ls_pending_queue, ls_rqst, ls_pending_link);
+ /* Return buffer to chip */
+ nvmf_fc_rqpair_buffer_release(hwqp, ls_rqst->rqstbuf.buf_index);
+ continue;
+ }
+ if (nport->nport_state != SPDK_NVMF_FC_OBJECT_CREATED ||
+ rport->rport_state != SPDK_NVMF_FC_OBJECT_CREATED) {
+ SPDK_ERRLOG("%s state not created. Dropping\n",
+ nport->nport_state != SPDK_NVMF_FC_OBJECT_CREATED ?
+ "Nport" : "Rport");
+ TAILQ_REMOVE(&hwqp->ls_pending_queue, ls_rqst, ls_pending_link);
+ /* Return buffer to chip */
+ nvmf_fc_rqpair_buffer_release(hwqp, ls_rqst->rqstbuf.buf_index);
+ continue;
+ }
+
+ ls_rqst->xchg = nvmf_fc_get_xri(hwqp);
+ if (ls_rqst->xchg) {
+ /* Got an XCHG */
+ TAILQ_REMOVE(&hwqp->ls_pending_queue, ls_rqst, ls_pending_link);
+ /* Handover the request to LS module */
+ nvmf_fc_handle_ls_rqst(ls_rqst);
+ } else {
+ /* No more XCHGs. Stop processing. */
+ hwqp->counters.no_xchg++;
+ return;
+ }
+ }
+}
+
+int
+nvmf_fc_handle_rsp(struct spdk_nvmf_fc_request *fc_req)
+{
+ int rc = 0;
+ struct spdk_nvmf_request *req = &fc_req->req;
+ struct spdk_nvmf_qpair *qpair = req->qpair;
+ struct spdk_nvmf_fc_conn *fc_conn = nvmf_fc_get_conn(qpair);
+ struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl;
+ uint16_t ersp_len = 0;
+
+ /* set sq head value in resp */
+ rsp->sqhd = nvmf_fc_advance_conn_sqhead(qpair);
+
+ /* Increment connection responses */
+ fc_conn->rsp_count++;
+
+ if (nvmf_fc_send_ersp_required(fc_req, fc_conn->rsp_count,
+ fc_req->transfered_len)) {
+ /* Fill ERSP Len */
+ to_be16(&ersp_len, (sizeof(struct spdk_nvmf_fc_ersp_iu) /
+ sizeof(uint32_t)));
+ fc_req->ersp.ersp_len = ersp_len;
+
+ /* Fill RSN */
+ to_be32(&fc_req->ersp.response_seq_no, fc_conn->rsn);
+ fc_conn->rsn++;
+
+ /* Fill transfer length */
+ to_be32(&fc_req->ersp.transferred_data_len, fc_req->transfered_len);
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC, "Posting ERSP.\n");
+ rc = nvmf_fc_xmt_rsp(fc_req, (uint8_t *)&fc_req->ersp,
+ sizeof(struct spdk_nvmf_fc_ersp_iu));
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC, "Posting RSP.\n");
+ rc = nvmf_fc_xmt_rsp(fc_req, NULL, 0);
+ }
+
+ return rc;
+}
+
+bool
+nvmf_fc_send_ersp_required(struct spdk_nvmf_fc_request *fc_req,
+ uint32_t rsp_cnt, uint32_t xfer_len)
+{
+ struct spdk_nvmf_request *req = &fc_req->req;
+ struct spdk_nvmf_qpair *qpair = req->qpair;
+ struct spdk_nvmf_fc_conn *fc_conn = nvmf_fc_get_conn(qpair);
+ struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+ struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl;
+ uint16_t status = *((uint16_t *)&rsp->status);
+
+ /*
+ * Check if we need to send ERSP
+ * 1) For every N responses where N == ersp_ratio
+ * 2) Fabric commands.
+ * 3) Completion status failed or Completion dw0 or dw1 valid.
+ * 4) SQ == 90% full.
+ * 5) Transfer length not equal to CMD IU length
+ */
+
+ if (!(rsp_cnt % fc_conn->esrp_ratio) ||
+ (cmd->opc == SPDK_NVME_OPC_FABRIC) ||
+ (status & 0xFFFE) || rsp->cdw0 || rsp->rsvd1 ||
+ (req->length != xfer_len)) {
+ return true;
+ }
+ return false;
+}
+
+static int
+nvmf_fc_request_complete(struct spdk_nvmf_request *req)
+{
+ int rc = 0;
+ struct spdk_nvmf_fc_request *fc_req = nvmf_fc_get_fc_req(req);
+ struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl;
+
+ if (fc_req->is_aborted) {
+ /* Defer this to make sure we dont call io cleanup in same context. */
+ nvmf_fc_poller_api_func(fc_req->hwqp, SPDK_NVMF_FC_POLLER_API_REQ_ABORT_COMPLETE,
+ (void *)fc_req);
+ } else if (rsp->status.sc == SPDK_NVME_SC_SUCCESS &&
+ req->xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) {
+
+ nvmf_fc_request_set_state(fc_req, SPDK_NVMF_FC_REQ_READ_XFER);
+
+ rc = nvmf_fc_send_data(fc_req);
+ } else {
+ if (req->xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) {
+ nvmf_fc_request_set_state(fc_req, SPDK_NVMF_FC_REQ_WRITE_RSP);
+ } else if (req->xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) {
+ nvmf_fc_request_set_state(fc_req, SPDK_NVMF_FC_REQ_READ_RSP);
+ } else {
+ nvmf_fc_request_set_state(fc_req, SPDK_NVMF_FC_REQ_NONE_RSP);
+ }
+
+ rc = nvmf_fc_handle_rsp(fc_req);
+ }
+
+ if (rc) {
+ SPDK_ERRLOG("Error in request complete.\n");
+ _nvmf_fc_request_free(fc_req);
+ }
+ return 0;
+}
+
+struct spdk_nvmf_tgt *
+nvmf_fc_get_tgt(void)
+{
+ if (g_nvmf_ftransport) {
+ return g_nvmf_ftransport->transport.tgt;
+ }
+ return NULL;
+}
+
+/*
+ * FC Transport Public API begins here
+ */
+
+#define SPDK_NVMF_FC_DEFAULT_MAX_QUEUE_DEPTH 128
+#define SPDK_NVMF_FC_DEFAULT_AQ_DEPTH 32
+#define SPDK_NVMF_FC_DEFAULT_MAX_QPAIRS_PER_CTRLR 5
+#define SPDK_NVMF_FC_DEFAULT_IN_CAPSULE_DATA_SIZE 0
+#define SPDK_NVMF_FC_DEFAULT_MAX_IO_SIZE 65536
+#define SPDK_NVMF_FC_DEFAULT_IO_UNIT_SIZE 4096
+#define SPDK_NVMF_FC_DEFAULT_NUM_SHARED_BUFFERS 8192
+#define SPDK_NVMF_FC_DEFAULT_MAX_SGE (SPDK_NVMF_FC_DEFAULT_MAX_IO_SIZE / \
+ SPDK_NVMF_FC_DEFAULT_IO_UNIT_SIZE)
+
+static void
+nvmf_fc_opts_init(struct spdk_nvmf_transport_opts *opts)
+{
+ opts->max_queue_depth = SPDK_NVMF_FC_DEFAULT_MAX_QUEUE_DEPTH;
+ opts->max_qpairs_per_ctrlr = SPDK_NVMF_FC_DEFAULT_MAX_QPAIRS_PER_CTRLR;
+ opts->in_capsule_data_size = SPDK_NVMF_FC_DEFAULT_IN_CAPSULE_DATA_SIZE;
+ opts->max_io_size = SPDK_NVMF_FC_DEFAULT_MAX_IO_SIZE;
+ opts->io_unit_size = SPDK_NVMF_FC_DEFAULT_IO_UNIT_SIZE;
+ opts->max_aq_depth = SPDK_NVMF_FC_DEFAULT_AQ_DEPTH;
+ opts->num_shared_buffers = SPDK_NVMF_FC_DEFAULT_NUM_SHARED_BUFFERS;
+}
+
+static struct spdk_nvmf_transport *
+nvmf_fc_create(struct spdk_nvmf_transport_opts *opts)
+{
+ uint32_t sge_count;
+
+ SPDK_INFOLOG(SPDK_LOG_NVMF_FC, "*** FC Transport Init ***\n"
+ " Transport opts: max_ioq_depth=%d, max_io_size=%d,\n"
+ " max_io_qpairs_per_ctrlr=%d, io_unit_size=%d,\n"
+ " max_aq_depth=%d\n",
+ opts->max_queue_depth,
+ opts->max_io_size,
+ opts->max_qpairs_per_ctrlr - 1,
+ opts->io_unit_size,
+ opts->max_aq_depth);
+
+ if (g_nvmf_ftransport) {
+ SPDK_ERRLOG("Duplicate NVMF-FC transport create request!\n");
+ return NULL;
+ }
+
+ if (spdk_env_get_last_core() < 1) {
+ SPDK_ERRLOG("Not enough cores/threads (%d) to run NVMF-FC transport!\n",
+ spdk_env_get_last_core() + 1);
+ return NULL;
+ }
+
+ sge_count = opts->max_io_size / opts->io_unit_size;
+ if (sge_count > SPDK_NVMF_FC_DEFAULT_MAX_SGE) {
+ SPDK_ERRLOG("Unsupported IO Unit size specified, %d bytes\n", opts->io_unit_size);
+ return NULL;
+ }
+
+ g_nvmf_fc_master_thread = spdk_get_thread();
+ g_nvmf_fgroup_count = 0;
+ g_nvmf_ftransport = calloc(1, sizeof(*g_nvmf_ftransport));
+
+ if (!g_nvmf_ftransport) {
+ SPDK_ERRLOG("Failed to allocate NVMF-FC transport\n");
+ return NULL;
+ }
+
+ if (pthread_mutex_init(&g_nvmf_ftransport->lock, NULL)) {
+ SPDK_ERRLOG("pthread_mutex_init() failed\n");
+ free(g_nvmf_ftransport);
+ g_nvmf_ftransport = NULL;
+ return NULL;
+ }
+
+ /* initialize the low level FC driver */
+ nvmf_fc_lld_init();
+
+ return &g_nvmf_ftransport->transport;
+}
+
+static int
+nvmf_fc_destroy(struct spdk_nvmf_transport *transport)
+{
+ if (transport) {
+ struct spdk_nvmf_fc_transport *ftransport;
+ struct spdk_nvmf_fc_poll_group *fgroup, *pg_tmp;
+
+ ftransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_fc_transport, transport);
+
+ free(ftransport);
+
+ /* clean up any FC poll groups still around */
+ TAILQ_FOREACH_SAFE(fgroup, &g_nvmf_fgroups, link, pg_tmp) {
+ TAILQ_REMOVE(&g_nvmf_fgroups, fgroup, link);
+ free(fgroup);
+ }
+ g_nvmf_fgroup_count = 0;
+
+ /* low level FC driver clean up */
+ nvmf_fc_lld_fini();
+
+ nvmf_fc_port_cleanup();
+ }
+
+ return 0;
+}
+
+static int
+nvmf_fc_listen(struct spdk_nvmf_transport *transport,
+ const struct spdk_nvme_transport_id *trid)
+{
+ return 0;
+}
+
+static void
+nvmf_fc_stop_listen(struct spdk_nvmf_transport *transport,
+ const struct spdk_nvme_transport_id *_trid)
+{
+}
+
+static uint32_t
+nvmf_fc_accept(struct spdk_nvmf_transport *transport)
+{
+ struct spdk_nvmf_fc_port *fc_port = NULL;
+ uint32_t count = 0;
+ static bool start_lld = false;
+
+ if (spdk_unlikely(!start_lld)) {
+ start_lld = true;
+ nvmf_fc_lld_start();
+ }
+
+ /* poll the LS queue on each port */
+ TAILQ_FOREACH(fc_port, &g_spdk_nvmf_fc_port_list, link) {
+ if (fc_port->hw_port_status == SPDK_FC_PORT_ONLINE) {
+ count += nvmf_fc_process_queue(&fc_port->ls_queue);
+ }
+ }
+
+ return count;
+}
+
+static void
+nvmf_fc_discover(struct spdk_nvmf_transport *transport,
+ struct spdk_nvme_transport_id *trid,
+ struct spdk_nvmf_discovery_log_page_entry *entry)
+{
+ entry->trtype = (enum spdk_nvme_transport_type) SPDK_NVMF_TRTYPE_FC;
+ entry->adrfam = trid->adrfam;
+ entry->treq.secure_channel = SPDK_NVMF_TREQ_SECURE_CHANNEL_NOT_SPECIFIED;
+
+ spdk_strcpy_pad(entry->trsvcid, trid->trsvcid, sizeof(entry->trsvcid), ' ');
+ spdk_strcpy_pad(entry->traddr, trid->traddr, sizeof(entry->traddr), ' ');
+}
+
+static struct spdk_nvmf_transport_poll_group *
+nvmf_fc_poll_group_create(struct spdk_nvmf_transport *transport)
+{
+ struct spdk_nvmf_fc_poll_group *fgroup;
+ struct spdk_nvmf_fc_transport *ftransport =
+ SPDK_CONTAINEROF(transport, struct spdk_nvmf_fc_transport, transport);
+
+ fgroup = calloc(1, sizeof(struct spdk_nvmf_fc_poll_group));
+ if (!fgroup) {
+ SPDK_ERRLOG("Unable to alloc FC poll group\n");
+ return NULL;
+ }
+
+ TAILQ_INIT(&fgroup->hwqp_list);
+
+ pthread_mutex_lock(&ftransport->lock);
+ TAILQ_INSERT_TAIL(&g_nvmf_fgroups, fgroup, link);
+ g_nvmf_fgroup_count++;
+ pthread_mutex_unlock(&ftransport->lock);
+
+ return &fgroup->group;
+}
+
+static void
+nvmf_fc_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group)
+{
+ struct spdk_nvmf_fc_poll_group *fgroup;
+ struct spdk_nvmf_fc_transport *ftransport =
+ SPDK_CONTAINEROF(group->transport, struct spdk_nvmf_fc_transport, transport);
+
+ fgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_fc_poll_group, group);
+ pthread_mutex_lock(&ftransport->lock);
+ TAILQ_REMOVE(&g_nvmf_fgroups, fgroup, link);
+ g_nvmf_fgroup_count--;
+ pthread_mutex_unlock(&ftransport->lock);
+
+ free(fgroup);
+}
+
+static int
+nvmf_fc_poll_group_add(struct spdk_nvmf_transport_poll_group *group,
+ struct spdk_nvmf_qpair *qpair)
+{
+ struct spdk_nvmf_fc_poll_group *fgroup;
+ struct spdk_nvmf_fc_conn *fc_conn;
+ struct spdk_nvmf_fc_hwqp *hwqp = NULL;
+ struct spdk_nvmf_fc_ls_add_conn_api_data *api_data = NULL;
+ bool hwqp_found = false;
+
+ fgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_fc_poll_group, group);
+ fc_conn = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_fc_conn, qpair);
+
+ TAILQ_FOREACH(hwqp, &fgroup->hwqp_list, link) {
+ if (fc_conn->fc_assoc->tgtport->fc_port == hwqp->fc_port) {
+ hwqp_found = true;
+ break;
+ }
+ }
+
+ if (!hwqp_found) {
+ SPDK_ERRLOG("No valid hwqp found for new QP.\n");
+ goto err;
+ }
+
+ if (!nvmf_fc_assign_conn_to_hwqp(hwqp,
+ &fc_conn->conn_id,
+ fc_conn->max_queue_depth)) {
+ SPDK_ERRLOG("Failed to get a connection id for new QP.\n");
+ goto err;
+ }
+
+ fc_conn->hwqp = hwqp;
+
+ /* If this is for ADMIN connection, then update assoc ID. */
+ if (fc_conn->qpair.qid == 0) {
+ fc_conn->fc_assoc->assoc_id = fc_conn->conn_id;
+ }
+
+ api_data = &fc_conn->create_opd->u.add_conn;
+ nvmf_fc_poller_api_func(hwqp, SPDK_NVMF_FC_POLLER_API_ADD_CONNECTION, &api_data->args);
+ return 0;
+err:
+ return -1;
+}
+
+static int
+nvmf_fc_poll_group_poll(struct spdk_nvmf_transport_poll_group *group)
+{
+ uint32_t count = 0;
+ struct spdk_nvmf_fc_poll_group *fgroup;
+ struct spdk_nvmf_fc_hwqp *hwqp;
+
+ fgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_fc_poll_group, group);
+
+ TAILQ_FOREACH(hwqp, &fgroup->hwqp_list, link) {
+ if (hwqp->state == SPDK_FC_HWQP_ONLINE) {
+ count += nvmf_fc_process_queue(hwqp);
+ }
+ }
+
+ return (int) count;
+}
+
+static int
+nvmf_fc_request_free(struct spdk_nvmf_request *req)
+{
+ struct spdk_nvmf_fc_request *fc_req = nvmf_fc_get_fc_req(req);
+
+ if (!fc_req->is_aborted) {
+ nvmf_fc_request_set_state(fc_req, SPDK_NVMF_FC_REQ_BDEV_ABORTED);
+ nvmf_fc_request_abort(fc_req, true, NULL, NULL);
+ } else {
+ nvmf_fc_request_abort_complete(fc_req);
+ }
+ return 0;
+}
+
+
+static void
+nvmf_fc_close_qpair(struct spdk_nvmf_qpair *qpair)
+{
+ struct spdk_nvmf_fc_conn *fc_conn;
+
+ fc_conn = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_fc_conn, qpair);
+
+ if (fc_conn->conn_id == NVMF_FC_INVALID_CONN_ID) {
+ /* QP creation failure in FC tranport. Cleanup. */
+ spdk_thread_send_msg(nvmf_fc_get_master_thread(),
+ nvmf_fc_handle_connection_failure, fc_conn);
+ } else if (fc_conn->fc_assoc->assoc_id == fc_conn->conn_id &&
+ fc_conn->fc_assoc->assoc_state != SPDK_NVMF_FC_OBJECT_TO_BE_DELETED) {
+ /* Admin connection */
+ spdk_thread_send_msg(nvmf_fc_get_master_thread(),
+ nvmf_fc_handle_assoc_deletion, fc_conn);
+ }
+}
+
+static int
+nvmf_fc_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair,
+ struct spdk_nvme_transport_id *trid)
+{
+ struct spdk_nvmf_fc_conn *fc_conn;
+
+ fc_conn = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_fc_conn, qpair);
+ memcpy(trid, &fc_conn->trid, sizeof(struct spdk_nvme_transport_id));
+ return 0;
+}
+
+static int
+nvmf_fc_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair,
+ struct spdk_nvme_transport_id *trid)
+{
+ struct spdk_nvmf_fc_conn *fc_conn;
+
+ fc_conn = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_fc_conn, qpair);
+ memcpy(trid, &fc_conn->trid, sizeof(struct spdk_nvme_transport_id));
+ return 0;
+}
+
+static int
+nvmf_fc_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair,
+ struct spdk_nvme_transport_id *trid)
+{
+ struct spdk_nvmf_fc_conn *fc_conn;
+
+ fc_conn = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_fc_conn, qpair);
+ memcpy(trid, &fc_conn->trid, sizeof(struct spdk_nvme_transport_id));
+ return 0;
+}
+
+static void
+nvmf_fc_qpair_abort_request(struct spdk_nvmf_qpair *qpair,
+ struct spdk_nvmf_request *req)
+{
+ spdk_nvmf_request_complete(req);
+}
+
+const struct spdk_nvmf_transport_ops spdk_nvmf_transport_fc = {
+ .name = "FC",
+ .type = (enum spdk_nvme_transport_type) SPDK_NVMF_TRTYPE_FC,
+ .opts_init = nvmf_fc_opts_init,
+ .create = nvmf_fc_create,
+ .destroy = nvmf_fc_destroy,
+
+ .listen = nvmf_fc_listen,
+ .stop_listen = nvmf_fc_stop_listen,
+ .accept = nvmf_fc_accept,
+
+ .listener_discover = nvmf_fc_discover,
+
+ .poll_group_create = nvmf_fc_poll_group_create,
+ .poll_group_destroy = nvmf_fc_poll_group_destroy,
+ .poll_group_add = nvmf_fc_poll_group_add,
+ .poll_group_poll = nvmf_fc_poll_group_poll,
+
+ .req_complete = nvmf_fc_request_complete,
+ .req_free = nvmf_fc_request_free,
+ .qpair_fini = nvmf_fc_close_qpair,
+ .qpair_get_peer_trid = nvmf_fc_qpair_get_peer_trid,
+ .qpair_get_local_trid = nvmf_fc_qpair_get_local_trid,
+ .qpair_get_listen_trid = nvmf_fc_qpair_get_listen_trid,
+ .qpair_abort_request = nvmf_fc_qpair_abort_request,
+};
+
+/*
+ * Re-initialize the FC-Port after an offline event.
+ * Only the queue information needs to be populated. XCHG, lcore and other hwqp information remains
+ * unchanged after the first initialization.
+ *
+ */
+static int
+nvmf_fc_adm_hw_port_reinit_validate(struct spdk_nvmf_fc_port *fc_port,
+ struct spdk_nvmf_fc_hw_port_init_args *args)
+{
+ uint32_t i;
+
+ /* Verify that the port was previously in offline or quiesced state */
+ if (nvmf_fc_port_is_online(fc_port)) {
+ SPDK_ERRLOG("SPDK FC port %d already initialized and online.\n", args->port_handle);
+ return -EINVAL;
+ }
+
+ /* Reinit information in new LS queue from previous queue */
+ nvmf_fc_hwqp_reinit_poller_queues(&fc_port->ls_queue, args->ls_queue);
+
+ fc_port->fcp_rq_id = args->fcp_rq_id;
+
+ /* Initialize the LS queue */
+ fc_port->ls_queue.queues = args->ls_queue;
+ nvmf_fc_init_poller_queues(fc_port->ls_queue.queues);
+
+ for (i = 0; i < fc_port->num_io_queues; i++) {
+ /* Reinit information in new IO queue from previous queue */
+ nvmf_fc_hwqp_reinit_poller_queues(&fc_port->io_queues[i],
+ args->io_queues[i]);
+ fc_port->io_queues[i].queues = args->io_queues[i];
+ /* Initialize the IO queues */
+ nvmf_fc_init_poller_queues(fc_port->io_queues[i].queues);
+ }
+
+ fc_port->hw_port_status = SPDK_FC_PORT_OFFLINE;
+
+ /* Validate the port information */
+ DEV_VERIFY(TAILQ_EMPTY(&fc_port->nport_list));
+ DEV_VERIFY(fc_port->num_nports == 0);
+ if (!TAILQ_EMPTY(&fc_port->nport_list) || (fc_port->num_nports != 0)) {
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/* Initializes the data for the creation of a FC-Port object in the SPDK
+ * library. The spdk_nvmf_fc_port is a well defined structure that is part of
+ * the API to the library. The contents added to this well defined structure
+ * is private to each vendors implementation.
+ */
+static int
+nvmf_fc_adm_hw_port_data_init(struct spdk_nvmf_fc_port *fc_port,
+ struct spdk_nvmf_fc_hw_port_init_args *args)
+{
+ /* Used a high number for the LS HWQP so that it does not clash with the
+ * IO HWQP's and immediately shows a LS queue during tracing.
+ */
+ uint32_t i;
+
+ fc_port->port_hdl = args->port_handle;
+ fc_port->hw_port_status = SPDK_FC_PORT_OFFLINE;
+ fc_port->fcp_rq_id = args->fcp_rq_id;
+ fc_port->num_io_queues = args->io_queue_cnt;
+
+ /*
+ * Set port context from init args. Used for FCP port stats.
+ */
+ fc_port->port_ctx = args->port_ctx;
+
+ /*
+ * Initialize the LS queue wherever needed.
+ */
+ fc_port->ls_queue.queues = args->ls_queue;
+ fc_port->ls_queue.thread = nvmf_fc_get_master_thread();
+ fc_port->ls_queue.hwqp_id = SPDK_MAX_NUM_OF_FC_PORTS * fc_port->num_io_queues;
+
+ /*
+ * Initialize the LS queue.
+ */
+ nvmf_fc_init_hwqp(fc_port, &fc_port->ls_queue);
+
+ /*
+ * Initialize the IO queues.
+ */
+ for (i = 0; i < args->io_queue_cnt; i++) {
+ struct spdk_nvmf_fc_hwqp *hwqp = &fc_port->io_queues[i];
+ hwqp->hwqp_id = i;
+ hwqp->queues = args->io_queues[i];
+ hwqp->rq_size = args->io_queue_size;
+ nvmf_fc_init_hwqp(fc_port, hwqp);
+ }
+
+ /*
+ * Initialize the LS processing for port
+ */
+ nvmf_fc_ls_init(fc_port);
+
+ /*
+ * Initialize the list of nport on this HW port.
+ */
+ TAILQ_INIT(&fc_port->nport_list);
+ fc_port->num_nports = 0;
+
+ return 0;
+}
+
+static void
+nvmf_fc_adm_port_hwqp_offline_del_poller(struct spdk_nvmf_fc_port *fc_port)
+{
+ struct spdk_nvmf_fc_hwqp *hwqp = NULL;
+ int i = 0;
+
+ hwqp = &fc_port->ls_queue;
+ (void)nvmf_fc_hwqp_set_offline(hwqp);
+
+ /* Remove poller for all the io queues. */
+ for (i = 0; i < (int)fc_port->num_io_queues; i++) {
+ hwqp = &fc_port->io_queues[i];
+ (void)nvmf_fc_hwqp_set_offline(hwqp);
+ nvmf_fc_poll_group_remove_hwqp(hwqp);
+ }
+}
+
+/*
+ * Callback function for HW port link break operation.
+ *
+ * Notice that this callback is being triggered when spdk_fc_nport_delete()
+ * completes, if that spdk_fc_nport_delete() called is issued by
+ * nvmf_fc_adm_evnt_hw_port_link_break().
+ *
+ * Since nvmf_fc_adm_evnt_hw_port_link_break() can invoke spdk_fc_nport_delete() multiple
+ * times (one per nport in the HW port's nport_list), a single call to
+ * nvmf_fc_adm_evnt_hw_port_link_break() can result in multiple calls to this callback function.
+ *
+ * As a result, this function only invokes a callback to the caller of
+ * nvmf_fc_adm_evnt_hw_port_link_break() only when the HW port's nport_list is empty.
+ */
+static void
+nvmf_fc_adm_hw_port_link_break_cb(uint8_t port_handle,
+ enum spdk_fc_event event_type, void *cb_args, int spdk_err)
+{
+ ASSERT_SPDK_FC_MASTER_THREAD();
+ struct spdk_nvmf_fc_adm_port_link_break_cb_data *offline_cb_args = cb_args;
+ struct spdk_nvmf_hw_port_link_break_args *offline_args = NULL;
+ spdk_nvmf_fc_callback cb_func = NULL;
+ int err = 0;
+ struct spdk_nvmf_fc_port *fc_port = NULL;
+ int num_nports = 0;
+ char log_str[256];
+
+ if (0 != spdk_err) {
+ DEV_VERIFY(!"port link break cb: spdk_err not success.");
+ SPDK_ERRLOG("port link break cb: spdk_err:%d.\n", spdk_err);
+ goto out;
+ }
+
+ if (!offline_cb_args) {
+ DEV_VERIFY(!"port link break cb: port_offline_args is NULL.");
+ err = -EINVAL;
+ goto out;
+ }
+
+ offline_args = offline_cb_args->args;
+ if (!offline_args) {
+ DEV_VERIFY(!"port link break cb: offline_args is NULL.");
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (port_handle != offline_args->port_handle) {
+ DEV_VERIFY(!"port link break cb: port_handle mismatch.");
+ err = -EINVAL;
+ goto out;
+ }
+
+ cb_func = offline_cb_args->cb_func;
+ if (!cb_func) {
+ DEV_VERIFY(!"port link break cb: cb_func is NULL.");
+ err = -EINVAL;
+ goto out;
+ }
+
+ fc_port = nvmf_fc_port_lookup(port_handle);
+ if (!fc_port) {
+ DEV_VERIFY(!"port link break cb: fc_port is NULL.");
+ SPDK_ERRLOG("port link break cb: Unable to find port:%d\n",
+ offline_args->port_handle);
+ err = -EINVAL;
+ goto out;
+ }
+
+ num_nports = fc_port->num_nports;
+ if (!TAILQ_EMPTY(&fc_port->nport_list)) {
+ /*
+ * Don't call the callback unless all nports have been deleted.
+ */
+ goto out;
+ }
+
+ if (num_nports != 0) {
+ DEV_VERIFY(!"port link break cb: num_nports in non-zero.");
+ SPDK_ERRLOG("port link break cb: # of ports should be 0. Instead, num_nports:%d\n",
+ num_nports);
+ err = -EINVAL;
+ }
+
+ /*
+ * Mark the hwqps as offline and unregister the pollers.
+ */
+ (void)nvmf_fc_adm_port_hwqp_offline_del_poller(fc_port);
+
+ /*
+ * Since there are no more nports, execute the callback(s).
+ */
+ (void)cb_func(port_handle, SPDK_FC_LINK_BREAK,
+ (void *)offline_args->cb_ctx, spdk_err);
+
+out:
+ free(offline_cb_args);
+
+ snprintf(log_str, sizeof(log_str),
+ "port link break cb: port:%d evt_type:%d num_nports:%d err:%d spdk_err:%d.\n",
+ port_handle, event_type, num_nports, err, spdk_err);
+
+ if (err != 0) {
+ SPDK_ERRLOG("%s", log_str);
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "%s", log_str);
+ }
+ return;
+}
+
+/*
+ * FC port must have all its nports deleted before transitioning to offline state.
+ */
+static void
+nvmf_fc_adm_hw_port_offline_nport_delete(struct spdk_nvmf_fc_port *fc_port)
+{
+ struct spdk_nvmf_fc_nport *nport = NULL;
+ /* All nports must have been deleted at this point for this fc port */
+ DEV_VERIFY(fc_port && TAILQ_EMPTY(&fc_port->nport_list));
+ DEV_VERIFY(fc_port->num_nports == 0);
+ /* Mark the nport states to be zombie, if they exist */
+ if (fc_port && !TAILQ_EMPTY(&fc_port->nport_list)) {
+ TAILQ_FOREACH(nport, &fc_port->nport_list, link) {
+ (void)nvmf_fc_nport_set_state(nport, SPDK_NVMF_FC_OBJECT_ZOMBIE);
+ }
+ }
+}
+
+static void
+nvmf_fc_adm_i_t_delete_cb(void *args, uint32_t err)
+{
+ ASSERT_SPDK_FC_MASTER_THREAD();
+ struct spdk_nvmf_fc_adm_i_t_del_cb_data *cb_data = args;
+ struct spdk_nvmf_fc_nport *nport = cb_data->nport;
+ struct spdk_nvmf_fc_remote_port_info *rport = cb_data->rport;
+ spdk_nvmf_fc_callback cb_func = cb_data->fc_cb_func;
+ int spdk_err = 0;
+ uint8_t port_handle = cb_data->port_handle;
+ uint32_t s_id = rport->s_id;
+ uint32_t rpi = rport->rpi;
+ uint32_t assoc_count = rport->assoc_count;
+ uint32_t nport_hdl = nport->nport_hdl;
+ uint32_t d_id = nport->d_id;
+ char log_str[256];
+
+ /*
+ * Assert on any delete failure.
+ */
+ if (0 != err) {
+ DEV_VERIFY(!"Error in IT Delete callback.");
+ goto out;
+ }
+
+ if (cb_func != NULL) {
+ (void)cb_func(port_handle, SPDK_FC_IT_DELETE, cb_data->fc_cb_ctx, spdk_err);
+ }
+
+out:
+ free(cb_data);
+
+ snprintf(log_str, sizeof(log_str),
+ "IT delete assoc_cb on nport %d done, port_handle:%d s_id:%d d_id:%d rpi:%d rport_assoc_count:%d rc = %d.\n",
+ nport_hdl, port_handle, s_id, d_id, rpi, assoc_count, err);
+
+ if (err != 0) {
+ SPDK_ERRLOG("%s", log_str);
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "%s", log_str);
+ }
+}
+
+static void
+nvmf_fc_adm_i_t_delete_assoc_cb(void *args, uint32_t err)
+{
+ ASSERT_SPDK_FC_MASTER_THREAD();
+ struct spdk_nvmf_fc_adm_i_t_del_assoc_cb_data *cb_data = args;
+ struct spdk_nvmf_fc_nport *nport = cb_data->nport;
+ struct spdk_nvmf_fc_remote_port_info *rport = cb_data->rport;
+ spdk_nvmf_fc_adm_i_t_delete_assoc_cb_fn cb_func = cb_data->cb_func;
+ uint32_t s_id = rport->s_id;
+ uint32_t rpi = rport->rpi;
+ uint32_t assoc_count = rport->assoc_count;
+ uint32_t nport_hdl = nport->nport_hdl;
+ uint32_t d_id = nport->d_id;
+ char log_str[256];
+
+ /*
+ * Assert on any association delete failure. We continue to delete other
+ * associations in promoted builds.
+ */
+ if (0 != err) {
+ DEV_VERIFY(!"Nport's association delete callback returned error");
+ if (nport->assoc_count > 0) {
+ nport->assoc_count--;
+ }
+ if (rport->assoc_count > 0) {
+ rport->assoc_count--;
+ }
+ }
+
+ /*
+ * If this is the last association being deleted for the ITN,
+ * execute the callback(s).
+ */
+ if (0 == rport->assoc_count) {
+ /* Remove the rport from the remote port list. */
+ if (nvmf_fc_nport_remove_rem_port(nport, rport) != 0) {
+ SPDK_ERRLOG("Error while removing rport from list.\n");
+ DEV_VERIFY(!"Error while removing rport from list.");
+ }
+
+ if (cb_func != NULL) {
+ /*
+ * Callback function is provided by the caller
+ * of nvmf_fc_adm_i_t_delete_assoc().
+ */
+ (void)cb_func(cb_data->cb_ctx, 0);
+ }
+ free(rport);
+ free(args);
+ }
+
+ snprintf(log_str, sizeof(log_str),
+ "IT delete assoc_cb on nport %d done, s_id:%d d_id:%d rpi:%d rport_assoc_count:%d err = %d.\n",
+ nport_hdl, s_id, d_id, rpi, assoc_count, err);
+
+ if (err != 0) {
+ SPDK_ERRLOG("%s", log_str);
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "%s", log_str);
+ }
+}
+
+/**
+ * Process a IT delete.
+ */
+static void
+nvmf_fc_adm_i_t_delete_assoc(struct spdk_nvmf_fc_nport *nport,
+ struct spdk_nvmf_fc_remote_port_info *rport,
+ spdk_nvmf_fc_adm_i_t_delete_assoc_cb_fn cb_func,
+ void *cb_ctx)
+{
+ int err = 0;
+ struct spdk_nvmf_fc_association *assoc = NULL;
+ int assoc_err = 0;
+ uint32_t num_assoc = 0;
+ uint32_t num_assoc_del_scheduled = 0;
+ struct spdk_nvmf_fc_adm_i_t_del_assoc_cb_data *cb_data = NULL;
+ uint8_t port_hdl = nport->port_hdl;
+ uint32_t s_id = rport->s_id;
+ uint32_t rpi = rport->rpi;
+ uint32_t assoc_count = rport->assoc_count;
+ char log_str[256];
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "IT delete associations on nport:%d begin.\n",
+ nport->nport_hdl);
+
+ /*
+ * Allocate memory for callback data.
+ * This memory will be freed by the callback function.
+ */
+ cb_data = calloc(1, sizeof(struct spdk_nvmf_fc_adm_i_t_del_assoc_cb_data));
+ if (NULL == cb_data) {
+ SPDK_ERRLOG("Failed to allocate memory for cb_data on nport:%d.\n", nport->nport_hdl);
+ err = -ENOMEM;
+ goto out;
+ }
+ cb_data->nport = nport;
+ cb_data->rport = rport;
+ cb_data->port_handle = port_hdl;
+ cb_data->cb_func = cb_func;
+ cb_data->cb_ctx = cb_ctx;
+
+ /*
+ * Delete all associations, if any, related with this ITN/remote_port.
+ */
+ TAILQ_FOREACH(assoc, &nport->fc_associations, link) {
+ num_assoc++;
+ if (assoc->s_id == s_id) {
+ assoc_err = nvmf_fc_delete_association(nport,
+ assoc->assoc_id,
+ false /* send abts */, false,
+ nvmf_fc_adm_i_t_delete_assoc_cb, cb_data);
+ if (0 != assoc_err) {
+ /*
+ * Mark this association as zombie.
+ */
+ err = -EINVAL;
+ DEV_VERIFY(!"Error while deleting association");
+ (void)nvmf_fc_assoc_set_state(assoc, SPDK_NVMF_FC_OBJECT_ZOMBIE);
+ } else {
+ num_assoc_del_scheduled++;
+ }
+ }
+ }
+
+out:
+ if ((cb_data) && (num_assoc_del_scheduled == 0)) {
+ /*
+ * Since there are no association_delete calls
+ * successfully scheduled, the association_delete
+ * callback function will never be called.
+ * In this case, call the callback function now.
+ */
+ nvmf_fc_adm_i_t_delete_assoc_cb(cb_data, 0);
+ }
+
+ snprintf(log_str, sizeof(log_str),
+ "IT delete associations on nport:%d end. "
+ "s_id:%d rpi:%d assoc_count:%d assoc:%d assoc_del_scheduled:%d rc:%d.\n",
+ nport->nport_hdl, s_id, rpi, assoc_count, num_assoc, num_assoc_del_scheduled, err);
+
+ if (err == 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "%s", log_str);
+ } else {
+ SPDK_ERRLOG("%s", log_str);
+ }
+}
+
+static void
+nvmf_fc_adm_queue_quiesce_cb(void *cb_data, enum spdk_nvmf_fc_poller_api_ret ret)
+{
+ ASSERT_SPDK_FC_MASTER_THREAD();
+ struct spdk_nvmf_fc_poller_api_quiesce_queue_args *quiesce_api_data = NULL;
+ struct spdk_nvmf_fc_adm_hw_port_quiesce_ctx *port_quiesce_ctx = NULL;
+ struct spdk_nvmf_fc_hwqp *hwqp = NULL;
+ struct spdk_nvmf_fc_port *fc_port = NULL;
+ int err = 0;
+
+ quiesce_api_data = (struct spdk_nvmf_fc_poller_api_quiesce_queue_args *)cb_data;
+ hwqp = quiesce_api_data->hwqp;
+ fc_port = hwqp->fc_port;
+ port_quiesce_ctx = (struct spdk_nvmf_fc_adm_hw_port_quiesce_ctx *)quiesce_api_data->ctx;
+ spdk_nvmf_fc_adm_hw_port_quiesce_cb_fn cb_func = port_quiesce_ctx->cb_func;
+
+ /*
+ * Decrement the callback/quiesced queue count.
+ */
+ port_quiesce_ctx->quiesce_count--;
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "Queue%d Quiesced\n", quiesce_api_data->hwqp->hwqp_id);
+
+ free(quiesce_api_data);
+ /*
+ * Wait for call backs i.e. max_ioq_queues + LS QUEUE.
+ */
+ if (port_quiesce_ctx->quiesce_count > 0) {
+ return;
+ }
+
+ if (fc_port->hw_port_status == SPDK_FC_PORT_QUIESCED) {
+ SPDK_ERRLOG("Port %d already in quiesced state.\n", fc_port->port_hdl);
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "HW port %d quiesced.\n", fc_port->port_hdl);
+ fc_port->hw_port_status = SPDK_FC_PORT_QUIESCED;
+ }
+
+ if (cb_func) {
+ /*
+ * Callback function for the called of quiesce.
+ */
+ cb_func(port_quiesce_ctx->ctx, err);
+ }
+
+ /*
+ * Free the context structure.
+ */
+ free(port_quiesce_ctx);
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "HW port %d quiesce done, rc = %d.\n", fc_port->port_hdl,
+ err);
+}
+
+static int
+nvmf_fc_adm_hw_queue_quiesce(struct spdk_nvmf_fc_hwqp *fc_hwqp, void *ctx,
+ spdk_nvmf_fc_poller_api_cb cb_func)
+{
+ struct spdk_nvmf_fc_poller_api_quiesce_queue_args *args;
+ enum spdk_nvmf_fc_poller_api_ret rc = SPDK_NVMF_FC_POLLER_API_SUCCESS;
+ int err = 0;
+
+ args = calloc(1, sizeof(struct spdk_nvmf_fc_poller_api_quiesce_queue_args));
+
+ if (args == NULL) {
+ err = -ENOMEM;
+ SPDK_ERRLOG("Failed to allocate memory for poller quiesce args, hwqp:%d\n", fc_hwqp->hwqp_id);
+ goto done;
+ }
+ args->hwqp = fc_hwqp;
+ args->ctx = ctx;
+ args->cb_info.cb_func = cb_func;
+ args->cb_info.cb_data = args;
+ args->cb_info.cb_thread = spdk_get_thread();
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "Quiesce queue %d\n", fc_hwqp->hwqp_id);
+ rc = nvmf_fc_poller_api_func(fc_hwqp, SPDK_NVMF_FC_POLLER_API_QUIESCE_QUEUE, args);
+ if (rc) {
+ free(args);
+ err = -EINVAL;
+ }
+
+done:
+ return err;
+}
+
+/*
+ * Hw port Quiesce
+ */
+static int
+nvmf_fc_adm_hw_port_quiesce(struct spdk_nvmf_fc_port *fc_port, void *ctx,
+ spdk_nvmf_fc_adm_hw_port_quiesce_cb_fn cb_func)
+{
+ struct spdk_nvmf_fc_adm_hw_port_quiesce_ctx *port_quiesce_ctx = NULL;
+ uint32_t i = 0;
+ int err = 0;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "HW port:%d is being quiesced.\n", fc_port->port_hdl);
+
+ /*
+ * If the port is in an OFFLINE state, set the state to QUIESCED
+ * and execute the callback.
+ */
+ if (fc_port->hw_port_status == SPDK_FC_PORT_OFFLINE) {
+ fc_port->hw_port_status = SPDK_FC_PORT_QUIESCED;
+ }
+
+ if (fc_port->hw_port_status == SPDK_FC_PORT_QUIESCED) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "Port %d already in quiesced state.\n",
+ fc_port->port_hdl);
+ /*
+ * Execute the callback function directly.
+ */
+ cb_func(ctx, err);
+ goto out;
+ }
+
+ port_quiesce_ctx = calloc(1, sizeof(struct spdk_nvmf_fc_adm_hw_port_quiesce_ctx));
+
+ if (port_quiesce_ctx == NULL) {
+ err = -ENOMEM;
+ SPDK_ERRLOG("Failed to allocate memory for LS queue quiesce ctx, port:%d\n",
+ fc_port->port_hdl);
+ goto out;
+ }
+
+ port_quiesce_ctx->quiesce_count = 0;
+ port_quiesce_ctx->ctx = ctx;
+ port_quiesce_ctx->cb_func = cb_func;
+
+ /*
+ * Quiesce the LS queue.
+ */
+ err = nvmf_fc_adm_hw_queue_quiesce(&fc_port->ls_queue, port_quiesce_ctx,
+ nvmf_fc_adm_queue_quiesce_cb);
+ if (err != 0) {
+ SPDK_ERRLOG("Failed to quiesce the LS queue.\n");
+ goto out;
+ }
+ port_quiesce_ctx->quiesce_count++;
+
+ /*
+ * Quiesce the IO queues.
+ */
+ for (i = 0; i < fc_port->num_io_queues; i++) {
+ err = nvmf_fc_adm_hw_queue_quiesce(&fc_port->io_queues[i],
+ port_quiesce_ctx,
+ nvmf_fc_adm_queue_quiesce_cb);
+ if (err != 0) {
+ DEV_VERIFY(0);
+ SPDK_ERRLOG("Failed to quiesce the IO queue:%d.\n", fc_port->io_queues[i].hwqp_id);
+ }
+ port_quiesce_ctx->quiesce_count++;
+ }
+
+out:
+ if (port_quiesce_ctx && err != 0) {
+ free(port_quiesce_ctx);
+ }
+ return err;
+}
+
+/*
+ * Initialize and add a HW port entry to the global
+ * HW port list.
+ */
+static void
+nvmf_fc_adm_evnt_hw_port_init(void *arg)
+{
+ ASSERT_SPDK_FC_MASTER_THREAD();
+ struct spdk_nvmf_fc_port *fc_port = NULL;
+ struct spdk_nvmf_fc_adm_api_data *api_data = (struct spdk_nvmf_fc_adm_api_data *)arg;
+ struct spdk_nvmf_fc_hw_port_init_args *args = (struct spdk_nvmf_fc_hw_port_init_args *)
+ api_data->api_args;
+ int err = 0;
+
+ if (args->io_queue_cnt > spdk_env_get_core_count()) {
+ SPDK_ERRLOG("IO queues count greater than cores for %d.\n", args->port_handle);
+ err = EINVAL;
+ goto abort_port_init;
+ }
+
+ /*
+ * 1. Check for duplicate initialization.
+ */
+ fc_port = nvmf_fc_port_lookup(args->port_handle);
+ if (fc_port != NULL) {
+ /* Port already exists, check if it has to be re-initialized */
+ err = nvmf_fc_adm_hw_port_reinit_validate(fc_port, args);
+ if (err) {
+ /*
+ * In case of an error we do not want to free the fc_port
+ * so we set that pointer to NULL.
+ */
+ fc_port = NULL;
+ }
+ goto abort_port_init;
+ }
+
+ /*
+ * 2. Get the memory to instantiate a fc port.
+ */
+ fc_port = calloc(1, sizeof(struct spdk_nvmf_fc_port) +
+ (args->io_queue_cnt * sizeof(struct spdk_nvmf_fc_hwqp)));
+ if (fc_port == NULL) {
+ SPDK_ERRLOG("Failed to allocate memory for fc_port %d.\n", args->port_handle);
+ err = -ENOMEM;
+ goto abort_port_init;
+ }
+
+ /* assign the io_queues array */
+ fc_port->io_queues = (struct spdk_nvmf_fc_hwqp *)((uint8_t *)fc_port + sizeof(
+ struct spdk_nvmf_fc_port));
+
+ /*
+ * 3. Initialize the contents for the FC-port
+ */
+ err = nvmf_fc_adm_hw_port_data_init(fc_port, args);
+
+ if (err != 0) {
+ SPDK_ERRLOG("Data initialization failed for fc_port %d.\n", args->port_handle);
+ DEV_VERIFY(!"Data initialization failed for fc_port");
+ goto abort_port_init;
+ }
+
+ /*
+ * 4. Add this port to the global fc port list in the library.
+ */
+ nvmf_fc_port_add(fc_port);
+
+abort_port_init:
+ if (err && fc_port) {
+ free(fc_port);
+ }
+ if (api_data->cb_func != NULL) {
+ (void)api_data->cb_func(args->port_handle, SPDK_FC_HW_PORT_INIT, args->cb_ctx, err);
+ }
+
+ free(arg);
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "HW port %d initialize done, rc = %d.\n",
+ args->port_handle, err);
+}
+
+/*
+ * Online a HW port.
+ */
+static void
+nvmf_fc_adm_evnt_hw_port_online(void *arg)
+{
+ ASSERT_SPDK_FC_MASTER_THREAD();
+ struct spdk_nvmf_fc_port *fc_port = NULL;
+ struct spdk_nvmf_fc_hwqp *hwqp = NULL;
+ struct spdk_nvmf_fc_adm_api_data *api_data = (struct spdk_nvmf_fc_adm_api_data *)arg;
+ struct spdk_nvmf_fc_hw_port_online_args *args = (struct spdk_nvmf_fc_hw_port_online_args *)
+ api_data->api_args;
+ int i = 0;
+ int err = 0;
+
+ fc_port = nvmf_fc_port_lookup(args->port_handle);
+ if (fc_port) {
+ /* Set the port state to online */
+ err = nvmf_fc_port_set_online(fc_port);
+ if (err != 0) {
+ SPDK_ERRLOG("Hw port %d online failed. err = %d\n", fc_port->port_hdl, err);
+ DEV_VERIFY(!"Hw port online failed");
+ goto out;
+ }
+
+ hwqp = &fc_port->ls_queue;
+ hwqp->context = NULL;
+ (void)nvmf_fc_hwqp_set_online(hwqp);
+
+ /* Cycle through all the io queues and setup a hwqp poller for each. */
+ for (i = 0; i < (int)fc_port->num_io_queues; i++) {
+ hwqp = &fc_port->io_queues[i];
+ hwqp->context = NULL;
+ (void)nvmf_fc_hwqp_set_online(hwqp);
+ nvmf_fc_poll_group_add_hwqp(hwqp);
+ }
+ } else {
+ SPDK_ERRLOG("Unable to find the SPDK FC port %d\n", args->port_handle);
+ err = -EINVAL;
+ }
+
+out:
+ if (api_data->cb_func != NULL) {
+ (void)api_data->cb_func(args->port_handle, SPDK_FC_HW_PORT_ONLINE, args->cb_ctx, err);
+ }
+
+ free(arg);
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "HW port %d online done, rc = %d.\n", args->port_handle,
+ err);
+}
+
+/*
+ * Offline a HW port.
+ */
+static void
+nvmf_fc_adm_evnt_hw_port_offline(void *arg)
+{
+ ASSERT_SPDK_FC_MASTER_THREAD();
+ struct spdk_nvmf_fc_port *fc_port = NULL;
+ struct spdk_nvmf_fc_hwqp *hwqp = NULL;
+ struct spdk_nvmf_fc_adm_api_data *api_data = (struct spdk_nvmf_fc_adm_api_data *)arg;
+ struct spdk_nvmf_fc_hw_port_offline_args *args = (struct spdk_nvmf_fc_hw_port_offline_args *)
+ api_data->api_args;
+ int i = 0;
+ int err = 0;
+
+ fc_port = nvmf_fc_port_lookup(args->port_handle);
+ if (fc_port) {
+ /* Set the port state to offline, if it is not already. */
+ err = nvmf_fc_port_set_offline(fc_port);
+ if (err != 0) {
+ SPDK_ERRLOG("Hw port %d already offline. err = %d\n", fc_port->port_hdl, err);
+ err = 0;
+ goto out;
+ }
+
+ hwqp = &fc_port->ls_queue;
+ (void)nvmf_fc_hwqp_set_offline(hwqp);
+
+ /* Remove poller for all the io queues. */
+ for (i = 0; i < (int)fc_port->num_io_queues; i++) {
+ hwqp = &fc_port->io_queues[i];
+ (void)nvmf_fc_hwqp_set_offline(hwqp);
+ nvmf_fc_poll_group_remove_hwqp(hwqp);
+ }
+
+ /*
+ * Delete all the nports. Ideally, the nports should have been purged
+ * before the offline event, in which case, only a validation is required.
+ */
+ nvmf_fc_adm_hw_port_offline_nport_delete(fc_port);
+ } else {
+ SPDK_ERRLOG("Unable to find the SPDK FC port %d\n", args->port_handle);
+ err = -EINVAL;
+ }
+out:
+ if (api_data->cb_func != NULL) {
+ (void)api_data->cb_func(args->port_handle, SPDK_FC_HW_PORT_OFFLINE, args->cb_ctx, err);
+ }
+
+ free(arg);
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "HW port %d offline done, rc = %d.\n", args->port_handle,
+ err);
+}
+
+struct nvmf_fc_add_rem_listener_ctx {
+ struct spdk_nvmf_subsystem *subsystem;
+ bool add_listener;
+ struct spdk_nvme_transport_id trid;
+};
+
+static void
+nvmf_fc_adm_subsystem_resume_cb(struct spdk_nvmf_subsystem *subsystem, void *cb_arg, int status)
+{
+ ASSERT_SPDK_FC_MASTER_THREAD();
+ struct nvmf_fc_add_rem_listener_ctx *ctx = (struct nvmf_fc_add_rem_listener_ctx *)cb_arg;
+ free(ctx);
+}
+
+static void
+nvmf_fc_adm_listen_done(void *cb_arg, int status)
+{
+ ASSERT_SPDK_FC_MASTER_THREAD();
+ struct nvmf_fc_add_rem_listener_ctx *ctx = cb_arg;
+
+ if (spdk_nvmf_subsystem_resume(ctx->subsystem, nvmf_fc_adm_subsystem_resume_cb, ctx)) {
+ SPDK_ERRLOG("Failed to resume subsystem: %s\n", ctx->subsystem->subnqn);
+ free(ctx);
+ }
+}
+
+static void
+nvmf_fc_adm_subsystem_paused_cb(struct spdk_nvmf_subsystem *subsystem, void *cb_arg, int status)
+{
+ ASSERT_SPDK_FC_MASTER_THREAD();
+ struct nvmf_fc_add_rem_listener_ctx *ctx = (struct nvmf_fc_add_rem_listener_ctx *)cb_arg;
+
+ if (ctx->add_listener) {
+ spdk_nvmf_subsystem_add_listener(subsystem, &ctx->trid, nvmf_fc_adm_listen_done, ctx);
+ } else {
+ spdk_nvmf_subsystem_remove_listener(subsystem, &ctx->trid);
+ nvmf_fc_adm_listen_done(ctx, 0);
+ }
+}
+
+static int
+nvmf_fc_adm_add_rem_nport_listener(struct spdk_nvmf_fc_nport *nport, bool add)
+{
+ struct spdk_nvmf_tgt *tgt = nvmf_fc_get_tgt();
+ struct spdk_nvmf_subsystem *subsystem;
+
+ if (!tgt) {
+ SPDK_ERRLOG("No nvmf target defined\n");
+ return -EINVAL;
+ }
+
+ subsystem = spdk_nvmf_subsystem_get_first(tgt);
+ while (subsystem) {
+ struct nvmf_fc_add_rem_listener_ctx *ctx;
+
+ if (spdk_nvmf_subsytem_any_listener_allowed(subsystem) == true) {
+ ctx = calloc(1, sizeof(struct nvmf_fc_add_rem_listener_ctx));
+ if (ctx) {
+ ctx->add_listener = add;
+ ctx->subsystem = subsystem;
+ nvmf_fc_create_trid(&ctx->trid,
+ nport->fc_nodename.u.wwn,
+ nport->fc_portname.u.wwn);
+
+ if (spdk_nvmf_tgt_listen(subsystem->tgt, &ctx->trid)) {
+ SPDK_ERRLOG("Failed to add transport address %s to tgt listeners\n",
+ ctx->trid.traddr);
+ free(ctx);
+ } else if (spdk_nvmf_subsystem_pause(subsystem,
+ nvmf_fc_adm_subsystem_paused_cb,
+ ctx)) {
+ SPDK_ERRLOG("Failed to pause subsystem: %s\n",
+ subsystem->subnqn);
+ free(ctx);
+ }
+ }
+ }
+
+ subsystem = spdk_nvmf_subsystem_get_next(subsystem);
+ }
+
+ return 0;
+}
+
+/*
+ * Create a Nport.
+ */
+static void
+nvmf_fc_adm_evnt_nport_create(void *arg)
+{
+ ASSERT_SPDK_FC_MASTER_THREAD();
+ struct spdk_nvmf_fc_adm_api_data *api_data = (struct spdk_nvmf_fc_adm_api_data *)arg;
+ struct spdk_nvmf_fc_nport_create_args *args = (struct spdk_nvmf_fc_nport_create_args *)
+ api_data->api_args;
+ struct spdk_nvmf_fc_nport *nport = NULL;
+ struct spdk_nvmf_fc_port *fc_port = NULL;
+ int err = 0;
+
+ /*
+ * Get the physical port.
+ */
+ fc_port = nvmf_fc_port_lookup(args->port_handle);
+ if (fc_port == NULL) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * Check for duplicate initialization.
+ */
+ nport = nvmf_fc_nport_find(args->port_handle, args->nport_handle);
+ if (nport != NULL) {
+ SPDK_ERRLOG("Duplicate SPDK FC nport %d exists for FC port:%d.\n", args->nport_handle,
+ args->port_handle);
+ err = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * Get the memory to instantiate a fc nport.
+ */
+ nport = calloc(1, sizeof(struct spdk_nvmf_fc_nport));
+ if (nport == NULL) {
+ SPDK_ERRLOG("Failed to allocate memory for nport %d.\n",
+ args->nport_handle);
+ err = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * Initialize the contents for the nport
+ */
+ nport->nport_hdl = args->nport_handle;
+ nport->port_hdl = args->port_handle;
+ nport->nport_state = SPDK_NVMF_FC_OBJECT_CREATED;
+ nport->fc_nodename = args->fc_nodename;
+ nport->fc_portname = args->fc_portname;
+ nport->d_id = args->d_id;
+ nport->fc_port = nvmf_fc_port_lookup(args->port_handle);
+
+ (void)nvmf_fc_nport_set_state(nport, SPDK_NVMF_FC_OBJECT_CREATED);
+ TAILQ_INIT(&nport->rem_port_list);
+ nport->rport_count = 0;
+ TAILQ_INIT(&nport->fc_associations);
+ nport->assoc_count = 0;
+
+ /*
+ * Populate the nport address (as listening address) to the nvmf subsystems.
+ */
+ err = nvmf_fc_adm_add_rem_nport_listener(nport, true);
+
+ (void)nvmf_fc_port_add_nport(fc_port, nport);
+out:
+ if (err && nport) {
+ free(nport);
+ }
+
+ if (api_data->cb_func != NULL) {
+ (void)api_data->cb_func(args->port_handle, SPDK_FC_NPORT_CREATE, args->cb_ctx, err);
+ }
+
+ free(arg);
+}
+
+static void
+nvmf_fc_adm_delete_nport_cb(uint8_t port_handle, enum spdk_fc_event event_type,
+ void *cb_args, int spdk_err)
+{
+ ASSERT_SPDK_FC_MASTER_THREAD();
+ struct spdk_nvmf_fc_adm_nport_del_cb_data *cb_data = cb_args;
+ struct spdk_nvmf_fc_nport *nport = cb_data->nport;
+ spdk_nvmf_fc_callback cb_func = cb_data->fc_cb_func;
+ int err = 0;
+ uint16_t nport_hdl = 0;
+ char log_str[256];
+
+ /*
+ * Assert on any delete failure.
+ */
+ if (nport == NULL) {
+ SPDK_ERRLOG("Nport delete callback returned null nport");
+ DEV_VERIFY(!"nport is null.");
+ goto out;
+ }
+
+ nport_hdl = nport->nport_hdl;
+ if (0 != spdk_err) {
+ SPDK_ERRLOG("Nport delete callback returned error. FC Port: "
+ "%d, Nport: %d\n",
+ nport->port_hdl, nport->nport_hdl);
+ DEV_VERIFY(!"nport delete callback error.");
+ }
+
+ /*
+ * Free the nport if this is the last rport being deleted and
+ * execute the callback(s).
+ */
+ if (nvmf_fc_nport_has_no_rport(nport)) {
+ if (0 != nport->assoc_count) {
+ SPDK_ERRLOG("association count != 0\n");
+ DEV_VERIFY(!"association count != 0");
+ }
+
+ err = nvmf_fc_port_remove_nport(nport->fc_port, nport);
+ if (0 != err) {
+ SPDK_ERRLOG("Nport delete callback: Failed to remove "
+ "nport from nport list. FC Port:%d Nport:%d\n",
+ nport->port_hdl, nport->nport_hdl);
+ }
+ /* Free the nport */
+ free(nport);
+
+ if (cb_func != NULL) {
+ (void)cb_func(cb_data->port_handle, SPDK_FC_NPORT_DELETE, cb_data->fc_cb_ctx, spdk_err);
+ }
+ free(cb_data);
+ }
+out:
+ snprintf(log_str, sizeof(log_str),
+ "port:%d nport:%d delete cb exit, evt_type:%d rc:%d.\n",
+ port_handle, nport_hdl, event_type, spdk_err);
+
+ if (err != 0) {
+ SPDK_ERRLOG("%s", log_str);
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "%s", log_str);
+ }
+}
+
+/*
+ * Delete Nport.
+ */
+static void
+nvmf_fc_adm_evnt_nport_delete(void *arg)
+{
+ ASSERT_SPDK_FC_MASTER_THREAD();
+ struct spdk_nvmf_fc_adm_api_data *api_data = (struct spdk_nvmf_fc_adm_api_data *)arg;
+ struct spdk_nvmf_fc_nport_delete_args *args = (struct spdk_nvmf_fc_nport_delete_args *)
+ api_data->api_args;
+ struct spdk_nvmf_fc_nport *nport = NULL;
+ struct spdk_nvmf_fc_adm_nport_del_cb_data *cb_data = NULL;
+ struct spdk_nvmf_fc_remote_port_info *rport_iter = NULL;
+ int err = 0;
+ uint32_t rport_cnt = 0;
+ int rc = 0;
+
+ /*
+ * Make sure that the nport exists.
+ */
+ nport = nvmf_fc_nport_find(args->port_handle, args->nport_handle);
+ if (nport == NULL) {
+ SPDK_ERRLOG("Unable to find the SPDK FC nport %d for FC Port: %d.\n", args->nport_handle,
+ args->port_handle);
+ err = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * Allocate memory for callback data.
+ */
+ cb_data = calloc(1, sizeof(struct spdk_nvmf_fc_adm_nport_del_cb_data));
+ if (NULL == cb_data) {
+ SPDK_ERRLOG("Failed to allocate memory for cb_data %d.\n", args->nport_handle);
+ err = -ENOMEM;
+ goto out;
+ }
+
+ cb_data->nport = nport;
+ cb_data->port_handle = args->port_handle;
+ cb_data->fc_cb_func = api_data->cb_func;
+ cb_data->fc_cb_ctx = args->cb_ctx;
+
+ /*
+ * Begin nport tear down
+ */
+ if (nport->nport_state == SPDK_NVMF_FC_OBJECT_CREATED) {
+ (void)nvmf_fc_nport_set_state(nport, SPDK_NVMF_FC_OBJECT_TO_BE_DELETED);
+ } else if (nport->nport_state == SPDK_NVMF_FC_OBJECT_TO_BE_DELETED) {
+ /*
+ * Deletion of this nport already in progress. Register callback
+ * and return.
+ */
+ /* TODO: Register callback in callback vector. For now, set the error and return. */
+ err = -ENODEV;
+ goto out;
+ } else {
+ /* nport partially created/deleted */
+ DEV_VERIFY(nport->nport_state == SPDK_NVMF_FC_OBJECT_ZOMBIE);
+ DEV_VERIFY(0 != "Nport in zombie state");
+ err = -ENODEV;
+ goto out;
+ }
+
+ /*
+ * Remove this nport from listening addresses across subsystems
+ */
+ rc = nvmf_fc_adm_add_rem_nport_listener(nport, false);
+
+ if (0 != rc) {
+ err = nvmf_fc_nport_set_state(nport, SPDK_NVMF_FC_OBJECT_ZOMBIE);
+ SPDK_ERRLOG("Unable to remove the listen addr in the subsystems for nport %d.\n",
+ nport->nport_hdl);
+ goto out;
+ }
+
+ /*
+ * Delete all the remote ports (if any) for the nport
+ */
+ /* TODO - Need to do this with a "first" and a "next" accessor function
+ * for completeness. Look at app-subsystem as examples.
+ */
+ if (nvmf_fc_nport_has_no_rport(nport)) {
+ /* No rports to delete. Complete the nport deletion. */
+ nvmf_fc_adm_delete_nport_cb(nport->port_hdl, SPDK_FC_NPORT_DELETE, cb_data, 0);
+ goto out;
+ }
+
+ TAILQ_FOREACH(rport_iter, &nport->rem_port_list, link) {
+ struct spdk_nvmf_fc_hw_i_t_delete_args *it_del_args = calloc(
+ 1, sizeof(struct spdk_nvmf_fc_hw_i_t_delete_args));
+
+ if (it_del_args == NULL) {
+ err = -ENOMEM;
+ SPDK_ERRLOG("SPDK_FC_IT_DELETE no mem to delete rport with rpi:%d s_id:%d.\n",
+ rport_iter->rpi, rport_iter->s_id);
+ DEV_VERIFY(!"SPDK_FC_IT_DELETE failed, cannot allocate memory");
+ goto out;
+ }
+
+ rport_cnt++;
+ it_del_args->port_handle = nport->port_hdl;
+ it_del_args->nport_handle = nport->nport_hdl;
+ it_del_args->cb_ctx = (void *)cb_data;
+ it_del_args->rpi = rport_iter->rpi;
+ it_del_args->s_id = rport_iter->s_id;
+
+ nvmf_fc_master_enqueue_event(SPDK_FC_IT_DELETE, (void *)it_del_args,
+ nvmf_fc_adm_delete_nport_cb);
+ }
+
+out:
+ /* On failure, execute the callback function now */
+ if ((err != 0) || (rc != 0)) {
+ SPDK_ERRLOG("NPort %d delete failed, error:%d, fc port:%d, "
+ "rport_cnt:%d rc:%d.\n",
+ args->nport_handle, err, args->port_handle,
+ rport_cnt, rc);
+ if (cb_data) {
+ free(cb_data);
+ }
+ if (api_data->cb_func != NULL) {
+ (void)api_data->cb_func(args->port_handle, SPDK_FC_NPORT_DELETE, args->cb_ctx, err);
+ }
+
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API,
+ "NPort %d delete done succesfully, fc port:%d. "
+ "rport_cnt:%d\n",
+ args->nport_handle, args->port_handle, rport_cnt);
+ }
+
+ free(arg);
+}
+
+/*
+ * Process an PRLI/IT add.
+ */
+static void
+nvmf_fc_adm_evnt_i_t_add(void *arg)
+{
+ ASSERT_SPDK_FC_MASTER_THREAD();
+ struct spdk_nvmf_fc_adm_api_data *api_data = (struct spdk_nvmf_fc_adm_api_data *)arg;
+ struct spdk_nvmf_fc_hw_i_t_add_args *args = (struct spdk_nvmf_fc_hw_i_t_add_args *)
+ api_data->api_args;
+ struct spdk_nvmf_fc_nport *nport = NULL;
+ struct spdk_nvmf_fc_remote_port_info *rport_iter = NULL;
+ struct spdk_nvmf_fc_remote_port_info *rport = NULL;
+ int err = 0;
+
+ /*
+ * Make sure the nport port exists.
+ */
+ nport = nvmf_fc_nport_find(args->port_handle, args->nport_handle);
+ if (nport == NULL) {
+ SPDK_ERRLOG("Unable to find the SPDK FC nport %d\n", args->nport_handle);
+ err = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * Check for duplicate i_t_add.
+ */
+ TAILQ_FOREACH(rport_iter, &nport->rem_port_list, link) {
+ if ((rport_iter->s_id == args->s_id) && (rport_iter->rpi == args->rpi)) {
+ SPDK_ERRLOG("Duplicate rport found for FC nport %d: sid:%d rpi:%d\n",
+ args->nport_handle, rport_iter->s_id, rport_iter->rpi);
+ err = -EEXIST;
+ goto out;
+ }
+ }
+
+ /*
+ * Get the memory to instantiate the remote port
+ */
+ rport = calloc(1, sizeof(struct spdk_nvmf_fc_remote_port_info));
+ if (rport == NULL) {
+ SPDK_ERRLOG("Memory allocation for rem port failed.\n");
+ err = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * Initialize the contents for the rport
+ */
+ (void)nvmf_fc_rport_set_state(rport, SPDK_NVMF_FC_OBJECT_CREATED);
+ rport->s_id = args->s_id;
+ rport->rpi = args->rpi;
+ rport->fc_nodename = args->fc_nodename;
+ rport->fc_portname = args->fc_portname;
+
+ /*
+ * Add remote port to nport
+ */
+ if (nvmf_fc_nport_add_rem_port(nport, rport) != 0) {
+ DEV_VERIFY(!"Error while adding rport to list");
+ };
+
+ /*
+ * TODO: Do we validate the initiators service parameters?
+ */
+
+ /*
+ * Get the targets service parameters from the library
+ * to return back to the driver.
+ */
+ args->target_prli_info = nvmf_fc_get_prli_service_params();
+
+out:
+ if (api_data->cb_func != NULL) {
+ /*
+ * Passing pointer to the args struct as the first argument.
+ * The cb_func should handle this appropriately.
+ */
+ (void)api_data->cb_func(args->port_handle, SPDK_FC_IT_ADD, args->cb_ctx, err);
+ }
+
+ free(arg);
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API,
+ "IT add on nport %d done, rc = %d.\n",
+ args->nport_handle, err);
+}
+
+/**
+ * Process a IT delete.
+ */
+static void
+nvmf_fc_adm_evnt_i_t_delete(void *arg)
+{
+ ASSERT_SPDK_FC_MASTER_THREAD();
+ struct spdk_nvmf_fc_adm_api_data *api_data = (struct spdk_nvmf_fc_adm_api_data *)arg;
+ struct spdk_nvmf_fc_hw_i_t_delete_args *args = (struct spdk_nvmf_fc_hw_i_t_delete_args *)
+ api_data->api_args;
+ int rc = 0;
+ struct spdk_nvmf_fc_nport *nport = NULL;
+ struct spdk_nvmf_fc_adm_i_t_del_cb_data *cb_data = NULL;
+ struct spdk_nvmf_fc_remote_port_info *rport_iter = NULL;
+ struct spdk_nvmf_fc_remote_port_info *rport = NULL;
+ uint32_t num_rport = 0;
+ char log_str[256];
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "IT delete on nport:%d begin.\n", args->nport_handle);
+
+ /*
+ * Make sure the nport port exists. If it does not, error out.
+ */
+ nport = nvmf_fc_nport_find(args->port_handle, args->nport_handle);
+ if (nport == NULL) {
+ SPDK_ERRLOG("Unable to find the SPDK FC nport:%d\n", args->nport_handle);
+ rc = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * Find this ITN / rport (remote port).
+ */
+ TAILQ_FOREACH(rport_iter, &nport->rem_port_list, link) {
+ num_rport++;
+ if ((rport_iter->s_id == args->s_id) &&
+ (rport_iter->rpi == args->rpi) &&
+ (rport_iter->rport_state == SPDK_NVMF_FC_OBJECT_CREATED)) {
+ rport = rport_iter;
+ break;
+ }
+ }
+
+ /*
+ * We should find either zero or exactly one rport.
+ *
+ * If we find zero rports, that means that a previous request has
+ * removed the rport by the time we reached here. In this case,
+ * simply return out.
+ */
+ if (rport == NULL) {
+ rc = -ENODEV;
+ goto out;
+ }
+
+ /*
+ * We have found exactly one rport. Allocate memory for callback data.
+ */
+ cb_data = calloc(1, sizeof(struct spdk_nvmf_fc_adm_i_t_del_cb_data));
+ if (NULL == cb_data) {
+ SPDK_ERRLOG("Failed to allocate memory for cb_data for nport:%d.\n", args->nport_handle);
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ cb_data->nport = nport;
+ cb_data->rport = rport;
+ cb_data->port_handle = args->port_handle;
+ cb_data->fc_cb_func = api_data->cb_func;
+ cb_data->fc_cb_ctx = args->cb_ctx;
+
+ /*
+ * Validate rport object state.
+ */
+ if (rport->rport_state == SPDK_NVMF_FC_OBJECT_CREATED) {
+ (void)nvmf_fc_rport_set_state(rport, SPDK_NVMF_FC_OBJECT_TO_BE_DELETED);
+ } else if (rport->rport_state == SPDK_NVMF_FC_OBJECT_TO_BE_DELETED) {
+ /*
+ * Deletion of this rport already in progress. Register callback
+ * and return.
+ */
+ /* TODO: Register callback in callback vector. For now, set the error and return. */
+ rc = -ENODEV;
+ goto out;
+ } else {
+ /* rport partially created/deleted */
+ DEV_VERIFY(rport->rport_state == SPDK_NVMF_FC_OBJECT_ZOMBIE);
+ DEV_VERIFY(!"Invalid rport_state");
+ rc = -ENODEV;
+ goto out;
+ }
+
+ /*
+ * We have successfully found a rport to delete. Call
+ * nvmf_fc_i_t_delete_assoc(), which will perform further
+ * IT-delete processing as well as free the cb_data.
+ */
+ nvmf_fc_adm_i_t_delete_assoc(nport, rport, nvmf_fc_adm_i_t_delete_cb,
+ (void *)cb_data);
+
+out:
+ if (rc != 0) {
+ /*
+ * We have entered here because either we encountered an
+ * error, or we did not find a rport to delete.
+ * As a result, we will not call the function
+ * nvmf_fc_i_t_delete_assoc() for further IT-delete
+ * processing. Therefore, execute the callback function now.
+ */
+ if (cb_data) {
+ free(cb_data);
+ }
+ if (api_data->cb_func != NULL) {
+ (void)api_data->cb_func(args->port_handle, SPDK_FC_IT_DELETE, args->cb_ctx, rc);
+ }
+ }
+
+ snprintf(log_str, sizeof(log_str),
+ "IT delete on nport:%d end. num_rport:%d rc = %d.\n",
+ args->nport_handle, num_rport, rc);
+
+ if (rc != 0) {
+ SPDK_ERRLOG("%s", log_str);
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "%s", log_str);
+ }
+
+ free(arg);
+}
+
+/*
+ * Process ABTS received
+ */
+static void
+nvmf_fc_adm_evnt_abts_recv(void *arg)
+{
+ ASSERT_SPDK_FC_MASTER_THREAD();
+ struct spdk_nvmf_fc_adm_api_data *api_data = (struct spdk_nvmf_fc_adm_api_data *)arg;
+ struct spdk_nvmf_fc_abts_args *args = (struct spdk_nvmf_fc_abts_args *)api_data->api_args;
+ struct spdk_nvmf_fc_nport *nport = NULL;
+ int err = 0;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "FC ABTS received. RPI:%d, oxid:%d, rxid:%d\n", args->rpi,
+ args->oxid, args->rxid);
+
+ /*
+ * 1. Make sure the nport port exists.
+ */
+ nport = nvmf_fc_nport_find(args->port_handle, args->nport_handle);
+ if (nport == NULL) {
+ SPDK_ERRLOG("Unable to find the SPDK FC nport %d\n", args->nport_handle);
+ err = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * 2. If the nport is in the process of being deleted, drop the ABTS.
+ */
+ if (nport->nport_state == SPDK_NVMF_FC_OBJECT_TO_BE_DELETED) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API,
+ "FC ABTS dropped because the nport is being deleted; RPI:%d, oxid:%d, rxid:%d\n",
+ args->rpi, args->oxid, args->rxid);
+ err = 0;
+ goto out;
+
+ }
+
+ /*
+ * 3. Pass the received ABTS-LS to the library for handling.
+ */
+ nvmf_fc_handle_abts_frame(nport, args->rpi, args->oxid, args->rxid);
+
+out:
+ if (api_data->cb_func != NULL) {
+ /*
+ * Passing pointer to the args struct as the first argument.
+ * The cb_func should handle this appropriately.
+ */
+ (void)api_data->cb_func(args->port_handle, SPDK_FC_ABTS_RECV, args, err);
+ } else {
+ /* No callback set, free the args */
+ free(args);
+ }
+
+ free(arg);
+}
+
+/*
+ * Callback function for hw port quiesce.
+ */
+static void
+nvmf_fc_adm_hw_port_quiesce_reset_cb(void *ctx, int err)
+{
+ ASSERT_SPDK_FC_MASTER_THREAD();
+ struct spdk_nvmf_fc_adm_hw_port_reset_ctx *reset_ctx =
+ (struct spdk_nvmf_fc_adm_hw_port_reset_ctx *)ctx;
+ struct spdk_nvmf_fc_hw_port_reset_args *args = reset_ctx->reset_args;
+ spdk_nvmf_fc_callback cb_func = reset_ctx->reset_cb_func;
+ struct spdk_nvmf_fc_queue_dump_info dump_info;
+ struct spdk_nvmf_fc_port *fc_port = NULL;
+ char *dump_buf = NULL;
+ uint32_t dump_buf_size = SPDK_FC_HW_DUMP_BUF_SIZE;
+
+ /*
+ * Free the callback context struct.
+ */
+ free(ctx);
+
+ if (err != 0) {
+ SPDK_ERRLOG("Port %d quiesce operation failed.\n", args->port_handle);
+ goto out;
+ }
+
+ if (args->dump_queues == false) {
+ /*
+ * Queues need not be dumped.
+ */
+ goto out;
+ }
+
+ SPDK_ERRLOG("Dumping queues for HW port %d\n", args->port_handle);
+
+ /*
+ * Get the fc port.
+ */
+ fc_port = nvmf_fc_port_lookup(args->port_handle);
+ if (fc_port == NULL) {
+ SPDK_ERRLOG("Unable to find the SPDK FC port %d\n", args->port_handle);
+ err = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * Allocate memory for the dump buffer.
+ * This memory will be freed by FCT.
+ */
+ dump_buf = (char *)calloc(1, dump_buf_size);
+ if (dump_buf == NULL) {
+ err = -ENOMEM;
+ SPDK_ERRLOG("Memory allocation for dump buffer failed, SPDK FC port %d\n", args->port_handle);
+ goto out;
+ }
+ *args->dump_buf = (uint32_t *)dump_buf;
+ dump_info.buffer = dump_buf;
+ dump_info.offset = 0;
+
+ /*
+ * Add the dump reason to the top of the buffer.
+ */
+ nvmf_fc_dump_buf_print(&dump_info, "%s\n", args->reason);
+
+ /*
+ * Dump the hwqp.
+ */
+ nvmf_fc_dump_all_queues(&fc_port->ls_queue, fc_port->io_queues,
+ fc_port->num_io_queues, &dump_info);
+
+out:
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "HW port %d reset done, queues_dumped = %d, rc = %d.\n",
+ args->port_handle, args->dump_queues, err);
+
+ if (cb_func != NULL) {
+ (void)cb_func(args->port_handle, SPDK_FC_HW_PORT_RESET, args->cb_ctx, err);
+ }
+}
+
+/*
+ * HW port reset
+
+ */
+static void
+nvmf_fc_adm_evnt_hw_port_reset(void *arg)
+{
+ ASSERT_SPDK_FC_MASTER_THREAD();
+ struct spdk_nvmf_fc_adm_api_data *api_data = (struct spdk_nvmf_fc_adm_api_data *)arg;
+ struct spdk_nvmf_fc_hw_port_reset_args *args = (struct spdk_nvmf_fc_hw_port_reset_args *)
+ api_data->api_args;
+ struct spdk_nvmf_fc_port *fc_port = NULL;
+ struct spdk_nvmf_fc_adm_hw_port_reset_ctx *ctx = NULL;
+ int err = 0;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "HW port %d dump\n", args->port_handle);
+
+ /*
+ * Make sure the physical port exists.
+ */
+ fc_port = nvmf_fc_port_lookup(args->port_handle);
+ if (fc_port == NULL) {
+ SPDK_ERRLOG("Unable to find the SPDK FC port %d\n", args->port_handle);
+ err = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * Save the reset event args and the callback in a context struct.
+ */
+ ctx = calloc(1, sizeof(struct spdk_nvmf_fc_adm_hw_port_reset_ctx));
+
+ if (ctx == NULL) {
+ err = -ENOMEM;
+ SPDK_ERRLOG("Memory allocation for reset ctx failed, SPDK FC port %d\n", args->port_handle);
+ goto fail;
+ }
+
+ ctx->reset_args = arg;
+ ctx->reset_cb_func = api_data->cb_func;
+
+ /*
+ * Quiesce the hw port.
+ */
+ err = nvmf_fc_adm_hw_port_quiesce(fc_port, ctx, nvmf_fc_adm_hw_port_quiesce_reset_cb);
+ if (err != 0) {
+ goto fail;
+ }
+
+ /*
+ * Once the ports are successfully quiesced the reset processing
+ * will continue in the callback function: spdk_fc_port_quiesce_reset_cb
+ */
+ return;
+fail:
+ free(ctx);
+
+out:
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "HW port %d dump done, rc = %d.\n", args->port_handle,
+ err);
+
+ if (api_data->cb_func != NULL) {
+ (void)api_data->cb_func(args->port_handle, SPDK_FC_HW_PORT_RESET, args->cb_ctx, err);
+ }
+
+ free(arg);
+}
+
+/*
+ * Process a link break event on a HW port.
+ */
+static void
+nvmf_fc_adm_evnt_hw_port_link_break(void *arg)
+{
+ ASSERT_SPDK_FC_MASTER_THREAD();
+ struct spdk_nvmf_fc_adm_api_data *api_data = (struct spdk_nvmf_fc_adm_api_data *)arg;
+ struct spdk_nvmf_hw_port_link_break_args *args = (struct spdk_nvmf_hw_port_link_break_args *)
+ api_data->api_args;
+ struct spdk_nvmf_fc_port *fc_port = NULL;
+ int err = 0;
+ struct spdk_nvmf_fc_adm_port_link_break_cb_data *cb_data = NULL;
+ struct spdk_nvmf_fc_nport *nport = NULL;
+ uint32_t nport_deletes_sent = 0;
+ uint32_t nport_deletes_skipped = 0;
+ struct spdk_nvmf_fc_nport_delete_args *nport_del_args = NULL;
+ char log_str[256];
+
+ /*
+ * Get the fc port using the port handle.
+ */
+ fc_port = nvmf_fc_port_lookup(args->port_handle);
+ if (!fc_port) {
+ SPDK_ERRLOG("port link break: Unable to find the SPDK FC port %d\n",
+ args->port_handle);
+ err = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * Set the port state to offline, if it is not already.
+ */
+ err = nvmf_fc_port_set_offline(fc_port);
+ if (err != 0) {
+ SPDK_ERRLOG("port link break: HW port %d already offline. rc = %d\n",
+ fc_port->port_hdl, err);
+ err = 0;
+ goto out;
+ }
+
+ /*
+ * Delete all the nports, if any.
+ */
+ if (!TAILQ_EMPTY(&fc_port->nport_list)) {
+ TAILQ_FOREACH(nport, &fc_port->nport_list, link) {
+ /* Skipped the nports that are not in CREATED state */
+ if (nport->nport_state != SPDK_NVMF_FC_OBJECT_CREATED) {
+ nport_deletes_skipped++;
+ continue;
+ }
+
+ /* Allocate memory for callback data. */
+ cb_data = calloc(1, sizeof(struct spdk_nvmf_fc_adm_port_link_break_cb_data));
+ if (NULL == cb_data) {
+ SPDK_ERRLOG("port link break: Failed to allocate memory for cb_data %d.\n",
+ args->port_handle);
+ err = -ENOMEM;
+ goto out;
+ }
+ cb_data->args = args;
+ cb_data->cb_func = api_data->cb_func;
+ nport_del_args = &cb_data->nport_del_args;
+ nport_del_args->port_handle = args->port_handle;
+ nport_del_args->nport_handle = nport->nport_hdl;
+ nport_del_args->cb_ctx = cb_data;
+
+ nvmf_fc_master_enqueue_event(SPDK_FC_NPORT_DELETE,
+ (void *)nport_del_args,
+ nvmf_fc_adm_hw_port_link_break_cb);
+
+ nport_deletes_sent++;
+ }
+ }
+
+ if (nport_deletes_sent == 0 && err == 0) {
+ /*
+ * Mark the hwqps as offline and unregister the pollers.
+ */
+ (void)nvmf_fc_adm_port_hwqp_offline_del_poller(fc_port);
+ }
+
+out:
+ snprintf(log_str, sizeof(log_str),
+ "port link break done: port:%d nport_deletes_sent:%d nport_deletes_skipped:%d rc:%d.\n",
+ args->port_handle, nport_deletes_sent, nport_deletes_skipped, err);
+
+ if (err != 0) {
+ SPDK_ERRLOG("%s", log_str);
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "%s", log_str);
+ }
+
+ if ((api_data->cb_func != NULL) && (nport_deletes_sent == 0)) {
+ /*
+ * No nport_deletes are sent, which would have eventually
+ * called the port_link_break callback. Therefore, call the
+ * port_link_break callback here.
+ */
+ (void)api_data->cb_func(args->port_handle, SPDK_FC_LINK_BREAK, args->cb_ctx, err);
+ }
+
+ free(arg);
+}
+
+static inline void
+nvmf_fc_adm_run_on_master_thread(spdk_msg_fn fn, void *args)
+{
+ if (nvmf_fc_get_master_thread()) {
+ spdk_thread_send_msg(nvmf_fc_get_master_thread(), fn, args);
+ }
+}
+
+/*
+ * Queue up an event in the SPDK masters event queue.
+ * Used by the FC driver to notify the SPDK master of FC related events.
+ */
+int
+nvmf_fc_master_enqueue_event(enum spdk_fc_event event_type, void *args,
+ spdk_nvmf_fc_callback cb_func)
+{
+ int err = 0;
+ struct spdk_nvmf_fc_adm_api_data *api_data = NULL;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "Enqueue event %d.\n", event_type);
+
+ if (event_type >= SPDK_FC_EVENT_MAX) {
+ SPDK_ERRLOG("Invalid spdk_fc_event_t %d.\n", event_type);
+ err = -EINVAL;
+ goto done;
+ }
+
+ if (args == NULL) {
+ SPDK_ERRLOG("Null args for event %d.\n", event_type);
+ err = -EINVAL;
+ goto done;
+ }
+
+ api_data = calloc(1, sizeof(*api_data));
+
+ if (api_data == NULL) {
+ SPDK_ERRLOG("Failed to alloc api data for event %d.\n", event_type);
+ err = -ENOMEM;
+ goto done;
+ }
+
+ api_data->api_args = args;
+ api_data->cb_func = cb_func;
+
+ switch (event_type) {
+ case SPDK_FC_HW_PORT_INIT:
+ nvmf_fc_adm_run_on_master_thread(nvmf_fc_adm_evnt_hw_port_init,
+ (void *)api_data);
+ break;
+
+ case SPDK_FC_HW_PORT_ONLINE:
+ nvmf_fc_adm_run_on_master_thread(nvmf_fc_adm_evnt_hw_port_online,
+ (void *)api_data);
+ break;
+
+ case SPDK_FC_HW_PORT_OFFLINE:
+ nvmf_fc_adm_run_on_master_thread(nvmf_fc_adm_evnt_hw_port_offline,
+ (void *)api_data);
+ break;
+
+ case SPDK_FC_NPORT_CREATE:
+ nvmf_fc_adm_run_on_master_thread(nvmf_fc_adm_evnt_nport_create,
+ (void *)api_data);
+ break;
+
+ case SPDK_FC_NPORT_DELETE:
+ nvmf_fc_adm_run_on_master_thread(nvmf_fc_adm_evnt_nport_delete,
+ (void *)api_data);
+ break;
+
+ case SPDK_FC_IT_ADD:
+ nvmf_fc_adm_run_on_master_thread(nvmf_fc_adm_evnt_i_t_add,
+ (void *)api_data);
+ break;
+
+ case SPDK_FC_IT_DELETE:
+ nvmf_fc_adm_run_on_master_thread(nvmf_fc_adm_evnt_i_t_delete,
+ (void *)api_data);
+ break;
+
+ case SPDK_FC_ABTS_RECV:
+ nvmf_fc_adm_run_on_master_thread(nvmf_fc_adm_evnt_abts_recv,
+ (void *)api_data);
+ break;
+
+ case SPDK_FC_LINK_BREAK:
+ nvmf_fc_adm_run_on_master_thread(nvmf_fc_adm_evnt_hw_port_link_break,
+ (void *)api_data);
+ break;
+
+ case SPDK_FC_HW_PORT_RESET:
+ nvmf_fc_adm_run_on_master_thread(nvmf_fc_adm_evnt_hw_port_reset,
+ (void *)api_data);
+ break;
+
+ case SPDK_FC_UNRECOVERABLE_ERR:
+ default:
+ SPDK_ERRLOG("Invalid spdk_fc_event_t: %d\n", event_type);
+ err = -EINVAL;
+ break;
+ }
+
+done:
+
+ if (err == 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_ADM_API, "Enqueue event %d done successfully\n", event_type);
+ } else {
+ SPDK_ERRLOG("Enqueue event %d failed, err = %d\n", event_type, err);
+ if (api_data) {
+ free(api_data);
+ }
+ }
+
+ return err;
+}
+
+SPDK_NVMF_TRANSPORT_REGISTER(fc, &spdk_nvmf_transport_fc);
+SPDK_LOG_REGISTER_COMPONENT("nvmf_fc_adm_api", SPDK_LOG_NVMF_FC_ADM_API);
+SPDK_LOG_REGISTER_COMPONENT("nvmf_fc", SPDK_LOG_NVMF_FC)
diff --git a/src/spdk/lib/nvmf/fc_ls.c b/src/spdk/lib/nvmf/fc_ls.c
new file mode 100644
index 000000000..1aa06bd45
--- /dev/null
+++ b/src/spdk/lib/nvmf/fc_ls.c
@@ -0,0 +1,1678 @@
+/*
+ * BSD LICENSE
+ *
+ * Copyright (c) 2018-2019 Broadcom. All Rights Reserved.
+ * The term "Broadcom" refers to Broadcom Inc. and/or its subsidiaries.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/env.h"
+#include "spdk/assert.h"
+#include "spdk/nvmf.h"
+#include "spdk/nvmf_spec.h"
+#include "spdk/string.h"
+#include "spdk/trace.h"
+#include "spdk/util.h"
+#include "spdk/endian.h"
+#include "spdk_internal/log.h"
+#include "nvmf_internal.h"
+#include "transport.h"
+
+#include "nvmf_fc.h"
+#include "fc_lld.h"
+
+/* set to 1 to send ls disconnect in response to ls disconnect from host (per standard) */
+#define NVMF_FC_LS_SEND_LS_DISCONNECT 0
+
+/* Validation Error indexes into the string table below */
+enum {
+ VERR_NO_ERROR = 0,
+ VERR_CR_ASSOC_LEN = 1,
+ VERR_CR_ASSOC_RQST_LEN = 2,
+ VERR_CR_ASSOC_CMD = 3,
+ VERR_CR_ASSOC_CMD_LEN = 4,
+ VERR_ERSP_RATIO = 5,
+ VERR_ASSOC_ALLOC_FAIL = 6,
+ VERR_CONN_ALLOC_FAIL = 7,
+ VERR_CR_CONN_LEN = 8,
+ VERR_CR_CONN_RQST_LEN = 9,
+ VERR_ASSOC_ID = 10,
+ VERR_ASSOC_ID_LEN = 11,
+ VERR_NO_ASSOC = 12,
+ VERR_CONN_ID = 13,
+ VERR_CONN_ID_LEN = 14,
+ VERR_NO_CONN = 15,
+ VERR_CR_CONN_CMD = 16,
+ VERR_CR_CONN_CMD_LEN = 17,
+ VERR_DISCONN_LEN = 18,
+ VERR_DISCONN_RQST_LEN = 19,
+ VERR_DISCONN_CMD = 20,
+ VERR_DISCONN_CMD_LEN = 21,
+ VERR_DISCONN_SCOPE = 22,
+ VERR_RS_LEN = 23,
+ VERR_RS_RQST_LEN = 24,
+ VERR_RS_CMD = 25,
+ VERR_RS_CMD_LEN = 26,
+ VERR_RS_RCTL = 27,
+ VERR_RS_RO = 28,
+ VERR_CONN_TOO_MANY = 29,
+ VERR_SUBNQN = 30,
+ VERR_HOSTNQN = 31,
+ VERR_SQSIZE = 32,
+ VERR_NO_RPORT = 33,
+ VERR_SUBLISTENER = 34,
+};
+
+static char *validation_errors[] = {
+ "OK",
+ "Bad CR_ASSOC Length",
+ "Bad CR_ASSOC Rqst Length",
+ "Not CR_ASSOC Cmd",
+ "Bad CR_ASSOC Cmd Length",
+ "Bad Ersp Ratio",
+ "Association Allocation Failed",
+ "Queue Allocation Failed",
+ "Bad CR_CONN Length",
+ "Bad CR_CONN Rqst Length",
+ "Not Association ID",
+ "Bad Association ID Length",
+ "No Association",
+ "Not Connection ID",
+ "Bad Connection ID Length",
+ "No Connection",
+ "Not CR_CONN Cmd",
+ "Bad CR_CONN Cmd Length",
+ "Bad DISCONN Length",
+ "Bad DISCONN Rqst Length",
+ "Not DISCONN Cmd",
+ "Bad DISCONN Cmd Length",
+ "Bad Disconnect Scope",
+ "Bad RS Length",
+ "Bad RS Rqst Length",
+ "Not RS Cmd",
+ "Bad RS Cmd Length",
+ "Bad RS R_CTL",
+ "Bad RS Relative Offset",
+ "Too many connections for association",
+ "Invalid subnqn or subsystem not found",
+ "Invalid hostnqn or subsystem doesn't allow host",
+ "SQ size = 0 or too big",
+ "No Remote Port",
+ "Bad Subsystem Port",
+};
+
+static inline void
+nvmf_fc_add_assoc_to_tgt_port(struct spdk_nvmf_fc_nport *tgtport,
+ struct spdk_nvmf_fc_association *assoc,
+ struct spdk_nvmf_fc_remote_port_info *rport);
+
+static inline FCNVME_BE32 cpu_to_be32(uint32_t in)
+{
+ uint32_t t;
+
+ to_be32(&t, in);
+ return (FCNVME_BE32)t;
+}
+
+static inline FCNVME_BE32 nvmf_fc_lsdesc_len(size_t sz)
+{
+ uint32_t t;
+
+ to_be32(&t, sz - (2 * sizeof(uint32_t)));
+ return (FCNVME_BE32)t;
+}
+
+static void
+nvmf_fc_ls_format_rsp_hdr(void *buf, uint8_t ls_cmd, uint32_t desc_len,
+ uint8_t rqst_ls_cmd)
+{
+ struct spdk_nvmf_fc_ls_acc_hdr *acc_hdr = buf;
+
+ acc_hdr->w0.ls_cmd = ls_cmd;
+ acc_hdr->desc_list_len = desc_len;
+ to_be32(&acc_hdr->rqst.desc_tag, FCNVME_LSDESC_RQST);
+ acc_hdr->rqst.desc_len =
+ nvmf_fc_lsdesc_len(sizeof(struct spdk_nvmf_fc_lsdesc_rqst));
+ acc_hdr->rqst.w0.ls_cmd = rqst_ls_cmd;
+}
+
+static int
+nvmf_fc_ls_format_rjt(void *buf, uint16_t buflen, uint8_t ls_cmd,
+ uint8_t reason, uint8_t explanation, uint8_t vendor)
+{
+ struct spdk_nvmf_fc_ls_rjt *rjt = buf;
+
+ bzero(buf, sizeof(struct spdk_nvmf_fc_ls_rjt));
+ nvmf_fc_ls_format_rsp_hdr(buf, FCNVME_LSDESC_RQST,
+ nvmf_fc_lsdesc_len(sizeof(struct spdk_nvmf_fc_ls_rjt)),
+ ls_cmd);
+ to_be32(&rjt->rjt.desc_tag, FCNVME_LSDESC_RJT);
+ rjt->rjt.desc_len = nvmf_fc_lsdesc_len(sizeof(struct spdk_nvmf_fc_lsdesc_rjt));
+ rjt->rjt.reason_code = reason;
+ rjt->rjt.reason_explanation = explanation;
+ rjt->rjt.vendor = vendor;
+
+ return sizeof(struct spdk_nvmf_fc_ls_rjt);
+}
+
+/* ************************************************** */
+/* Allocators/Deallocators (assocations, connections, */
+/* poller API data) */
+
+static inline void
+nvmf_fc_ls_free_association(struct spdk_nvmf_fc_association *assoc)
+{
+ struct spdk_nvmf_fc_conn *fc_conn;
+
+ /* return the q slots of the conns for the association */
+ TAILQ_FOREACH(fc_conn, &assoc->avail_fc_conns, assoc_avail_link) {
+ if (fc_conn->conn_id != NVMF_FC_INVALID_CONN_ID) {
+ nvmf_fc_release_conn(fc_conn->hwqp, fc_conn->conn_id,
+ fc_conn->max_queue_depth);
+ }
+ }
+
+ /* free assocation's send disconnect buffer */
+ if (assoc->snd_disconn_bufs) {
+ nvmf_fc_free_srsr_bufs(assoc->snd_disconn_bufs);
+ }
+
+ /* free assocation's connections */
+ free(assoc->conns_buf);
+
+ /* free the association */
+ free(assoc);
+}
+
+static int
+nvmf_fc_ls_alloc_connections(struct spdk_nvmf_fc_association *assoc,
+ struct spdk_nvmf_transport *nvmf_transport)
+{
+ uint32_t i;
+ struct spdk_nvmf_fc_conn *fc_conn;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, "Pre-alloc %d qpairs for host NQN %s\n",
+ nvmf_transport->opts.max_qpairs_per_ctrlr, assoc->host_nqn);
+
+ /* allocate memory for all connections at once */
+ assoc->conns_buf = calloc(nvmf_transport->opts.max_qpairs_per_ctrlr + 1,
+ sizeof(struct spdk_nvmf_fc_conn));
+ if (assoc->conns_buf == NULL) {
+ SPDK_ERRLOG("Out of memory for connections for new association\n");
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < nvmf_transport->opts.max_qpairs_per_ctrlr; i++) {
+ fc_conn = assoc->conns_buf + (i * sizeof(struct spdk_nvmf_fc_conn));
+ fc_conn->conn_id = NVMF_FC_INVALID_CONN_ID;
+ fc_conn->qpair.state = SPDK_NVMF_QPAIR_UNINITIALIZED;
+ fc_conn->qpair.transport = nvmf_transport;
+
+ TAILQ_INSERT_TAIL(&assoc->avail_fc_conns, fc_conn, assoc_avail_link);
+ }
+
+ return 0;
+}
+
+static struct spdk_nvmf_fc_association *
+nvmf_fc_ls_new_association(uint32_t s_id,
+ struct spdk_nvmf_fc_nport *tgtport,
+ struct spdk_nvmf_fc_remote_port_info *rport,
+ struct spdk_nvmf_fc_lsdesc_cr_assoc_cmd *a_cmd,
+ struct spdk_nvmf_subsystem *subsys,
+ uint16_t rpi,
+ struct spdk_nvmf_transport *nvmf_transport)
+{
+ struct spdk_nvmf_fc_association *assoc;
+ int rc;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS,
+ "New Association request for port %d nport %d rpi 0x%x\n",
+ tgtport->fc_port->port_hdl, tgtport->nport_hdl, rpi);
+
+ assert(rport);
+ if (!rport) {
+ SPDK_ERRLOG("rport is null.\n");
+ return NULL;
+ }
+
+ assoc = calloc(1, sizeof(struct spdk_nvmf_fc_association));
+ if (!assoc) {
+ SPDK_ERRLOG("unable to allocate memory for new association\n");
+ return NULL;
+ }
+
+ /* initialize association */
+#if (NVMF_FC_LS_SEND_LS_DISCONNECT == 1)
+ /* allocate buffers to send LS disconnect command to host */
+ assoc->snd_disconn_bufs =
+ nvmf_fc_alloc_srsr_bufs(sizeof(struct spdk_nvmf_fc_ls_disconnect_rqst),
+ sizeof(struct spdk_nvmf_fc_ls_rjt));
+ if (!assoc->snd_disconn_bufs) {
+ SPDK_ERRLOG("no dma memory for association's ls disconnect bufs\n");
+ free(assoc);
+ return NULL;
+ }
+
+ assoc->snd_disconn_bufs->rpi = rpi;
+#endif
+ assoc->s_id = s_id;
+ assoc->tgtport = tgtport;
+ assoc->rport = rport;
+ assoc->subsystem = subsys;
+ assoc->assoc_state = SPDK_NVMF_FC_OBJECT_CREATED;
+ memcpy(assoc->host_id, a_cmd->hostid, FCNVME_ASSOC_HOSTID_LEN);
+ memcpy(assoc->host_nqn, a_cmd->hostnqn, SPDK_NVME_NQN_FIELD_SIZE);
+ memcpy(assoc->sub_nqn, a_cmd->subnqn, SPDK_NVME_NQN_FIELD_SIZE);
+ TAILQ_INIT(&assoc->fc_conns);
+ TAILQ_INIT(&assoc->avail_fc_conns);
+ assoc->ls_del_op_ctx = NULL;
+
+ /* allocate and assign connections for association */
+ rc = nvmf_fc_ls_alloc_connections(assoc, nvmf_transport);
+ if (rc != 0) {
+ nvmf_fc_ls_free_association(assoc);
+ return NULL;
+ }
+
+ /* add association to target port's association list */
+ nvmf_fc_add_assoc_to_tgt_port(tgtport, assoc, rport);
+ return assoc;
+}
+
+static inline void
+nvmf_fc_ls_append_del_cb_ctx(struct spdk_nvmf_fc_association *assoc,
+ struct nvmf_fc_ls_op_ctx *opd)
+{
+ /* append to delete assoc callback list */
+ if (!assoc->ls_del_op_ctx) {
+ assoc->ls_del_op_ctx = (void *)opd;
+ } else {
+ struct nvmf_fc_ls_op_ctx *nxt =
+ (struct nvmf_fc_ls_op_ctx *) assoc->ls_del_op_ctx;
+ while (nxt->next_op_ctx) {
+ nxt = nxt->next_op_ctx;
+ }
+ nxt->next_op_ctx = opd;
+ }
+}
+
+static struct spdk_nvmf_fc_conn *
+nvmf_fc_ls_new_connection(struct spdk_nvmf_fc_association *assoc, uint16_t qid,
+ uint16_t esrp_ratio, uint16_t rpi, uint16_t sq_size,
+ struct spdk_nvmf_fc_nport *tgtport)
+{
+ struct spdk_nvmf_fc_conn *fc_conn;
+
+ fc_conn = TAILQ_FIRST(&assoc->avail_fc_conns);
+ if (!fc_conn) {
+ SPDK_ERRLOG("out of connections for association %p\n", assoc);
+ return NULL;
+ }
+
+ /* Remove from avail list and add to in use. */
+ TAILQ_REMOVE(&assoc->avail_fc_conns, fc_conn, assoc_avail_link);
+ TAILQ_INSERT_TAIL(&assoc->fc_conns, fc_conn, assoc_link);
+
+ if (qid == 0) {
+ /* AdminQ connection. */
+ assoc->aq_conn = fc_conn;
+ }
+
+ fc_conn->qpair.qid = qid;
+ fc_conn->qpair.sq_head_max = sq_size;
+ TAILQ_INIT(&fc_conn->qpair.outstanding);
+ fc_conn->esrp_ratio = esrp_ratio;
+ fc_conn->fc_assoc = assoc;
+ fc_conn->rpi = rpi;
+ fc_conn->max_queue_depth = sq_size + 1;
+
+ /* save target port trid in connection (for subsystem
+ * listener validation in fabric connect command)
+ */
+ nvmf_fc_create_trid(&fc_conn->trid, tgtport->fc_nodename.u.wwn,
+ tgtport->fc_portname.u.wwn);
+
+ return fc_conn;
+}
+
+static inline void
+nvmf_fc_ls_free_connection(struct spdk_nvmf_fc_conn *fc_conn)
+{
+ TAILQ_INSERT_TAIL(&fc_conn->fc_assoc->avail_fc_conns, fc_conn, assoc_avail_link);
+}
+
+/* End - Allocators/Deallocators (assocations, connections, */
+/* poller API data) */
+/* ******************************************************** */
+
+static inline struct spdk_nvmf_fc_association *
+nvmf_fc_ls_find_assoc(struct spdk_nvmf_fc_nport *tgtport, uint64_t assoc_id)
+{
+ struct spdk_nvmf_fc_association *assoc = NULL;
+
+ TAILQ_FOREACH(assoc, &tgtport->fc_associations, link) {
+ if (assoc->assoc_id == assoc_id) {
+ if (assoc->assoc_state == SPDK_NVMF_FC_OBJECT_ZOMBIE) {
+ assoc = NULL;
+ }
+ break;
+ }
+ }
+ return assoc;
+}
+
+static inline void
+nvmf_fc_add_assoc_to_tgt_port(struct spdk_nvmf_fc_nport *tgtport,
+ struct spdk_nvmf_fc_association *assoc,
+ struct spdk_nvmf_fc_remote_port_info *rport)
+{
+ TAILQ_INSERT_TAIL(&tgtport->fc_associations, assoc, link);
+ tgtport->assoc_count++;
+ rport->assoc_count++;
+}
+
+static inline void
+nvmf_fc_del_assoc_from_tgt_port(struct spdk_nvmf_fc_association *assoc)
+{
+ struct spdk_nvmf_fc_nport *tgtport = assoc->tgtport;
+
+ TAILQ_REMOVE(&tgtport->fc_associations, assoc, link);
+ tgtport->assoc_count--;
+ assoc->rport->assoc_count--;
+}
+
+static void
+nvmf_fc_ls_rsp_fail_del_conn_cb(void *cb_data, enum spdk_nvmf_fc_poller_api_ret ret)
+{
+ struct nvmf_fc_ls_op_ctx *opd =
+ (struct nvmf_fc_ls_op_ctx *)cb_data;
+ struct spdk_nvmf_fc_ls_del_conn_api_data *dp = &opd->u.del_conn;
+ struct spdk_nvmf_fc_association *assoc = dp->assoc;
+ struct spdk_nvmf_fc_conn *fc_conn = dp->args.fc_conn;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, "Delete Connection callback "
+ "for assoc_id 0x%lx conn_id 0x%lx\n", assoc->assoc_id,
+ fc_conn->conn_id);
+
+ if (dp->aq_conn) {
+ /* delete association */
+ nvmf_fc_del_assoc_from_tgt_port(assoc);
+ nvmf_fc_ls_free_association(assoc);
+ } else {
+ /* remove connection from association's connection list */
+ TAILQ_REMOVE(&assoc->fc_conns, fc_conn, assoc_link);
+ nvmf_fc_ls_free_connection(fc_conn);
+ }
+
+ free(opd);
+}
+
+static void
+nvmf_fc_handle_xmt_ls_rsp_failure(struct spdk_nvmf_fc_association *assoc,
+ struct spdk_nvmf_fc_conn *fc_conn,
+ bool aq_conn)
+{
+ struct spdk_nvmf_fc_ls_del_conn_api_data *api_data;
+ struct nvmf_fc_ls_op_ctx *opd = NULL;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, "Transmit LS response failure "
+ "for assoc_id 0x%lx conn_id 0x%lx\n", assoc->assoc_id,
+ fc_conn->conn_id);
+
+
+ /* create context for delete connection API */
+ opd = calloc(1, sizeof(struct nvmf_fc_ls_op_ctx));
+ if (!opd) { /* hopefully this doesn't happen - if so, we leak the connection */
+ SPDK_ERRLOG("Mem alloc failed for del conn op data");
+ return;
+ }
+
+ api_data = &opd->u.del_conn;
+ api_data->assoc = assoc;
+ api_data->ls_rqst = NULL;
+ api_data->aq_conn = aq_conn;
+ api_data->args.fc_conn = fc_conn;
+ api_data->args.send_abts = false;
+ api_data->args.hwqp = fc_conn->hwqp;
+ api_data->args.cb_info.cb_thread = spdk_get_thread();
+ api_data->args.cb_info.cb_func = nvmf_fc_ls_rsp_fail_del_conn_cb;
+ api_data->args.cb_info.cb_data = opd;
+
+ nvmf_fc_poller_api_func(api_data->args.hwqp,
+ SPDK_NVMF_FC_POLLER_API_DEL_CONNECTION,
+ &api_data->args);
+}
+
+/* callback from poller's ADD_Connection event */
+static void
+nvmf_fc_ls_add_conn_cb(void *cb_data, enum spdk_nvmf_fc_poller_api_ret ret)
+{
+ struct nvmf_fc_ls_op_ctx *opd =
+ (struct nvmf_fc_ls_op_ctx *)cb_data;
+ struct spdk_nvmf_fc_ls_add_conn_api_data *dp = &opd->u.add_conn;
+ struct spdk_nvmf_fc_association *assoc = dp->assoc;
+ struct spdk_nvmf_fc_nport *tgtport = assoc->tgtport;
+ struct spdk_nvmf_fc_conn *fc_conn = dp->args.fc_conn;
+ struct spdk_nvmf_fc_ls_rqst *ls_rqst = dp->ls_rqst;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS,
+ "add_conn_cb: assoc_id = 0x%lx, conn_id = 0x%lx\n",
+ assoc->assoc_id, fc_conn->conn_id);
+
+ fc_conn->create_opd = NULL;
+
+ if (assoc->assoc_state == SPDK_NVMF_FC_OBJECT_TO_BE_DELETED) {
+ /* association is already being deleted - don't continue */
+ free(opd);
+ return;
+ }
+
+ if (dp->aq_conn) {
+ struct spdk_nvmf_fc_ls_cr_assoc_acc *assoc_acc =
+ (struct spdk_nvmf_fc_ls_cr_assoc_acc *)ls_rqst->rspbuf.virt;
+ /* put connection and association ID in response */
+ to_be64(&assoc_acc->conn_id.connection_id, fc_conn->conn_id);
+ assoc_acc->assoc_id.association_id = assoc_acc->conn_id.connection_id;
+ } else {
+ struct spdk_nvmf_fc_ls_cr_conn_acc *conn_acc =
+ (struct spdk_nvmf_fc_ls_cr_conn_acc *)ls_rqst->rspbuf.virt;
+ /* put connection ID in response */
+ to_be64(&conn_acc->conn_id.connection_id, fc_conn->conn_id);
+ }
+
+ /* send LS response */
+ if (nvmf_fc_xmt_ls_rsp(tgtport, ls_rqst) != 0) {
+ SPDK_ERRLOG("Send LS response for %s failed - cleaning up\n",
+ dp->aq_conn ? "association" : "connection");
+ nvmf_fc_handle_xmt_ls_rsp_failure(assoc, fc_conn,
+ dp->aq_conn);
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS,
+ "LS response (conn_id 0x%lx) sent\n", fc_conn->conn_id);
+ }
+
+ free(opd);
+}
+
+void
+nvmf_fc_ls_add_conn_failure(
+ struct spdk_nvmf_fc_association *assoc,
+ struct spdk_nvmf_fc_ls_rqst *ls_rqst,
+ struct spdk_nvmf_fc_conn *fc_conn,
+ bool aq_conn)
+{
+ struct spdk_nvmf_fc_ls_cr_assoc_rqst *rqst;
+ struct spdk_nvmf_fc_ls_cr_assoc_acc *acc;
+ struct spdk_nvmf_fc_nport *tgtport = assoc->tgtport;
+
+ if (fc_conn->create_opd) {
+ free(fc_conn->create_opd);
+ fc_conn->create_opd = NULL;
+ }
+
+ rqst = (struct spdk_nvmf_fc_ls_cr_assoc_rqst *)ls_rqst->rqstbuf.virt;
+ acc = (struct spdk_nvmf_fc_ls_cr_assoc_acc *)ls_rqst->rspbuf.virt;
+
+ /* send failure response */
+ ls_rqst->rsp_len = nvmf_fc_ls_format_rjt(acc,
+ FCNVME_MAX_LS_BUFFER_SIZE, rqst->w0.ls_cmd,
+ FCNVME_RJT_RC_INSUFF_RES,
+ FCNVME_RJT_EXP_NONE, 0);
+
+ nvmf_fc_ls_free_connection(fc_conn);
+ if (aq_conn) {
+ nvmf_fc_del_assoc_from_tgt_port(assoc);
+ nvmf_fc_ls_free_association(assoc);
+ }
+
+ nvmf_fc_xmt_ls_rsp(tgtport, ls_rqst);
+}
+
+
+static void
+nvmf_fc_ls_add_conn_to_poller(
+ struct spdk_nvmf_fc_association *assoc,
+ struct spdk_nvmf_fc_ls_rqst *ls_rqst,
+ struct spdk_nvmf_fc_conn *fc_conn,
+ bool aq_conn)
+{
+ struct nvmf_fc_ls_op_ctx *opd;
+ struct spdk_nvmf_fc_ls_add_conn_api_data *api_data;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, "Add Connection to poller for "
+ "assoc_id 0x%lx conn_id 0x%lx\n", assoc->assoc_id,
+ fc_conn->conn_id);
+
+ opd = calloc(1, sizeof(struct nvmf_fc_ls_op_ctx));
+ if (!opd) {
+ SPDK_ERRLOG("allocate api data for add conn op failed\n");
+ nvmf_fc_ls_add_conn_failure(assoc, ls_rqst, fc_conn, aq_conn);
+ return;
+ }
+
+ /* insert conn in association's connection list */
+ api_data = &opd->u.add_conn;
+ assoc->conn_count++;
+
+ api_data->args.fc_conn = fc_conn;
+ api_data->args.cb_info.cb_thread = spdk_get_thread();
+ api_data->args.cb_info.cb_func = nvmf_fc_ls_add_conn_cb;
+ api_data->args.cb_info.cb_data = (void *)opd;
+ api_data->assoc = assoc;
+ api_data->ls_rqst = ls_rqst;
+ api_data->aq_conn = aq_conn;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS,
+ "New QP callback called.\n");
+
+ /* Let the nvmf_tgt decide which pollgroup to use. */
+ fc_conn->create_opd = opd;
+ spdk_nvmf_tgt_new_qpair(ls_rqst->nvmf_tgt, &fc_conn->qpair);
+}
+
+/* Delete association functions */
+
+static void
+nvmf_fc_do_del_assoc_cbs(struct nvmf_fc_ls_op_ctx *opd, int ret)
+{
+ struct nvmf_fc_ls_op_ctx *nxt;
+ struct spdk_nvmf_fc_delete_assoc_api_data *dp;
+
+ while (opd) {
+ dp = &opd->u.del_assoc;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, "performing delete assoc. callback\n");
+ dp->del_assoc_cb(dp->del_assoc_cb_data, ret);
+
+ nxt = opd->next_op_ctx;
+ free(opd);
+ opd = nxt;
+ }
+}
+
+static void
+nvmf_fs_send_ls_disconnect_cb(void *hwqp, int32_t status, void *args)
+{
+ if (args) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, "free disconnect buffers\n");
+ nvmf_fc_free_srsr_bufs((struct spdk_nvmf_fc_srsr_bufs *)args);
+ }
+}
+
+static void
+nvmf_fc_del_all_conns_cb(void *cb_data, enum spdk_nvmf_fc_poller_api_ret ret)
+{
+ struct nvmf_fc_ls_op_ctx *opd = (struct nvmf_fc_ls_op_ctx *)cb_data;
+ struct spdk_nvmf_fc_delete_assoc_api_data *dp = &opd->u.del_assoc;
+ struct spdk_nvmf_fc_association *assoc = dp->assoc;
+ struct spdk_nvmf_fc_conn *fc_conn = dp->args.fc_conn;
+
+ /* Assumption here is that there will be no error (i.e. ret=success).
+ * Since connections are deleted in parallel, nothing can be
+ * done anyway if there is an error because we need to complete
+ * all connection deletes and callback to caller */
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS,
+ "Delete all connections for assoc_id 0x%lx, conn_id = %lx\n",
+ assoc->assoc_id, fc_conn->conn_id);
+
+ /* remove connection from association's connection list */
+ TAILQ_REMOVE(&assoc->fc_conns, fc_conn, assoc_link);
+ nvmf_fc_ls_free_connection(fc_conn);
+
+ if (--assoc->conn_count == 0) {
+ /* last connection - remove association from target port's association list */
+ struct nvmf_fc_ls_op_ctx *cb_opd = (struct nvmf_fc_ls_op_ctx *)assoc->ls_del_op_ctx;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS,
+ "remove assoc. %lx\n", assoc->assoc_id);
+ nvmf_fc_del_assoc_from_tgt_port(assoc);
+
+ if (assoc->snd_disconn_bufs &&
+ assoc->tgtport->fc_port->hw_port_status == SPDK_FC_PORT_ONLINE) {
+
+ struct spdk_nvmf_fc_ls_disconnect_rqst *dc_rqst;
+ struct spdk_nvmf_fc_srsr_bufs *srsr_bufs;
+
+ dc_rqst = (struct spdk_nvmf_fc_ls_disconnect_rqst *)
+ assoc->snd_disconn_bufs->rqst;
+
+ bzero(dc_rqst, sizeof(struct spdk_nvmf_fc_ls_disconnect_rqst));
+
+ /* fill in request descriptor */
+ dc_rqst->w0.ls_cmd = FCNVME_LS_DISCONNECT;
+ to_be32(&dc_rqst->desc_list_len,
+ sizeof(struct spdk_nvmf_fc_ls_disconnect_rqst) -
+ (2 * sizeof(uint32_t)));
+
+ /* fill in disconnect command descriptor */
+ to_be32(&dc_rqst->disconn_cmd.desc_tag, FCNVME_LSDESC_DISCONN_CMD);
+ to_be32(&dc_rqst->disconn_cmd.desc_len,
+ sizeof(struct spdk_nvmf_fc_lsdesc_disconn_cmd) -
+ (2 * sizeof(uint32_t)));
+
+ /* fill in association id descriptor */
+ to_be32(&dc_rqst->assoc_id.desc_tag, FCNVME_LSDESC_ASSOC_ID),
+ to_be32(&dc_rqst->assoc_id.desc_len,
+ sizeof(struct spdk_nvmf_fc_lsdesc_assoc_id) -
+ (2 * sizeof(uint32_t)));
+ to_be64(&dc_rqst->assoc_id.association_id, assoc->assoc_id);
+
+ srsr_bufs = assoc->snd_disconn_bufs;
+ assoc->snd_disconn_bufs = NULL;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, "Send LS disconnect\n");
+ if (nvmf_fc_xmt_srsr_req(&assoc->tgtport->fc_port->ls_queue,
+ srsr_bufs, nvmf_fs_send_ls_disconnect_cb,
+ (void *)srsr_bufs)) {
+ SPDK_ERRLOG("Error sending LS disconnect\n");
+ assoc->snd_disconn_bufs = srsr_bufs;
+ }
+ }
+
+ nvmf_fc_ls_free_association(assoc);
+
+ /* perform callbacks to all callers to delete association */
+ nvmf_fc_do_del_assoc_cbs(cb_opd, 0);
+
+ }
+
+ free(opd);
+}
+
+static void
+nvmf_fc_kill_io_del_all_conns_cb(void *cb_data, enum spdk_nvmf_fc_poller_api_ret ret)
+{
+ struct nvmf_fc_ls_op_ctx *opd = (struct nvmf_fc_ls_op_ctx *)cb_data;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, "Callback after killing outstanding ABTS.");
+ /*
+ * NOTE: We should not access any connection or association related data
+ * structures here.
+ */
+ free(opd);
+}
+
+
+/* Disconnect/delete (association) request functions */
+
+static int
+_nvmf_fc_delete_association(struct spdk_nvmf_fc_nport *tgtport,
+ uint64_t assoc_id, bool send_abts, bool backend_initiated,
+ spdk_nvmf_fc_del_assoc_cb del_assoc_cb,
+ void *cb_data, bool from_ls_rqst)
+{
+
+ struct nvmf_fc_ls_op_ctx *opd, *opd_tail, *opd_head = NULL;
+ struct spdk_nvmf_fc_delete_assoc_api_data *api_data;
+ struct spdk_nvmf_fc_conn *fc_conn;
+ struct spdk_nvmf_fc_association *assoc =
+ nvmf_fc_ls_find_assoc(tgtport, assoc_id);
+ struct spdk_nvmf_fc_port *fc_port = tgtport->fc_port;
+ enum spdk_nvmf_fc_object_state assoc_state;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, "Delete association, "
+ "assoc_id 0x%lx\n", assoc_id);
+
+ if (!assoc) {
+ SPDK_ERRLOG("Delete association failed: %s\n",
+ validation_errors[VERR_NO_ASSOC]);
+ return VERR_NO_ASSOC;
+ }
+
+ /* create cb context to put in association's list of
+ * callbacks to call when delete association is done */
+ opd = calloc(1, sizeof(struct nvmf_fc_ls_op_ctx));
+ if (!opd) {
+ SPDK_ERRLOG("Mem alloc failed for del assoc cb data");
+ return -ENOMEM;
+ }
+
+ api_data = &opd->u.del_assoc;
+ api_data->assoc = assoc;
+ api_data->from_ls_rqst = from_ls_rqst;
+ api_data->del_assoc_cb = del_assoc_cb;
+ api_data->del_assoc_cb_data = cb_data;
+ api_data->args.cb_info.cb_data = opd;
+ nvmf_fc_ls_append_del_cb_ctx(assoc, opd);
+
+ assoc_state = assoc->assoc_state;
+ if ((assoc_state == SPDK_NVMF_FC_OBJECT_TO_BE_DELETED) &&
+ (fc_port->hw_port_status != SPDK_FC_PORT_QUIESCED)) {
+ /* association already being deleted */
+ return 0;
+ }
+
+ /* mark assoc. to be deleted */
+ assoc->assoc_state = SPDK_NVMF_FC_OBJECT_TO_BE_DELETED;
+
+ /* create a list of all connection to delete */
+ TAILQ_FOREACH(fc_conn, &assoc->fc_conns, assoc_link) {
+ opd = calloc(1, sizeof(struct nvmf_fc_ls_op_ctx));
+ if (!opd) { /* hopefully this doesn't happen */
+ SPDK_ERRLOG("Mem alloc failed for del conn op data");
+ while (opd_head) { /* free any contexts already allocated */
+ opd = opd_head;
+ opd_head = opd->next_op_ctx;
+ free(opd);
+ }
+ return -ENOMEM;
+ }
+
+ api_data = &opd->u.del_assoc;
+ api_data->args.fc_conn = fc_conn;
+ api_data->assoc = assoc;
+ api_data->args.send_abts = send_abts;
+ api_data->args.backend_initiated = backend_initiated;
+ api_data->args.hwqp = nvmf_fc_get_hwqp_from_conn_id(
+ assoc->tgtport->fc_port->io_queues,
+ assoc->tgtport->fc_port->num_io_queues,
+ fc_conn->conn_id);
+ api_data->args.cb_info.cb_thread = spdk_get_thread();
+ if ((fc_port->hw_port_status == SPDK_FC_PORT_QUIESCED) &&
+ (assoc_state == SPDK_NVMF_FC_OBJECT_TO_BE_DELETED)) {
+ /*
+ * If there are any connections deletes or IO abts that are
+ * stuck because of firmware reset, a second invocation of
+ * SPDK_NVMF_FC_POLLER_API_DEL_CONNECTION will result in
+ * outstanding connections & requests being killed and
+ * their corresponding callbacks being executed.
+ */
+ api_data->args.cb_info.cb_func = nvmf_fc_kill_io_del_all_conns_cb;
+ } else {
+ api_data->args.cb_info.cb_func = nvmf_fc_del_all_conns_cb;
+ }
+ api_data->args.cb_info.cb_data = opd;
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS,
+ "conn_id = %lx\n", fc_conn->conn_id);
+
+ if (!opd_head) {
+ opd_head = opd;
+ } else {
+ opd_tail->next_op_ctx = opd;
+ }
+ opd_tail = opd;
+ }
+
+ /* make poller api calls to delete connetions */
+ while (opd_head) {
+ opd = opd_head;
+ opd_head = opd->next_op_ctx;
+ api_data = &opd->u.del_assoc;
+ nvmf_fc_poller_api_func(api_data->args.hwqp,
+ SPDK_NVMF_FC_POLLER_API_DEL_CONNECTION,
+ &api_data->args);
+ }
+
+ return 0;
+}
+
+static void
+nvmf_fc_ls_disconnect_assoc_cb(void *cb_data, uint32_t err)
+{
+ struct nvmf_fc_ls_op_ctx *opd = (struct nvmf_fc_ls_op_ctx *)cb_data;
+ struct spdk_nvmf_fc_ls_disconn_assoc_api_data *dp = &opd->u.disconn_assoc;
+ struct spdk_nvmf_fc_nport *tgtport = dp->tgtport;
+ struct spdk_nvmf_fc_ls_rqst *ls_rqst = dp->ls_rqst;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, "Disconnect association callback begin "
+ "nport %d\n", tgtport->nport_hdl);
+ if (err != 0) {
+ /* send failure response */
+ struct spdk_nvmf_fc_ls_cr_assoc_rqst *rqst =
+ (struct spdk_nvmf_fc_ls_cr_assoc_rqst *)ls_rqst->rqstbuf.virt;
+ struct spdk_nvmf_fc_ls_cr_assoc_acc *acc =
+ (struct spdk_nvmf_fc_ls_cr_assoc_acc *)ls_rqst->rspbuf.virt;
+ ls_rqst->rsp_len = nvmf_fc_ls_format_rjt(acc,
+ FCNVME_MAX_LS_BUFFER_SIZE,
+ rqst->w0.ls_cmd,
+ FCNVME_RJT_RC_UNAB,
+ FCNVME_RJT_EXP_NONE,
+ 0);
+ }
+
+ nvmf_fc_xmt_ls_rsp(tgtport, ls_rqst);
+
+ free(opd);
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, "Disconnect association callback complete "
+ "nport %d err %d\n", tgtport->nport_hdl, err);
+}
+
+static void
+nvmf_fc_ls_disconnect_assoc(struct spdk_nvmf_fc_nport *tgtport,
+ struct spdk_nvmf_fc_ls_rqst *ls_rqst, uint64_t assoc_id)
+{
+ struct nvmf_fc_ls_op_ctx *opd;
+ struct spdk_nvmf_fc_ls_cr_assoc_rqst *rqst =
+ (struct spdk_nvmf_fc_ls_cr_assoc_rqst *)ls_rqst->rqstbuf.virt;
+ struct spdk_nvmf_fc_ls_cr_assoc_acc *acc =
+ (struct spdk_nvmf_fc_ls_cr_assoc_acc *)ls_rqst->rspbuf.virt;
+ struct spdk_nvmf_fc_ls_disconn_assoc_api_data *api_data;
+ int ret;
+ uint8_t reason = 0;
+
+ opd = calloc(1, sizeof(struct nvmf_fc_ls_op_ctx));
+ if (!opd) {
+ /* send failure response */
+ SPDK_ERRLOG("Allocate disconn assoc op data failed\n");
+ reason = FCNVME_RJT_RC_INSUFF_RES;
+ goto send_rjt;
+ }
+
+ api_data = &opd->u.disconn_assoc;
+ api_data->tgtport = tgtport;
+ api_data->ls_rqst = ls_rqst;
+ ret = _nvmf_fc_delete_association(tgtport, assoc_id,
+ false, false,
+ nvmf_fc_ls_disconnect_assoc_cb,
+ api_data, true);
+ if (!ret) {
+ return;
+ }
+
+ /* delete association failed */
+ switch (ret) {
+ case VERR_NO_ASSOC:
+ reason = FCNVME_RJT_RC_INV_ASSOC;
+ break;
+ case -ENOMEM:
+ reason = FCNVME_RJT_RC_INSUFF_RES;
+ break;
+ default:
+ reason = FCNVME_RJT_RC_LOGIC;
+ }
+
+ free(opd);
+
+send_rjt:
+ ls_rqst->rsp_len = nvmf_fc_ls_format_rjt(acc,
+ FCNVME_MAX_LS_BUFFER_SIZE,
+ rqst->w0.ls_cmd, reason,
+ FCNVME_RJT_EXP_NONE, 0);
+ nvmf_fc_xmt_ls_rsp(tgtport, ls_rqst);
+}
+
+static int
+nvmf_fc_ls_validate_host(struct spdk_nvmf_subsystem *subsystem, const char *hostnqn)
+{
+
+ if (!spdk_nvmf_subsystem_host_allowed(subsystem, hostnqn)) {
+ return -EPERM;
+ }
+
+ return 0;
+}
+
+/* **************************** */
+/* LS Reqeust Handler Functions */
+
+static void
+nvmf_fc_ls_process_cass(uint32_t s_id,
+ struct spdk_nvmf_fc_nport *tgtport,
+ struct spdk_nvmf_fc_ls_rqst *ls_rqst)
+{
+ struct spdk_nvmf_fc_ls_cr_assoc_rqst *rqst =
+ (struct spdk_nvmf_fc_ls_cr_assoc_rqst *)ls_rqst->rqstbuf.virt;
+ struct spdk_nvmf_fc_ls_cr_assoc_acc *acc =
+ (struct spdk_nvmf_fc_ls_cr_assoc_acc *)ls_rqst->rspbuf.virt;
+ struct spdk_nvmf_fc_association *assoc;
+ struct spdk_nvmf_fc_conn *fc_conn;
+ struct spdk_nvmf_subsystem *subsystem = NULL;
+ const char *hostnqn = (const char *)rqst->assoc_cmd.hostnqn;
+ int errmsg_ind = 0;
+ uint8_t rc = FCNVME_RJT_RC_NONE;
+ uint8_t ec = FCNVME_RJT_EXP_NONE;
+ struct spdk_nvmf_transport *transport = spdk_nvmf_tgt_get_transport(ls_rqst->nvmf_tgt,
+ SPDK_NVME_TRANSPORT_NAME_FC);
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS,
+ "LS_CASS: ls_rqst_len=%d, desc_list_len=%d, cmd_len=%d, sq_size=%d, "
+ "Subnqn: %s, Hostnqn: %s, Tgtport nn:%lx, pn:%lx\n",
+ ls_rqst->rqst_len, from_be32(&rqst->desc_list_len),
+ from_be32(&rqst->assoc_cmd.desc_len),
+ from_be32(&rqst->assoc_cmd.sqsize),
+ rqst->assoc_cmd.subnqn, hostnqn,
+ tgtport->fc_nodename.u.wwn, tgtport->fc_portname.u.wwn);
+
+ if (ls_rqst->rqst_len < FCNVME_LS_CA_CMD_MIN_LEN) {
+ SPDK_ERRLOG("assoc_cmd req len = %d, should be at least %d\n",
+ ls_rqst->rqst_len, FCNVME_LS_CA_CMD_MIN_LEN);
+ errmsg_ind = VERR_CR_ASSOC_LEN;
+ rc = FCNVME_RJT_RC_INV_PARAM;
+ ec = FCNVME_RJT_EXP_INV_LEN;
+ } else if (from_be32(&rqst->desc_list_len) <
+ FCNVME_LS_CA_DESC_LIST_MIN_LEN) {
+ SPDK_ERRLOG("assoc_cmd desc list len = %d, should be at least %d\n",
+ from_be32(&rqst->desc_list_len),
+ FCNVME_LS_CA_DESC_LIST_MIN_LEN);
+ errmsg_ind = VERR_CR_ASSOC_RQST_LEN;
+ rc = FCNVME_RJT_RC_INV_PARAM;
+ ec = FCNVME_RJT_EXP_INV_LEN;
+ } else if (rqst->assoc_cmd.desc_tag !=
+ cpu_to_be32(FCNVME_LSDESC_CREATE_ASSOC_CMD)) {
+ errmsg_ind = VERR_CR_ASSOC_CMD;
+ rc = FCNVME_RJT_RC_INV_PARAM;
+ } else if (from_be32(&rqst->assoc_cmd.desc_len) <
+ FCNVME_LS_CA_DESC_MIN_LEN) {
+ SPDK_ERRLOG("assoc_cmd desc len = %d, should be at least %d\n",
+ from_be32(&rqst->assoc_cmd.desc_len),
+ FCNVME_LS_CA_DESC_MIN_LEN);
+ errmsg_ind = VERR_CR_ASSOC_CMD_LEN;
+ rc = FCNVME_RJT_RC_INV_PARAM;
+ ec = FCNVME_RJT_EXP_INV_LEN;
+ } else if (!rqst->assoc_cmd.ersp_ratio ||
+ (from_be16(&rqst->assoc_cmd.ersp_ratio) >=
+ from_be16(&rqst->assoc_cmd.sqsize))) {
+ errmsg_ind = VERR_ERSP_RATIO;
+ rc = FCNVME_RJT_RC_INV_PARAM;
+ ec = FCNVME_RJT_EXP_INV_ESRP;
+ } else if (from_be16(&rqst->assoc_cmd.sqsize) == 0 ||
+ from_be16(&rqst->assoc_cmd.sqsize) > transport->opts.max_aq_depth) {
+ errmsg_ind = VERR_SQSIZE;
+ rc = FCNVME_RJT_RC_INV_PARAM;
+ ec = FCNVME_RJT_EXP_SQ_SIZE;
+ }
+
+ if (rc != FCNVME_RJT_RC_NONE) {
+ goto rjt_cass;
+ }
+
+ subsystem = spdk_nvmf_tgt_find_subsystem(ls_rqst->nvmf_tgt, rqst->assoc_cmd.subnqn);
+ if (subsystem == NULL) {
+ errmsg_ind = VERR_SUBNQN;
+ rc = FCNVME_RJT_RC_INV_PARAM;
+ ec = FCNVME_RJT_EXP_INV_SUBNQN;
+ goto rjt_cass;
+ }
+
+ if (nvmf_fc_ls_validate_host(subsystem, hostnqn)) {
+ errmsg_ind = VERR_HOSTNQN;
+ rc = FCNVME_RJT_RC_INV_HOST;
+ ec = FCNVME_RJT_EXP_INV_HOSTNQN;
+ goto rjt_cass;
+ }
+
+ /* get new association */
+ assoc = nvmf_fc_ls_new_association(s_id, tgtport, ls_rqst->rport,
+ &rqst->assoc_cmd, subsystem,
+ ls_rqst->rpi, transport);
+ if (!assoc) {
+ errmsg_ind = VERR_ASSOC_ALLOC_FAIL;
+ rc = FCNVME_RJT_RC_INSUFF_RES;
+ ec = FCNVME_RJT_EXP_NONE;
+ goto rjt_cass;
+ }
+
+ /* alloc admin q (i.e. connection) */
+ fc_conn = nvmf_fc_ls_new_connection(assoc, 0,
+ from_be16(&rqst->assoc_cmd.ersp_ratio),
+ ls_rqst->rpi,
+ from_be16(&rqst->assoc_cmd.sqsize),
+ tgtport);
+ if (!fc_conn) {
+ nvmf_fc_ls_free_association(assoc);
+ errmsg_ind = VERR_CONN_ALLOC_FAIL;
+ rc = FCNVME_RJT_RC_INSUFF_RES;
+ ec = FCNVME_RJT_EXP_NONE;
+ goto rjt_cass;
+ }
+
+ /* format accept response */
+ bzero(acc, sizeof(*acc));
+ ls_rqst->rsp_len = sizeof(*acc);
+
+ nvmf_fc_ls_format_rsp_hdr(acc, FCNVME_LS_ACC,
+ nvmf_fc_lsdesc_len(
+ sizeof(struct spdk_nvmf_fc_ls_cr_assoc_acc)),
+ FCNVME_LS_CREATE_ASSOCIATION);
+ to_be32(&acc->assoc_id.desc_tag, FCNVME_LSDESC_ASSOC_ID);
+ acc->assoc_id.desc_len =
+ nvmf_fc_lsdesc_len(sizeof(struct spdk_nvmf_fc_lsdesc_assoc_id));
+ to_be32(&acc->conn_id.desc_tag, FCNVME_LSDESC_CONN_ID);
+ acc->conn_id.desc_len =
+ nvmf_fc_lsdesc_len(sizeof(struct spdk_nvmf_fc_lsdesc_conn_id));
+
+ /* assign connection to HWQP poller - also sends response */
+ nvmf_fc_ls_add_conn_to_poller(assoc, ls_rqst, fc_conn, true);
+
+ return;
+
+rjt_cass:
+ SPDK_ERRLOG("Create Association LS failed: %s\n", validation_errors[errmsg_ind]);
+ ls_rqst->rsp_len = nvmf_fc_ls_format_rjt(acc, FCNVME_MAX_LS_BUFFER_SIZE,
+ rqst->w0.ls_cmd, rc, ec, 0);
+ nvmf_fc_xmt_ls_rsp(tgtport, ls_rqst);
+}
+
+static void
+nvmf_fc_ls_process_cioc(struct spdk_nvmf_fc_nport *tgtport,
+ struct spdk_nvmf_fc_ls_rqst *ls_rqst)
+{
+ struct spdk_nvmf_fc_ls_cr_conn_rqst *rqst =
+ (struct spdk_nvmf_fc_ls_cr_conn_rqst *)ls_rqst->rqstbuf.virt;
+ struct spdk_nvmf_fc_ls_cr_conn_acc *acc =
+ (struct spdk_nvmf_fc_ls_cr_conn_acc *)ls_rqst->rspbuf.virt;
+ struct spdk_nvmf_fc_association *assoc;
+ struct spdk_nvmf_fc_conn *fc_conn = NULL;
+ int errmsg_ind = 0;
+ uint8_t rc = FCNVME_RJT_RC_NONE;
+ uint8_t ec = FCNVME_RJT_EXP_NONE;
+ struct spdk_nvmf_transport *transport = spdk_nvmf_tgt_get_transport(ls_rqst->nvmf_tgt,
+ SPDK_NVME_TRANSPORT_NAME_FC);
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS,
+ "LS_CIOC: ls_rqst_len=%d, desc_list_len=%d, cmd_len=%d, "
+ "assoc_id=0x%lx, sq_size=%d, esrp=%d, Tgtport nn:%lx, pn:%lx\n",
+ ls_rqst->rqst_len, from_be32(&rqst->desc_list_len),
+ from_be32(&rqst->connect_cmd.desc_len),
+ from_be64(&rqst->assoc_id.association_id),
+ from_be32(&rqst->connect_cmd.sqsize),
+ from_be32(&rqst->connect_cmd.ersp_ratio),
+ tgtport->fc_nodename.u.wwn, tgtport->fc_portname.u.wwn);
+
+ if (ls_rqst->rqst_len < sizeof(struct spdk_nvmf_fc_ls_cr_conn_rqst)) {
+ errmsg_ind = VERR_CR_CONN_LEN;
+ rc = FCNVME_RJT_RC_INV_PARAM;
+ ec = FCNVME_RJT_EXP_INV_LEN;
+ } else if (rqst->desc_list_len !=
+ nvmf_fc_lsdesc_len(sizeof(struct spdk_nvmf_fc_ls_cr_conn_rqst))) {
+ errmsg_ind = VERR_CR_CONN_RQST_LEN;
+ rc = FCNVME_RJT_RC_INV_PARAM;
+ ec = FCNVME_RJT_EXP_INV_LEN;
+ } else if (rqst->assoc_id.desc_tag !=
+ cpu_to_be32(FCNVME_LSDESC_ASSOC_ID)) {
+ errmsg_ind = VERR_ASSOC_ID;
+ rc = FCNVME_RJT_RC_INV_PARAM;
+ } else if (rqst->assoc_id.desc_len !=
+ nvmf_fc_lsdesc_len(sizeof(struct spdk_nvmf_fc_lsdesc_assoc_id))) {
+ errmsg_ind = VERR_ASSOC_ID_LEN;
+ rc = FCNVME_RJT_RC_INV_PARAM;
+ ec = FCNVME_RJT_EXP_INV_LEN;
+ } else if (rqst->connect_cmd.desc_tag !=
+ cpu_to_be32(FCNVME_LSDESC_CREATE_CONN_CMD)) {
+ errmsg_ind = VERR_CR_CONN_CMD;
+ rc = FCNVME_RJT_RC_INV_PARAM;
+ } else if (rqst->connect_cmd.desc_len !=
+ nvmf_fc_lsdesc_len(
+ sizeof(struct spdk_nvmf_fc_lsdesc_cr_conn_cmd))) {
+ errmsg_ind = VERR_CR_CONN_CMD_LEN;
+ rc = FCNVME_RJT_RC_INV_PARAM;
+ ec = FCNVME_RJT_EXP_INV_LEN;
+ } else if (!rqst->connect_cmd.ersp_ratio ||
+ (from_be16(&rqst->connect_cmd.ersp_ratio) >=
+ from_be16(&rqst->connect_cmd.sqsize))) {
+ errmsg_ind = VERR_ERSP_RATIO;
+ rc = FCNVME_RJT_RC_INV_PARAM;
+ ec = FCNVME_RJT_EXP_INV_ESRP;
+ } else if (from_be16(&rqst->connect_cmd.sqsize) == 0 ||
+ from_be16(&rqst->connect_cmd.sqsize) > transport->opts.max_queue_depth) {
+ errmsg_ind = VERR_SQSIZE;
+ rc = FCNVME_RJT_RC_INV_PARAM;
+ ec = FCNVME_RJT_EXP_SQ_SIZE;
+ }
+
+ if (rc != FCNVME_RJT_RC_NONE) {
+ goto rjt_cioc;
+ }
+
+ /* find association */
+ assoc = nvmf_fc_ls_find_assoc(tgtport,
+ from_be64(&rqst->assoc_id.association_id));
+ if (!assoc) {
+ errmsg_ind = VERR_NO_ASSOC;
+ rc = FCNVME_RJT_RC_INV_ASSOC;
+ } else if (assoc->assoc_state == SPDK_NVMF_FC_OBJECT_TO_BE_DELETED) {
+ /* association is being deleted - don't allow more connections */
+ errmsg_ind = VERR_NO_ASSOC;
+ rc = FCNVME_RJT_RC_INV_ASSOC;
+ } else if (assoc->conn_count >= transport->opts.max_qpairs_per_ctrlr) {
+ errmsg_ind = VERR_CONN_TOO_MANY;
+ rc = FCNVME_RJT_RC_INV_PARAM;
+ ec = FCNVME_RJT_EXP_INV_Q_ID;
+ }
+
+ if (rc != FCNVME_RJT_RC_NONE) {
+ goto rjt_cioc;
+ }
+
+ fc_conn = nvmf_fc_ls_new_connection(assoc, from_be16(&rqst->connect_cmd.qid),
+ from_be16(&rqst->connect_cmd.ersp_ratio),
+ ls_rqst->rpi,
+ from_be16(&rqst->connect_cmd.sqsize),
+ tgtport);
+ if (!fc_conn) {
+ errmsg_ind = VERR_CONN_ALLOC_FAIL;
+ rc = FCNVME_RJT_RC_INSUFF_RES;
+ ec = FCNVME_RJT_EXP_NONE;
+ goto rjt_cioc;
+ }
+
+ /* format accept response */
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, "Formatting LS accept response for "
+ "assoc_id 0x%lx conn_id 0x%lx\n", assoc->assoc_id,
+ fc_conn->conn_id);
+ bzero(acc, sizeof(*acc));
+ ls_rqst->rsp_len = sizeof(*acc);
+ nvmf_fc_ls_format_rsp_hdr(acc, FCNVME_LS_ACC,
+ nvmf_fc_lsdesc_len(
+ sizeof(struct spdk_nvmf_fc_ls_cr_conn_acc)),
+ FCNVME_LS_CREATE_CONNECTION);
+ to_be32(&acc->conn_id.desc_tag, FCNVME_LSDESC_CONN_ID);
+ acc->conn_id.desc_len =
+ nvmf_fc_lsdesc_len(sizeof(struct spdk_nvmf_fc_lsdesc_conn_id));
+
+ /* assign connection to HWQP poller - also sends response */
+ nvmf_fc_ls_add_conn_to_poller(assoc, ls_rqst, fc_conn, false);
+
+ return;
+
+rjt_cioc:
+ SPDK_ERRLOG("Create Connection LS failed: %s\n", validation_errors[errmsg_ind]);
+
+ ls_rqst->rsp_len = nvmf_fc_ls_format_rjt(acc, FCNVME_MAX_LS_BUFFER_SIZE,
+ rqst->w0.ls_cmd, rc, ec, 0);
+ nvmf_fc_xmt_ls_rsp(tgtport, ls_rqst);
+}
+
+static void
+nvmf_fc_ls_process_disc(struct spdk_nvmf_fc_nport *tgtport,
+ struct spdk_nvmf_fc_ls_rqst *ls_rqst)
+{
+ struct spdk_nvmf_fc_ls_disconnect_rqst *rqst =
+ (struct spdk_nvmf_fc_ls_disconnect_rqst *)ls_rqst->rqstbuf.virt;
+ struct spdk_nvmf_fc_ls_disconnect_acc *acc =
+ (struct spdk_nvmf_fc_ls_disconnect_acc *)ls_rqst->rspbuf.virt;
+ struct spdk_nvmf_fc_association *assoc;
+ int errmsg_ind = 0;
+ uint8_t rc = FCNVME_RJT_RC_NONE;
+ uint8_t ec = FCNVME_RJT_EXP_NONE;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS,
+ "LS_DISC: ls_rqst_len=%d, desc_list_len=%d, cmd_len=%d,"
+ "assoc_id=0x%lx\n",
+ ls_rqst->rqst_len, from_be32(&rqst->desc_list_len),
+ from_be32(&rqst->disconn_cmd.desc_len),
+ from_be64(&rqst->assoc_id.association_id));
+
+ if (ls_rqst->rqst_len < sizeof(struct spdk_nvmf_fc_ls_disconnect_rqst)) {
+ errmsg_ind = VERR_DISCONN_LEN;
+ rc = FCNVME_RJT_RC_INV_PARAM;
+ ec = FCNVME_RJT_EXP_INV_LEN;
+ } else if (rqst->desc_list_len !=
+ nvmf_fc_lsdesc_len(sizeof(struct spdk_nvmf_fc_ls_disconnect_rqst))) {
+ errmsg_ind = VERR_DISCONN_RQST_LEN;
+ rc = FCNVME_RJT_RC_INV_PARAM;
+ ec = FCNVME_RJT_EXP_INV_LEN;
+ } else if (rqst->assoc_id.desc_tag !=
+ cpu_to_be32(FCNVME_LSDESC_ASSOC_ID)) {
+ errmsg_ind = VERR_ASSOC_ID;
+ rc = FCNVME_RJT_RC_INV_PARAM;
+ } else if (rqst->assoc_id.desc_len !=
+ nvmf_fc_lsdesc_len(sizeof(struct spdk_nvmf_fc_lsdesc_assoc_id))) {
+ errmsg_ind = VERR_ASSOC_ID_LEN;
+ rc = FCNVME_RJT_RC_INV_PARAM;
+ ec = FCNVME_RJT_EXP_INV_LEN;
+ } else if (rqst->disconn_cmd.desc_tag !=
+ cpu_to_be32(FCNVME_LSDESC_DISCONN_CMD)) {
+ rc = FCNVME_RJT_RC_INV_PARAM;
+ errmsg_ind = VERR_DISCONN_CMD;
+ } else if (rqst->disconn_cmd.desc_len !=
+ nvmf_fc_lsdesc_len(sizeof(struct spdk_nvmf_fc_lsdesc_disconn_cmd))) {
+ errmsg_ind = VERR_DISCONN_CMD_LEN;
+ rc = FCNVME_RJT_RC_INV_PARAM;
+ ec = FCNVME_RJT_EXP_INV_LEN;
+ }
+
+ if (rc != FCNVME_RJT_RC_NONE) {
+ goto rjt_disc;
+ }
+
+ /* match an active association */
+ assoc = nvmf_fc_ls_find_assoc(tgtport,
+ from_be64(&rqst->assoc_id.association_id));
+ if (!assoc) {
+ errmsg_ind = VERR_NO_ASSOC;
+ rc = FCNVME_RJT_RC_INV_ASSOC;
+ goto rjt_disc;
+ }
+
+ /* format response */
+ bzero(acc, sizeof(*acc));
+ ls_rqst->rsp_len = sizeof(*acc);
+
+ nvmf_fc_ls_format_rsp_hdr(acc, FCNVME_LS_ACC,
+ nvmf_fc_lsdesc_len(
+ sizeof(struct spdk_nvmf_fc_ls_disconnect_acc)),
+ FCNVME_LS_DISCONNECT);
+
+ nvmf_fc_ls_disconnect_assoc(tgtport, ls_rqst, assoc->assoc_id);
+ return;
+
+rjt_disc:
+ SPDK_ERRLOG("Disconnect LS failed: %s\n", validation_errors[errmsg_ind]);
+ ls_rqst->rsp_len = nvmf_fc_ls_format_rjt(acc, FCNVME_MAX_LS_BUFFER_SIZE,
+ rqst->w0.ls_cmd, rc, ec, 0);
+ nvmf_fc_xmt_ls_rsp(tgtport, ls_rqst);
+}
+
+/* ************************ */
+/* external functions */
+
+void
+nvmf_fc_ls_init(struct spdk_nvmf_fc_port *fc_port)
+{
+}
+
+void
+nvmf_fc_ls_fini(struct spdk_nvmf_fc_port *fc_port)
+{
+}
+
+void
+nvmf_fc_handle_ls_rqst(struct spdk_nvmf_fc_ls_rqst *ls_rqst)
+{
+ struct spdk_nvmf_fc_ls_rqst_w0 *w0 =
+ (struct spdk_nvmf_fc_ls_rqst_w0 *)ls_rqst->rqstbuf.virt;
+ uint32_t s_id = ls_rqst->s_id;
+ struct spdk_nvmf_fc_nport *tgtport = ls_rqst->nport;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_LS, "LS cmd=%d\n", w0->ls_cmd);
+
+ switch (w0->ls_cmd) {
+ case FCNVME_LS_CREATE_ASSOCIATION:
+ nvmf_fc_ls_process_cass(s_id, tgtport, ls_rqst);
+ break;
+ case FCNVME_LS_CREATE_CONNECTION:
+ nvmf_fc_ls_process_cioc(tgtport, ls_rqst);
+ break;
+ case FCNVME_LS_DISCONNECT:
+ nvmf_fc_ls_process_disc(tgtport, ls_rqst);
+ break;
+ default:
+ SPDK_ERRLOG("Invalid LS cmd=%d\n", w0->ls_cmd);
+ ls_rqst->rsp_len = nvmf_fc_ls_format_rjt(ls_rqst->rspbuf.virt,
+ FCNVME_MAX_LS_BUFFER_SIZE, w0->ls_cmd,
+ FCNVME_RJT_RC_INVAL, FCNVME_RJT_EXP_NONE, 0);
+ nvmf_fc_xmt_ls_rsp(tgtport, ls_rqst);
+ }
+}
+
+int
+nvmf_fc_delete_association(struct spdk_nvmf_fc_nport *tgtport,
+ uint64_t assoc_id, bool send_abts, bool backend_initiated,
+ spdk_nvmf_fc_del_assoc_cb del_assoc_cb,
+ void *cb_data)
+{
+ return _nvmf_fc_delete_association(tgtport, assoc_id, send_abts, backend_initiated,
+ del_assoc_cb, cb_data, false);
+}
+
+static void
+nvmf_fc_poller_api_cb_event(void *arg)
+{
+ struct spdk_nvmf_fc_poller_api_cb_info *cb_info =
+ (struct spdk_nvmf_fc_poller_api_cb_info *) arg;
+
+ assert(cb_info != NULL);
+ cb_info->cb_func(cb_info->cb_data, cb_info->ret);
+}
+
+static void
+nvmf_fc_poller_api_perform_cb(struct spdk_nvmf_fc_poller_api_cb_info *cb_info,
+ enum spdk_nvmf_fc_poller_api_ret ret)
+{
+ if (cb_info->cb_func && cb_info->cb_thread) {
+ cb_info->ret = ret;
+ /* callback to master thread */
+ spdk_thread_send_msg(cb_info->cb_thread, nvmf_fc_poller_api_cb_event,
+ (void *) cb_info);
+ }
+}
+
+static void
+nvmf_fc_poller_api_add_connection(void *arg)
+{
+ enum spdk_nvmf_fc_poller_api_ret ret = SPDK_NVMF_FC_POLLER_API_SUCCESS;
+ struct spdk_nvmf_fc_poller_api_add_connection_args *conn_args =
+ (struct spdk_nvmf_fc_poller_api_add_connection_args *)arg;
+ struct spdk_nvmf_fc_conn *fc_conn;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_POLLER_API, "Poller add connection, conn_id 0x%lx\n",
+ conn_args->fc_conn->conn_id);
+
+ /* make sure connection is not already in poller's list */
+ fc_conn = nvmf_fc_hwqp_find_fc_conn(conn_args->fc_conn->hwqp,
+ conn_args->fc_conn->conn_id);
+ if (fc_conn) {
+ SPDK_ERRLOG("duplicate connection found");
+ ret = SPDK_NVMF_FC_POLLER_API_DUP_CONN_ID;
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_POLLER_API,
+ "conn_id=%lx", fc_conn->conn_id);
+ TAILQ_INSERT_TAIL(&conn_args->fc_conn->hwqp->connection_list,
+ conn_args->fc_conn, link);
+ }
+
+ /* perform callback */
+ nvmf_fc_poller_api_perform_cb(&conn_args->cb_info, ret);
+}
+
+static void
+nvmf_fc_poller_api_quiesce_queue(void *arg)
+{
+ struct spdk_nvmf_fc_poller_api_quiesce_queue_args *q_args =
+ (struct spdk_nvmf_fc_poller_api_quiesce_queue_args *) arg;
+ struct spdk_nvmf_fc_request *fc_req = NULL, *tmp;
+
+ /* should be already, but make sure queue is quiesced */
+ q_args->hwqp->state = SPDK_FC_HWQP_OFFLINE;
+
+ /*
+ * Kill all the outstanding commands that are in the transfer state and
+ * in the process of being aborted.
+ * We can run into this situation if an adapter reset happens when an I_T Nexus delete
+ * is in progress.
+ */
+ TAILQ_FOREACH_SAFE(fc_req, &q_args->hwqp->in_use_reqs, link, tmp) {
+ if (nvmf_fc_req_in_xfer(fc_req) && fc_req->is_aborted == true) {
+ nvmf_fc_poller_api_func(q_args->hwqp, SPDK_NVMF_FC_POLLER_API_REQ_ABORT_COMPLETE,
+ (void *)fc_req);
+ }
+ }
+
+ /* perform callback */
+ nvmf_fc_poller_api_perform_cb(&q_args->cb_info, SPDK_NVMF_FC_POLLER_API_SUCCESS);
+}
+
+static void
+nvmf_fc_poller_api_activate_queue(void *arg)
+{
+ struct spdk_nvmf_fc_poller_api_quiesce_queue_args *q_args =
+ (struct spdk_nvmf_fc_poller_api_quiesce_queue_args *) arg;
+
+ q_args->hwqp->state = SPDK_FC_HWQP_ONLINE;
+
+ /* perform callback */
+ nvmf_fc_poller_api_perform_cb(&q_args->cb_info, 0);
+}
+
+static void
+nvmf_fc_disconnect_qpair_cb(void *ctx)
+{
+ struct spdk_nvmf_fc_poller_api_cb_info *cb_info = ctx;
+ /* perform callback */
+ nvmf_fc_poller_api_perform_cb(cb_info, SPDK_NVMF_FC_POLLER_API_SUCCESS);
+}
+
+static void
+nvmf_fc_poller_conn_abort_done(void *hwqp, int32_t status, void *cb_args)
+{
+ struct spdk_nvmf_fc_poller_api_del_connection_args *conn_args = cb_args;
+
+ if (conn_args->fc_request_cnt) {
+ conn_args->fc_request_cnt -= 1;
+ }
+
+ if (!conn_args->fc_request_cnt) {
+ if (!TAILQ_EMPTY(&conn_args->hwqp->connection_list)) {
+ /* All the requests for this connection are aborted. */
+ TAILQ_REMOVE(&conn_args->hwqp->connection_list, conn_args->fc_conn, link);
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_POLLER_API, "Connection deleted, conn_id 0x%lx\n",
+ conn_args->fc_conn->conn_id);
+
+ if (!conn_args->backend_initiated) {
+ /* disconnect qpair from nvmf controller */
+ spdk_nvmf_qpair_disconnect(&conn_args->fc_conn->qpair,
+ nvmf_fc_disconnect_qpair_cb, &conn_args->cb_info);
+ }
+ } else {
+ /*
+ * Duplicate connection delete can happen if one is
+ * coming in via an association disconnect and the other
+ * is initiated by a port reset.
+ */
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_POLLER_API, "Duplicate conn delete.");
+ /* perform callback */
+ nvmf_fc_poller_api_perform_cb(&conn_args->cb_info, SPDK_NVMF_FC_POLLER_API_SUCCESS);
+ }
+ }
+}
+
+static void
+nvmf_fc_poller_api_del_connection(void *arg)
+{
+ struct spdk_nvmf_fc_poller_api_del_connection_args *conn_args =
+ (struct spdk_nvmf_fc_poller_api_del_connection_args *)arg;
+ struct spdk_nvmf_fc_conn *fc_conn;
+ struct spdk_nvmf_fc_request *fc_req = NULL, *tmp;
+ struct spdk_nvmf_fc_hwqp *hwqp = conn_args->hwqp;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_POLLER_API, "Poller delete connection, conn_id 0x%lx\n",
+ conn_args->fc_conn->conn_id);
+
+ /* find the connection in poller's list */
+ fc_conn = nvmf_fc_hwqp_find_fc_conn(hwqp, conn_args->fc_conn->conn_id);
+ if (!fc_conn) {
+ /* perform callback */
+ nvmf_fc_poller_api_perform_cb(&conn_args->cb_info, SPDK_NVMF_FC_POLLER_API_NO_CONN_ID);
+ return;
+ }
+
+ conn_args->fc_request_cnt = 0;
+
+ TAILQ_FOREACH_SAFE(fc_req, &hwqp->in_use_reqs, link, tmp) {
+ if (fc_req->fc_conn->conn_id == fc_conn->conn_id) {
+ if (nvmf_qpair_is_admin_queue(&fc_conn->qpair) &&
+ (fc_req->req.cmd->nvme_cmd.opc == SPDK_NVME_OPC_ASYNC_EVENT_REQUEST)) {
+ /* AER will be cleaned by spdk_nvmf_qpair_disconnect. */
+ continue;
+ }
+
+ conn_args->fc_request_cnt += 1;
+ nvmf_fc_request_abort(fc_req, conn_args->send_abts,
+ nvmf_fc_poller_conn_abort_done,
+ conn_args);
+ }
+ }
+
+ if (!conn_args->fc_request_cnt) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_POLLER_API, "Connection deleted.\n");
+ TAILQ_REMOVE(&hwqp->connection_list, fc_conn, link);
+
+ if (!conn_args->backend_initiated) {
+ /* disconnect qpair from nvmf controller */
+ spdk_nvmf_qpair_disconnect(&fc_conn->qpair, nvmf_fc_disconnect_qpair_cb,
+ &conn_args->cb_info);
+ }
+ }
+}
+
+static void
+nvmf_fc_poller_abts_done(void *hwqp, int32_t status, void *cb_args)
+{
+ struct spdk_nvmf_fc_poller_api_abts_recvd_args *args = cb_args;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_POLLER_API,
+ "ABTS poller done, rpi: 0x%x, oxid: 0x%x, rxid: 0x%x\n",
+ args->ctx->rpi, args->ctx->oxid, args->ctx->rxid);
+
+ nvmf_fc_poller_api_perform_cb(&args->cb_info,
+ SPDK_NVMF_FC_POLLER_API_SUCCESS);
+}
+
+static void
+nvmf_fc_poller_api_abts_received(void *arg)
+{
+ struct spdk_nvmf_fc_poller_api_abts_recvd_args *args = arg;
+ struct spdk_nvmf_fc_request *fc_req = NULL;
+ struct spdk_nvmf_fc_hwqp *hwqp = args->hwqp;
+
+ TAILQ_FOREACH(fc_req, &hwqp->in_use_reqs, link) {
+ if ((fc_req->rpi == args->ctx->rpi) &&
+ (fc_req->oxid == args->ctx->oxid)) {
+ nvmf_fc_request_abort(fc_req, false,
+ nvmf_fc_poller_abts_done, args);
+ return;
+ }
+ }
+
+ nvmf_fc_poller_api_perform_cb(&args->cb_info,
+ SPDK_NVMF_FC_POLLER_API_OXID_NOT_FOUND);
+}
+
+static void
+nvmf_fc_poller_api_queue_sync(void *arg)
+{
+ struct spdk_nvmf_fc_poller_api_queue_sync_args *args = arg;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_POLLER_API,
+ "HWQP sync requested for u_id = 0x%lx\n", args->u_id);
+
+ /* Add this args to hwqp sync_cb list */
+ TAILQ_INSERT_TAIL(&args->hwqp->sync_cbs, args, link);
+}
+
+static void
+nvmf_fc_poller_api_queue_sync_done(void *arg)
+{
+ struct spdk_nvmf_fc_poller_api_queue_sync_done_args *args = arg;
+ struct spdk_nvmf_fc_hwqp *hwqp = args->hwqp;
+ uint64_t tag = args->tag;
+ struct spdk_nvmf_fc_poller_api_queue_sync_args *sync_args = NULL, *tmp = NULL;
+
+ assert(args != NULL);
+
+ TAILQ_FOREACH_SAFE(sync_args, &hwqp->sync_cbs, link, tmp) {
+ if (sync_args->u_id == tag) {
+ /* Queue successfully synced. Remove from cb list */
+ TAILQ_REMOVE(&hwqp->sync_cbs, sync_args, link);
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_FC_POLLER_API,
+ "HWQP sync done for u_id = 0x%lx\n", sync_args->u_id);
+
+ /* Return the status to poller */
+ nvmf_fc_poller_api_perform_cb(&sync_args->cb_info,
+ SPDK_NVMF_FC_POLLER_API_SUCCESS);
+ return;
+ }
+ }
+
+ free(arg);
+ /* note: no callback from this api */
+}
+
+static void
+nvmf_fc_poller_api_add_hwqp(void *arg)
+{
+ struct spdk_nvmf_fc_hwqp *hwqp = (struct spdk_nvmf_fc_hwqp *)arg;
+
+ hwqp->lcore_id = spdk_env_get_current_core(); /* for tracing purposes only */
+ TAILQ_INSERT_TAIL(&hwqp->fgroup->hwqp_list, hwqp, link);
+ /* note: no callback from this api */
+}
+
+static void
+nvmf_fc_poller_api_remove_hwqp(void *arg)
+{
+ struct spdk_nvmf_fc_hwqp *hwqp = (struct spdk_nvmf_fc_hwqp *)arg;
+ struct spdk_nvmf_fc_poll_group *fgroup = hwqp->fgroup;
+
+ TAILQ_REMOVE(&fgroup->hwqp_list, hwqp, link);
+ hwqp->fgroup = NULL;
+ /* note: no callback from this api */
+}
+
+enum spdk_nvmf_fc_poller_api_ret
+nvmf_fc_poller_api_func(struct spdk_nvmf_fc_hwqp *hwqp, enum spdk_nvmf_fc_poller_api api,
+ void *api_args) {
+ switch (api)
+ {
+ case SPDK_NVMF_FC_POLLER_API_ADD_CONNECTION:
+ spdk_thread_send_msg(hwqp->thread,
+ nvmf_fc_poller_api_add_connection, api_args);
+ break;
+
+ case SPDK_NVMF_FC_POLLER_API_DEL_CONNECTION:
+ spdk_thread_send_msg(hwqp->thread,
+ nvmf_fc_poller_api_del_connection, api_args);
+ break;
+
+ case SPDK_NVMF_FC_POLLER_API_QUIESCE_QUEUE:
+ /* quiesce q polling now, don't wait for poller to do it */
+ hwqp->state = SPDK_FC_HWQP_OFFLINE;
+ spdk_thread_send_msg(hwqp->thread,
+ nvmf_fc_poller_api_quiesce_queue, api_args);
+ break;
+
+ case SPDK_NVMF_FC_POLLER_API_ACTIVATE_QUEUE:
+ spdk_thread_send_msg(hwqp->thread,
+ nvmf_fc_poller_api_activate_queue, api_args);
+ break;
+
+ case SPDK_NVMF_FC_POLLER_API_ABTS_RECEIVED:
+ spdk_thread_send_msg(hwqp->thread,
+ nvmf_fc_poller_api_abts_received, api_args);
+ break;
+
+ case SPDK_NVMF_FC_POLLER_API_REQ_ABORT_COMPLETE:
+ spdk_thread_send_msg(hwqp->thread,
+ nvmf_fc_request_abort_complete, api_args);
+ break;
+
+ case SPDK_NVMF_FC_POLLER_API_QUEUE_SYNC:
+ spdk_thread_send_msg(hwqp->thread,
+ nvmf_fc_poller_api_queue_sync, api_args);
+ break;
+
+ case SPDK_NVMF_FC_POLLER_API_QUEUE_SYNC_DONE:
+ spdk_thread_send_msg(hwqp->thread,
+ nvmf_fc_poller_api_queue_sync_done, api_args);
+ break;
+
+ case SPDK_NVMF_FC_POLLER_API_ADD_HWQP:
+ spdk_thread_send_msg(hwqp->thread, nvmf_fc_poller_api_add_hwqp, (void *) hwqp);
+ break;
+
+ case SPDK_NVMF_FC_POLLER_API_REMOVE_HWQP:
+ spdk_thread_send_msg(hwqp->thread, nvmf_fc_poller_api_remove_hwqp, (void *) hwqp);
+ break;
+
+ case SPDK_NVMF_FC_POLLER_API_ADAPTER_EVENT:
+ case SPDK_NVMF_FC_POLLER_API_AEN:
+ default:
+ SPDK_ERRLOG("BAD ARG!");
+ return SPDK_NVMF_FC_POLLER_API_INVALID_ARG;
+ }
+
+ return SPDK_NVMF_FC_POLLER_API_SUCCESS;
+}
+
+SPDK_LOG_REGISTER_COMPONENT("nvmf_fc_poller_api", SPDK_LOG_NVMF_FC_POLLER_API)
+SPDK_LOG_REGISTER_COMPONENT("nvmf_fc_ls", SPDK_LOG_NVMF_FC_LS)
diff --git a/src/spdk/lib/nvmf/nvmf.c b/src/spdk/lib/nvmf/nvmf.c
new file mode 100644
index 000000000..73fa0742e
--- /dev/null
+++ b/src/spdk/lib/nvmf/nvmf.c
@@ -0,0 +1,1457 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation. All rights reserved.
+ * Copyright (c) 2018-2019 Mellanox Technologies LTD. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/bdev.h"
+#include "spdk/bit_array.h"
+#include "spdk/conf.h"
+#include "spdk/thread.h"
+#include "spdk/nvmf.h"
+#include "spdk/trace.h"
+#include "spdk/endian.h"
+#include "spdk/string.h"
+
+#include "spdk_internal/log.h"
+
+#include "nvmf_internal.h"
+#include "transport.h"
+
+SPDK_LOG_REGISTER_COMPONENT("nvmf", SPDK_LOG_NVMF)
+
+#define SPDK_NVMF_DEFAULT_MAX_SUBSYSTEMS 1024
+
+static TAILQ_HEAD(, spdk_nvmf_tgt) g_nvmf_tgts = TAILQ_HEAD_INITIALIZER(g_nvmf_tgts);
+
+typedef void (*nvmf_qpair_disconnect_cpl)(void *ctx, int status);
+static void nvmf_tgt_destroy_poll_group(void *io_device, void *ctx_buf);
+
+/* supplied to a single call to nvmf_qpair_disconnect */
+struct nvmf_qpair_disconnect_ctx {
+ struct spdk_nvmf_qpair *qpair;
+ struct spdk_nvmf_ctrlr *ctrlr;
+ nvmf_qpair_disconnect_cb cb_fn;
+ struct spdk_thread *thread;
+ void *ctx;
+ uint16_t qid;
+};
+
+/*
+ * There are several times when we need to iterate through the list of all qpairs and selectively delete them.
+ * In order to do this sequentially without overlap, we must provide a context to recover the next qpair from
+ * to enable calling nvmf_qpair_disconnect on the next desired qpair.
+ */
+struct nvmf_qpair_disconnect_many_ctx {
+ struct spdk_nvmf_subsystem *subsystem;
+ struct spdk_nvmf_poll_group *group;
+ spdk_nvmf_poll_group_mod_done cpl_fn;
+ void *cpl_ctx;
+};
+
+static void
+nvmf_qpair_set_state(struct spdk_nvmf_qpair *qpair,
+ enum spdk_nvmf_qpair_state state)
+{
+ assert(qpair != NULL);
+ assert(qpair->group->thread == spdk_get_thread());
+
+ qpair->state = state;
+}
+
+static int
+nvmf_poll_group_poll(void *ctx)
+{
+ struct spdk_nvmf_poll_group *group = ctx;
+ int rc;
+ int count = 0;
+ struct spdk_nvmf_transport_poll_group *tgroup;
+
+ TAILQ_FOREACH(tgroup, &group->tgroups, link) {
+ rc = nvmf_transport_poll_group_poll(tgroup);
+ if (rc < 0) {
+ return SPDK_POLLER_BUSY;
+ }
+ count += rc;
+ }
+
+ return count > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
+}
+
+static int
+nvmf_tgt_create_poll_group(void *io_device, void *ctx_buf)
+{
+ struct spdk_nvmf_tgt *tgt = io_device;
+ struct spdk_nvmf_poll_group *group = ctx_buf;
+ struct spdk_nvmf_transport *transport;
+ uint32_t sid;
+
+ TAILQ_INIT(&group->tgroups);
+ TAILQ_INIT(&group->qpairs);
+
+ TAILQ_FOREACH(transport, &tgt->transports, link) {
+ nvmf_poll_group_add_transport(group, transport);
+ }
+
+ group->num_sgroups = tgt->max_subsystems;
+ group->sgroups = calloc(tgt->max_subsystems, sizeof(struct spdk_nvmf_subsystem_poll_group));
+ if (!group->sgroups) {
+ return -ENOMEM;
+ }
+
+ for (sid = 0; sid < tgt->max_subsystems; sid++) {
+ struct spdk_nvmf_subsystem *subsystem;
+
+ subsystem = tgt->subsystems[sid];
+ if (!subsystem) {
+ continue;
+ }
+
+ if (nvmf_poll_group_add_subsystem(group, subsystem, NULL, NULL) != 0) {
+ nvmf_tgt_destroy_poll_group(io_device, ctx_buf);
+ return -1;
+ }
+ }
+
+ pthread_mutex_lock(&tgt->mutex);
+ TAILQ_INSERT_TAIL(&tgt->poll_groups, group, link);
+ pthread_mutex_unlock(&tgt->mutex);
+
+ group->poller = SPDK_POLLER_REGISTER(nvmf_poll_group_poll, group, 0);
+ group->thread = spdk_get_thread();
+
+ return 0;
+}
+
+static void
+nvmf_tgt_destroy_poll_group(void *io_device, void *ctx_buf)
+{
+ struct spdk_nvmf_tgt *tgt = io_device;
+ struct spdk_nvmf_poll_group *group = ctx_buf;
+ struct spdk_nvmf_transport_poll_group *tgroup, *tmp;
+ struct spdk_nvmf_subsystem_poll_group *sgroup;
+ uint32_t sid, nsid;
+
+ pthread_mutex_lock(&tgt->mutex);
+ TAILQ_REMOVE(&tgt->poll_groups, group, link);
+ pthread_mutex_unlock(&tgt->mutex);
+
+ TAILQ_FOREACH_SAFE(tgroup, &group->tgroups, link, tmp) {
+ TAILQ_REMOVE(&group->tgroups, tgroup, link);
+ nvmf_transport_poll_group_destroy(tgroup);
+ }
+
+ for (sid = 0; sid < group->num_sgroups; sid++) {
+ sgroup = &group->sgroups[sid];
+
+ for (nsid = 0; nsid < sgroup->num_ns; nsid++) {
+ if (sgroup->ns_info[nsid].channel) {
+ spdk_put_io_channel(sgroup->ns_info[nsid].channel);
+ sgroup->ns_info[nsid].channel = NULL;
+ }
+ }
+
+ free(sgroup->ns_info);
+ }
+
+ free(group->sgroups);
+
+ if (group->destroy_cb_fn) {
+ group->destroy_cb_fn(group->destroy_cb_arg, 0);
+ }
+}
+
+static void
+_nvmf_tgt_disconnect_next_qpair(void *ctx)
+{
+ struct spdk_nvmf_qpair *qpair;
+ struct nvmf_qpair_disconnect_many_ctx *qpair_ctx = ctx;
+ struct spdk_nvmf_poll_group *group = qpair_ctx->group;
+ struct spdk_io_channel *ch;
+ int rc = 0;
+
+ qpair = TAILQ_FIRST(&group->qpairs);
+
+ if (qpair) {
+ rc = spdk_nvmf_qpair_disconnect(qpair, _nvmf_tgt_disconnect_next_qpair, ctx);
+ }
+
+ if (!qpair || rc != 0) {
+ /* When the refcount from the channels reaches 0, nvmf_tgt_destroy_poll_group will be called. */
+ ch = spdk_io_channel_from_ctx(group);
+ spdk_put_io_channel(ch);
+ free(qpair_ctx);
+ }
+}
+
+static void
+nvmf_tgt_destroy_poll_group_qpairs(struct spdk_nvmf_poll_group *group)
+{
+ struct nvmf_qpair_disconnect_many_ctx *ctx;
+
+ ctx = calloc(1, sizeof(struct nvmf_qpair_disconnect_many_ctx));
+
+ if (!ctx) {
+ SPDK_ERRLOG("Failed to allocate memory for destroy poll group ctx\n");
+ return;
+ }
+
+ spdk_poller_unregister(&group->poller);
+
+ ctx->group = group;
+ _nvmf_tgt_disconnect_next_qpair(ctx);
+}
+
+struct spdk_nvmf_tgt *
+spdk_nvmf_tgt_create(struct spdk_nvmf_target_opts *opts)
+{
+ struct spdk_nvmf_tgt *tgt, *tmp_tgt;
+
+ if (strnlen(opts->name, NVMF_TGT_NAME_MAX_LENGTH) == NVMF_TGT_NAME_MAX_LENGTH) {
+ SPDK_ERRLOG("Provided target name exceeds the max length of %u.\n", NVMF_TGT_NAME_MAX_LENGTH);
+ return NULL;
+ }
+
+ TAILQ_FOREACH(tmp_tgt, &g_nvmf_tgts, link) {
+ if (!strncmp(opts->name, tmp_tgt->name, NVMF_TGT_NAME_MAX_LENGTH)) {
+ SPDK_ERRLOG("Provided target name must be unique.\n");
+ return NULL;
+ }
+ }
+
+ tgt = calloc(1, sizeof(*tgt));
+ if (!tgt) {
+ return NULL;
+ }
+
+ snprintf(tgt->name, NVMF_TGT_NAME_MAX_LENGTH, "%s", opts->name);
+
+ if (!opts || !opts->max_subsystems) {
+ tgt->max_subsystems = SPDK_NVMF_DEFAULT_MAX_SUBSYSTEMS;
+ } else {
+ tgt->max_subsystems = opts->max_subsystems;
+ }
+
+ tgt->discovery_genctr = 0;
+ TAILQ_INIT(&tgt->transports);
+ TAILQ_INIT(&tgt->poll_groups);
+
+ tgt->subsystems = calloc(tgt->max_subsystems, sizeof(struct spdk_nvmf_subsystem *));
+ if (!tgt->subsystems) {
+ free(tgt);
+ return NULL;
+ }
+
+ pthread_mutex_init(&tgt->mutex, NULL);
+
+ TAILQ_INSERT_HEAD(&g_nvmf_tgts, tgt, link);
+
+ spdk_io_device_register(tgt,
+ nvmf_tgt_create_poll_group,
+ nvmf_tgt_destroy_poll_group,
+ sizeof(struct spdk_nvmf_poll_group),
+ tgt->name);
+
+ return tgt;
+}
+
+static void
+nvmf_tgt_destroy_cb(void *io_device)
+{
+ struct spdk_nvmf_tgt *tgt = io_device;
+ struct spdk_nvmf_transport *transport, *transport_tmp;
+ spdk_nvmf_tgt_destroy_done_fn *destroy_cb_fn;
+ void *destroy_cb_arg;
+ uint32_t i;
+
+ if (tgt->subsystems) {
+ for (i = 0; i < tgt->max_subsystems; i++) {
+ if (tgt->subsystems[i]) {
+ nvmf_subsystem_remove_all_listeners(tgt->subsystems[i], true);
+ spdk_nvmf_subsystem_destroy(tgt->subsystems[i]);
+ }
+ }
+ free(tgt->subsystems);
+ }
+
+ TAILQ_FOREACH_SAFE(transport, &tgt->transports, link, transport_tmp) {
+ TAILQ_REMOVE(&tgt->transports, transport, link);
+ spdk_nvmf_transport_destroy(transport);
+ }
+
+ destroy_cb_fn = tgt->destroy_cb_fn;
+ destroy_cb_arg = tgt->destroy_cb_arg;
+
+ free(tgt);
+
+ if (destroy_cb_fn) {
+ destroy_cb_fn(destroy_cb_arg, 0);
+ }
+}
+
+void
+spdk_nvmf_tgt_destroy(struct spdk_nvmf_tgt *tgt,
+ spdk_nvmf_tgt_destroy_done_fn cb_fn,
+ void *cb_arg)
+{
+ tgt->destroy_cb_fn = cb_fn;
+ tgt->destroy_cb_arg = cb_arg;
+
+ TAILQ_REMOVE(&g_nvmf_tgts, tgt, link);
+
+ spdk_io_device_unregister(tgt, nvmf_tgt_destroy_cb);
+}
+
+const char *
+spdk_nvmf_tgt_get_name(struct spdk_nvmf_tgt *tgt)
+{
+ return tgt->name;
+}
+
+struct spdk_nvmf_tgt *
+spdk_nvmf_get_tgt(const char *name)
+{
+ struct spdk_nvmf_tgt *tgt;
+ uint32_t num_targets = 0;
+
+ TAILQ_FOREACH(tgt, &g_nvmf_tgts, link) {
+ if (name) {
+ if (!strncmp(tgt->name, name, NVMF_TGT_NAME_MAX_LENGTH)) {
+ return tgt;
+ }
+ }
+ num_targets++;
+ }
+
+ /*
+ * special case. If there is only one target and
+ * no name was specified, return the only available
+ * target. If there is more than one target, name must
+ * be specified.
+ */
+ if (!name && num_targets == 1) {
+ return TAILQ_FIRST(&g_nvmf_tgts);
+ }
+
+ return NULL;
+}
+
+struct spdk_nvmf_tgt *
+spdk_nvmf_get_first_tgt(void)
+{
+ return TAILQ_FIRST(&g_nvmf_tgts);
+}
+
+struct spdk_nvmf_tgt *
+spdk_nvmf_get_next_tgt(struct spdk_nvmf_tgt *prev)
+{
+ return TAILQ_NEXT(prev, link);
+}
+
+static void
+nvmf_write_subsystem_config_json(struct spdk_json_write_ctx *w,
+ struct spdk_nvmf_subsystem *subsystem)
+{
+ struct spdk_nvmf_host *host;
+ struct spdk_nvmf_subsystem_listener *listener;
+ const struct spdk_nvme_transport_id *trid;
+ struct spdk_nvmf_ns *ns;
+ struct spdk_nvmf_ns_opts ns_opts;
+ uint32_t max_namespaces;
+ char uuid_str[SPDK_UUID_STRING_LEN];
+ const char *adrfam;
+
+ if (spdk_nvmf_subsystem_get_type(subsystem) != SPDK_NVMF_SUBTYPE_NVME) {
+ return;
+ }
+
+ /* { */
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "method", "nvmf_create_subsystem");
+
+ /* "params" : { */
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_string(w, "nqn", spdk_nvmf_subsystem_get_nqn(subsystem));
+ spdk_json_write_named_bool(w, "allow_any_host", spdk_nvmf_subsystem_get_allow_any_host(subsystem));
+ spdk_json_write_named_string(w, "serial_number", spdk_nvmf_subsystem_get_sn(subsystem));
+ spdk_json_write_named_string(w, "model_number", spdk_nvmf_subsystem_get_mn(subsystem));
+
+ max_namespaces = spdk_nvmf_subsystem_get_max_namespaces(subsystem);
+ if (max_namespaces != 0) {
+ spdk_json_write_named_uint32(w, "max_namespaces", max_namespaces);
+ }
+
+ /* } "params" */
+ spdk_json_write_object_end(w);
+
+ /* } */
+ spdk_json_write_object_end(w);
+
+ for (listener = spdk_nvmf_subsystem_get_first_listener(subsystem); listener != NULL;
+ listener = spdk_nvmf_subsystem_get_next_listener(subsystem, listener)) {
+ trid = spdk_nvmf_subsystem_listener_get_trid(listener);
+
+ adrfam = spdk_nvme_transport_id_adrfam_str(trid->adrfam);
+
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "method", "nvmf_subsystem_add_listener");
+
+ /* "params" : { */
+ spdk_json_write_named_object_begin(w, "params");
+
+ spdk_json_write_named_string(w, "nqn", spdk_nvmf_subsystem_get_nqn(subsystem));
+
+ /* "listen_address" : { */
+ spdk_json_write_named_object_begin(w, "listen_address");
+
+ spdk_json_write_named_string(w, "trtype", trid->trstring);
+ if (adrfam) {
+ spdk_json_write_named_string(w, "adrfam", adrfam);
+ }
+
+ spdk_json_write_named_string(w, "traddr", trid->traddr);
+ spdk_json_write_named_string(w, "trsvcid", trid->trsvcid);
+ /* } "listen_address" */
+ spdk_json_write_object_end(w);
+
+ /* } "params" */
+ spdk_json_write_object_end(w);
+
+ /* } */
+ spdk_json_write_object_end(w);
+ }
+
+ for (host = spdk_nvmf_subsystem_get_first_host(subsystem); host != NULL;
+ host = spdk_nvmf_subsystem_get_next_host(subsystem, host)) {
+
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "method", "nvmf_subsystem_add_host");
+
+ /* "params" : { */
+ spdk_json_write_named_object_begin(w, "params");
+
+ spdk_json_write_named_string(w, "nqn", spdk_nvmf_subsystem_get_nqn(subsystem));
+ spdk_json_write_named_string(w, "host", spdk_nvmf_host_get_nqn(host));
+
+ /* } "params" */
+ spdk_json_write_object_end(w);
+
+ /* } */
+ spdk_json_write_object_end(w);
+ }
+
+ for (ns = spdk_nvmf_subsystem_get_first_ns(subsystem); ns != NULL;
+ ns = spdk_nvmf_subsystem_get_next_ns(subsystem, ns)) {
+ spdk_nvmf_ns_get_opts(ns, &ns_opts, sizeof(ns_opts));
+
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "method", "nvmf_subsystem_add_ns");
+
+ /* "params" : { */
+ spdk_json_write_named_object_begin(w, "params");
+
+ spdk_json_write_named_string(w, "nqn", spdk_nvmf_subsystem_get_nqn(subsystem));
+
+ /* "namespace" : { */
+ spdk_json_write_named_object_begin(w, "namespace");
+
+ spdk_json_write_named_uint32(w, "nsid", spdk_nvmf_ns_get_id(ns));
+ spdk_json_write_named_string(w, "bdev_name", spdk_bdev_get_name(spdk_nvmf_ns_get_bdev(ns)));
+
+ if (!spdk_mem_all_zero(ns_opts.nguid, sizeof(ns_opts.nguid))) {
+ SPDK_STATIC_ASSERT(sizeof(ns_opts.nguid) == sizeof(uint64_t) * 2, "size mismatch");
+ spdk_json_write_named_string_fmt(w, "nguid", "%016"PRIX64"%016"PRIX64, from_be64(&ns_opts.nguid[0]),
+ from_be64(&ns_opts.nguid[8]));
+ }
+
+ if (!spdk_mem_all_zero(ns_opts.eui64, sizeof(ns_opts.eui64))) {
+ SPDK_STATIC_ASSERT(sizeof(ns_opts.eui64) == sizeof(uint64_t), "size mismatch");
+ spdk_json_write_named_string_fmt(w, "eui64", "%016"PRIX64, from_be64(&ns_opts.eui64));
+ }
+
+ if (!spdk_mem_all_zero(&ns_opts.uuid, sizeof(ns_opts.uuid))) {
+ spdk_uuid_fmt_lower(uuid_str, sizeof(uuid_str), &ns_opts.uuid);
+ spdk_json_write_named_string(w, "uuid", uuid_str);
+ }
+
+ /* "namespace" */
+ spdk_json_write_object_end(w);
+
+ /* } "params" */
+ spdk_json_write_object_end(w);
+
+ /* } */
+ spdk_json_write_object_end(w);
+ }
+}
+
+void
+spdk_nvmf_tgt_write_config_json(struct spdk_json_write_ctx *w, struct spdk_nvmf_tgt *tgt)
+{
+ struct spdk_nvmf_subsystem *subsystem;
+ struct spdk_nvmf_transport *transport;
+
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "method", "nvmf_set_max_subsystems");
+
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_uint32(w, "max_subsystems", tgt->max_subsystems);
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+
+ /* write transports */
+ TAILQ_FOREACH(transport, &tgt->transports, link) {
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "method", "nvmf_create_transport");
+
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_string(w, "trtype", spdk_nvme_transport_id_trtype_str(transport->ops->type));
+ spdk_json_write_named_uint32(w, "max_queue_depth", transport->opts.max_queue_depth);
+ spdk_json_write_named_uint32(w, "max_io_qpairs_per_ctrlr",
+ transport->opts.max_qpairs_per_ctrlr - 1);
+ spdk_json_write_named_uint32(w, "in_capsule_data_size", transport->opts.in_capsule_data_size);
+ spdk_json_write_named_uint32(w, "max_io_size", transport->opts.max_io_size);
+ spdk_json_write_named_uint32(w, "io_unit_size", transport->opts.io_unit_size);
+ spdk_json_write_named_uint32(w, "max_aq_depth", transport->opts.max_aq_depth);
+ if (transport->ops->type == SPDK_NVME_TRANSPORT_RDMA) {
+ spdk_json_write_named_uint32(w, "max_srq_depth", transport->opts.max_srq_depth);
+ }
+ spdk_json_write_named_uint32(w, "abort_timeout_sec", transport->opts.abort_timeout_sec);
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+ }
+
+ subsystem = spdk_nvmf_subsystem_get_first(tgt);
+ while (subsystem) {
+ nvmf_write_subsystem_config_json(w, subsystem);
+ subsystem = spdk_nvmf_subsystem_get_next(subsystem);
+ }
+}
+
+int
+spdk_nvmf_tgt_listen(struct spdk_nvmf_tgt *tgt,
+ struct spdk_nvme_transport_id *trid)
+{
+ struct spdk_nvmf_transport *transport;
+ const char *trtype;
+ int rc;
+
+ transport = spdk_nvmf_tgt_get_transport(tgt, trid->trstring);
+ if (!transport) {
+ trtype = spdk_nvme_transport_id_trtype_str(trid->trtype);
+ if (trtype != NULL) {
+ SPDK_ERRLOG("Unable to listen on transport %s. The transport must be created first.\n", trtype);
+ } else {
+ SPDK_ERRLOG("The specified trtype %d is unknown. Please make sure that it is properly registered.\n",
+ trid->trtype);
+ }
+
+ return -EINVAL;
+ }
+
+ rc = spdk_nvmf_transport_listen(transport, trid);
+ if (rc < 0) {
+ SPDK_ERRLOG("Unable to listen on address '%s'\n", trid->traddr);
+ }
+
+ return rc;
+}
+
+int
+spdk_nvmf_tgt_stop_listen(struct spdk_nvmf_tgt *tgt,
+ struct spdk_nvme_transport_id *trid)
+{
+ struct spdk_nvmf_transport *transport;
+ const char *trtype;
+ int rc;
+
+ transport = spdk_nvmf_tgt_get_transport(tgt, trid->trstring);
+ if (!transport) {
+ trtype = spdk_nvme_transport_id_trtype_str(trid->trtype);
+ if (trtype != NULL) {
+ SPDK_ERRLOG("Unable to stop listen on transport %s. The transport must be created first.\n",
+ trtype);
+ } else {
+ SPDK_ERRLOG("The specified trtype %d is unknown. Please make sure that it is properly registered.\n",
+ trid->trtype);
+ }
+ return -EINVAL;
+ }
+
+ rc = spdk_nvmf_transport_stop_listen(transport, trid);
+ if (rc < 0) {
+ SPDK_ERRLOG("Failed to stop listening on address '%s'\n", trid->traddr);
+ return rc;
+ }
+ return 0;
+}
+
+struct spdk_nvmf_tgt_add_transport_ctx {
+ struct spdk_nvmf_tgt *tgt;
+ struct spdk_nvmf_transport *transport;
+ spdk_nvmf_tgt_add_transport_done_fn cb_fn;
+ void *cb_arg;
+};
+
+static void
+_nvmf_tgt_add_transport_done(struct spdk_io_channel_iter *i, int status)
+{
+ struct spdk_nvmf_tgt_add_transport_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
+
+ ctx->cb_fn(ctx->cb_arg, status);
+
+ free(ctx);
+}
+
+static void
+_nvmf_tgt_add_transport(struct spdk_io_channel_iter *i)
+{
+ struct spdk_nvmf_tgt_add_transport_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
+ struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
+ struct spdk_nvmf_poll_group *group = spdk_io_channel_get_ctx(ch);
+ int rc;
+
+ rc = nvmf_poll_group_add_transport(group, ctx->transport);
+ spdk_for_each_channel_continue(i, rc);
+}
+
+void spdk_nvmf_tgt_add_transport(struct spdk_nvmf_tgt *tgt,
+ struct spdk_nvmf_transport *transport,
+ spdk_nvmf_tgt_add_transport_done_fn cb_fn,
+ void *cb_arg)
+{
+ struct spdk_nvmf_tgt_add_transport_ctx *ctx;
+
+ if (spdk_nvmf_tgt_get_transport(tgt, transport->ops->name)) {
+ cb_fn(cb_arg, -EEXIST);
+ return; /* transport already created */
+ }
+
+ transport->tgt = tgt;
+ TAILQ_INSERT_TAIL(&tgt->transports, transport, link);
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx) {
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ ctx->tgt = tgt;
+ ctx->transport = transport;
+ ctx->cb_fn = cb_fn;
+ ctx->cb_arg = cb_arg;
+
+ spdk_for_each_channel(tgt,
+ _nvmf_tgt_add_transport,
+ ctx,
+ _nvmf_tgt_add_transport_done);
+}
+
+struct spdk_nvmf_subsystem *
+spdk_nvmf_tgt_find_subsystem(struct spdk_nvmf_tgt *tgt, const char *subnqn)
+{
+ struct spdk_nvmf_subsystem *subsystem;
+ uint32_t sid;
+
+ if (!subnqn) {
+ return NULL;
+ }
+
+ /* Ensure that subnqn is null terminated */
+ if (!memchr(subnqn, '\0', SPDK_NVMF_NQN_MAX_LEN + 1)) {
+ SPDK_ERRLOG("Connect SUBNQN is not null terminated\n");
+ return NULL;
+ }
+
+ for (sid = 0; sid < tgt->max_subsystems; sid++) {
+ subsystem = tgt->subsystems[sid];
+ if (subsystem == NULL) {
+ continue;
+ }
+
+ if (strcmp(subnqn, subsystem->subnqn) == 0) {
+ return subsystem;
+ }
+ }
+
+ return NULL;
+}
+
+struct spdk_nvmf_transport *
+spdk_nvmf_tgt_get_transport(struct spdk_nvmf_tgt *tgt, const char *transport_name)
+{
+ struct spdk_nvmf_transport *transport;
+
+ TAILQ_FOREACH(transport, &tgt->transports, link) {
+ if (!strncasecmp(transport->ops->name, transport_name, SPDK_NVMF_TRSTRING_MAX_LEN)) {
+ return transport;
+ }
+ }
+ return NULL;
+}
+
+struct nvmf_new_qpair_ctx {
+ struct spdk_nvmf_qpair *qpair;
+ struct spdk_nvmf_poll_group *group;
+};
+
+static void
+_nvmf_poll_group_add(void *_ctx)
+{
+ struct nvmf_new_qpair_ctx *ctx = _ctx;
+ struct spdk_nvmf_qpair *qpair = ctx->qpair;
+ struct spdk_nvmf_poll_group *group = ctx->group;
+
+ free(_ctx);
+
+ if (spdk_nvmf_poll_group_add(group, qpair) != 0) {
+ SPDK_ERRLOG("Unable to add the qpair to a poll group.\n");
+ spdk_nvmf_qpair_disconnect(qpair, NULL, NULL);
+ }
+}
+
+void
+spdk_nvmf_tgt_new_qpair(struct spdk_nvmf_tgt *tgt, struct spdk_nvmf_qpair *qpair)
+{
+ struct spdk_nvmf_poll_group *group;
+ struct nvmf_new_qpair_ctx *ctx;
+
+ group = spdk_nvmf_get_optimal_poll_group(qpair);
+ if (group == NULL) {
+ if (tgt->next_poll_group == NULL) {
+ tgt->next_poll_group = TAILQ_FIRST(&tgt->poll_groups);
+ if (tgt->next_poll_group == NULL) {
+ SPDK_ERRLOG("No poll groups exist.\n");
+ spdk_nvmf_qpair_disconnect(qpair, NULL, NULL);
+ return;
+ }
+ }
+ group = tgt->next_poll_group;
+ tgt->next_poll_group = TAILQ_NEXT(group, link);
+ }
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx) {
+ SPDK_ERRLOG("Unable to send message to poll group.\n");
+ spdk_nvmf_qpair_disconnect(qpair, NULL, NULL);
+ return;
+ }
+
+ ctx->qpair = qpair;
+ ctx->group = group;
+
+ spdk_thread_send_msg(group->thread, _nvmf_poll_group_add, ctx);
+}
+
+uint32_t
+spdk_nvmf_tgt_accept(struct spdk_nvmf_tgt *tgt)
+{
+ struct spdk_nvmf_transport *transport, *tmp;
+ uint32_t count = 0;
+
+ TAILQ_FOREACH_SAFE(transport, &tgt->transports, link, tmp) {
+ count += nvmf_transport_accept(transport);
+ }
+
+ return count;
+}
+
+struct spdk_nvmf_poll_group *
+spdk_nvmf_poll_group_create(struct spdk_nvmf_tgt *tgt)
+{
+ struct spdk_io_channel *ch;
+
+ ch = spdk_get_io_channel(tgt);
+ if (!ch) {
+ SPDK_ERRLOG("Unable to get I/O channel for target\n");
+ return NULL;
+ }
+
+ return spdk_io_channel_get_ctx(ch);
+}
+
+void
+spdk_nvmf_poll_group_destroy(struct spdk_nvmf_poll_group *group,
+ spdk_nvmf_poll_group_destroy_done_fn cb_fn,
+ void *cb_arg)
+{
+ assert(group->destroy_cb_fn == NULL);
+ group->destroy_cb_fn = cb_fn;
+ group->destroy_cb_arg = cb_arg;
+
+ /* This function will put the io_channel associated with this poll group */
+ nvmf_tgt_destroy_poll_group_qpairs(group);
+}
+
+int
+spdk_nvmf_poll_group_add(struct spdk_nvmf_poll_group *group,
+ struct spdk_nvmf_qpair *qpair)
+{
+ int rc = -1;
+ struct spdk_nvmf_transport_poll_group *tgroup;
+
+ TAILQ_INIT(&qpair->outstanding);
+ qpair->group = group;
+
+ TAILQ_FOREACH(tgroup, &group->tgroups, link) {
+ if (tgroup->transport == qpair->transport) {
+ rc = nvmf_transport_poll_group_add(tgroup, qpair);
+ break;
+ }
+ }
+
+ /* We add the qpair to the group only it is succesfully added into the tgroup */
+ if (rc == 0) {
+ TAILQ_INSERT_TAIL(&group->qpairs, qpair, link);
+ nvmf_qpair_set_state(qpair, SPDK_NVMF_QPAIR_ACTIVE);
+ }
+
+ return rc;
+}
+
+static
+void _nvmf_ctrlr_destruct(void *ctx)
+{
+ struct spdk_nvmf_ctrlr *ctrlr = ctx;
+
+ nvmf_ctrlr_destruct(ctrlr);
+}
+
+static void
+_nvmf_transport_qpair_fini(void *ctx)
+{
+ struct spdk_nvmf_qpair *qpair = ctx;
+
+ nvmf_transport_qpair_fini(qpair);
+}
+
+static void
+_nvmf_ctrlr_free_from_qpair(void *ctx)
+{
+ struct nvmf_qpair_disconnect_ctx *qpair_ctx = ctx;
+ struct spdk_nvmf_ctrlr *ctrlr = qpair_ctx->ctrlr;
+ uint32_t count;
+
+ spdk_bit_array_clear(ctrlr->qpair_mask, qpair_ctx->qid);
+ count = spdk_bit_array_count_set(ctrlr->qpair_mask);
+ if (count == 0) {
+ spdk_bit_array_free(&ctrlr->qpair_mask);
+
+ spdk_thread_send_msg(ctrlr->subsys->thread, _nvmf_ctrlr_destruct, ctrlr);
+ }
+
+ spdk_thread_send_msg(qpair_ctx->thread, _nvmf_transport_qpair_fini, qpair_ctx->qpair);
+ if (qpair_ctx->cb_fn) {
+ spdk_thread_send_msg(qpair_ctx->thread, qpair_ctx->cb_fn, qpair_ctx->ctx);
+ }
+ free(qpair_ctx);
+}
+
+void
+spdk_nvmf_poll_group_remove(struct spdk_nvmf_qpair *qpair)
+{
+ struct spdk_nvmf_ctrlr *ctrlr = qpair->ctrlr;
+ struct spdk_nvmf_transport_poll_group *tgroup;
+ struct spdk_nvmf_request *req, *tmp;
+ struct spdk_nvmf_subsystem_poll_group *sgroup;
+ int rc;
+
+ nvmf_qpair_set_state(qpair, SPDK_NVMF_QPAIR_ERROR);
+
+ /* Find the tgroup and remove the qpair from the tgroup */
+ TAILQ_FOREACH(tgroup, &qpair->group->tgroups, link) {
+ if (tgroup->transport == qpair->transport) {
+ rc = nvmf_transport_poll_group_remove(tgroup, qpair);
+ if (rc && (rc != ENOTSUP)) {
+ SPDK_ERRLOG("Cannot remove qpair=%p from transport group=%p\n",
+ qpair, tgroup);
+ }
+ break;
+ }
+ }
+
+ if (ctrlr) {
+ sgroup = &qpair->group->sgroups[ctrlr->subsys->id];
+ TAILQ_FOREACH_SAFE(req, &sgroup->queued, link, tmp) {
+ if (req->qpair == qpair) {
+ TAILQ_REMOVE(&sgroup->queued, req, link);
+ if (nvmf_transport_req_free(req)) {
+ SPDK_ERRLOG("Transport request free error!\n");
+ }
+ }
+ }
+ }
+
+ TAILQ_REMOVE(&qpair->group->qpairs, qpair, link);
+ qpair->group = NULL;
+}
+
+static void
+_nvmf_qpair_destroy(void *ctx, int status)
+{
+ struct nvmf_qpair_disconnect_ctx *qpair_ctx = ctx;
+ struct spdk_nvmf_qpair *qpair = qpair_ctx->qpair;
+ struct spdk_nvmf_ctrlr *ctrlr = qpair->ctrlr;
+
+ assert(qpair->state == SPDK_NVMF_QPAIR_DEACTIVATING);
+ qpair_ctx->qid = qpair->qid;
+
+ spdk_nvmf_poll_group_remove(qpair);
+
+ if (!ctrlr || !ctrlr->thread) {
+ nvmf_transport_qpair_fini(qpair);
+ if (qpair_ctx->cb_fn) {
+ spdk_thread_send_msg(qpair_ctx->thread, qpair_ctx->cb_fn, qpair_ctx->ctx);
+ }
+ free(qpair_ctx);
+ return;
+ }
+
+ qpair_ctx->ctrlr = ctrlr;
+ spdk_thread_send_msg(ctrlr->thread, _nvmf_ctrlr_free_from_qpair, qpair_ctx);
+}
+
+int
+spdk_nvmf_qpair_disconnect(struct spdk_nvmf_qpair *qpair, nvmf_qpair_disconnect_cb cb_fn, void *ctx)
+{
+ struct nvmf_qpair_disconnect_ctx *qpair_ctx;
+
+ /* If we get a qpair in the uninitialized state, we can just destroy it immediately */
+ if (qpair->state == SPDK_NVMF_QPAIR_UNINITIALIZED) {
+ nvmf_transport_qpair_fini(qpair);
+ if (cb_fn) {
+ cb_fn(ctx);
+ }
+ return 0;
+ }
+
+ /* The queue pair must be disconnected from the thread that owns it */
+ assert(qpair->group->thread == spdk_get_thread());
+
+ if (qpair->state != SPDK_NVMF_QPAIR_ACTIVE) {
+ /* This can occur if the connection is killed by the target,
+ * which results in a notification that the connection
+ * died. Send a message to defer the processing of this
+ * callback. This allows the stack to unwind in the case
+ * where a bunch of connections are disconnected in
+ * a loop. */
+ if (cb_fn) {
+ spdk_thread_send_msg(qpair->group->thread, cb_fn, ctx);
+ }
+ return 0;
+ }
+
+ assert(qpair->state == SPDK_NVMF_QPAIR_ACTIVE);
+ nvmf_qpair_set_state(qpair, SPDK_NVMF_QPAIR_DEACTIVATING);
+
+ qpair_ctx = calloc(1, sizeof(struct nvmf_qpair_disconnect_ctx));
+ if (!qpair_ctx) {
+ SPDK_ERRLOG("Unable to allocate context for nvmf_qpair_disconnect\n");
+ return -ENOMEM;
+ }
+
+ qpair_ctx->qpair = qpair;
+ qpair_ctx->cb_fn = cb_fn;
+ qpair_ctx->thread = qpair->group->thread;
+ qpair_ctx->ctx = ctx;
+
+ /* Check for outstanding I/O */
+ if (!TAILQ_EMPTY(&qpair->outstanding)) {
+ qpair->state_cb = _nvmf_qpair_destroy;
+ qpair->state_cb_arg = qpair_ctx;
+ nvmf_qpair_free_aer(qpair);
+ return 0;
+ }
+
+ _nvmf_qpair_destroy(qpair_ctx, 0);
+
+ return 0;
+}
+
+int
+spdk_nvmf_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair,
+ struct spdk_nvme_transport_id *trid)
+{
+ return nvmf_transport_qpair_get_peer_trid(qpair, trid);
+}
+
+int
+spdk_nvmf_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair,
+ struct spdk_nvme_transport_id *trid)
+{
+ return nvmf_transport_qpair_get_local_trid(qpair, trid);
+}
+
+int
+spdk_nvmf_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair,
+ struct spdk_nvme_transport_id *trid)
+{
+ return nvmf_transport_qpair_get_listen_trid(qpair, trid);
+}
+
+int
+nvmf_poll_group_add_transport(struct spdk_nvmf_poll_group *group,
+ struct spdk_nvmf_transport *transport)
+{
+ struct spdk_nvmf_transport_poll_group *tgroup;
+
+ TAILQ_FOREACH(tgroup, &group->tgroups, link) {
+ if (tgroup->transport == transport) {
+ /* Transport already in the poll group */
+ return 0;
+ }
+ }
+
+ tgroup = nvmf_transport_poll_group_create(transport);
+ if (!tgroup) {
+ SPDK_ERRLOG("Unable to create poll group for transport\n");
+ return -1;
+ }
+
+ tgroup->group = group;
+ TAILQ_INSERT_TAIL(&group->tgroups, tgroup, link);
+
+ return 0;
+}
+
+static int
+poll_group_update_subsystem(struct spdk_nvmf_poll_group *group,
+ struct spdk_nvmf_subsystem *subsystem)
+{
+ struct spdk_nvmf_subsystem_poll_group *sgroup;
+ uint32_t new_num_ns, old_num_ns;
+ uint32_t i, j;
+ struct spdk_nvmf_ns *ns;
+ struct spdk_nvmf_registrant *reg, *tmp;
+ struct spdk_io_channel *ch;
+ struct spdk_nvmf_subsystem_pg_ns_info *ns_info;
+ struct spdk_nvmf_ctrlr *ctrlr;
+ bool ns_changed;
+
+ /* Make sure our poll group has memory for this subsystem allocated */
+ if (subsystem->id >= group->num_sgroups) {
+ return -ENOMEM;
+ }
+
+ sgroup = &group->sgroups[subsystem->id];
+
+ /* Make sure the array of namespace information is the correct size */
+ new_num_ns = subsystem->max_nsid;
+ old_num_ns = sgroup->num_ns;
+
+ ns_changed = false;
+
+ if (old_num_ns == 0) {
+ if (new_num_ns > 0) {
+ /* First allocation */
+ sgroup->ns_info = calloc(new_num_ns, sizeof(struct spdk_nvmf_subsystem_pg_ns_info));
+ if (!sgroup->ns_info) {
+ return -ENOMEM;
+ }
+ }
+ } else if (new_num_ns > old_num_ns) {
+ void *buf;
+
+ /* Make the array larger */
+ buf = realloc(sgroup->ns_info, new_num_ns * sizeof(struct spdk_nvmf_subsystem_pg_ns_info));
+ if (!buf) {
+ return -ENOMEM;
+ }
+
+ sgroup->ns_info = buf;
+
+ /* Null out the new namespace information slots */
+ for (i = old_num_ns; i < new_num_ns; i++) {
+ memset(&sgroup->ns_info[i], 0, sizeof(struct spdk_nvmf_subsystem_pg_ns_info));
+ }
+ } else if (new_num_ns < old_num_ns) {
+ void *buf;
+
+ /* Free the extra I/O channels */
+ for (i = new_num_ns; i < old_num_ns; i++) {
+ ns_info = &sgroup->ns_info[i];
+
+ if (ns_info->channel) {
+ spdk_put_io_channel(ns_info->channel);
+ ns_info->channel = NULL;
+ }
+ }
+
+ /* Make the array smaller */
+ if (new_num_ns > 0) {
+ buf = realloc(sgroup->ns_info, new_num_ns * sizeof(struct spdk_nvmf_subsystem_pg_ns_info));
+ if (!buf) {
+ return -ENOMEM;
+ }
+ sgroup->ns_info = buf;
+ } else {
+ free(sgroup->ns_info);
+ sgroup->ns_info = NULL;
+ }
+ }
+
+ sgroup->num_ns = new_num_ns;
+
+ /* Detect bdevs that were added or removed */
+ for (i = 0; i < sgroup->num_ns; i++) {
+ ns = subsystem->ns[i];
+ ns_info = &sgroup->ns_info[i];
+ ch = ns_info->channel;
+
+ if (ns == NULL && ch == NULL) {
+ /* Both NULL. Leave empty */
+ } else if (ns == NULL && ch != NULL) {
+ /* There was a channel here, but the namespace is gone. */
+ ns_changed = true;
+ spdk_put_io_channel(ch);
+ ns_info->channel = NULL;
+ } else if (ns != NULL && ch == NULL) {
+ /* A namespace appeared but there is no channel yet */
+ ns_changed = true;
+ ch = spdk_bdev_get_io_channel(ns->desc);
+ if (ch == NULL) {
+ SPDK_ERRLOG("Could not allocate I/O channel.\n");
+ return -ENOMEM;
+ }
+ ns_info->channel = ch;
+ } else if (spdk_uuid_compare(&ns_info->uuid, spdk_bdev_get_uuid(ns->bdev)) != 0) {
+ /* A namespace was here before, but was replaced by a new one. */
+ ns_changed = true;
+ spdk_put_io_channel(ns_info->channel);
+ memset(ns_info, 0, sizeof(*ns_info));
+
+ ch = spdk_bdev_get_io_channel(ns->desc);
+ if (ch == NULL) {
+ SPDK_ERRLOG("Could not allocate I/O channel.\n");
+ return -ENOMEM;
+ }
+ ns_info->channel = ch;
+ } else if (ns_info->num_blocks != spdk_bdev_get_num_blocks(ns->bdev)) {
+ /* Namespace is still there but size has changed */
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Namespace resized: subsystem_id %d,"
+ " nsid %u, pg %p, old %lu, new %lu\n",
+ subsystem->id,
+ ns->nsid,
+ group,
+ ns_info->num_blocks,
+ spdk_bdev_get_num_blocks(ns->bdev));
+ ns_changed = true;
+ }
+
+ if (ns == NULL) {
+ memset(ns_info, 0, sizeof(*ns_info));
+ } else {
+ ns_info->uuid = *spdk_bdev_get_uuid(ns->bdev);
+ ns_info->num_blocks = spdk_bdev_get_num_blocks(ns->bdev);
+ ns_info->crkey = ns->crkey;
+ ns_info->rtype = ns->rtype;
+ if (ns->holder) {
+ ns_info->holder_id = ns->holder->hostid;
+ }
+
+ memset(&ns_info->reg_hostid, 0, SPDK_NVMF_MAX_NUM_REGISTRANTS * sizeof(struct spdk_uuid));
+ j = 0;
+ TAILQ_FOREACH_SAFE(reg, &ns->registrants, link, tmp) {
+ if (j >= SPDK_NVMF_MAX_NUM_REGISTRANTS) {
+ SPDK_ERRLOG("Maximum %u registrants can support.\n", SPDK_NVMF_MAX_NUM_REGISTRANTS);
+ return -EINVAL;
+ }
+ ns_info->reg_hostid[j++] = reg->hostid;
+ }
+ }
+ }
+
+ if (ns_changed) {
+ TAILQ_FOREACH(ctrlr, &subsystem->ctrlrs, link) {
+ if (ctrlr->admin_qpair->group == group) {
+ nvmf_ctrlr_async_event_ns_notice(ctrlr);
+ }
+ }
+ }
+
+ return 0;
+}
+
+int
+nvmf_poll_group_update_subsystem(struct spdk_nvmf_poll_group *group,
+ struct spdk_nvmf_subsystem *subsystem)
+{
+ return poll_group_update_subsystem(group, subsystem);
+}
+
+int
+nvmf_poll_group_add_subsystem(struct spdk_nvmf_poll_group *group,
+ struct spdk_nvmf_subsystem *subsystem,
+ spdk_nvmf_poll_group_mod_done cb_fn, void *cb_arg)
+{
+ int rc = 0;
+ struct spdk_nvmf_subsystem_poll_group *sgroup = &group->sgroups[subsystem->id];
+
+ TAILQ_INIT(&sgroup->queued);
+
+ rc = poll_group_update_subsystem(group, subsystem);
+ if (rc) {
+ nvmf_poll_group_remove_subsystem(group, subsystem, NULL, NULL);
+ goto fini;
+ }
+
+ sgroup->state = SPDK_NVMF_SUBSYSTEM_ACTIVE;
+fini:
+ if (cb_fn) {
+ cb_fn(cb_arg, rc);
+ }
+
+ return rc;
+}
+
+static void
+_nvmf_poll_group_remove_subsystem_cb(void *ctx, int status)
+{
+ struct nvmf_qpair_disconnect_many_ctx *qpair_ctx = ctx;
+ struct spdk_nvmf_subsystem *subsystem;
+ struct spdk_nvmf_poll_group *group;
+ struct spdk_nvmf_subsystem_poll_group *sgroup;
+ spdk_nvmf_poll_group_mod_done cpl_fn = NULL;
+ void *cpl_ctx = NULL;
+ uint32_t nsid;
+
+ group = qpair_ctx->group;
+ subsystem = qpair_ctx->subsystem;
+ cpl_fn = qpair_ctx->cpl_fn;
+ cpl_ctx = qpair_ctx->cpl_ctx;
+ sgroup = &group->sgroups[subsystem->id];
+
+ if (status) {
+ goto fini;
+ }
+
+ for (nsid = 0; nsid < sgroup->num_ns; nsid++) {
+ if (sgroup->ns_info[nsid].channel) {
+ spdk_put_io_channel(sgroup->ns_info[nsid].channel);
+ sgroup->ns_info[nsid].channel = NULL;
+ }
+ }
+
+ sgroup->num_ns = 0;
+ free(sgroup->ns_info);
+ sgroup->ns_info = NULL;
+fini:
+ free(qpair_ctx);
+ if (cpl_fn) {
+ cpl_fn(cpl_ctx, status);
+ }
+}
+
+static void
+_nvmf_subsystem_disconnect_next_qpair(void *ctx)
+{
+ struct spdk_nvmf_qpair *qpair;
+ struct nvmf_qpair_disconnect_many_ctx *qpair_ctx = ctx;
+ struct spdk_nvmf_subsystem *subsystem;
+ struct spdk_nvmf_poll_group *group;
+ int rc = 0;
+
+ group = qpair_ctx->group;
+ subsystem = qpair_ctx->subsystem;
+
+ TAILQ_FOREACH(qpair, &group->qpairs, link) {
+ if ((qpair->ctrlr != NULL) && (qpair->ctrlr->subsys == subsystem)) {
+ break;
+ }
+ }
+
+ if (qpair) {
+ rc = spdk_nvmf_qpair_disconnect(qpair, _nvmf_subsystem_disconnect_next_qpair, qpair_ctx);
+ }
+
+ if (!qpair || rc != 0) {
+ _nvmf_poll_group_remove_subsystem_cb(ctx, rc);
+ }
+ return;
+}
+
+void
+nvmf_poll_group_remove_subsystem(struct spdk_nvmf_poll_group *group,
+ struct spdk_nvmf_subsystem *subsystem,
+ spdk_nvmf_poll_group_mod_done cb_fn, void *cb_arg)
+{
+ struct spdk_nvmf_qpair *qpair;
+ struct spdk_nvmf_subsystem_poll_group *sgroup;
+ struct nvmf_qpair_disconnect_many_ctx *ctx;
+ int rc = 0;
+
+ ctx = calloc(1, sizeof(struct nvmf_qpair_disconnect_many_ctx));
+
+ if (!ctx) {
+ SPDK_ERRLOG("Unable to allocate memory for context to remove poll subsystem\n");
+ goto fini;
+ }
+
+ ctx->group = group;
+ ctx->subsystem = subsystem;
+ ctx->cpl_fn = cb_fn;
+ ctx->cpl_ctx = cb_arg;
+
+ sgroup = &group->sgroups[subsystem->id];
+ sgroup->state = SPDK_NVMF_SUBSYSTEM_INACTIVE;
+
+ TAILQ_FOREACH(qpair, &group->qpairs, link) {
+ if ((qpair->ctrlr != NULL) && (qpair->ctrlr->subsys == subsystem)) {
+ break;
+ }
+ }
+
+ if (qpair) {
+ rc = spdk_nvmf_qpair_disconnect(qpair, _nvmf_subsystem_disconnect_next_qpair, ctx);
+ } else {
+ /* call the callback immediately. It will handle any channel iteration */
+ _nvmf_poll_group_remove_subsystem_cb(ctx, 0);
+ }
+
+ if (rc != 0) {
+ free(ctx);
+ goto fini;
+ }
+
+ return;
+fini:
+ if (cb_fn) {
+ cb_fn(cb_arg, rc);
+ }
+}
+
+void
+nvmf_poll_group_pause_subsystem(struct spdk_nvmf_poll_group *group,
+ struct spdk_nvmf_subsystem *subsystem,
+ spdk_nvmf_poll_group_mod_done cb_fn, void *cb_arg)
+{
+ struct spdk_nvmf_subsystem_poll_group *sgroup;
+ int rc = 0;
+
+ if (subsystem->id >= group->num_sgroups) {
+ rc = -1;
+ goto fini;
+ }
+
+ sgroup = &group->sgroups[subsystem->id];
+ if (sgroup == NULL) {
+ rc = -1;
+ goto fini;
+ }
+
+ assert(sgroup->state == SPDK_NVMF_SUBSYSTEM_ACTIVE);
+ sgroup->state = SPDK_NVMF_SUBSYSTEM_PAUSING;
+
+ if (sgroup->io_outstanding > 0) {
+ sgroup->cb_fn = cb_fn;
+ sgroup->cb_arg = cb_arg;
+ return;
+ }
+
+ assert(sgroup->io_outstanding == 0);
+ sgroup->state = SPDK_NVMF_SUBSYSTEM_PAUSED;
+fini:
+ if (cb_fn) {
+ cb_fn(cb_arg, rc);
+ }
+}
+
+void
+nvmf_poll_group_resume_subsystem(struct spdk_nvmf_poll_group *group,
+ struct spdk_nvmf_subsystem *subsystem,
+ spdk_nvmf_poll_group_mod_done cb_fn, void *cb_arg)
+{
+ struct spdk_nvmf_request *req, *tmp;
+ struct spdk_nvmf_subsystem_poll_group *sgroup;
+ int rc = 0;
+
+ if (subsystem->id >= group->num_sgroups) {
+ rc = -1;
+ goto fini;
+ }
+
+ sgroup = &group->sgroups[subsystem->id];
+
+ assert(sgroup->state == SPDK_NVMF_SUBSYSTEM_PAUSED);
+
+ rc = poll_group_update_subsystem(group, subsystem);
+ if (rc) {
+ goto fini;
+ }
+
+ sgroup->state = SPDK_NVMF_SUBSYSTEM_ACTIVE;
+
+ /* Release all queued requests */
+ TAILQ_FOREACH_SAFE(req, &sgroup->queued, link, tmp) {
+ TAILQ_REMOVE(&sgroup->queued, req, link);
+ spdk_nvmf_request_exec(req);
+ }
+fini:
+ if (cb_fn) {
+ cb_fn(cb_arg, rc);
+ }
+}
+
+
+struct spdk_nvmf_poll_group *
+spdk_nvmf_get_optimal_poll_group(struct spdk_nvmf_qpair *qpair)
+{
+ struct spdk_nvmf_transport_poll_group *tgroup;
+
+ tgroup = nvmf_transport_get_optimal_poll_group(qpair->transport, qpair);
+
+ if (tgroup == NULL) {
+ return NULL;
+ }
+
+ return tgroup->group;
+}
+
+int
+spdk_nvmf_poll_group_get_stat(struct spdk_nvmf_tgt *tgt,
+ struct spdk_nvmf_poll_group_stat *stat)
+{
+ struct spdk_io_channel *ch;
+ struct spdk_nvmf_poll_group *group;
+
+ if (tgt == NULL || stat == NULL) {
+ return -EINVAL;
+ }
+
+ ch = spdk_get_io_channel(tgt);
+ group = spdk_io_channel_get_ctx(ch);
+ *stat = group->stat;
+ spdk_put_io_channel(ch);
+ return 0;
+}
diff --git a/src/spdk/lib/nvmf/nvmf_fc.h b/src/spdk/lib/nvmf/nvmf_fc.h
new file mode 100644
index 000000000..10d3ef9cf
--- /dev/null
+++ b/src/spdk/lib/nvmf/nvmf_fc.h
@@ -0,0 +1,999 @@
+/*
+ * BSD LICENSE
+ *
+ * Copyright (c) 2018-2019 Broadcom. All Rights Reserved.
+ * The term "Broadcom" refers to Broadcom Inc. and/or its subsidiaries.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __NVMF_FC_H__
+#define __NVMF_FC_H__
+
+#include "spdk/nvme.h"
+#include "spdk/nvmf.h"
+#include "spdk/assert.h"
+#include "spdk/nvme_spec.h"
+#include "spdk/nvmf_fc_spec.h"
+#include "spdk/thread.h"
+#include "nvmf_internal.h"
+
+#define SPDK_NVMF_FC_TR_ADDR_LEN 64
+#define NVMF_FC_INVALID_CONN_ID UINT64_MAX
+
+#define SPDK_FC_HW_DUMP_REASON_STR_MAX_SIZE 256
+#define SPDK_MAX_NUM_OF_FC_PORTS 32
+#define SPDK_NVMF_PORT_ID_MAX_LEN 32
+
+/*
+ * FC HWQP pointer
+ */
+typedef void *spdk_nvmf_fc_lld_hwqp_t;
+
+/*
+ * FC HW port states.
+ */
+enum spdk_fc_port_state {
+ SPDK_FC_PORT_OFFLINE = 0,
+ SPDK_FC_PORT_ONLINE = 1,
+ SPDK_FC_PORT_QUIESCED = 2,
+};
+
+enum spdk_fc_hwqp_state {
+ SPDK_FC_HWQP_OFFLINE = 0,
+ SPDK_FC_HWQP_ONLINE = 1,
+};
+
+/*
+ * NVMF FC Object state
+ * Add all the generic states of the object here.
+ * Specific object states can be added separately
+ */
+enum spdk_nvmf_fc_object_state {
+ SPDK_NVMF_FC_OBJECT_CREATED = 0,
+ SPDK_NVMF_FC_OBJECT_TO_BE_DELETED = 1,
+ SPDK_NVMF_FC_OBJECT_ZOMBIE = 2, /* Partial Create or Delete */
+};
+
+/*
+ * FC request state
+ */
+enum spdk_nvmf_fc_request_state {
+ SPDK_NVMF_FC_REQ_INIT = 0,
+ SPDK_NVMF_FC_REQ_READ_BDEV,
+ SPDK_NVMF_FC_REQ_READ_XFER,
+ SPDK_NVMF_FC_REQ_READ_RSP,
+ SPDK_NVMF_FC_REQ_WRITE_BUFFS,
+ SPDK_NVMF_FC_REQ_WRITE_XFER,
+ SPDK_NVMF_FC_REQ_WRITE_BDEV,
+ SPDK_NVMF_FC_REQ_WRITE_RSP,
+ SPDK_NVMF_FC_REQ_NONE_BDEV,
+ SPDK_NVMF_FC_REQ_NONE_RSP,
+ SPDK_NVMF_FC_REQ_SUCCESS,
+ SPDK_NVMF_FC_REQ_FAILED,
+ SPDK_NVMF_FC_REQ_ABORTED,
+ SPDK_NVMF_FC_REQ_BDEV_ABORTED,
+ SPDK_NVMF_FC_REQ_PENDING,
+ SPDK_NVMF_FC_REQ_MAX_STATE,
+};
+
+/*
+ * Generic DMA buffer descriptor
+ */
+struct spdk_nvmf_fc_buffer_desc {
+ void *virt;
+ uint64_t phys;
+ size_t len;
+
+ /* Internal */
+ uint32_t buf_index;
+};
+
+/*
+ * ABTS hadling context
+ */
+struct spdk_nvmf_fc_abts_ctx {
+ bool handled;
+ uint16_t hwqps_responded;
+ uint16_t rpi;
+ uint16_t oxid;
+ uint16_t rxid;
+ struct spdk_nvmf_fc_nport *nport;
+ uint16_t nport_hdl;
+ uint8_t port_hdl;
+ void *abts_poller_args;
+ void *sync_poller_args;
+ int num_hwqps;
+ bool queue_synced;
+ uint64_t u_id;
+ struct spdk_nvmf_fc_hwqp *ls_hwqp;
+ uint16_t fcp_rq_id;
+};
+
+/*
+ * NVME FC transport errors
+ */
+struct spdk_nvmf_fc_errors {
+ uint32_t no_xchg;
+ uint32_t nport_invalid;
+ uint32_t unknown_frame;
+ uint32_t wqe_cmplt_err;
+ uint32_t wqe_write_err;
+ uint32_t rq_status_err;
+ uint32_t rq_buf_len_err;
+ uint32_t rq_id_err;
+ uint32_t rq_index_err;
+ uint32_t invalid_cq_type;
+ uint32_t invalid_cq_id;
+ uint32_t fc_req_buf_err;
+ uint32_t buf_alloc_err;
+ uint32_t unexpected_err;
+ uint32_t nvme_cmd_iu_err;
+ uint32_t nvme_cmd_xfer_err;
+ uint32_t queue_entry_invalid;
+ uint32_t invalid_conn_err;
+ uint32_t fcp_rsp_failure;
+ uint32_t write_failed;
+ uint32_t read_failed;
+ uint32_t rport_invalid;
+ uint32_t num_aborted;
+ uint32_t num_abts_sent;
+};
+
+/*
+ * Send Single Request/Response Sequence.
+ */
+struct spdk_nvmf_fc_srsr_bufs {
+ void *rqst;
+ size_t rqst_len;
+ void *rsp;
+ size_t rsp_len;
+ uint16_t rpi;
+};
+
+/*
+ * Struct representing a nport
+ */
+struct spdk_nvmf_fc_nport {
+
+ uint16_t nport_hdl;
+ uint8_t port_hdl;
+ uint32_t d_id;
+ enum spdk_nvmf_fc_object_state nport_state;
+ struct spdk_nvmf_fc_wwn fc_nodename;
+ struct spdk_nvmf_fc_wwn fc_portname;
+
+ /* list of remote ports (i.e. initiators) connected to nport */
+ TAILQ_HEAD(, spdk_nvmf_fc_remote_port_info) rem_port_list;
+ uint32_t rport_count;
+
+ void *vendor_data; /* available for vendor use */
+
+ /* list of associations to nport */
+ TAILQ_HEAD(, spdk_nvmf_fc_association) fc_associations;
+ uint32_t assoc_count;
+ struct spdk_nvmf_fc_port *fc_port;
+ TAILQ_ENTRY(spdk_nvmf_fc_nport) link; /* list of nports on a hw port. */
+};
+
+/*
+ * NVMF FC Connection
+ */
+struct spdk_nvmf_fc_conn {
+ struct spdk_nvmf_qpair qpair;
+ struct spdk_nvme_transport_id trid;
+
+ uint64_t conn_id;
+ struct spdk_nvmf_fc_hwqp *hwqp;
+ uint16_t esrp_ratio;
+ uint16_t rsp_count;
+ uint32_t rsn;
+
+ /* The maximum number of I/O outstanding on this connection at one time */
+ uint16_t max_queue_depth;
+ uint16_t max_rw_depth;
+ /* The current number of I/O outstanding on this connection. This number
+ * includes all I/O from the time the capsule is first received until it is
+ * completed.
+ */
+ uint16_t cur_queue_depth;
+
+ /* number of read/write requests that are outstanding */
+ uint16_t cur_fc_rw_depth;
+
+ struct spdk_nvmf_fc_association *fc_assoc;
+
+ uint16_t rpi;
+
+ /* for association's connection list */
+ TAILQ_ENTRY(spdk_nvmf_fc_conn) assoc_link;
+
+ /* for assocations's available connection list */
+ TAILQ_ENTRY(spdk_nvmf_fc_conn) assoc_avail_link;
+
+ /* for hwqp's connection list */
+ TAILQ_ENTRY(spdk_nvmf_fc_conn) link;
+
+ /* New QP create context. */
+ struct nvmf_fc_ls_op_ctx *create_opd;
+};
+
+/*
+ * Structure for maintaining the FC exchanges
+ */
+struct spdk_nvmf_fc_xchg {
+ uint32_t xchg_id; /* The actual xchg identifier */
+
+ /* Internal */
+ TAILQ_ENTRY(spdk_nvmf_fc_xchg) link;
+ bool active;
+ bool aborted;
+ bool send_abts; /* Valid if is_aborted is set. */
+};
+
+/*
+ * FC poll group structure
+ */
+struct spdk_nvmf_fc_poll_group {
+ struct spdk_nvmf_transport_poll_group group;
+ struct spdk_nvmf_tgt *nvmf_tgt;
+ uint32_t hwqp_count; /* number of hwqp's assigned to this pg */
+ TAILQ_HEAD(, spdk_nvmf_fc_hwqp) hwqp_list;
+
+ TAILQ_ENTRY(spdk_nvmf_fc_poll_group) link;
+};
+
+/*
+ * HWQP poller structure passed from Master thread
+ */
+struct spdk_nvmf_fc_hwqp {
+ enum spdk_fc_hwqp_state state; /* queue state (for poller) */
+ uint32_t lcore_id; /* core hwqp is running on (for tracing purposes only) */
+ struct spdk_thread *thread; /* thread hwqp is running on */
+ uint32_t hwqp_id; /* A unique id (per physical port) for a hwqp */
+ uint32_t rq_size; /* receive queue size */
+ spdk_nvmf_fc_lld_hwqp_t queues; /* vendor HW queue set */
+ struct spdk_nvmf_fc_port *fc_port; /* HW port structure for these queues */
+ struct spdk_nvmf_fc_poll_group *fgroup;
+
+ /* qpair (fc_connection) list */
+ TAILQ_HEAD(, spdk_nvmf_fc_conn) connection_list;
+ uint32_t num_conns; /* number of connections to queue */
+
+ struct spdk_nvmf_fc_request *fc_reqs_buf;
+ TAILQ_HEAD(, spdk_nvmf_fc_request) free_reqs;
+ TAILQ_HEAD(, spdk_nvmf_fc_request) in_use_reqs;
+
+ struct spdk_nvmf_fc_errors counters;
+
+ /* Pending LS request waiting for FC resource */
+ TAILQ_HEAD(, spdk_nvmf_fc_ls_rqst) ls_pending_queue;
+
+ /* Sync req list */
+ TAILQ_HEAD(, spdk_nvmf_fc_poller_api_queue_sync_args) sync_cbs;
+
+ TAILQ_ENTRY(spdk_nvmf_fc_hwqp) link;
+
+ void *context; /* Vendor specific context data */
+};
+
+/*
+ * FC HW port.
+ */
+struct spdk_nvmf_fc_port {
+ uint8_t port_hdl;
+ enum spdk_fc_port_state hw_port_status;
+ uint16_t fcp_rq_id;
+ struct spdk_nvmf_fc_hwqp ls_queue;
+
+ uint32_t num_io_queues;
+ struct spdk_nvmf_fc_hwqp *io_queues;
+ /*
+ * List of nports on this HW port.
+ */
+ TAILQ_HEAD(, spdk_nvmf_fc_nport)nport_list;
+ int num_nports;
+ TAILQ_ENTRY(spdk_nvmf_fc_port) link;
+
+ struct spdk_mempool *io_resource_pool; /* Pools to store bdev_io's for this port */
+ void *port_ctx;
+};
+
+/*
+ * NVMF FC Request
+ */
+struct spdk_nvmf_fc_request {
+ struct spdk_nvmf_request req;
+ struct spdk_nvmf_fc_ersp_iu ersp;
+ uint32_t poller_lcore; /* for tracing purposes only */
+ struct spdk_thread *poller_thread;
+ uint16_t buf_index;
+ struct spdk_nvmf_fc_xchg *xchg;
+ uint16_t oxid;
+ uint16_t rpi;
+ struct spdk_nvmf_fc_conn *fc_conn;
+ struct spdk_nvmf_fc_hwqp *hwqp;
+ int state;
+ uint32_t transfered_len;
+ bool is_aborted;
+ uint32_t magic;
+ uint32_t s_id;
+ uint32_t d_id;
+ TAILQ_ENTRY(spdk_nvmf_fc_request) link;
+ STAILQ_ENTRY(spdk_nvmf_fc_request) pending_link;
+ TAILQ_HEAD(, spdk_nvmf_fc_caller_ctx) abort_cbs;
+};
+
+SPDK_STATIC_ASSERT(!offsetof(struct spdk_nvmf_fc_request, req),
+ "FC request and NVMF request address don't match.");
+
+
+/*
+ * NVMF FC Association
+ */
+struct spdk_nvmf_fc_association {
+ uint64_t assoc_id;
+ uint32_t s_id;
+ struct spdk_nvmf_fc_nport *tgtport;
+ struct spdk_nvmf_fc_remote_port_info *rport;
+ struct spdk_nvmf_subsystem *subsystem;
+ enum spdk_nvmf_fc_object_state assoc_state;
+
+ char host_id[FCNVME_ASSOC_HOSTID_LEN];
+ char host_nqn[SPDK_NVME_NQN_FIELD_SIZE];
+ char sub_nqn[SPDK_NVME_NQN_FIELD_SIZE];
+
+ struct spdk_nvmf_fc_conn *aq_conn; /* connection for admin queue */
+
+ uint16_t conn_count;
+ TAILQ_HEAD(, spdk_nvmf_fc_conn) fc_conns;
+
+ void *conns_buf;
+ TAILQ_HEAD(, spdk_nvmf_fc_conn) avail_fc_conns;
+
+ TAILQ_ENTRY(spdk_nvmf_fc_association) link;
+
+ /* for port's association free list */
+ TAILQ_ENTRY(spdk_nvmf_fc_association) port_free_assoc_list_link;
+
+ void *ls_del_op_ctx; /* delete assoc. callback list */
+
+ /* disconnect cmd buffers (sent to initiator) */
+ struct spdk_nvmf_fc_srsr_bufs *snd_disconn_bufs;
+};
+
+/*
+ * FC Remote Port
+ */
+struct spdk_nvmf_fc_remote_port_info {
+ uint32_t s_id;
+ uint32_t rpi;
+ uint32_t assoc_count;
+ struct spdk_nvmf_fc_wwn fc_nodename;
+ struct spdk_nvmf_fc_wwn fc_portname;
+ enum spdk_nvmf_fc_object_state rport_state;
+ TAILQ_ENTRY(spdk_nvmf_fc_remote_port_info) link;
+};
+
+/*
+ * Poller API error codes
+ */
+enum spdk_nvmf_fc_poller_api_ret {
+ SPDK_NVMF_FC_POLLER_API_SUCCESS = 0,
+ SPDK_NVMF_FC_POLLER_API_ERROR,
+ SPDK_NVMF_FC_POLLER_API_INVALID_ARG,
+ SPDK_NVMF_FC_POLLER_API_NO_CONN_ID,
+ SPDK_NVMF_FC_POLLER_API_DUP_CONN_ID,
+ SPDK_NVMF_FC_POLLER_API_OXID_NOT_FOUND,
+};
+
+/*
+ * Poller API definitions
+ */
+enum spdk_nvmf_fc_poller_api {
+ SPDK_NVMF_FC_POLLER_API_ADD_CONNECTION,
+ SPDK_NVMF_FC_POLLER_API_DEL_CONNECTION,
+ SPDK_NVMF_FC_POLLER_API_QUIESCE_QUEUE,
+ SPDK_NVMF_FC_POLLER_API_ACTIVATE_QUEUE,
+ SPDK_NVMF_FC_POLLER_API_ABTS_RECEIVED,
+ SPDK_NVMF_FC_POLLER_API_REQ_ABORT_COMPLETE,
+ SPDK_NVMF_FC_POLLER_API_ADAPTER_EVENT,
+ SPDK_NVMF_FC_POLLER_API_AEN,
+ SPDK_NVMF_FC_POLLER_API_QUEUE_SYNC,
+ SPDK_NVMF_FC_POLLER_API_QUEUE_SYNC_DONE,
+ SPDK_NVMF_FC_POLLER_API_ADD_HWQP,
+ SPDK_NVMF_FC_POLLER_API_REMOVE_HWQP,
+};
+
+/*
+ * Poller API callback function proto
+ */
+typedef void (*spdk_nvmf_fc_poller_api_cb)(void *cb_data, enum spdk_nvmf_fc_poller_api_ret ret);
+
+/*
+ * Poller API callback data
+ */
+struct spdk_nvmf_fc_poller_api_cb_info {
+ struct spdk_thread *cb_thread;
+ spdk_nvmf_fc_poller_api_cb cb_func;
+ void *cb_data;
+ enum spdk_nvmf_fc_poller_api_ret ret;
+};
+
+/*
+ * Poller API structures
+ */
+struct spdk_nvmf_fc_poller_api_add_connection_args {
+ struct spdk_nvmf_fc_conn *fc_conn;
+ struct spdk_nvmf_fc_poller_api_cb_info cb_info;
+};
+
+struct spdk_nvmf_fc_poller_api_del_connection_args {
+ struct spdk_nvmf_fc_conn *fc_conn;
+ struct spdk_nvmf_fc_hwqp *hwqp;
+ struct spdk_nvmf_fc_poller_api_cb_info cb_info;
+ bool send_abts;
+ /* internal */
+ int fc_request_cnt;
+ bool backend_initiated;
+};
+
+struct spdk_nvmf_fc_poller_api_quiesce_queue_args {
+ void *ctx;
+ struct spdk_nvmf_fc_hwqp *hwqp;
+ struct spdk_nvmf_fc_poller_api_cb_info cb_info;
+};
+
+struct spdk_nvmf_fc_poller_api_activate_queue_args {
+ struct spdk_nvmf_fc_hwqp *hwqp;
+ struct spdk_nvmf_fc_poller_api_cb_info cb_info;
+};
+
+struct spdk_nvmf_fc_poller_api_abts_recvd_args {
+ struct spdk_nvmf_fc_abts_ctx *ctx;
+ struct spdk_nvmf_fc_hwqp *hwqp;
+ struct spdk_nvmf_fc_poller_api_cb_info cb_info;
+};
+
+struct spdk_nvmf_fc_poller_api_queue_sync_done_args {
+ struct spdk_nvmf_fc_hwqp *hwqp;
+ struct spdk_nvmf_fc_poller_api_cb_info cb_info;
+ uint64_t tag;
+};
+
+/*
+ * NVMF LS request structure
+ */
+struct spdk_nvmf_fc_ls_rqst {
+ struct spdk_nvmf_fc_buffer_desc rqstbuf;
+ struct spdk_nvmf_fc_buffer_desc rspbuf;
+ uint32_t rqst_len;
+ uint32_t rsp_len;
+ uint32_t rpi;
+ struct spdk_nvmf_fc_xchg *xchg;
+ uint16_t oxid;
+ void *private_data; /* for LLD only (LS does not touch) */
+ TAILQ_ENTRY(spdk_nvmf_fc_ls_rqst) ls_pending_link;
+ uint32_t s_id;
+ uint32_t d_id;
+ struct spdk_nvmf_fc_nport *nport;
+ struct spdk_nvmf_fc_remote_port_info *rport;
+ struct spdk_nvmf_tgt *nvmf_tgt;
+};
+
+/*
+ * RQ Buffer LS Overlay Structure
+ */
+#define FCNVME_LS_RSVD_SIZE (FCNVME_MAX_LS_BUFFER_SIZE - \
+ (sizeof(struct spdk_nvmf_fc_ls_rqst) + FCNVME_MAX_LS_REQ_SIZE + FCNVME_MAX_LS_RSP_SIZE))
+
+struct spdk_nvmf_fc_rq_buf_ls_request {
+ uint8_t rqst[FCNVME_MAX_LS_REQ_SIZE];
+ uint8_t resp[FCNVME_MAX_LS_RSP_SIZE];
+ struct spdk_nvmf_fc_ls_rqst ls_rqst;
+ uint8_t rsvd[FCNVME_LS_RSVD_SIZE];
+};
+
+SPDK_STATIC_ASSERT(sizeof(struct spdk_nvmf_fc_rq_buf_ls_request) ==
+ FCNVME_MAX_LS_BUFFER_SIZE, "LS RQ Buffer overflow");
+
+/* Poller API structures (arguments and callback data */
+typedef void (*spdk_nvmf_fc_del_assoc_cb)(void *arg, uint32_t err);
+
+struct spdk_nvmf_fc_ls_add_conn_api_data {
+ struct spdk_nvmf_fc_poller_api_add_connection_args args;
+ struct spdk_nvmf_fc_ls_rqst *ls_rqst;
+ struct spdk_nvmf_fc_association *assoc;
+ bool aq_conn; /* true if adding connection for new association */
+};
+
+/* Disconnect (connection) request functions */
+struct spdk_nvmf_fc_ls_del_conn_api_data {
+ struct spdk_nvmf_fc_poller_api_del_connection_args args;
+ struct spdk_nvmf_fc_ls_rqst *ls_rqst;
+ struct spdk_nvmf_fc_association *assoc;
+ bool aq_conn; /* true if deleting AQ connection */
+};
+
+/* used by LS disconnect association cmd handling */
+struct spdk_nvmf_fc_ls_disconn_assoc_api_data {
+ struct spdk_nvmf_fc_nport *tgtport;
+ struct spdk_nvmf_fc_ls_rqst *ls_rqst;
+};
+
+/* used by delete association call */
+struct spdk_nvmf_fc_delete_assoc_api_data {
+ struct spdk_nvmf_fc_poller_api_del_connection_args args;
+ struct spdk_nvmf_fc_association *assoc;
+ bool from_ls_rqst; /* true = request came for LS */
+ spdk_nvmf_fc_del_assoc_cb del_assoc_cb;
+ void *del_assoc_cb_data;
+};
+
+struct nvmf_fc_ls_op_ctx {
+ union {
+ struct spdk_nvmf_fc_ls_add_conn_api_data add_conn;
+ struct spdk_nvmf_fc_ls_del_conn_api_data del_conn;
+ struct spdk_nvmf_fc_ls_disconn_assoc_api_data disconn_assoc;
+ struct spdk_nvmf_fc_delete_assoc_api_data del_assoc;
+ } u;
+ struct nvmf_fc_ls_op_ctx *next_op_ctx;
+};
+
+struct spdk_nvmf_fc_poller_api_queue_sync_args {
+ uint64_t u_id;
+ struct spdk_nvmf_fc_hwqp *hwqp;
+ struct spdk_nvmf_fc_poller_api_cb_info cb_info;
+
+ /* Used internally by poller */
+ TAILQ_ENTRY(spdk_nvmf_fc_poller_api_queue_sync_args) link;
+};
+
+/**
+ * Following defines and structures are used to pass messages between master thread
+ * and FCT driver.
+ */
+enum spdk_fc_event {
+ SPDK_FC_HW_PORT_INIT,
+ SPDK_FC_HW_PORT_ONLINE,
+ SPDK_FC_HW_PORT_OFFLINE,
+ SPDK_FC_HW_PORT_RESET,
+ SPDK_FC_NPORT_CREATE,
+ SPDK_FC_NPORT_DELETE,
+ SPDK_FC_IT_ADD, /* PRLI */
+ SPDK_FC_IT_DELETE, /* PRLI */
+ SPDK_FC_ABTS_RECV,
+ SPDK_FC_LINK_BREAK,
+ SPDK_FC_HW_PORT_DUMP,
+ SPDK_FC_UNRECOVERABLE_ERR,
+ SPDK_FC_EVENT_MAX,
+};
+
+/**
+ * Arguments for to dump assoc id
+ */
+struct spdk_nvmf_fc_dump_assoc_id_args {
+ uint8_t pport_handle;
+ uint16_t nport_handle;
+ uint32_t assoc_id;
+};
+
+/**
+ * Arguments for HW port init event.
+ */
+struct spdk_nvmf_fc_hw_port_init_args {
+ uint32_t ls_queue_size;
+ spdk_nvmf_fc_lld_hwqp_t ls_queue;
+ uint32_t io_queue_size;
+ uint32_t io_queue_cnt;
+ spdk_nvmf_fc_lld_hwqp_t *io_queues;
+ void *cb_ctx;
+ void *port_ctx;
+ uint8_t port_handle;
+ uint8_t nvme_aq_index; /* io_queue used for nvme admin queue */
+ uint16_t fcp_rq_id; /* Base rq ID of SCSI queue */
+};
+
+/**
+ * Arguments for HW port link break event.
+ */
+struct spdk_nvmf_hw_port_link_break_args {
+ uint8_t port_handle;
+ void *cb_ctx;
+};
+
+/**
+ * Arguments for HW port online event.
+ */
+struct spdk_nvmf_fc_hw_port_online_args {
+ uint8_t port_handle;
+ void *cb_ctx;
+};
+
+/**
+ * Arguments for HW port offline event.
+ */
+struct spdk_nvmf_fc_hw_port_offline_args {
+ uint8_t port_handle;
+ void *cb_ctx;
+};
+
+/**
+ * Arguments for n-port add event.
+ */
+struct spdk_nvmf_fc_nport_create_args {
+ uint8_t port_handle;
+ uint16_t nport_handle;
+ struct spdk_uuid container_uuid; /* UUID of the nports container */
+ struct spdk_uuid nport_uuid; /* Unique UUID for the nport */
+ uint32_t d_id;
+ struct spdk_nvmf_fc_wwn fc_nodename;
+ struct spdk_nvmf_fc_wwn fc_portname;
+ uint32_t subsys_id; /* Subsystemid */
+ char port_id[SPDK_NVMF_PORT_ID_MAX_LEN];
+ void *cb_ctx;
+};
+
+/**
+ * Arguments for n-port delete event.
+ */
+struct spdk_nvmf_fc_nport_delete_args {
+ uint8_t port_handle;
+ uint32_t nport_handle;
+ uint32_t subsys_id; /* Subsystem id */
+ void *cb_ctx;
+};
+
+/**
+ * Arguments for I_T add event.
+ */
+struct spdk_nvmf_fc_hw_i_t_add_args {
+ uint8_t port_handle;
+ uint32_t nport_handle;
+ uint16_t itn_handle;
+ uint32_t rpi;
+ uint32_t s_id;
+ uint32_t initiator_prli_info;
+ uint32_t target_prli_info; /* populated by the SPDK master */
+ struct spdk_nvmf_fc_wwn fc_nodename;
+ struct spdk_nvmf_fc_wwn fc_portname;
+ void *cb_ctx;
+};
+
+/**
+ * Arguments for I_T delete event.
+ */
+struct spdk_nvmf_fc_hw_i_t_delete_args {
+ uint8_t port_handle;
+ uint32_t nport_handle;
+ uint16_t itn_handle; /* Only used by FC LLD driver; unused in SPDK */
+ uint32_t rpi;
+ uint32_t s_id;
+ void *cb_ctx;
+};
+
+/**
+ * Arguments for ABTS event.
+ */
+struct spdk_nvmf_fc_abts_args {
+ uint8_t port_handle;
+ uint32_t nport_handle;
+ uint32_t rpi;
+ uint16_t oxid, rxid;
+ void *cb_ctx;
+};
+
+/**
+ * Arguments for link break event.
+ */
+struct spdk_nvmf_fc_link_break_args {
+ uint8_t port_handle;
+};
+
+/**
+ * Arguments for port reset event.
+ */
+struct spdk_nvmf_fc_hw_port_reset_args {
+ uint8_t port_handle;
+ bool dump_queues;
+ char reason[SPDK_FC_HW_DUMP_REASON_STR_MAX_SIZE];
+ uint32_t **dump_buf;
+ void *cb_ctx;
+};
+
+/**
+ * Arguments for unrecoverable error event
+ */
+struct spdk_nvmf_fc_unrecoverable_error_event_args {
+};
+
+/**
+ * Callback function to the FCT driver.
+ */
+typedef void (*spdk_nvmf_fc_callback)(uint8_t port_handle,
+ enum spdk_fc_event event_type,
+ void *arg, int err);
+
+/**
+ * Enqueue an FCT event to master thread
+ *
+ * \param event_type Type of the event.
+ * \param args Pointer to the argument structure.
+ * \param cb_func Callback function into fc driver.
+ *
+ * \return 0 on success, non-zero on failure.
+ */
+int
+nvmf_fc_master_enqueue_event(enum spdk_fc_event event_type,
+ void *args,
+ spdk_nvmf_fc_callback cb_func);
+
+/*
+ * dump info
+ */
+struct spdk_nvmf_fc_queue_dump_info {
+ char *buffer;
+ int offset;
+};
+#define SPDK_FC_HW_DUMP_BUF_SIZE (10 * 4096)
+
+static inline void
+nvmf_fc_dump_buf_print(struct spdk_nvmf_fc_queue_dump_info *dump_info, char *fmt, ...)
+{
+ uint64_t buffer_size = SPDK_FC_HW_DUMP_BUF_SIZE;
+ int32_t avail = (int32_t)(buffer_size - dump_info->offset);
+
+ if (avail > 0) {
+ va_list ap;
+ int32_t written;
+
+ va_start(ap, fmt);
+ written = vsnprintf(dump_info->buffer + dump_info->offset, avail, fmt, ap);
+ if (written >= avail) {
+ dump_info->offset += avail;
+ } else {
+ dump_info->offset += written;
+ }
+ va_end(ap);
+ }
+}
+
+/*
+ * NVMF FC caller callback definitions
+ */
+typedef void (*spdk_nvmf_fc_caller_cb)(void *hwqp, int32_t status, void *args);
+
+struct spdk_nvmf_fc_caller_ctx {
+ void *ctx;
+ spdk_nvmf_fc_caller_cb cb;
+ void *cb_args;
+ TAILQ_ENTRY(spdk_nvmf_fc_caller_ctx) link;
+};
+
+/*
+ * NVMF FC Exchange Info (for debug)
+ */
+struct spdk_nvmf_fc_xchg_info {
+ uint32_t xchg_base;
+ uint32_t xchg_total_count;
+ uint32_t xchg_avail_count;
+ uint32_t send_frame_xchg_id;
+ uint8_t send_frame_seqid;
+};
+
+/*
+ * NVMF FC inline and function prototypes
+ */
+
+static inline struct spdk_nvmf_fc_request *
+nvmf_fc_get_fc_req(struct spdk_nvmf_request *req)
+{
+ return (struct spdk_nvmf_fc_request *)
+ ((uintptr_t)req - offsetof(struct spdk_nvmf_fc_request, req));
+}
+
+static inline bool
+nvmf_fc_is_port_dead(struct spdk_nvmf_fc_hwqp *hwqp)
+{
+ switch (hwqp->fc_port->hw_port_status) {
+ case SPDK_FC_PORT_QUIESCED:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static inline bool
+nvmf_fc_req_in_xfer(struct spdk_nvmf_fc_request *fc_req)
+{
+ switch (fc_req->state) {
+ case SPDK_NVMF_FC_REQ_READ_XFER:
+ case SPDK_NVMF_FC_REQ_READ_RSP:
+ case SPDK_NVMF_FC_REQ_WRITE_XFER:
+ case SPDK_NVMF_FC_REQ_WRITE_RSP:
+ case SPDK_NVMF_FC_REQ_NONE_RSP:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static inline void
+nvmf_fc_create_trid(struct spdk_nvme_transport_id *trid, uint64_t n_wwn, uint64_t p_wwn)
+{
+ spdk_nvme_trid_populate_transport(trid, SPDK_NVME_TRANSPORT_FC);
+ trid->adrfam = SPDK_NVMF_ADRFAM_FC;
+ snprintf(trid->trsvcid, sizeof(trid->trsvcid), "none");
+ snprintf(trid->traddr, sizeof(trid->traddr), "nn-0x%lx:pn-0x%lx", n_wwn, p_wwn);
+}
+
+void nvmf_fc_ls_init(struct spdk_nvmf_fc_port *fc_port);
+
+void nvmf_fc_ls_fini(struct spdk_nvmf_fc_port *fc_port);
+
+void nvmf_fc_handle_ls_rqst(struct spdk_nvmf_fc_ls_rqst *ls_rqst);
+void nvmf_fc_ls_add_conn_failure(
+ struct spdk_nvmf_fc_association *assoc,
+ struct spdk_nvmf_fc_ls_rqst *ls_rqst,
+ struct spdk_nvmf_fc_conn *fc_conn,
+ bool aq_conn);
+
+void nvmf_fc_init_hwqp(struct spdk_nvmf_fc_port *fc_port, struct spdk_nvmf_fc_hwqp *hwqp);
+
+void nvmf_fc_init_poller_queues(struct spdk_nvmf_fc_hwqp *hwqp);
+
+struct spdk_nvmf_fc_conn *nvmf_fc_hwqp_find_fc_conn(struct spdk_nvmf_fc_hwqp *hwqp,
+ uint64_t conn_id);
+
+void nvmf_fc_hwqp_reinit_poller_queues(struct spdk_nvmf_fc_hwqp *hwqp, void *queues_curr);
+
+struct spdk_nvmf_fc_port *nvmf_fc_port_lookup(uint8_t port_hdl);
+
+bool nvmf_fc_port_is_offline(struct spdk_nvmf_fc_port *fc_port);
+
+int nvmf_fc_port_set_offline(struct spdk_nvmf_fc_port *fc_port);
+
+bool nvmf_fc_port_is_online(struct spdk_nvmf_fc_port *fc_port);
+
+int nvmf_fc_port_set_online(struct spdk_nvmf_fc_port *fc_port);
+
+int nvmf_fc_rport_set_state(struct spdk_nvmf_fc_remote_port_info *rport,
+ enum spdk_nvmf_fc_object_state state);
+
+void nvmf_fc_port_add(struct spdk_nvmf_fc_port *fc_port);
+
+int nvmf_fc_port_add_nport(struct spdk_nvmf_fc_port *fc_port,
+ struct spdk_nvmf_fc_nport *nport);
+
+int nvmf_fc_port_remove_nport(struct spdk_nvmf_fc_port *fc_port,
+ struct spdk_nvmf_fc_nport *nport);
+
+struct spdk_nvmf_fc_nport *nvmf_fc_nport_find(uint8_t port_hdl, uint16_t nport_hdl);
+
+int nvmf_fc_nport_set_state(struct spdk_nvmf_fc_nport *nport,
+ enum spdk_nvmf_fc_object_state state);
+
+bool nvmf_fc_nport_add_rem_port(struct spdk_nvmf_fc_nport *nport,
+ struct spdk_nvmf_fc_remote_port_info *rem_port);
+
+bool nvmf_fc_nport_remove_rem_port(struct spdk_nvmf_fc_nport *nport,
+ struct spdk_nvmf_fc_remote_port_info *rem_port);
+
+bool nvmf_fc_nport_has_no_rport(struct spdk_nvmf_fc_nport *nport);
+
+int nvmf_fc_assoc_set_state(struct spdk_nvmf_fc_association *assoc,
+ enum spdk_nvmf_fc_object_state state);
+
+int nvmf_fc_delete_association(struct spdk_nvmf_fc_nport *tgtport,
+ uint64_t assoc_id, bool send_abts, bool backend_initiated,
+ spdk_nvmf_fc_del_assoc_cb del_assoc_cb,
+ void *cb_data);
+
+bool nvmf_ctrlr_is_on_nport(uint8_t port_hdl, uint16_t nport_hdl,
+ struct spdk_nvmf_ctrlr *ctrlr);
+
+void nvmf_fc_assign_queue_to_master_thread(struct spdk_nvmf_fc_hwqp *hwqp);
+
+void nvmf_fc_poll_group_add_hwqp(struct spdk_nvmf_fc_hwqp *hwqp);
+
+void nvmf_fc_poll_group_remove_hwqp(struct spdk_nvmf_fc_hwqp *hwqp);
+
+int nvmf_fc_hwqp_set_online(struct spdk_nvmf_fc_hwqp *hwqp);
+
+int nvmf_fc_hwqp_set_offline(struct spdk_nvmf_fc_hwqp *hwqp);
+
+uint32_t nvmf_fc_get_prli_service_params(void);
+
+void nvmf_fc_handle_abts_frame(struct spdk_nvmf_fc_nport *nport, uint16_t rpi, uint16_t oxid,
+ uint16_t rxid);
+
+void nvmf_fc_request_abort(struct spdk_nvmf_fc_request *fc_req, bool send_abts,
+ spdk_nvmf_fc_caller_cb cb, void *cb_args);
+
+struct spdk_nvmf_tgt *nvmf_fc_get_tgt(void);
+
+struct spdk_thread *nvmf_fc_get_master_thread(void);
+
+/*
+ * These functions are called by low level FC driver
+ */
+
+static inline struct spdk_nvmf_fc_conn *
+nvmf_fc_get_conn(struct spdk_nvmf_qpair *qpair)
+{
+ return (struct spdk_nvmf_fc_conn *)
+ ((uintptr_t)qpair - offsetof(struct spdk_nvmf_fc_conn, qpair));
+}
+
+static inline uint16_t
+nvmf_fc_advance_conn_sqhead(struct spdk_nvmf_qpair *qpair)
+{
+ /* advance sq_head pointer - wrap if needed */
+ qpair->sq_head = (qpair->sq_head == qpair->sq_head_max) ?
+ 0 : (qpair->sq_head + 1);
+ return qpair->sq_head;
+}
+
+static inline bool
+nvmf_fc_use_send_frame(struct spdk_nvmf_request *req)
+{
+ /* For now use for only keepalives. */
+ if (req->qpair->qid == 0 &&
+ (req->cmd->nvme_cmd.opc == SPDK_NVME_OPC_KEEP_ALIVE)) {
+ return true;
+ }
+ return false;
+}
+
+enum spdk_nvmf_fc_poller_api_ret nvmf_fc_poller_api_func(
+ struct spdk_nvmf_fc_hwqp *hwqp,
+ enum spdk_nvmf_fc_poller_api api,
+ void *api_args);
+
+int nvmf_fc_hwqp_process_frame(struct spdk_nvmf_fc_hwqp *hwqp, uint32_t buff_idx,
+ struct spdk_nvmf_fc_frame_hdr *frame,
+ struct spdk_nvmf_fc_buffer_desc *buffer, uint32_t plen);
+
+void nvmf_fc_hwqp_process_pending_reqs(struct spdk_nvmf_fc_hwqp *hwqp);
+
+void nvmf_fc_hwqp_process_pending_ls_rqsts(struct spdk_nvmf_fc_hwqp *hwqp);
+
+void nvmf_fc_request_set_state(struct spdk_nvmf_fc_request *fc_req,
+ enum spdk_nvmf_fc_request_state state);
+
+char *nvmf_fc_request_get_state_str(int state);
+
+void _nvmf_fc_request_free(struct spdk_nvmf_fc_request *fc_req);
+
+void nvmf_fc_request_abort_complete(void *arg1);
+
+bool nvmf_fc_send_ersp_required(struct spdk_nvmf_fc_request *fc_req,
+ uint32_t rsp_cnt, uint32_t xfer_len);
+
+int nvmf_fc_handle_rsp(struct spdk_nvmf_fc_request *req);
+
+#endif
diff --git a/src/spdk/lib/nvmf/nvmf_internal.h b/src/spdk/lib/nvmf/nvmf_internal.h
new file mode 100644
index 000000000..f1f3837d5
--- /dev/null
+++ b/src/spdk/lib/nvmf/nvmf_internal.h
@@ -0,0 +1,371 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation. All rights reserved.
+ * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __NVMF_INTERNAL_H__
+#define __NVMF_INTERNAL_H__
+
+#include "spdk/stdinc.h"
+
+#include "spdk/likely.h"
+#include "spdk/nvmf.h"
+#include "spdk/nvmf_cmd.h"
+#include "spdk/nvmf_transport.h"
+#include "spdk/nvmf_spec.h"
+#include "spdk/assert.h"
+#include "spdk/bdev.h"
+#include "spdk/queue.h"
+#include "spdk/util.h"
+#include "spdk/thread.h"
+
+#define NVMF_MAX_ASYNC_EVENTS (4)
+
+enum spdk_nvmf_subsystem_state {
+ SPDK_NVMF_SUBSYSTEM_INACTIVE = 0,
+ SPDK_NVMF_SUBSYSTEM_ACTIVATING,
+ SPDK_NVMF_SUBSYSTEM_ACTIVE,
+ SPDK_NVMF_SUBSYSTEM_PAUSING,
+ SPDK_NVMF_SUBSYSTEM_PAUSED,
+ SPDK_NVMF_SUBSYSTEM_RESUMING,
+ SPDK_NVMF_SUBSYSTEM_DEACTIVATING,
+};
+
+struct spdk_nvmf_tgt {
+ char name[NVMF_TGT_NAME_MAX_LENGTH];
+
+ pthread_mutex_t mutex;
+
+ uint64_t discovery_genctr;
+
+ uint32_t max_subsystems;
+
+ /* Array of subsystem pointers of size max_subsystems indexed by sid */
+ struct spdk_nvmf_subsystem **subsystems;
+
+ TAILQ_HEAD(, spdk_nvmf_transport) transports;
+ TAILQ_HEAD(, spdk_nvmf_poll_group) poll_groups;
+
+ /* Used for round-robin assignment of connections to poll groups */
+ struct spdk_nvmf_poll_group *next_poll_group;
+
+ spdk_nvmf_tgt_destroy_done_fn *destroy_cb_fn;
+ void *destroy_cb_arg;
+
+ TAILQ_ENTRY(spdk_nvmf_tgt) link;
+};
+
+struct spdk_nvmf_host {
+ char nqn[SPDK_NVMF_NQN_MAX_LEN + 1];
+ TAILQ_ENTRY(spdk_nvmf_host) link;
+};
+
+struct spdk_nvmf_subsystem_listener {
+ struct spdk_nvmf_subsystem *subsystem;
+ spdk_nvmf_tgt_subsystem_listen_done_fn cb_fn;
+ void *cb_arg;
+ struct spdk_nvme_transport_id *trid;
+ struct spdk_nvmf_transport *transport;
+ TAILQ_ENTRY(spdk_nvmf_subsystem_listener) link;
+};
+
+/* Maximum number of registrants supported per namespace */
+#define SPDK_NVMF_MAX_NUM_REGISTRANTS 16
+
+struct spdk_nvmf_registrant_info {
+ uint64_t rkey;
+ char host_uuid[SPDK_UUID_STRING_LEN];
+};
+
+struct spdk_nvmf_reservation_info {
+ bool ptpl_activated;
+ enum spdk_nvme_reservation_type rtype;
+ uint64_t crkey;
+ char bdev_uuid[SPDK_UUID_STRING_LEN];
+ char holder_uuid[SPDK_UUID_STRING_LEN];
+ uint32_t num_regs;
+ struct spdk_nvmf_registrant_info registrants[SPDK_NVMF_MAX_NUM_REGISTRANTS];
+};
+
+struct spdk_nvmf_subsystem_pg_ns_info {
+ struct spdk_io_channel *channel;
+ struct spdk_uuid uuid;
+ /* current reservation key, no reservation if the value is 0 */
+ uint64_t crkey;
+ /* reservation type */
+ enum spdk_nvme_reservation_type rtype;
+ /* Host ID which holds the reservation */
+ struct spdk_uuid holder_id;
+ /* Host ID for the registrants with the namespace */
+ struct spdk_uuid reg_hostid[SPDK_NVMF_MAX_NUM_REGISTRANTS];
+ uint64_t num_blocks;
+};
+
+typedef void(*spdk_nvmf_poll_group_mod_done)(void *cb_arg, int status);
+
+struct spdk_nvmf_subsystem_poll_group {
+ /* Array of namespace information for each namespace indexed by nsid - 1 */
+ struct spdk_nvmf_subsystem_pg_ns_info *ns_info;
+ uint32_t num_ns;
+
+ uint64_t io_outstanding;
+ spdk_nvmf_poll_group_mod_done cb_fn;
+ void *cb_arg;
+
+ enum spdk_nvmf_subsystem_state state;
+
+ TAILQ_HEAD(, spdk_nvmf_request) queued;
+};
+
+struct spdk_nvmf_registrant {
+ TAILQ_ENTRY(spdk_nvmf_registrant) link;
+ struct spdk_uuid hostid;
+ /* Registration key */
+ uint64_t rkey;
+};
+
+struct spdk_nvmf_ns {
+ uint32_t nsid;
+ struct spdk_nvmf_subsystem *subsystem;
+ struct spdk_bdev *bdev;
+ struct spdk_bdev_desc *desc;
+ struct spdk_nvmf_ns_opts opts;
+ /* reservation notificaton mask */
+ uint32_t mask;
+ /* generation code */
+ uint32_t gen;
+ /* registrants head */
+ TAILQ_HEAD(, spdk_nvmf_registrant) registrants;
+ /* current reservation key */
+ uint64_t crkey;
+ /* reservation type */
+ enum spdk_nvme_reservation_type rtype;
+ /* current reservation holder, only valid if reservation type can only have one holder */
+ struct spdk_nvmf_registrant *holder;
+ /* Persist Through Power Loss file which contains the persistent reservation */
+ char *ptpl_file;
+ /* Persist Through Power Loss feature is enabled */
+ bool ptpl_activated;
+};
+
+struct spdk_nvmf_ctrlr_feat {
+ union spdk_nvme_feat_arbitration arbitration;
+ union spdk_nvme_feat_power_management power_management;
+ union spdk_nvme_feat_error_recovery error_recovery;
+ union spdk_nvme_feat_volatile_write_cache volatile_write_cache;
+ union spdk_nvme_feat_number_of_queues number_of_queues;
+ union spdk_nvme_feat_write_atomicity write_atomicity;
+ union spdk_nvme_feat_async_event_configuration async_event_configuration;
+ union spdk_nvme_feat_keep_alive_timer keep_alive_timer;
+};
+
+/*
+ * NVMf reservation notificaton log page.
+ */
+struct spdk_nvmf_reservation_log {
+ struct spdk_nvme_reservation_notification_log log;
+ TAILQ_ENTRY(spdk_nvmf_reservation_log) link;
+ struct spdk_nvmf_ctrlr *ctrlr;
+};
+
+/*
+ * This structure represents an NVMe-oF controller,
+ * which is like a "session" in networking terms.
+ */
+struct spdk_nvmf_ctrlr {
+ uint16_t cntlid;
+ char hostnqn[SPDK_NVMF_NQN_MAX_LEN + 1];
+ struct spdk_nvmf_subsystem *subsys;
+
+ struct spdk_nvmf_ctrlr_data cdata;
+
+ struct spdk_nvmf_registers vcprop;
+
+ struct spdk_nvmf_ctrlr_feat feat;
+
+ struct spdk_nvmf_qpair *admin_qpair;
+ struct spdk_thread *thread;
+ struct spdk_bit_array *qpair_mask;
+
+ struct spdk_nvmf_request *aer_req[NVMF_MAX_ASYNC_EVENTS];
+ union spdk_nvme_async_event_completion notice_event;
+ union spdk_nvme_async_event_completion reservation_event;
+ uint8_t nr_aer_reqs;
+ struct spdk_uuid hostid;
+
+ uint16_t changed_ns_list_count;
+ struct spdk_nvme_ns_list changed_ns_list;
+ uint64_t log_page_count;
+ uint8_t num_avail_log_pages;
+ TAILQ_HEAD(log_page_head, spdk_nvmf_reservation_log) log_head;
+
+ /* Time to trigger keep-alive--poller_time = now_tick + period */
+ uint64_t last_keep_alive_tick;
+ struct spdk_poller *keep_alive_poller;
+
+ bool dif_insert_or_strip;
+
+ TAILQ_ENTRY(spdk_nvmf_ctrlr) link;
+};
+
+struct spdk_nvmf_subsystem {
+ struct spdk_thread *thread;
+ uint32_t id;
+ enum spdk_nvmf_subsystem_state state;
+
+ char subnqn[SPDK_NVMF_NQN_MAX_LEN + 1];
+ enum spdk_nvmf_subtype subtype;
+ uint16_t next_cntlid;
+ bool allow_any_host;
+ bool allow_any_listener;
+
+ struct spdk_nvmf_tgt *tgt;
+
+ char sn[SPDK_NVME_CTRLR_SN_LEN + 1];
+ char mn[SPDK_NVME_CTRLR_MN_LEN + 1];
+
+ /* Array of pointers to namespaces of size max_nsid indexed by nsid - 1 */
+ struct spdk_nvmf_ns **ns;
+ uint32_t max_nsid;
+ /* This is the maximum allowed nsid to a subsystem */
+ uint32_t max_allowed_nsid;
+
+ TAILQ_HEAD(, spdk_nvmf_ctrlr) ctrlrs;
+ TAILQ_HEAD(, spdk_nvmf_host) hosts;
+ TAILQ_HEAD(, spdk_nvmf_subsystem_listener) listeners;
+
+ TAILQ_ENTRY(spdk_nvmf_subsystem) entries;
+};
+
+int nvmf_poll_group_add_transport(struct spdk_nvmf_poll_group *group,
+ struct spdk_nvmf_transport *transport);
+int nvmf_poll_group_update_subsystem(struct spdk_nvmf_poll_group *group,
+ struct spdk_nvmf_subsystem *subsystem);
+int nvmf_poll_group_add_subsystem(struct spdk_nvmf_poll_group *group,
+ struct spdk_nvmf_subsystem *subsystem,
+ spdk_nvmf_poll_group_mod_done cb_fn, void *cb_arg);
+void nvmf_poll_group_remove_subsystem(struct spdk_nvmf_poll_group *group,
+ struct spdk_nvmf_subsystem *subsystem, spdk_nvmf_poll_group_mod_done cb_fn, void *cb_arg);
+void nvmf_poll_group_pause_subsystem(struct spdk_nvmf_poll_group *group,
+ struct spdk_nvmf_subsystem *subsystem, spdk_nvmf_poll_group_mod_done cb_fn, void *cb_arg);
+void nvmf_poll_group_resume_subsystem(struct spdk_nvmf_poll_group *group,
+ struct spdk_nvmf_subsystem *subsystem, spdk_nvmf_poll_group_mod_done cb_fn, void *cb_arg);
+
+void nvmf_get_discovery_log_page(struct spdk_nvmf_tgt *tgt, const char *hostnqn,
+ struct iovec *iov,
+ uint32_t iovcnt, uint64_t offset, uint32_t length);
+
+void nvmf_ctrlr_destruct(struct spdk_nvmf_ctrlr *ctrlr);
+int nvmf_ctrlr_process_fabrics_cmd(struct spdk_nvmf_request *req);
+int nvmf_ctrlr_process_admin_cmd(struct spdk_nvmf_request *req);
+int nvmf_ctrlr_process_io_cmd(struct spdk_nvmf_request *req);
+bool nvmf_ctrlr_dsm_supported(struct spdk_nvmf_ctrlr *ctrlr);
+bool nvmf_ctrlr_write_zeroes_supported(struct spdk_nvmf_ctrlr *ctrlr);
+void nvmf_ctrlr_ns_changed(struct spdk_nvmf_ctrlr *ctrlr, uint32_t nsid);
+
+void nvmf_bdev_ctrlr_identify_ns(struct spdk_nvmf_ns *ns, struct spdk_nvme_ns_data *nsdata,
+ bool dif_insert_or_strip);
+int nvmf_bdev_ctrlr_read_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
+ struct spdk_io_channel *ch, struct spdk_nvmf_request *req);
+int nvmf_bdev_ctrlr_write_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
+ struct spdk_io_channel *ch, struct spdk_nvmf_request *req);
+int nvmf_bdev_ctrlr_compare_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
+ struct spdk_io_channel *ch, struct spdk_nvmf_request *req);
+int nvmf_bdev_ctrlr_compare_and_write_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
+ struct spdk_io_channel *ch, struct spdk_nvmf_request *cmp_req, struct spdk_nvmf_request *write_req);
+int nvmf_bdev_ctrlr_write_zeroes_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
+ struct spdk_io_channel *ch, struct spdk_nvmf_request *req);
+int nvmf_bdev_ctrlr_flush_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
+ struct spdk_io_channel *ch, struct spdk_nvmf_request *req);
+int nvmf_bdev_ctrlr_dsm_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
+ struct spdk_io_channel *ch, struct spdk_nvmf_request *req);
+int nvmf_bdev_ctrlr_nvme_passthru_io(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc,
+ struct spdk_io_channel *ch, struct spdk_nvmf_request *req);
+bool nvmf_bdev_ctrlr_get_dif_ctx(struct spdk_bdev *bdev, struct spdk_nvme_cmd *cmd,
+ struct spdk_dif_ctx *dif_ctx);
+
+int nvmf_subsystem_add_ctrlr(struct spdk_nvmf_subsystem *subsystem,
+ struct spdk_nvmf_ctrlr *ctrlr);
+void nvmf_subsystem_remove_ctrlr(struct spdk_nvmf_subsystem *subsystem,
+ struct spdk_nvmf_ctrlr *ctrlr);
+void nvmf_subsystem_remove_all_listeners(struct spdk_nvmf_subsystem *subsystem,
+ bool stop);
+struct spdk_nvmf_ctrlr *nvmf_subsystem_get_ctrlr(struct spdk_nvmf_subsystem *subsystem,
+ uint16_t cntlid);
+struct spdk_nvmf_subsystem_listener *nvmf_subsystem_find_listener(
+ struct spdk_nvmf_subsystem *subsystem,
+ const struct spdk_nvme_transport_id *trid);
+struct spdk_nvmf_listener *nvmf_transport_find_listener(
+ struct spdk_nvmf_transport *transport,
+ const struct spdk_nvme_transport_id *trid);
+
+int nvmf_ctrlr_async_event_ns_notice(struct spdk_nvmf_ctrlr *ctrlr);
+void nvmf_ctrlr_async_event_reservation_notification(struct spdk_nvmf_ctrlr *ctrlr);
+void nvmf_ns_reservation_request(void *ctx);
+void nvmf_ctrlr_reservation_notice_log(struct spdk_nvmf_ctrlr *ctrlr,
+ struct spdk_nvmf_ns *ns,
+ enum spdk_nvme_reservation_notification_log_page_type type);
+
+/*
+ * Abort aer is sent on a per controller basis and sends a completion for the aer to the host.
+ * This function should be called when attempting to recover in error paths when it is OK for
+ * the host to send a subsequent AER.
+ */
+void nvmf_ctrlr_abort_aer(struct spdk_nvmf_ctrlr *ctrlr);
+
+/*
+ * Free aer simply frees the rdma resources for the aer without informing the host.
+ * This function should be called when deleting a qpair when one wants to make sure
+ * the qpair is completely empty before freeing the request. The reason we free the
+ * AER without sending a completion is to prevent the host from sending another AER.
+ */
+void nvmf_qpair_free_aer(struct spdk_nvmf_qpair *qpair);
+
+int nvmf_ctrlr_abort_request(struct spdk_nvmf_request *req);
+
+static inline struct spdk_nvmf_ns *
+_nvmf_subsystem_get_ns(struct spdk_nvmf_subsystem *subsystem, uint32_t nsid)
+{
+ /* NOTE: This implicitly also checks for 0, since 0 - 1 wraps around to UINT32_MAX. */
+ if (spdk_unlikely(nsid - 1 >= subsystem->max_nsid)) {
+ return NULL;
+ }
+
+ return subsystem->ns[nsid - 1];
+}
+
+static inline bool
+nvmf_qpair_is_admin_queue(struct spdk_nvmf_qpair *qpair)
+{
+ return qpair->qid == 0;
+}
+
+#endif /* __NVMF_INTERNAL_H__ */
diff --git a/src/spdk/lib/nvmf/nvmf_rpc.c b/src/spdk/lib/nvmf/nvmf_rpc.c
new file mode 100644
index 000000000..5dc9f42f0
--- /dev/null
+++ b/src/spdk/lib/nvmf/nvmf_rpc.c
@@ -0,0 +1,2012 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation. All rights reserved.
+ * Copyright (c) 2018-2020 Mellanox Technologies LTD. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/bdev.h"
+#include "spdk/log.h"
+#include "spdk/rpc.h"
+#include "spdk/env.h"
+#include "spdk/nvme.h"
+#include "spdk/nvmf.h"
+#include "spdk/string.h"
+#include "spdk/util.h"
+
+#include "spdk_internal/log.h"
+#include "spdk_internal/assert.h"
+
+#include "nvmf_internal.h"
+
+static int
+json_write_hex_str(struct spdk_json_write_ctx *w, const void *data, size_t size)
+{
+ static const char hex_char[16] = "0123456789ABCDEF";
+ const uint8_t *buf = data;
+ char *str, *out;
+ int rc;
+
+ str = malloc(size * 2 + 1);
+ if (str == NULL) {
+ return -1;
+ }
+
+ out = str;
+ while (size--) {
+ unsigned byte = *buf++;
+
+ out[0] = hex_char[(byte >> 4) & 0xF];
+ out[1] = hex_char[byte & 0xF];
+
+ out += 2;
+ }
+ *out = '\0';
+
+ rc = spdk_json_write_string(w, str);
+ free(str);
+
+ return rc;
+}
+
+static int
+hex_nybble_to_num(char c)
+{
+ if (c >= '0' && c <= '9') {
+ return c - '0';
+ }
+
+ if (c >= 'a' && c <= 'f') {
+ return c - 'a' + 0xA;
+ }
+
+ if (c >= 'A' && c <= 'F') {
+ return c - 'A' + 0xA;
+ }
+
+ return -1;
+}
+
+static int
+hex_byte_to_num(const char *str)
+{
+ int hi, lo;
+
+ hi = hex_nybble_to_num(str[0]);
+ if (hi < 0) {
+ return hi;
+ }
+
+ lo = hex_nybble_to_num(str[1]);
+ if (lo < 0) {
+ return lo;
+ }
+
+ return hi * 16 + lo;
+}
+
+static int
+decode_hex_string_be(const char *str, uint8_t *out, size_t size)
+{
+ size_t i;
+
+ /* Decode a string in "ABCDEF012345" format to its binary representation */
+ for (i = 0; i < size; i++) {
+ int num = hex_byte_to_num(str);
+
+ if (num < 0) {
+ /* Invalid hex byte or end of string */
+ return -1;
+ }
+
+ out[i] = (uint8_t)num;
+ str += 2;
+ }
+
+ if (i != size || *str != '\0') {
+ /* Length mismatch */
+ return -1;
+ }
+
+ return 0;
+}
+
+static int
+decode_ns_nguid(const struct spdk_json_val *val, void *out)
+{
+ char *str = NULL;
+ int rc;
+
+ rc = spdk_json_decode_string(val, &str);
+ if (rc == 0) {
+ /* 16-byte NGUID */
+ rc = decode_hex_string_be(str, out, 16);
+ }
+
+ free(str);
+ return rc;
+}
+
+static int
+decode_ns_eui64(const struct spdk_json_val *val, void *out)
+{
+ char *str = NULL;
+ int rc;
+
+ rc = spdk_json_decode_string(val, &str);
+ if (rc == 0) {
+ /* 8-byte EUI-64 */
+ rc = decode_hex_string_be(str, out, 8);
+ }
+
+ free(str);
+ return rc;
+}
+
+static int
+decode_ns_uuid(const struct spdk_json_val *val, void *out)
+{
+ char *str = NULL;
+ int rc;
+
+ rc = spdk_json_decode_string(val, &str);
+ if (rc == 0) {
+ rc = spdk_uuid_parse(out, str);
+ }
+
+ free(str);
+ return rc;
+}
+
+struct rpc_get_subsystem {
+ char *tgt_name;
+};
+
+static const struct spdk_json_object_decoder rpc_get_subsystem_decoders[] = {
+ {"tgt_name", offsetof(struct rpc_get_subsystem, tgt_name), spdk_json_decode_string, true},
+};
+
+static void
+dump_nvmf_subsystem(struct spdk_json_write_ctx *w, struct spdk_nvmf_subsystem *subsystem)
+{
+ struct spdk_nvmf_host *host;
+ struct spdk_nvmf_subsystem_listener *listener;
+
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_string(w, "nqn", spdk_nvmf_subsystem_get_nqn(subsystem));
+ spdk_json_write_name(w, "subtype");
+ if (spdk_nvmf_subsystem_get_type(subsystem) == SPDK_NVMF_SUBTYPE_NVME) {
+ spdk_json_write_string(w, "NVMe");
+ } else {
+ spdk_json_write_string(w, "Discovery");
+ }
+
+ spdk_json_write_named_array_begin(w, "listen_addresses");
+
+ for (listener = spdk_nvmf_subsystem_get_first_listener(subsystem); listener != NULL;
+ listener = spdk_nvmf_subsystem_get_next_listener(subsystem, listener)) {
+ const struct spdk_nvme_transport_id *trid;
+ const char *adrfam;
+
+ trid = spdk_nvmf_subsystem_listener_get_trid(listener);
+
+ spdk_json_write_object_begin(w);
+ adrfam = spdk_nvme_transport_id_adrfam_str(trid->adrfam);
+ if (adrfam == NULL) {
+ adrfam = "unknown";
+ }
+ /* NOTE: "transport" is kept for compatibility; new code should use "trtype" */
+ spdk_json_write_named_string(w, "transport", trid->trstring);
+ spdk_json_write_named_string(w, "trtype", trid->trstring);
+ spdk_json_write_named_string(w, "adrfam", adrfam);
+ spdk_json_write_named_string(w, "traddr", trid->traddr);
+ spdk_json_write_named_string(w, "trsvcid", trid->trsvcid);
+ spdk_json_write_object_end(w);
+ }
+ spdk_json_write_array_end(w);
+
+ spdk_json_write_named_bool(w, "allow_any_host",
+ spdk_nvmf_subsystem_get_allow_any_host(subsystem));
+
+ spdk_json_write_named_array_begin(w, "hosts");
+
+ for (host = spdk_nvmf_subsystem_get_first_host(subsystem); host != NULL;
+ host = spdk_nvmf_subsystem_get_next_host(subsystem, host)) {
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "nqn", spdk_nvmf_host_get_nqn(host));
+ spdk_json_write_object_end(w);
+ }
+ spdk_json_write_array_end(w);
+
+ if (spdk_nvmf_subsystem_get_type(subsystem) == SPDK_NVMF_SUBTYPE_NVME) {
+ struct spdk_nvmf_ns *ns;
+ struct spdk_nvmf_ns_opts ns_opts;
+ uint32_t max_namespaces;
+
+ spdk_json_write_named_string(w, "serial_number", spdk_nvmf_subsystem_get_sn(subsystem));
+
+ spdk_json_write_named_string(w, "model_number", spdk_nvmf_subsystem_get_mn(subsystem));
+
+ max_namespaces = spdk_nvmf_subsystem_get_max_namespaces(subsystem);
+ if (max_namespaces != 0) {
+ spdk_json_write_named_uint32(w, "max_namespaces", max_namespaces);
+ }
+
+ spdk_json_write_named_array_begin(w, "namespaces");
+ for (ns = spdk_nvmf_subsystem_get_first_ns(subsystem); ns != NULL;
+ ns = spdk_nvmf_subsystem_get_next_ns(subsystem, ns)) {
+ spdk_nvmf_ns_get_opts(ns, &ns_opts, sizeof(ns_opts));
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_int32(w, "nsid", spdk_nvmf_ns_get_id(ns));
+ spdk_json_write_named_string(w, "bdev_name",
+ spdk_bdev_get_name(spdk_nvmf_ns_get_bdev(ns)));
+ /* NOTE: "name" is kept for compatibility only - new code should use bdev_name. */
+ spdk_json_write_named_string(w, "name",
+ spdk_bdev_get_name(spdk_nvmf_ns_get_bdev(ns)));
+
+ if (!spdk_mem_all_zero(ns_opts.nguid, sizeof(ns_opts.nguid))) {
+ spdk_json_write_name(w, "nguid");
+ json_write_hex_str(w, ns_opts.nguid, sizeof(ns_opts.nguid));
+ }
+
+ if (!spdk_mem_all_zero(ns_opts.eui64, sizeof(ns_opts.eui64))) {
+ spdk_json_write_name(w, "eui64");
+ json_write_hex_str(w, ns_opts.eui64, sizeof(ns_opts.eui64));
+ }
+
+ if (!spdk_mem_all_zero(&ns_opts.uuid, sizeof(ns_opts.uuid))) {
+ char uuid_str[SPDK_UUID_STRING_LEN];
+
+ spdk_uuid_fmt_lower(uuid_str, sizeof(uuid_str), &ns_opts.uuid);
+ spdk_json_write_named_string(w, "uuid", uuid_str);
+ }
+
+ spdk_json_write_object_end(w);
+ }
+ spdk_json_write_array_end(w);
+ }
+ spdk_json_write_object_end(w);
+}
+
+static void
+rpc_nvmf_get_subsystems(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_get_subsystem req = { 0 };
+ struct spdk_json_write_ctx *w;
+ struct spdk_nvmf_subsystem *subsystem;
+ struct spdk_nvmf_tgt *tgt;
+
+ if (params) {
+ if (spdk_json_decode_object(params, rpc_get_subsystem_decoders,
+ SPDK_COUNTOF(rpc_get_subsystem_decoders),
+ &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+ return;
+ }
+ }
+
+ tgt = spdk_nvmf_get_tgt(req.tgt_name);
+ if (!tgt) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Unable to find a target.");
+ free(req.tgt_name);
+ return;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_array_begin(w);
+ subsystem = spdk_nvmf_subsystem_get_first(tgt);
+ while (subsystem) {
+ dump_nvmf_subsystem(w, subsystem);
+ subsystem = spdk_nvmf_subsystem_get_next(subsystem);
+ }
+ spdk_json_write_array_end(w);
+ spdk_jsonrpc_end_result(request, w);
+ free(req.tgt_name);
+}
+SPDK_RPC_REGISTER("nvmf_get_subsystems", rpc_nvmf_get_subsystems, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(nvmf_get_subsystems, get_nvmf_subsystems)
+
+struct rpc_subsystem_create {
+ char *nqn;
+ char *serial_number;
+ char *model_number;
+ char *tgt_name;
+ uint32_t max_namespaces;
+ bool allow_any_host;
+};
+
+static const struct spdk_json_object_decoder rpc_subsystem_create_decoders[] = {
+ {"nqn", offsetof(struct rpc_subsystem_create, nqn), spdk_json_decode_string},
+ {"serial_number", offsetof(struct rpc_subsystem_create, serial_number), spdk_json_decode_string, true},
+ {"model_number", offsetof(struct rpc_subsystem_create, model_number), spdk_json_decode_string, true},
+ {"tgt_name", offsetof(struct rpc_subsystem_create, tgt_name), spdk_json_decode_string, true},
+ {"max_namespaces", offsetof(struct rpc_subsystem_create, max_namespaces), spdk_json_decode_uint32, true},
+ {"allow_any_host", offsetof(struct rpc_subsystem_create, allow_any_host), spdk_json_decode_bool, true},
+};
+
+static void
+rpc_nvmf_subsystem_started(struct spdk_nvmf_subsystem *subsystem,
+ void *cb_arg, int status)
+{
+ struct spdk_jsonrpc_request *request = cb_arg;
+
+ if (!status) {
+ struct spdk_json_write_ctx *w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ } else {
+ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Subsystem %s start failed",
+ subsystem->subnqn);
+ spdk_nvmf_subsystem_destroy(subsystem);
+ }
+}
+
+static void
+rpc_nvmf_create_subsystem(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_subsystem_create *req;
+ struct spdk_nvmf_subsystem *subsystem = NULL;
+ struct spdk_nvmf_tgt *tgt;
+ int rc = -1;
+
+ req = calloc(1, sizeof(*req));
+ if (!req) {
+ SPDK_ERRLOG("Memory allocation failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Memory allocation failed");
+ return;
+ }
+
+ if (spdk_json_decode_object(params, rpc_subsystem_create_decoders,
+ SPDK_COUNTOF(rpc_subsystem_create_decoders),
+ req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+ goto cleanup;
+ }
+
+ tgt = spdk_nvmf_get_tgt(req->tgt_name);
+ if (!tgt) {
+ SPDK_ERRLOG("Unable to find target %s\n", req->tgt_name);
+ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Unable to find target %s", req->tgt_name);
+ goto cleanup;
+ }
+
+ subsystem = spdk_nvmf_subsystem_create(tgt, req->nqn, SPDK_NVMF_SUBTYPE_NVME,
+ req->max_namespaces);
+ if (!subsystem) {
+ SPDK_ERRLOG("Unable to create subsystem %s\n", req->nqn);
+ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Unable to create subsystem %s", req->nqn);
+ goto cleanup;
+ }
+
+ if (req->serial_number) {
+ if (spdk_nvmf_subsystem_set_sn(subsystem, req->serial_number)) {
+ SPDK_ERRLOG("Subsystem %s: invalid serial number '%s'\n", req->nqn, req->serial_number);
+ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid SN %s", req->serial_number);
+ goto cleanup;
+ }
+ }
+
+ if (req->model_number) {
+ if (spdk_nvmf_subsystem_set_mn(subsystem, req->model_number)) {
+ SPDK_ERRLOG("Subsystem %s: invalid model number '%s'\n", req->nqn, req->model_number);
+ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid MN %s", req->model_number);
+ goto cleanup;
+ }
+ }
+
+ spdk_nvmf_subsystem_set_allow_any_host(subsystem, req->allow_any_host);
+
+ rc = spdk_nvmf_subsystem_start(subsystem,
+ rpc_nvmf_subsystem_started,
+ request);
+
+cleanup:
+ free(req->nqn);
+ free(req->tgt_name);
+ free(req->serial_number);
+ free(req->model_number);
+ free(req);
+
+ if (rc && subsystem) {
+ spdk_nvmf_subsystem_destroy(subsystem);
+ }
+}
+SPDK_RPC_REGISTER("nvmf_create_subsystem", rpc_nvmf_create_subsystem, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(nvmf_create_subsystem, nvmf_subsystem_create)
+
+struct rpc_delete_subsystem {
+ char *nqn;
+ char *tgt_name;
+};
+
+static void
+free_rpc_delete_subsystem(struct rpc_delete_subsystem *r)
+{
+ free(r->nqn);
+ free(r->tgt_name);
+}
+
+static void
+rpc_nvmf_subsystem_stopped(struct spdk_nvmf_subsystem *subsystem,
+ void *cb_arg, int status)
+{
+ struct spdk_jsonrpc_request *request = cb_arg;
+ struct spdk_json_write_ctx *w;
+
+ nvmf_subsystem_remove_all_listeners(subsystem, true);
+ spdk_nvmf_subsystem_destroy(subsystem);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+}
+
+static const struct spdk_json_object_decoder rpc_delete_subsystem_decoders[] = {
+ {"nqn", offsetof(struct rpc_delete_subsystem, nqn), spdk_json_decode_string},
+ {"tgt_name", offsetof(struct rpc_delete_subsystem, tgt_name), spdk_json_decode_string, true},
+};
+
+static void
+rpc_nvmf_delete_subsystem(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_delete_subsystem req = { 0 };
+ struct spdk_nvmf_subsystem *subsystem;
+ struct spdk_nvmf_tgt *tgt;
+
+ if (spdk_json_decode_object(params, rpc_delete_subsystem_decoders,
+ SPDK_COUNTOF(rpc_delete_subsystem_decoders),
+ &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ goto invalid;
+ }
+
+ if (req.nqn == NULL) {
+ SPDK_ERRLOG("missing name param\n");
+ goto invalid;
+ }
+
+ tgt = spdk_nvmf_get_tgt(req.tgt_name);
+ if (!tgt) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Unable to find a target.");
+ goto invalid_custom_response;
+ }
+
+ subsystem = spdk_nvmf_tgt_find_subsystem(tgt, req.nqn);
+ if (!subsystem) {
+ goto invalid;
+ }
+
+ free_rpc_delete_subsystem(&req);
+
+ spdk_nvmf_subsystem_stop(subsystem,
+ rpc_nvmf_subsystem_stopped,
+ request);
+
+ return;
+
+invalid:
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+invalid_custom_response:
+ free_rpc_delete_subsystem(&req);
+}
+SPDK_RPC_REGISTER("nvmf_delete_subsystem", rpc_nvmf_delete_subsystem, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(nvmf_delete_subsystem, delete_nvmf_subsystem)
+
+struct rpc_listen_address {
+ char *transport;
+ char *adrfam;
+ char *traddr;
+ char *trsvcid;
+};
+
+#define RPC_MAX_LISTEN_ADDRESSES 255
+#define RPC_MAX_NAMESPACES 255
+
+struct rpc_listen_addresses {
+ size_t num_listen_address;
+ struct rpc_listen_address addresses[RPC_MAX_LISTEN_ADDRESSES];
+};
+
+static const struct spdk_json_object_decoder rpc_listen_address_decoders[] = {
+ /* NOTE: "transport" is kept for compatibility; new code should use "trtype" */
+ {"transport", offsetof(struct rpc_listen_address, transport), spdk_json_decode_string, true},
+ {"trtype", offsetof(struct rpc_listen_address, transport), spdk_json_decode_string, true},
+ {"adrfam", offsetof(struct rpc_listen_address, adrfam), spdk_json_decode_string, true},
+ {"traddr", offsetof(struct rpc_listen_address, traddr), spdk_json_decode_string},
+ {"trsvcid", offsetof(struct rpc_listen_address, trsvcid), spdk_json_decode_string},
+};
+
+static int
+decode_rpc_listen_address(const struct spdk_json_val *val, void *out)
+{
+ struct rpc_listen_address *req = (struct rpc_listen_address *)out;
+ if (spdk_json_decode_object(val, rpc_listen_address_decoders,
+ SPDK_COUNTOF(rpc_listen_address_decoders),
+ req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ return -1;
+ }
+ return 0;
+}
+
+static void
+free_rpc_listen_address(struct rpc_listen_address *r)
+{
+ free(r->transport);
+ free(r->adrfam);
+ free(r->traddr);
+ free(r->trsvcid);
+}
+
+enum nvmf_rpc_listen_op {
+ NVMF_RPC_LISTEN_ADD,
+ NVMF_RPC_LISTEN_REMOVE,
+};
+
+struct nvmf_rpc_listener_ctx {
+ char *nqn;
+ char *tgt_name;
+ struct spdk_nvmf_tgt *tgt;
+ struct spdk_nvmf_subsystem *subsystem;
+ struct rpc_listen_address address;
+
+ struct spdk_jsonrpc_request *request;
+ struct spdk_nvme_transport_id trid;
+ enum nvmf_rpc_listen_op op;
+ bool response_sent;
+};
+
+static const struct spdk_json_object_decoder nvmf_rpc_listener_decoder[] = {
+ {"nqn", offsetof(struct nvmf_rpc_listener_ctx, nqn), spdk_json_decode_string},
+ {"listen_address", offsetof(struct nvmf_rpc_listener_ctx, address), decode_rpc_listen_address},
+ {"tgt_name", offsetof(struct nvmf_rpc_listener_ctx, tgt_name), spdk_json_decode_string, true},
+};
+
+static void
+nvmf_rpc_listener_ctx_free(struct nvmf_rpc_listener_ctx *ctx)
+{
+ free(ctx->nqn);
+ free(ctx->tgt_name);
+ free_rpc_listen_address(&ctx->address);
+ free(ctx);
+}
+
+static void
+nvmf_rpc_listen_resumed(struct spdk_nvmf_subsystem *subsystem,
+ void *cb_arg, int status)
+{
+ struct nvmf_rpc_listener_ctx *ctx = cb_arg;
+ struct spdk_jsonrpc_request *request;
+ struct spdk_json_write_ctx *w;
+
+ request = ctx->request;
+ if (ctx->response_sent) {
+ /* If an error occurred, the response has already been sent. */
+ nvmf_rpc_listener_ctx_free(ctx);
+ return;
+ }
+
+ nvmf_rpc_listener_ctx_free(ctx);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+}
+
+static void
+nvmf_rpc_subsystem_listen(void *cb_arg, int status)
+{
+ struct nvmf_rpc_listener_ctx *ctx = cb_arg;
+
+ if (status) {
+ /* Destroy the listener that we just created. Ignore the error code because
+ * the RPC is failing already anyway. */
+ spdk_nvmf_tgt_stop_listen(ctx->tgt, &ctx->trid);
+
+ spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid parameters");
+ ctx->response_sent = true;
+ }
+
+ if (spdk_nvmf_subsystem_resume(ctx->subsystem, nvmf_rpc_listen_resumed, ctx)) {
+ if (!ctx->response_sent) {
+ spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error");
+ }
+ nvmf_rpc_listener_ctx_free(ctx);
+ /* Can't really do anything to recover here - subsystem will remain paused. */
+ }
+}
+
+static void
+nvmf_rpc_listen_paused(struct spdk_nvmf_subsystem *subsystem,
+ void *cb_arg, int status)
+{
+ struct nvmf_rpc_listener_ctx *ctx = cb_arg;
+ int rc;
+
+ if (ctx->op == NVMF_RPC_LISTEN_ADD) {
+ if (!nvmf_subsystem_find_listener(subsystem, &ctx->trid)) {
+ rc = spdk_nvmf_tgt_listen(ctx->tgt, &ctx->trid);
+ if (rc == 0) {
+ spdk_nvmf_subsystem_add_listener(ctx->subsystem, &ctx->trid, nvmf_rpc_subsystem_listen, ctx);
+ return;
+ }
+
+ spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid parameters");
+ ctx->response_sent = true;
+ }
+ } else if (ctx->op == NVMF_RPC_LISTEN_REMOVE) {
+ if (spdk_nvmf_subsystem_remove_listener(subsystem, &ctx->trid)) {
+ SPDK_ERRLOG("Unable to remove listener.\n");
+ spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid parameters");
+ ctx->response_sent = true;
+ }
+ spdk_nvmf_tgt_stop_listen(ctx->tgt, &ctx->trid);
+ } else {
+ SPDK_UNREACHABLE();
+ }
+
+ if (spdk_nvmf_subsystem_resume(subsystem, nvmf_rpc_listen_resumed, ctx)) {
+ if (!ctx->response_sent) {
+ spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error");
+ }
+ nvmf_rpc_listener_ctx_free(ctx);
+ /* Can't really do anything to recover here - subsystem will remain paused. */
+ }
+}
+
+static int
+rpc_listen_address_to_trid(const struct rpc_listen_address *address,
+ struct spdk_nvme_transport_id *trid)
+{
+ size_t len;
+
+ memset(trid, 0, sizeof(*trid));
+
+ if (spdk_nvme_transport_id_populate_trstring(trid, address->transport)) {
+ SPDK_ERRLOG("Invalid transport string: %s\n", address->transport);
+ return -EINVAL;
+ }
+
+ if (spdk_nvme_transport_id_parse_trtype(&trid->trtype, address->transport)) {
+ SPDK_ERRLOG("Invalid transport type: %s\n", address->transport);
+ return -EINVAL;
+ }
+
+ if (address->adrfam) {
+ if (spdk_nvme_transport_id_parse_adrfam(&trid->adrfam, address->adrfam)) {
+ SPDK_ERRLOG("Invalid adrfam: %s\n", address->adrfam);
+ return -EINVAL;
+ }
+ } else {
+ trid->adrfam = SPDK_NVMF_ADRFAM_IPV4;
+ }
+
+ len = strlen(address->traddr);
+ if (len > sizeof(trid->traddr) - 1) {
+ SPDK_ERRLOG("Transport address longer than %zu characters: %s\n",
+ sizeof(trid->traddr) - 1, address->traddr);
+ return -EINVAL;
+ }
+ memcpy(trid->traddr, address->traddr, len + 1);
+
+ len = strlen(address->trsvcid);
+ if (len > sizeof(trid->trsvcid) - 1) {
+ SPDK_ERRLOG("Transport service id longer than %zu characters: %s\n",
+ sizeof(trid->trsvcid) - 1, address->trsvcid);
+ return -EINVAL;
+ }
+ memcpy(trid->trsvcid, address->trsvcid, len + 1);
+
+ return 0;
+}
+
+static void
+rpc_nvmf_subsystem_add_listener(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct nvmf_rpc_listener_ctx *ctx;
+ struct spdk_nvmf_subsystem *subsystem;
+ struct spdk_nvmf_tgt *tgt;
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory");
+ return;
+ }
+
+ ctx->request = request;
+
+ if (spdk_json_decode_object(params, nvmf_rpc_listener_decoder,
+ SPDK_COUNTOF(nvmf_rpc_listener_decoder),
+ ctx)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+ nvmf_rpc_listener_ctx_free(ctx);
+ return;
+ }
+
+ tgt = spdk_nvmf_get_tgt(ctx->tgt_name);
+ if (!tgt) {
+ SPDK_ERRLOG("Unable to find a target object.\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Unable to find a target.");
+ nvmf_rpc_listener_ctx_free(ctx);
+ return;
+ }
+ ctx->tgt = tgt;
+
+ subsystem = spdk_nvmf_tgt_find_subsystem(tgt, ctx->nqn);
+ if (!subsystem) {
+ SPDK_ERRLOG("Unable to find subsystem with NQN %s\n", ctx->nqn);
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+ nvmf_rpc_listener_ctx_free(ctx);
+ return;
+ }
+
+ ctx->subsystem = subsystem;
+
+ if (rpc_listen_address_to_trid(&ctx->address, &ctx->trid)) {
+ spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid parameters");
+ nvmf_rpc_listener_ctx_free(ctx);
+ return;
+ }
+
+ ctx->op = NVMF_RPC_LISTEN_ADD;
+
+ if (spdk_nvmf_subsystem_pause(subsystem, nvmf_rpc_listen_paused, ctx)) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error");
+ nvmf_rpc_listener_ctx_free(ctx);
+ }
+}
+SPDK_RPC_REGISTER("nvmf_subsystem_add_listener", rpc_nvmf_subsystem_add_listener,
+ SPDK_RPC_RUNTIME);
+
+static void
+rpc_nvmf_subsystem_remove_listener(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct nvmf_rpc_listener_ctx *ctx;
+ struct spdk_nvmf_subsystem *subsystem;
+ struct spdk_nvmf_tgt *tgt;
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory");
+ return;
+ }
+
+ ctx->request = request;
+
+ if (spdk_json_decode_object(params, nvmf_rpc_listener_decoder,
+ SPDK_COUNTOF(nvmf_rpc_listener_decoder),
+ ctx)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+ nvmf_rpc_listener_ctx_free(ctx);
+ return;
+ }
+
+ tgt = spdk_nvmf_get_tgt(ctx->tgt_name);
+ if (!tgt) {
+ SPDK_ERRLOG("Unable to find a target object.\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Unable to find a target.");
+ nvmf_rpc_listener_ctx_free(ctx);
+ return;
+ }
+ ctx->tgt = tgt;
+
+ subsystem = spdk_nvmf_tgt_find_subsystem(tgt, ctx->nqn);
+ if (!subsystem) {
+ SPDK_ERRLOG("Unable to find subsystem with NQN %s\n", ctx->nqn);
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+ nvmf_rpc_listener_ctx_free(ctx);
+ return;
+ }
+
+ ctx->subsystem = subsystem;
+
+ if (rpc_listen_address_to_trid(&ctx->address, &ctx->trid)) {
+ spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid parameters");
+ nvmf_rpc_listener_ctx_free(ctx);
+ return;
+ }
+
+ ctx->op = NVMF_RPC_LISTEN_REMOVE;
+
+ if (spdk_nvmf_subsystem_pause(subsystem, nvmf_rpc_listen_paused, ctx)) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error");
+ nvmf_rpc_listener_ctx_free(ctx);
+ }
+}
+SPDK_RPC_REGISTER("nvmf_subsystem_remove_listener", rpc_nvmf_subsystem_remove_listener,
+ SPDK_RPC_RUNTIME);
+
+struct spdk_nvmf_ns_params {
+ char *bdev_name;
+ char *ptpl_file;
+ uint32_t nsid;
+ char nguid[16];
+ char eui64[8];
+ struct spdk_uuid uuid;
+};
+
+struct rpc_namespaces {
+ size_t num_ns;
+ struct spdk_nvmf_ns_params ns_params[RPC_MAX_NAMESPACES];
+};
+
+
+static const struct spdk_json_object_decoder rpc_ns_params_decoders[] = {
+ {"nsid", offsetof(struct spdk_nvmf_ns_params, nsid), spdk_json_decode_uint32, true},
+ {"bdev_name", offsetof(struct spdk_nvmf_ns_params, bdev_name), spdk_json_decode_string},
+ {"ptpl_file", offsetof(struct spdk_nvmf_ns_params, ptpl_file), spdk_json_decode_string, true},
+ {"nguid", offsetof(struct spdk_nvmf_ns_params, nguid), decode_ns_nguid, true},
+ {"eui64", offsetof(struct spdk_nvmf_ns_params, eui64), decode_ns_eui64, true},
+ {"uuid", offsetof(struct spdk_nvmf_ns_params, uuid), decode_ns_uuid, true},
+};
+
+static int
+decode_rpc_ns_params(const struct spdk_json_val *val, void *out)
+{
+ struct spdk_nvmf_ns_params *ns_params = out;
+
+ return spdk_json_decode_object(val, rpc_ns_params_decoders,
+ SPDK_COUNTOF(rpc_ns_params_decoders),
+ ns_params);
+}
+
+struct nvmf_rpc_ns_ctx {
+ char *nqn;
+ char *tgt_name;
+ struct spdk_nvmf_ns_params ns_params;
+
+ struct spdk_jsonrpc_request *request;
+ bool response_sent;
+};
+
+static const struct spdk_json_object_decoder nvmf_rpc_subsystem_ns_decoder[] = {
+ {"nqn", offsetof(struct nvmf_rpc_ns_ctx, nqn), spdk_json_decode_string},
+ {"namespace", offsetof(struct nvmf_rpc_ns_ctx, ns_params), decode_rpc_ns_params},
+ {"tgt_name", offsetof(struct nvmf_rpc_ns_ctx, tgt_name), spdk_json_decode_string, true},
+};
+
+static void
+nvmf_rpc_ns_ctx_free(struct nvmf_rpc_ns_ctx *ctx)
+{
+ free(ctx->nqn);
+ free(ctx->tgt_name);
+ free(ctx->ns_params.bdev_name);
+ free(ctx->ns_params.ptpl_file);
+ free(ctx);
+}
+
+static void
+nvmf_rpc_ns_resumed(struct spdk_nvmf_subsystem *subsystem,
+ void *cb_arg, int status)
+{
+ struct nvmf_rpc_ns_ctx *ctx = cb_arg;
+ struct spdk_jsonrpc_request *request = ctx->request;
+ uint32_t nsid = ctx->ns_params.nsid;
+ bool response_sent = ctx->response_sent;
+ struct spdk_json_write_ctx *w;
+
+ nvmf_rpc_ns_ctx_free(ctx);
+
+ if (response_sent) {
+ return;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_uint32(w, nsid);
+ spdk_jsonrpc_end_result(request, w);
+}
+
+static void
+nvmf_rpc_ns_paused(struct spdk_nvmf_subsystem *subsystem,
+ void *cb_arg, int status)
+{
+ struct nvmf_rpc_ns_ctx *ctx = cb_arg;
+ struct spdk_nvmf_ns_opts ns_opts;
+ struct spdk_bdev *bdev;
+
+ bdev = spdk_bdev_get_by_name(ctx->ns_params.bdev_name);
+ if (!bdev) {
+ SPDK_ERRLOG("No bdev with name %s\n", ctx->ns_params.bdev_name);
+ spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid parameters");
+ ctx->response_sent = true;
+ goto resume;
+ }
+
+ spdk_nvmf_ns_opts_get_defaults(&ns_opts, sizeof(ns_opts));
+ ns_opts.nsid = ctx->ns_params.nsid;
+
+ SPDK_STATIC_ASSERT(sizeof(ns_opts.nguid) == sizeof(ctx->ns_params.nguid), "size mismatch");
+ memcpy(ns_opts.nguid, ctx->ns_params.nguid, sizeof(ns_opts.nguid));
+
+ SPDK_STATIC_ASSERT(sizeof(ns_opts.eui64) == sizeof(ctx->ns_params.eui64), "size mismatch");
+ memcpy(ns_opts.eui64, ctx->ns_params.eui64, sizeof(ns_opts.eui64));
+
+ if (!spdk_mem_all_zero(&ctx->ns_params.uuid, sizeof(ctx->ns_params.uuid))) {
+ ns_opts.uuid = ctx->ns_params.uuid;
+ }
+
+ ctx->ns_params.nsid = spdk_nvmf_subsystem_add_ns(subsystem, bdev, &ns_opts, sizeof(ns_opts),
+ ctx->ns_params.ptpl_file);
+ if (ctx->ns_params.nsid == 0) {
+ SPDK_ERRLOG("Unable to add namespace\n");
+ spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid parameters");
+ ctx->response_sent = true;
+ goto resume;
+ }
+
+resume:
+ if (spdk_nvmf_subsystem_resume(subsystem, nvmf_rpc_ns_resumed, ctx)) {
+ spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error");
+ nvmf_rpc_ns_ctx_free(ctx);
+ }
+}
+
+static void
+rpc_nvmf_subsystem_add_ns(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct nvmf_rpc_ns_ctx *ctx;
+ struct spdk_nvmf_subsystem *subsystem;
+ struct spdk_nvmf_tgt *tgt;
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory");
+ return;
+ }
+
+ if (spdk_json_decode_object(params, nvmf_rpc_subsystem_ns_decoder,
+ SPDK_COUNTOF(nvmf_rpc_subsystem_ns_decoder),
+ ctx)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+ nvmf_rpc_ns_ctx_free(ctx);
+ return;
+ }
+
+ ctx->request = request;
+ ctx->response_sent = false;
+
+ tgt = spdk_nvmf_get_tgt(ctx->tgt_name);
+ if (!tgt) {
+ SPDK_ERRLOG("Unable to find a target object.\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Unable to find a target.");
+ nvmf_rpc_ns_ctx_free(ctx);
+ return;
+ }
+
+ subsystem = spdk_nvmf_tgt_find_subsystem(tgt, ctx->nqn);
+ if (!subsystem) {
+ SPDK_ERRLOG("Unable to find subsystem with NQN %s\n", ctx->nqn);
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+ nvmf_rpc_ns_ctx_free(ctx);
+ return;
+ }
+
+ if (spdk_nvmf_subsystem_pause(subsystem, nvmf_rpc_ns_paused, ctx)) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error");
+ nvmf_rpc_ns_ctx_free(ctx);
+ }
+}
+SPDK_RPC_REGISTER("nvmf_subsystem_add_ns", rpc_nvmf_subsystem_add_ns, SPDK_RPC_RUNTIME)
+
+struct nvmf_rpc_remove_ns_ctx {
+ char *nqn;
+ char *tgt_name;
+ uint32_t nsid;
+
+ struct spdk_jsonrpc_request *request;
+ bool response_sent;
+};
+
+static const struct spdk_json_object_decoder nvmf_rpc_subsystem_remove_ns_decoder[] = {
+ {"nqn", offsetof(struct nvmf_rpc_remove_ns_ctx, nqn), spdk_json_decode_string},
+ {"nsid", offsetof(struct nvmf_rpc_remove_ns_ctx, nsid), spdk_json_decode_uint32},
+ {"tgt_name", offsetof(struct nvmf_rpc_remove_ns_ctx, tgt_name), spdk_json_decode_string, true},
+};
+
+static void
+nvmf_rpc_remove_ns_ctx_free(struct nvmf_rpc_remove_ns_ctx *ctx)
+{
+ free(ctx->nqn);
+ free(ctx->tgt_name);
+ free(ctx);
+}
+
+static void
+nvmf_rpc_remove_ns_resumed(struct spdk_nvmf_subsystem *subsystem,
+ void *cb_arg, int status)
+{
+ struct nvmf_rpc_remove_ns_ctx *ctx = cb_arg;
+ struct spdk_jsonrpc_request *request = ctx->request;
+ bool response_sent = ctx->response_sent;
+ struct spdk_json_write_ctx *w;
+
+ nvmf_rpc_remove_ns_ctx_free(ctx);
+
+ if (response_sent) {
+ return;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+}
+
+static void
+nvmf_rpc_remove_ns_paused(struct spdk_nvmf_subsystem *subsystem,
+ void *cb_arg, int status)
+{
+ struct nvmf_rpc_remove_ns_ctx *ctx = cb_arg;
+ int ret;
+
+ ret = spdk_nvmf_subsystem_remove_ns(subsystem, ctx->nsid);
+ if (ret < 0) {
+ SPDK_ERRLOG("Unable to remove namespace ID %u\n", ctx->nsid);
+ spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid parameters");
+ ctx->response_sent = true;
+ }
+
+ if (spdk_nvmf_subsystem_resume(subsystem, nvmf_rpc_remove_ns_resumed, ctx)) {
+ if (!ctx->response_sent) {
+ spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error");
+ }
+ nvmf_rpc_remove_ns_ctx_free(ctx);
+ }
+}
+
+static void
+rpc_nvmf_subsystem_remove_ns(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct nvmf_rpc_remove_ns_ctx *ctx;
+ struct spdk_nvmf_subsystem *subsystem;
+ struct spdk_nvmf_tgt *tgt;
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory");
+ return;
+ }
+
+ if (spdk_json_decode_object(params, nvmf_rpc_subsystem_remove_ns_decoder,
+ SPDK_COUNTOF(nvmf_rpc_subsystem_remove_ns_decoder),
+ ctx)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+ nvmf_rpc_remove_ns_ctx_free(ctx);
+ return;
+ }
+
+ tgt = spdk_nvmf_get_tgt(ctx->tgt_name);
+ if (!tgt) {
+ SPDK_ERRLOG("Unable to find a target object.\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Unable to find a target.");
+ nvmf_rpc_remove_ns_ctx_free(ctx);
+ return;
+ }
+
+ ctx->request = request;
+ ctx->response_sent = false;
+
+ subsystem = spdk_nvmf_tgt_find_subsystem(tgt, ctx->nqn);
+ if (!subsystem) {
+ SPDK_ERRLOG("Unable to find subsystem with NQN %s\n", ctx->nqn);
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+ nvmf_rpc_remove_ns_ctx_free(ctx);
+ return;
+ }
+
+ if (spdk_nvmf_subsystem_pause(subsystem, nvmf_rpc_remove_ns_paused, ctx)) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error");
+ nvmf_rpc_remove_ns_ctx_free(ctx);
+ }
+}
+SPDK_RPC_REGISTER("nvmf_subsystem_remove_ns", rpc_nvmf_subsystem_remove_ns, SPDK_RPC_RUNTIME)
+
+enum nvmf_rpc_host_op {
+ NVMF_RPC_HOST_ADD,
+ NVMF_RPC_HOST_REMOVE,
+ NVMF_RPC_HOST_ALLOW_ANY,
+};
+
+struct nvmf_rpc_host_ctx {
+ struct spdk_jsonrpc_request *request;
+
+ char *nqn;
+ char *host;
+ char *tgt_name;
+
+ enum nvmf_rpc_host_op op;
+
+ bool allow_any_host;
+
+ bool response_sent;
+};
+
+static const struct spdk_json_object_decoder nvmf_rpc_subsystem_host_decoder[] = {
+ {"nqn", offsetof(struct nvmf_rpc_host_ctx, nqn), spdk_json_decode_string},
+ {"host", offsetof(struct nvmf_rpc_host_ctx, host), spdk_json_decode_string},
+ {"tgt_name", offsetof(struct nvmf_rpc_host_ctx, tgt_name), spdk_json_decode_string, true},
+};
+
+static void
+nvmf_rpc_host_ctx_free(struct nvmf_rpc_host_ctx *ctx)
+{
+ free(ctx->nqn);
+ free(ctx->host);
+ free(ctx->tgt_name);
+ free(ctx);
+}
+
+static void
+nvmf_rpc_host_resumed(struct spdk_nvmf_subsystem *subsystem,
+ void *cb_arg, int status)
+{
+ struct nvmf_rpc_host_ctx *ctx = cb_arg;
+ struct spdk_jsonrpc_request *request;
+ struct spdk_json_write_ctx *w;
+ bool response_sent = ctx->response_sent;
+
+ request = ctx->request;
+ nvmf_rpc_host_ctx_free(ctx);
+
+ if (response_sent) {
+ return;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+}
+
+static void
+nvmf_rpc_host_paused(struct spdk_nvmf_subsystem *subsystem,
+ void *cb_arg, int status)
+{
+ struct nvmf_rpc_host_ctx *ctx = cb_arg;
+ int rc = -1;
+
+ switch (ctx->op) {
+ case NVMF_RPC_HOST_ADD:
+ rc = spdk_nvmf_subsystem_add_host(subsystem, ctx->host);
+ break;
+ case NVMF_RPC_HOST_REMOVE:
+ rc = spdk_nvmf_subsystem_remove_host(subsystem, ctx->host);
+ break;
+ case NVMF_RPC_HOST_ALLOW_ANY:
+ rc = spdk_nvmf_subsystem_set_allow_any_host(subsystem, ctx->allow_any_host);
+ break;
+ }
+
+ if (rc != 0) {
+ spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error");
+ ctx->response_sent = true;
+ }
+
+ if (spdk_nvmf_subsystem_resume(subsystem, nvmf_rpc_host_resumed, ctx)) {
+ if (!ctx->response_sent) {
+ spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error");
+ }
+ nvmf_rpc_host_ctx_free(ctx);
+ }
+}
+
+static void
+rpc_nvmf_subsystem_add_host(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct nvmf_rpc_host_ctx *ctx;
+ struct spdk_nvmf_subsystem *subsystem;
+ struct spdk_nvmf_tgt *tgt;
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory");
+ return;
+ }
+
+ if (spdk_json_decode_object(params, nvmf_rpc_subsystem_host_decoder,
+ SPDK_COUNTOF(nvmf_rpc_subsystem_host_decoder),
+ ctx)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+ nvmf_rpc_host_ctx_free(ctx);
+ return;
+ }
+
+ tgt = spdk_nvmf_get_tgt(ctx->tgt_name);
+ if (!tgt) {
+ SPDK_ERRLOG("Unable to find a target object.\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Unable to find a target.");
+ nvmf_rpc_host_ctx_free(ctx);
+ return;
+ }
+
+ ctx->request = request;
+ ctx->op = NVMF_RPC_HOST_ADD;
+ ctx->response_sent = false;
+
+ subsystem = spdk_nvmf_tgt_find_subsystem(tgt, ctx->nqn);
+ if (!subsystem) {
+ SPDK_ERRLOG("Unable to find subsystem with NQN %s\n", ctx->nqn);
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+ nvmf_rpc_host_ctx_free(ctx);
+ return;
+ }
+
+ if (spdk_nvmf_subsystem_pause(subsystem, nvmf_rpc_host_paused, ctx)) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error");
+ nvmf_rpc_host_ctx_free(ctx);
+ }
+}
+SPDK_RPC_REGISTER("nvmf_subsystem_add_host", rpc_nvmf_subsystem_add_host, SPDK_RPC_RUNTIME)
+
+static void
+rpc_nvmf_subsystem_remove_host(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct nvmf_rpc_host_ctx *ctx;
+ struct spdk_nvmf_subsystem *subsystem;
+ struct spdk_nvmf_tgt *tgt;
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory");
+ return;
+ }
+
+ if (spdk_json_decode_object(params, nvmf_rpc_subsystem_host_decoder,
+ SPDK_COUNTOF(nvmf_rpc_subsystem_host_decoder),
+ ctx)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+ nvmf_rpc_host_ctx_free(ctx);
+ return;
+ }
+
+ tgt = spdk_nvmf_get_tgt(ctx->tgt_name);
+ if (!tgt) {
+ SPDK_ERRLOG("Unable to find a target object.\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Unable to find a target.");
+ nvmf_rpc_host_ctx_free(ctx);
+ return;
+ }
+
+ ctx->request = request;
+ ctx->op = NVMF_RPC_HOST_REMOVE;
+ ctx->response_sent = false;
+
+ subsystem = spdk_nvmf_tgt_find_subsystem(tgt, ctx->nqn);
+ if (!subsystem) {
+ SPDK_ERRLOG("Unable to find subsystem with NQN %s\n", ctx->nqn);
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+ nvmf_rpc_host_ctx_free(ctx);
+ return;
+ }
+
+ if (spdk_nvmf_subsystem_pause(subsystem, nvmf_rpc_host_paused, ctx)) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error");
+ nvmf_rpc_host_ctx_free(ctx);
+ }
+}
+SPDK_RPC_REGISTER("nvmf_subsystem_remove_host", rpc_nvmf_subsystem_remove_host,
+ SPDK_RPC_RUNTIME)
+
+
+static const struct spdk_json_object_decoder nvmf_rpc_subsystem_any_host_decoder[] = {
+ {"nqn", offsetof(struct nvmf_rpc_host_ctx, nqn), spdk_json_decode_string},
+ {"allow_any_host", offsetof(struct nvmf_rpc_host_ctx, allow_any_host), spdk_json_decode_bool},
+ {"tgt_name", offsetof(struct nvmf_rpc_host_ctx, tgt_name), spdk_json_decode_string, true},
+};
+
+static void
+rpc_nvmf_subsystem_allow_any_host(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct nvmf_rpc_host_ctx *ctx;
+ struct spdk_nvmf_subsystem *subsystem;
+ struct spdk_nvmf_tgt *tgt;
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory");
+ return;
+ }
+
+ if (spdk_json_decode_object(params, nvmf_rpc_subsystem_any_host_decoder,
+ SPDK_COUNTOF(nvmf_rpc_subsystem_any_host_decoder),
+ ctx)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+ nvmf_rpc_host_ctx_free(ctx);
+ return;
+ }
+
+ tgt = spdk_nvmf_get_tgt(ctx->tgt_name);
+ if (!tgt) {
+ SPDK_ERRLOG("Unable to find a target object.\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Unable to find a target.");
+ nvmf_rpc_host_ctx_free(ctx);
+ return;
+ }
+
+ ctx->request = request;
+ ctx->op = NVMF_RPC_HOST_ALLOW_ANY;
+ ctx->response_sent = false;
+
+ subsystem = spdk_nvmf_tgt_find_subsystem(tgt, ctx->nqn);
+ if (!subsystem) {
+ SPDK_ERRLOG("Unable to find subsystem with NQN %s\n", ctx->nqn);
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+ nvmf_rpc_host_ctx_free(ctx);
+ return;
+ }
+
+ if (spdk_nvmf_subsystem_pause(subsystem, nvmf_rpc_host_paused, ctx)) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error");
+ nvmf_rpc_host_ctx_free(ctx);
+ }
+}
+SPDK_RPC_REGISTER("nvmf_subsystem_allow_any_host", rpc_nvmf_subsystem_allow_any_host,
+ SPDK_RPC_RUNTIME)
+
+struct nvmf_rpc_target_ctx {
+ char *name;
+ uint32_t max_subsystems;
+};
+
+static const struct spdk_json_object_decoder nvmf_rpc_create_target_decoder[] = {
+ {"name", offsetof(struct nvmf_rpc_target_ctx, name), spdk_json_decode_string},
+ {"max_subsystems", offsetof(struct nvmf_rpc_target_ctx, max_subsystems), spdk_json_decode_uint32, true},
+};
+
+static void
+rpc_nvmf_create_target(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct spdk_nvmf_target_opts opts;
+ struct nvmf_rpc_target_ctx ctx = {0};
+ struct spdk_nvmf_tgt *tgt;
+ struct spdk_json_write_ctx *w;
+
+ /* Decode parameters the first time to get the transport type */
+ if (spdk_json_decode_object(params, nvmf_rpc_create_target_decoder,
+ SPDK_COUNTOF(nvmf_rpc_create_target_decoder),
+ &ctx)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+ free(ctx.name);
+ return;
+ }
+
+ snprintf(opts.name, NVMF_TGT_NAME_MAX_LENGTH, "%s", ctx.name);
+ opts.max_subsystems = ctx.max_subsystems;
+
+ if (spdk_nvmf_get_tgt(opts.name) != NULL) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Target already exists.");
+ free(ctx.name);
+ return;
+ }
+
+ tgt = spdk_nvmf_tgt_create(&opts);
+
+ if (tgt == NULL) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Unable to create the requested target.");
+ free(ctx.name);
+ return;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_string(w, spdk_nvmf_tgt_get_name(tgt));
+ spdk_jsonrpc_end_result(request, w);
+ free(ctx.name);
+}
+SPDK_RPC_REGISTER("nvmf_create_target", rpc_nvmf_create_target, SPDK_RPC_RUNTIME);
+
+static const struct spdk_json_object_decoder nvmf_rpc_destroy_target_decoder[] = {
+ {"name", offsetof(struct nvmf_rpc_target_ctx, name), spdk_json_decode_string},
+};
+
+static void
+nvmf_rpc_destroy_target_done(void *ctx, int status)
+{
+ struct spdk_jsonrpc_request *request = ctx;
+ struct spdk_json_write_ctx *w;
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+}
+
+static void
+rpc_nvmf_delete_target(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct nvmf_rpc_target_ctx ctx = {0};
+ struct spdk_nvmf_tgt *tgt;
+
+ /* Decode parameters the first time to get the transport type */
+ if (spdk_json_decode_object(params, nvmf_rpc_destroy_target_decoder,
+ SPDK_COUNTOF(nvmf_rpc_destroy_target_decoder),
+ &ctx)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+ free(ctx.name);
+ return;
+ }
+
+ tgt = spdk_nvmf_get_tgt(ctx.name);
+
+ if (tgt == NULL) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "The specified target doesn't exist, cannot delete it.");
+ free(ctx.name);
+ return;
+ }
+
+ spdk_nvmf_tgt_destroy(tgt, nvmf_rpc_destroy_target_done, request);
+ free(ctx.name);
+}
+SPDK_RPC_REGISTER("nvmf_delete_target", rpc_nvmf_delete_target, SPDK_RPC_RUNTIME);
+
+static void
+rpc_nvmf_get_targets(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct spdk_json_write_ctx *w;
+ struct spdk_nvmf_tgt *tgt;
+ const char *name;
+
+ if (params != NULL) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "nvmf_get_targets has no parameters.");
+ return;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_array_begin(w);
+
+ tgt = spdk_nvmf_get_first_tgt();
+
+ while (tgt != NULL) {
+ name = spdk_nvmf_tgt_get_name(tgt);
+ spdk_json_write_string(w, name);
+ tgt = spdk_nvmf_get_next_tgt(tgt);
+ }
+
+ spdk_json_write_array_end(w);
+ spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("nvmf_get_targets", rpc_nvmf_get_targets, SPDK_RPC_RUNTIME);
+
+struct nvmf_rpc_create_transport_ctx {
+ char *trtype;
+ char *tgt_name;
+ struct spdk_nvmf_transport_opts opts;
+ struct spdk_jsonrpc_request *request;
+};
+
+/**
+ * `max_qpairs_per_ctrlr` represents both admin and IO qpairs, that confuses
+ * users when they configure a transport using RPC. So it was decided to
+ * deprecate `max_qpairs_per_ctrlr` RPC parameter and use `max_io_qpairs_per_ctrlr`
+ * But internal logic remains unchanged and SPDK expects that
+ * spdk_nvmf_transport_opts::max_qpairs_per_ctrlr includes an admin qpair.
+ * This function parses the number of IO qpairs and adds +1 for admin qpair.
+ */
+static int
+nvmf_rpc_decode_max_io_qpairs(const struct spdk_json_val *val, void *out)
+{
+ uint16_t *i = out;
+ int rc;
+
+ rc = spdk_json_number_to_uint16(val, i);
+ if (rc == 0) {
+ (*i)++;
+ }
+
+ return rc;
+}
+
+/**
+ * This function parses deprecated `max_qpairs_per_ctrlr` and warns the user to use
+ * the new parameter `max_io_qpairs_per_ctrlr`
+ */
+static int
+nvmf_rpc_decode_max_qpairs(const struct spdk_json_val *val, void *out)
+{
+ uint16_t *i = out;
+ int rc;
+
+ rc = spdk_json_number_to_uint16(val, i);
+ if (rc == 0) {
+ SPDK_WARNLOG("Parameter max_qpairs_per_ctrlr is deprecated, use max_io_qpairs_per_ctrlr instead.\n");
+ }
+
+ return rc;
+}
+
+static const struct spdk_json_object_decoder nvmf_rpc_create_transport_decoder[] = {
+ { "trtype", offsetof(struct nvmf_rpc_create_transport_ctx, trtype), spdk_json_decode_string},
+ {
+ "max_queue_depth", offsetof(struct nvmf_rpc_create_transport_ctx, opts.max_queue_depth),
+ spdk_json_decode_uint16, true
+ },
+ {
+ "max_qpairs_per_ctrlr", offsetof(struct nvmf_rpc_create_transport_ctx, opts.max_qpairs_per_ctrlr),
+ nvmf_rpc_decode_max_qpairs, true
+ },
+ {
+ "max_io_qpairs_per_ctrlr", offsetof(struct nvmf_rpc_create_transport_ctx, opts.max_qpairs_per_ctrlr),
+ nvmf_rpc_decode_max_io_qpairs, true
+ },
+ {
+ "in_capsule_data_size", offsetof(struct nvmf_rpc_create_transport_ctx, opts.in_capsule_data_size),
+ spdk_json_decode_uint32, true
+ },
+ {
+ "max_io_size", offsetof(struct nvmf_rpc_create_transport_ctx, opts.max_io_size),
+ spdk_json_decode_uint32, true
+ },
+ {
+ "io_unit_size", offsetof(struct nvmf_rpc_create_transport_ctx, opts.io_unit_size),
+ spdk_json_decode_uint32, true
+ },
+ {
+ "max_aq_depth", offsetof(struct nvmf_rpc_create_transport_ctx, opts.max_aq_depth),
+ spdk_json_decode_uint32, true
+ },
+ {
+ "num_shared_buffers", offsetof(struct nvmf_rpc_create_transport_ctx, opts.num_shared_buffers),
+ spdk_json_decode_uint32, true
+ },
+ {
+ "buf_cache_size", offsetof(struct nvmf_rpc_create_transport_ctx, opts.buf_cache_size),
+ spdk_json_decode_uint32, true
+ },
+ {
+ "max_srq_depth", offsetof(struct nvmf_rpc_create_transport_ctx, opts.max_srq_depth),
+ spdk_json_decode_uint32, true
+ },
+ {
+ "no_srq", offsetof(struct nvmf_rpc_create_transport_ctx, opts.no_srq),
+ spdk_json_decode_bool, true
+ },
+ {
+ "c2h_success", offsetof(struct nvmf_rpc_create_transport_ctx, opts.c2h_success),
+ spdk_json_decode_bool, true
+ },
+ {
+ "dif_insert_or_strip", offsetof(struct nvmf_rpc_create_transport_ctx, opts.dif_insert_or_strip),
+ spdk_json_decode_bool, true
+ },
+ {
+ "sock_priority", offsetof(struct nvmf_rpc_create_transport_ctx, opts.sock_priority),
+ spdk_json_decode_uint32, true
+ },
+ {
+ "acceptor_backlog", offsetof(struct nvmf_rpc_create_transport_ctx, opts.acceptor_backlog),
+ spdk_json_decode_int32, true
+ },
+ {
+ "abort_timeout_sec", offsetof(struct nvmf_rpc_create_transport_ctx, opts.abort_timeout_sec),
+ spdk_json_decode_uint32, true
+ },
+ {
+ "tgt_name", offsetof(struct nvmf_rpc_create_transport_ctx, tgt_name),
+ spdk_json_decode_string, true
+ },
+};
+
+static void
+nvmf_rpc_create_transport_ctx_free(struct nvmf_rpc_create_transport_ctx *ctx)
+{
+ free(ctx->trtype);
+ free(ctx->tgt_name);
+ free(ctx);
+}
+
+static void
+nvmf_rpc_tgt_add_transport_done(void *cb_arg, int status)
+{
+ struct nvmf_rpc_create_transport_ctx *ctx = cb_arg;
+ struct spdk_jsonrpc_request *request;
+ struct spdk_json_write_ctx *w;
+
+ request = ctx->request;
+ nvmf_rpc_create_transport_ctx_free(ctx);
+
+ if (status) {
+ SPDK_ERRLOG("Failed to add transport to tgt.(%d)\n", status);
+ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Failed to add transport to tgt.(%d)\n",
+ status);
+ return;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+}
+
+static void
+rpc_nvmf_create_transport(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct nvmf_rpc_create_transport_ctx *ctx;
+ enum spdk_nvme_transport_type trtype;
+ struct spdk_nvmf_transport *transport;
+ struct spdk_nvmf_tgt *tgt;
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory");
+ return;
+ }
+
+ /* Decode parameters the first time to get the transport type */
+ if (spdk_json_decode_object(params, nvmf_rpc_create_transport_decoder,
+ SPDK_COUNTOF(nvmf_rpc_create_transport_decoder),
+ ctx)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+ nvmf_rpc_create_transport_ctx_free(ctx);
+ return;
+ }
+
+ tgt = spdk_nvmf_get_tgt(ctx->tgt_name);
+ if (!tgt) {
+ SPDK_ERRLOG("Unable to find a target object.\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Unable to find a target.");
+ nvmf_rpc_create_transport_ctx_free(ctx);
+ return;
+ }
+
+ if (spdk_nvme_transport_id_parse_trtype(&trtype, ctx->trtype)) {
+ SPDK_ERRLOG("Invalid transport type '%s'\n", ctx->trtype);
+ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid transport type '%s'\n", ctx->trtype);
+ nvmf_rpc_create_transport_ctx_free(ctx);
+ return;
+ }
+
+ /* Initialize all the transport options (based on transport type) and decode the
+ * parameters again to update any options passed in rpc create transport call.
+ */
+ if (!spdk_nvmf_transport_opts_init(ctx->trtype, &ctx->opts)) {
+ /* This can happen if user specifies PCIE transport type which isn't valid for
+ * NVMe-oF.
+ */
+ SPDK_ERRLOG("Invalid transport type '%s'\n", ctx->trtype);
+ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid transport type '%s'\n", ctx->trtype);
+ nvmf_rpc_create_transport_ctx_free(ctx);
+ return;
+ }
+
+ if (spdk_json_decode_object(params, nvmf_rpc_create_transport_decoder,
+ SPDK_COUNTOF(nvmf_rpc_create_transport_decoder),
+ ctx)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+ nvmf_rpc_create_transport_ctx_free(ctx);
+ return;
+ }
+
+ if (spdk_nvmf_tgt_get_transport(tgt, ctx->trtype)) {
+ SPDK_ERRLOG("Transport type '%s' already exists\n", ctx->trtype);
+ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Transport type '%s' already exists\n", ctx->trtype);
+ nvmf_rpc_create_transport_ctx_free(ctx);
+ return;
+ }
+
+ transport = spdk_nvmf_transport_create(ctx->trtype, &ctx->opts);
+
+ if (!transport) {
+ SPDK_ERRLOG("Transport type '%s' create failed\n", ctx->trtype);
+ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Transport type '%s' create failed\n", ctx->trtype);
+ nvmf_rpc_create_transport_ctx_free(ctx);
+ return;
+ }
+
+ /* add transport to target */
+ ctx->request = request;
+ spdk_nvmf_tgt_add_transport(tgt, transport, nvmf_rpc_tgt_add_transport_done, ctx);
+}
+SPDK_RPC_REGISTER("nvmf_create_transport", rpc_nvmf_create_transport, SPDK_RPC_RUNTIME)
+
+static void
+dump_nvmf_transport(struct spdk_json_write_ctx *w, struct spdk_nvmf_transport *transport)
+{
+ const struct spdk_nvmf_transport_opts *opts = spdk_nvmf_get_transport_opts(transport);
+ spdk_nvme_transport_type_t type = spdk_nvmf_get_transport_type(transport);
+
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_string(w, "trtype", spdk_nvmf_get_transport_name(transport));
+ spdk_json_write_named_uint32(w, "max_queue_depth", opts->max_queue_depth);
+ spdk_json_write_named_uint32(w, "max_io_qpairs_per_ctrlr", opts->max_qpairs_per_ctrlr - 1);
+ spdk_json_write_named_uint32(w, "in_capsule_data_size", opts->in_capsule_data_size);
+ spdk_json_write_named_uint32(w, "max_io_size", opts->max_io_size);
+ spdk_json_write_named_uint32(w, "io_unit_size", opts->io_unit_size);
+ spdk_json_write_named_uint32(w, "max_aq_depth", opts->max_aq_depth);
+ spdk_json_write_named_uint32(w, "num_shared_buffers", opts->num_shared_buffers);
+ spdk_json_write_named_uint32(w, "buf_cache_size", opts->buf_cache_size);
+ spdk_json_write_named_bool(w, "dif_insert_or_strip", opts->dif_insert_or_strip);
+ if (type == SPDK_NVME_TRANSPORT_RDMA) {
+ spdk_json_write_named_uint32(w, "max_srq_depth", opts->max_srq_depth);
+ spdk_json_write_named_bool(w, "no_srq", opts->no_srq);
+ spdk_json_write_named_int32(w, "acceptor_backlog", opts->acceptor_backlog);
+ } else if (type == SPDK_NVME_TRANSPORT_TCP) {
+ spdk_json_write_named_bool(w, "c2h_success", opts->c2h_success);
+ spdk_json_write_named_uint32(w, "sock_priority", opts->sock_priority);
+ }
+ spdk_json_write_named_uint32(w, "abort_timeout_sec", opts->abort_timeout_sec);
+
+ spdk_json_write_object_end(w);
+}
+
+struct rpc_get_transport {
+ char *tgt_name;
+};
+
+static const struct spdk_json_object_decoder rpc_get_transport_decoders[] = {
+ {"tgt_name", offsetof(struct rpc_get_transport, tgt_name), spdk_json_decode_string, true},
+};
+
+static void
+rpc_nvmf_get_transports(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_get_transport req = { 0 };
+ struct spdk_json_write_ctx *w;
+ struct spdk_nvmf_transport *transport;
+ struct spdk_nvmf_tgt *tgt;
+
+ if (params) {
+ if (spdk_json_decode_object(params, rpc_get_transport_decoders,
+ SPDK_COUNTOF(rpc_get_transport_decoders),
+ &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+ return;
+ }
+ }
+
+ tgt = spdk_nvmf_get_tgt(req.tgt_name);
+ if (!tgt) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Unable to find a target.");
+ free(req.tgt_name);
+ return;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_array_begin(w);
+ transport = spdk_nvmf_transport_get_first(tgt);
+ while (transport) {
+ dump_nvmf_transport(w, transport);
+ transport = spdk_nvmf_transport_get_next(transport);
+ }
+ spdk_json_write_array_end(w);
+ spdk_jsonrpc_end_result(request, w);
+ free(req.tgt_name);
+}
+SPDK_RPC_REGISTER("nvmf_get_transports", rpc_nvmf_get_transports, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(nvmf_get_transports, get_nvmf_transports)
+
+struct rpc_nvmf_get_stats_ctx {
+ char *tgt_name;
+ struct spdk_nvmf_tgt *tgt;
+ struct spdk_jsonrpc_request *request;
+ struct spdk_json_write_ctx *w;
+};
+
+static const struct spdk_json_object_decoder rpc_get_stats_decoders[] = {
+ {"tgt_name", offsetof(struct rpc_nvmf_get_stats_ctx, tgt_name), spdk_json_decode_string, true},
+};
+
+static void
+free_get_stats_ctx(struct rpc_nvmf_get_stats_ctx *ctx)
+{
+ free(ctx->tgt_name);
+ free(ctx);
+}
+
+static void
+rpc_nvmf_get_stats_done(struct spdk_io_channel_iter *i, int status)
+{
+ struct rpc_nvmf_get_stats_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
+
+ spdk_json_write_array_end(ctx->w);
+ spdk_json_write_object_end(ctx->w);
+ spdk_jsonrpc_end_result(ctx->request, ctx->w);
+ free_get_stats_ctx(ctx);
+}
+
+static void
+write_nvmf_transport_stats(struct spdk_json_write_ctx *w,
+ struct spdk_nvmf_transport_poll_group_stat *stat)
+{
+ uint64_t i;
+
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "trtype",
+ spdk_nvme_transport_id_trtype_str(stat->trtype));
+ switch (stat->trtype) {
+ case SPDK_NVME_TRANSPORT_RDMA:
+ spdk_json_write_named_uint64(w, "pending_data_buffer", stat->rdma.pending_data_buffer);
+ spdk_json_write_named_array_begin(w, "devices");
+ for (i = 0; i < stat->rdma.num_devices; ++i) {
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "name", stat->rdma.devices[i].name);
+ spdk_json_write_named_uint64(w, "polls", stat->rdma.devices[i].polls);
+ spdk_json_write_named_uint64(w, "completions", stat->rdma.devices[i].completions);
+ spdk_json_write_named_uint64(w, "requests",
+ stat->rdma.devices[i].requests);
+ spdk_json_write_named_uint64(w, "request_latency",
+ stat->rdma.devices[i].request_latency);
+ spdk_json_write_named_uint64(w, "pending_free_request",
+ stat->rdma.devices[i].pending_free_request);
+ spdk_json_write_named_uint64(w, "pending_rdma_read",
+ stat->rdma.devices[i].pending_rdma_read);
+ spdk_json_write_named_uint64(w, "pending_rdma_write",
+ stat->rdma.devices[i].pending_rdma_write);
+ spdk_json_write_object_end(w);
+ }
+ spdk_json_write_array_end(w);
+ break;
+ default:
+ break;
+ }
+ spdk_json_write_object_end(w);
+}
+
+static void
+_rpc_nvmf_get_stats(struct spdk_io_channel_iter *i)
+{
+ struct rpc_nvmf_get_stats_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
+ struct spdk_nvmf_transport *transport;
+ struct spdk_nvmf_poll_group_stat stat;
+ struct spdk_nvmf_transport_poll_group_stat *trstat;
+ int rc;
+
+ if (0 == spdk_nvmf_poll_group_get_stat(ctx->tgt, &stat)) {
+ spdk_json_write_object_begin(ctx->w);
+ spdk_json_write_named_string(ctx->w, "name", spdk_thread_get_name(spdk_get_thread()));
+ spdk_json_write_named_uint32(ctx->w, "admin_qpairs", stat.admin_qpairs);
+ spdk_json_write_named_uint32(ctx->w, "io_qpairs", stat.io_qpairs);
+ spdk_json_write_named_uint64(ctx->w, "pending_bdev_io", stat.pending_bdev_io);
+
+ spdk_json_write_named_array_begin(ctx->w, "transports");
+ transport = spdk_nvmf_transport_get_first(ctx->tgt);
+ while (transport) {
+ rc = spdk_nvmf_transport_poll_group_get_stat(ctx->tgt, transport, &trstat);
+ if (0 == rc) {
+ write_nvmf_transport_stats(ctx->w, trstat);
+ spdk_nvmf_transport_poll_group_free_stat(transport, trstat);
+ } else if (-ENOTSUP != rc) {
+ SPDK_ERRLOG("Failed to get poll group statistics for transport %s, errno %d\n",
+ spdk_nvme_transport_id_trtype_str(spdk_nvmf_get_transport_type(transport)),
+ rc);
+ }
+ transport = spdk_nvmf_transport_get_next(transport);
+ }
+ spdk_json_write_array_end(ctx->w);
+ spdk_json_write_object_end(ctx->w);
+ }
+
+ spdk_for_each_channel_continue(i, 0);
+}
+
+
+static void
+rpc_nvmf_get_stats(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_nvmf_get_stats_ctx *ctx;
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Memory allocation error");
+ return;
+ }
+ ctx->request = request;
+
+ if (params) {
+ if (spdk_json_decode_object(params, rpc_get_stats_decoders,
+ SPDK_COUNTOF(rpc_get_stats_decoders),
+ ctx)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+ free_get_stats_ctx(ctx);
+ return;
+ }
+ }
+
+ ctx->tgt = spdk_nvmf_get_tgt(ctx->tgt_name);
+ if (!ctx->tgt) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ "Unable to find a target.");
+ free_get_stats_ctx(ctx);
+ return;
+ }
+
+ ctx->w = spdk_jsonrpc_begin_result(ctx->request);
+ spdk_json_write_object_begin(ctx->w);
+ spdk_json_write_named_uint64(ctx->w, "tick_rate", spdk_get_ticks_hz());
+ spdk_json_write_named_array_begin(ctx->w, "poll_groups");
+
+ spdk_for_each_channel(ctx->tgt,
+ _rpc_nvmf_get_stats,
+ ctx,
+ rpc_nvmf_get_stats_done);
+}
+
+SPDK_RPC_REGISTER("nvmf_get_stats", rpc_nvmf_get_stats, SPDK_RPC_RUNTIME)
diff --git a/src/spdk/lib/nvmf/rdma.c b/src/spdk/lib/nvmf/rdma.c
new file mode 100644
index 000000000..4a4de4374
--- /dev/null
+++ b/src/spdk/lib/nvmf/rdma.c
@@ -0,0 +1,4313 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation. All rights reserved.
+ * Copyright (c) 2019, 2020 Mellanox Technologies LTD. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/config.h"
+#include "spdk/thread.h"
+#include "spdk/likely.h"
+#include "spdk/nvmf_transport.h"
+#include "spdk/string.h"
+#include "spdk/trace.h"
+#include "spdk/util.h"
+
+#include "spdk_internal/assert.h"
+#include "spdk_internal/log.h"
+#include "spdk_internal/rdma.h"
+
+#include "nvmf_internal.h"
+
+struct spdk_nvme_rdma_hooks g_nvmf_hooks = {};
+const struct spdk_nvmf_transport_ops spdk_nvmf_transport_rdma;
+
+/*
+ RDMA Connection Resource Defaults
+ */
+#define NVMF_DEFAULT_TX_SGE SPDK_NVMF_MAX_SGL_ENTRIES
+#define NVMF_DEFAULT_RSP_SGE 1
+#define NVMF_DEFAULT_RX_SGE 2
+
+/* The RDMA completion queue size */
+#define DEFAULT_NVMF_RDMA_CQ_SIZE 4096
+#define MAX_WR_PER_QP(queue_depth) (queue_depth * 3 + 2)
+
+/* Timeout for destroying defunct rqpairs */
+#define NVMF_RDMA_QPAIR_DESTROY_TIMEOUT_US 4000000
+
+static int g_spdk_nvmf_ibv_query_mask =
+ IBV_QP_STATE |
+ IBV_QP_PKEY_INDEX |
+ IBV_QP_PORT |
+ IBV_QP_ACCESS_FLAGS |
+ IBV_QP_AV |
+ IBV_QP_PATH_MTU |
+ IBV_QP_DEST_QPN |
+ IBV_QP_RQ_PSN |
+ IBV_QP_MAX_DEST_RD_ATOMIC |
+ IBV_QP_MIN_RNR_TIMER |
+ IBV_QP_SQ_PSN |
+ IBV_QP_TIMEOUT |
+ IBV_QP_RETRY_CNT |
+ IBV_QP_RNR_RETRY |
+ IBV_QP_MAX_QP_RD_ATOMIC;
+
+enum spdk_nvmf_rdma_request_state {
+ /* The request is not currently in use */
+ RDMA_REQUEST_STATE_FREE = 0,
+
+ /* Initial state when request first received */
+ RDMA_REQUEST_STATE_NEW,
+
+ /* The request is queued until a data buffer is available. */
+ RDMA_REQUEST_STATE_NEED_BUFFER,
+
+ /* The request is waiting on RDMA queue depth availability
+ * to transfer data from the host to the controller.
+ */
+ RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING,
+
+ /* The request is currently transferring data from the host to the controller. */
+ RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER,
+
+ /* The request is ready to execute at the block device */
+ RDMA_REQUEST_STATE_READY_TO_EXECUTE,
+
+ /* The request is currently executing at the block device */
+ RDMA_REQUEST_STATE_EXECUTING,
+
+ /* The request finished executing at the block device */
+ RDMA_REQUEST_STATE_EXECUTED,
+
+ /* The request is waiting on RDMA queue depth availability
+ * to transfer data from the controller to the host.
+ */
+ RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING,
+
+ /* The request is ready to send a completion */
+ RDMA_REQUEST_STATE_READY_TO_COMPLETE,
+
+ /* The request is currently transferring data from the controller to the host. */
+ RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST,
+
+ /* The request currently has an outstanding completion without an
+ * associated data transfer.
+ */
+ RDMA_REQUEST_STATE_COMPLETING,
+
+ /* The request completed and can be marked free. */
+ RDMA_REQUEST_STATE_COMPLETED,
+
+ /* Terminator */
+ RDMA_REQUEST_NUM_STATES,
+};
+
+#define OBJECT_NVMF_RDMA_IO 0x40
+
+#define TRACE_GROUP_NVMF_RDMA 0x4
+#define TRACE_RDMA_REQUEST_STATE_NEW SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x0)
+#define TRACE_RDMA_REQUEST_STATE_NEED_BUFFER SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x1)
+#define TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x2)
+#define TRACE_RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x3)
+#define TRACE_RDMA_REQUEST_STATE_READY_TO_EXECUTE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x4)
+#define TRACE_RDMA_REQUEST_STATE_EXECUTING SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x5)
+#define TRACE_RDMA_REQUEST_STATE_EXECUTED SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x6)
+#define TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x7)
+#define TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x8)
+#define TRACE_RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x9)
+#define TRACE_RDMA_REQUEST_STATE_COMPLETING SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xA)
+#define TRACE_RDMA_REQUEST_STATE_COMPLETED SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xB)
+#define TRACE_RDMA_QP_CREATE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xC)
+#define TRACE_RDMA_IBV_ASYNC_EVENT SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xD)
+#define TRACE_RDMA_CM_ASYNC_EVENT SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xE)
+#define TRACE_RDMA_QP_STATE_CHANGE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xF)
+#define TRACE_RDMA_QP_DISCONNECT SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x10)
+#define TRACE_RDMA_QP_DESTROY SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x11)
+
+SPDK_TRACE_REGISTER_FN(nvmf_trace, "nvmf_rdma", TRACE_GROUP_NVMF_RDMA)
+{
+ spdk_trace_register_object(OBJECT_NVMF_RDMA_IO, 'r');
+ spdk_trace_register_description("RDMA_REQ_NEW", TRACE_RDMA_REQUEST_STATE_NEW,
+ OWNER_NONE, OBJECT_NVMF_RDMA_IO, 1, 1, "cmid: ");
+ spdk_trace_register_description("RDMA_REQ_NEED_BUFFER", TRACE_RDMA_REQUEST_STATE_NEED_BUFFER,
+ OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: ");
+ spdk_trace_register_description("RDMA_REQ_TX_PENDING_C2H",
+ TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING,
+ OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: ");
+ spdk_trace_register_description("RDMA_REQ_TX_PENDING_H2C",
+ TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING,
+ OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: ");
+ spdk_trace_register_description("RDMA_REQ_TX_H2C",
+ TRACE_RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER,
+ OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: ");
+ spdk_trace_register_description("RDMA_REQ_RDY_TO_EXECUTE",
+ TRACE_RDMA_REQUEST_STATE_READY_TO_EXECUTE,
+ OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: ");
+ spdk_trace_register_description("RDMA_REQ_EXECUTING",
+ TRACE_RDMA_REQUEST_STATE_EXECUTING,
+ OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: ");
+ spdk_trace_register_description("RDMA_REQ_EXECUTED",
+ TRACE_RDMA_REQUEST_STATE_EXECUTED,
+ OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: ");
+ spdk_trace_register_description("RDMA_REQ_RDY_TO_COMPL",
+ TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE,
+ OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: ");
+ spdk_trace_register_description("RDMA_REQ_COMPLETING_C2H",
+ TRACE_RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST,
+ OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: ");
+ spdk_trace_register_description("RDMA_REQ_COMPLETING",
+ TRACE_RDMA_REQUEST_STATE_COMPLETING,
+ OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: ");
+ spdk_trace_register_description("RDMA_REQ_COMPLETED",
+ TRACE_RDMA_REQUEST_STATE_COMPLETED,
+ OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: ");
+
+ spdk_trace_register_description("RDMA_QP_CREATE", TRACE_RDMA_QP_CREATE,
+ OWNER_NONE, OBJECT_NONE, 0, 0, "");
+ spdk_trace_register_description("RDMA_IBV_ASYNC_EVENT", TRACE_RDMA_IBV_ASYNC_EVENT,
+ OWNER_NONE, OBJECT_NONE, 0, 0, "type: ");
+ spdk_trace_register_description("RDMA_CM_ASYNC_EVENT", TRACE_RDMA_CM_ASYNC_EVENT,
+ OWNER_NONE, OBJECT_NONE, 0, 0, "type: ");
+ spdk_trace_register_description("RDMA_QP_STATE_CHANGE", TRACE_RDMA_QP_STATE_CHANGE,
+ OWNER_NONE, OBJECT_NONE, 0, 1, "state: ");
+ spdk_trace_register_description("RDMA_QP_DISCONNECT", TRACE_RDMA_QP_DISCONNECT,
+ OWNER_NONE, OBJECT_NONE, 0, 0, "");
+ spdk_trace_register_description("RDMA_QP_DESTROY", TRACE_RDMA_QP_DESTROY,
+ OWNER_NONE, OBJECT_NONE, 0, 0, "");
+}
+
+enum spdk_nvmf_rdma_wr_type {
+ RDMA_WR_TYPE_RECV,
+ RDMA_WR_TYPE_SEND,
+ RDMA_WR_TYPE_DATA,
+};
+
+struct spdk_nvmf_rdma_wr {
+ enum spdk_nvmf_rdma_wr_type type;
+};
+
+/* This structure holds commands as they are received off the wire.
+ * It must be dynamically paired with a full request object
+ * (spdk_nvmf_rdma_request) to service a request. It is separate
+ * from the request because RDMA does not appear to order
+ * completions, so occasionally we'll get a new incoming
+ * command when there aren't any free request objects.
+ */
+struct spdk_nvmf_rdma_recv {
+ struct ibv_recv_wr wr;
+ struct ibv_sge sgl[NVMF_DEFAULT_RX_SGE];
+
+ struct spdk_nvmf_rdma_qpair *qpair;
+
+ /* In-capsule data buffer */
+ uint8_t *buf;
+
+ struct spdk_nvmf_rdma_wr rdma_wr;
+ uint64_t receive_tsc;
+
+ STAILQ_ENTRY(spdk_nvmf_rdma_recv) link;
+};
+
+struct spdk_nvmf_rdma_request_data {
+ struct spdk_nvmf_rdma_wr rdma_wr;
+ struct ibv_send_wr wr;
+ struct ibv_sge sgl[SPDK_NVMF_MAX_SGL_ENTRIES];
+};
+
+struct spdk_nvmf_rdma_request {
+ struct spdk_nvmf_request req;
+
+ enum spdk_nvmf_rdma_request_state state;
+
+ struct spdk_nvmf_rdma_recv *recv;
+
+ struct {
+ struct spdk_nvmf_rdma_wr rdma_wr;
+ struct ibv_send_wr wr;
+ struct ibv_sge sgl[NVMF_DEFAULT_RSP_SGE];
+ } rsp;
+
+ struct spdk_nvmf_rdma_request_data data;
+
+ uint32_t iovpos;
+
+ uint32_t num_outstanding_data_wr;
+ uint64_t receive_tsc;
+
+ STAILQ_ENTRY(spdk_nvmf_rdma_request) state_link;
+};
+
+enum spdk_nvmf_rdma_qpair_disconnect_flags {
+ RDMA_QP_DISCONNECTING = 1,
+ RDMA_QP_RECV_DRAINED = 1 << 1,
+ RDMA_QP_SEND_DRAINED = 1 << 2
+};
+
+struct spdk_nvmf_rdma_resource_opts {
+ struct spdk_nvmf_rdma_qpair *qpair;
+ /* qp points either to an ibv_qp object or an ibv_srq object depending on the value of shared. */
+ void *qp;
+ struct ibv_pd *pd;
+ uint32_t max_queue_depth;
+ uint32_t in_capsule_data_size;
+ bool shared;
+};
+
+struct spdk_nvmf_send_wr_list {
+ struct ibv_send_wr *first;
+ struct ibv_send_wr *last;
+};
+
+struct spdk_nvmf_recv_wr_list {
+ struct ibv_recv_wr *first;
+ struct ibv_recv_wr *last;
+};
+
+struct spdk_nvmf_rdma_resources {
+ /* Array of size "max_queue_depth" containing RDMA requests. */
+ struct spdk_nvmf_rdma_request *reqs;
+
+ /* Array of size "max_queue_depth" containing RDMA recvs. */
+ struct spdk_nvmf_rdma_recv *recvs;
+
+ /* Array of size "max_queue_depth" containing 64 byte capsules
+ * used for receive.
+ */
+ union nvmf_h2c_msg *cmds;
+ struct ibv_mr *cmds_mr;
+
+ /* Array of size "max_queue_depth" containing 16 byte completions
+ * to be sent back to the user.
+ */
+ union nvmf_c2h_msg *cpls;
+ struct ibv_mr *cpls_mr;
+
+ /* Array of size "max_queue_depth * InCapsuleDataSize" containing
+ * buffers to be used for in capsule data.
+ */
+ void *bufs;
+ struct ibv_mr *bufs_mr;
+
+ /* The list of pending recvs to transfer */
+ struct spdk_nvmf_recv_wr_list recvs_to_post;
+
+ /* Receives that are waiting for a request object */
+ STAILQ_HEAD(, spdk_nvmf_rdma_recv) incoming_queue;
+
+ /* Queue to track free requests */
+ STAILQ_HEAD(, spdk_nvmf_rdma_request) free_queue;
+};
+
+typedef void (*spdk_nvmf_rdma_qpair_ibv_event)(struct spdk_nvmf_rdma_qpair *rqpair);
+
+struct spdk_nvmf_rdma_ibv_event_ctx {
+ struct spdk_nvmf_rdma_qpair *rqpair;
+ spdk_nvmf_rdma_qpair_ibv_event cb_fn;
+ /* Link to other ibv events associated with this qpair */
+ STAILQ_ENTRY(spdk_nvmf_rdma_ibv_event_ctx) link;
+};
+
+struct spdk_nvmf_rdma_qpair {
+ struct spdk_nvmf_qpair qpair;
+
+ struct spdk_nvmf_rdma_device *device;
+ struct spdk_nvmf_rdma_poller *poller;
+
+ struct spdk_rdma_qp *rdma_qp;
+ struct rdma_cm_id *cm_id;
+ struct ibv_srq *srq;
+ struct rdma_cm_id *listen_id;
+
+ /* The maximum number of I/O outstanding on this connection at one time */
+ uint16_t max_queue_depth;
+
+ /* The maximum number of active RDMA READ and ATOMIC operations at one time */
+ uint16_t max_read_depth;
+
+ /* The maximum number of RDMA SEND operations at one time */
+ uint32_t max_send_depth;
+
+ /* The current number of outstanding WRs from this qpair's
+ * recv queue. Should not exceed device->attr.max_queue_depth.
+ */
+ uint16_t current_recv_depth;
+
+ /* The current number of active RDMA READ operations */
+ uint16_t current_read_depth;
+
+ /* The current number of posted WRs from this qpair's
+ * send queue. Should not exceed max_send_depth.
+ */
+ uint32_t current_send_depth;
+
+ /* The maximum number of SGEs per WR on the send queue */
+ uint32_t max_send_sge;
+
+ /* The maximum number of SGEs per WR on the recv queue */
+ uint32_t max_recv_sge;
+
+ struct spdk_nvmf_rdma_resources *resources;
+
+ STAILQ_HEAD(, spdk_nvmf_rdma_request) pending_rdma_read_queue;
+
+ STAILQ_HEAD(, spdk_nvmf_rdma_request) pending_rdma_write_queue;
+
+ /* Number of requests not in the free state */
+ uint32_t qd;
+
+ TAILQ_ENTRY(spdk_nvmf_rdma_qpair) link;
+
+ STAILQ_ENTRY(spdk_nvmf_rdma_qpair) recv_link;
+
+ STAILQ_ENTRY(spdk_nvmf_rdma_qpair) send_link;
+
+ /* IBV queue pair attributes: they are used to manage
+ * qp state and recover from errors.
+ */
+ enum ibv_qp_state ibv_state;
+
+ uint32_t disconnect_flags;
+
+ /* Poller registered in case the qpair doesn't properly
+ * complete the qpair destruct process and becomes defunct.
+ */
+
+ struct spdk_poller *destruct_poller;
+
+ /*
+ * io_channel which is used to destroy qpair when it is removed from poll group
+ */
+ struct spdk_io_channel *destruct_channel;
+
+ /* List of ibv async events */
+ STAILQ_HEAD(, spdk_nvmf_rdma_ibv_event_ctx) ibv_events;
+
+ /* There are several ways a disconnect can start on a qpair
+ * and they are not all mutually exclusive. It is important
+ * that we only initialize one of these paths.
+ */
+ bool disconnect_started;
+ /* Lets us know that we have received the last_wqe event. */
+ bool last_wqe_reached;
+};
+
+struct spdk_nvmf_rdma_poller_stat {
+ uint64_t completions;
+ uint64_t polls;
+ uint64_t requests;
+ uint64_t request_latency;
+ uint64_t pending_free_request;
+ uint64_t pending_rdma_read;
+ uint64_t pending_rdma_write;
+};
+
+struct spdk_nvmf_rdma_poller {
+ struct spdk_nvmf_rdma_device *device;
+ struct spdk_nvmf_rdma_poll_group *group;
+
+ int num_cqe;
+ int required_num_wr;
+ struct ibv_cq *cq;
+
+ /* The maximum number of I/O outstanding on the shared receive queue at one time */
+ uint16_t max_srq_depth;
+
+ /* Shared receive queue */
+ struct ibv_srq *srq;
+
+ struct spdk_nvmf_rdma_resources *resources;
+ struct spdk_nvmf_rdma_poller_stat stat;
+
+ TAILQ_HEAD(, spdk_nvmf_rdma_qpair) qpairs;
+
+ STAILQ_HEAD(, spdk_nvmf_rdma_qpair) qpairs_pending_recv;
+
+ STAILQ_HEAD(, spdk_nvmf_rdma_qpair) qpairs_pending_send;
+
+ TAILQ_ENTRY(spdk_nvmf_rdma_poller) link;
+};
+
+struct spdk_nvmf_rdma_poll_group_stat {
+ uint64_t pending_data_buffer;
+};
+
+struct spdk_nvmf_rdma_poll_group {
+ struct spdk_nvmf_transport_poll_group group;
+ struct spdk_nvmf_rdma_poll_group_stat stat;
+ TAILQ_HEAD(, spdk_nvmf_rdma_poller) pollers;
+ TAILQ_ENTRY(spdk_nvmf_rdma_poll_group) link;
+ /*
+ * buffers which are split across multiple RDMA
+ * memory regions cannot be used by this transport.
+ */
+ STAILQ_HEAD(, spdk_nvmf_transport_pg_cache_buf) retired_bufs;
+};
+
+struct spdk_nvmf_rdma_conn_sched {
+ struct spdk_nvmf_rdma_poll_group *next_admin_pg;
+ struct spdk_nvmf_rdma_poll_group *next_io_pg;
+};
+
+/* Assuming rdma_cm uses just one protection domain per ibv_context. */
+struct spdk_nvmf_rdma_device {
+ struct ibv_device_attr attr;
+ struct ibv_context *context;
+
+ struct spdk_mem_map *map;
+ struct ibv_pd *pd;
+
+ int num_srq;
+
+ TAILQ_ENTRY(spdk_nvmf_rdma_device) link;
+};
+
+struct spdk_nvmf_rdma_port {
+ const struct spdk_nvme_transport_id *trid;
+ struct rdma_cm_id *id;
+ struct spdk_nvmf_rdma_device *device;
+ TAILQ_ENTRY(spdk_nvmf_rdma_port) link;
+};
+
+struct spdk_nvmf_rdma_transport {
+ struct spdk_nvmf_transport transport;
+
+ struct spdk_nvmf_rdma_conn_sched conn_sched;
+
+ struct rdma_event_channel *event_channel;
+
+ struct spdk_mempool *data_wr_pool;
+
+ pthread_mutex_t lock;
+
+ /* fields used to poll RDMA/IB events */
+ nfds_t npoll_fds;
+ struct pollfd *poll_fds;
+
+ TAILQ_HEAD(, spdk_nvmf_rdma_device) devices;
+ TAILQ_HEAD(, spdk_nvmf_rdma_port) ports;
+ TAILQ_HEAD(, spdk_nvmf_rdma_poll_group) poll_groups;
+};
+
+static inline void
+nvmf_rdma_start_disconnect(struct spdk_nvmf_rdma_qpair *rqpair);
+
+static bool
+nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport,
+ struct spdk_nvmf_rdma_request *rdma_req);
+
+static inline int
+nvmf_rdma_check_ibv_state(enum ibv_qp_state state)
+{
+ switch (state) {
+ case IBV_QPS_RESET:
+ case IBV_QPS_INIT:
+ case IBV_QPS_RTR:
+ case IBV_QPS_RTS:
+ case IBV_QPS_SQD:
+ case IBV_QPS_SQE:
+ case IBV_QPS_ERR:
+ return 0;
+ default:
+ return -1;
+ }
+}
+
+static inline enum spdk_nvme_media_error_status_code
+nvmf_rdma_dif_error_to_compl_status(uint8_t err_type) {
+ enum spdk_nvme_media_error_status_code result;
+ switch (err_type)
+ {
+ case SPDK_DIF_REFTAG_ERROR:
+ result = SPDK_NVME_SC_REFERENCE_TAG_CHECK_ERROR;
+ break;
+ case SPDK_DIF_APPTAG_ERROR:
+ result = SPDK_NVME_SC_APPLICATION_TAG_CHECK_ERROR;
+ break;
+ case SPDK_DIF_GUARD_ERROR:
+ result = SPDK_NVME_SC_GUARD_CHECK_ERROR;
+ break;
+ default:
+ SPDK_UNREACHABLE();
+ }
+
+ return result;
+}
+
+static enum ibv_qp_state
+nvmf_rdma_update_ibv_state(struct spdk_nvmf_rdma_qpair *rqpair) {
+ enum ibv_qp_state old_state, new_state;
+ struct ibv_qp_attr qp_attr;
+ struct ibv_qp_init_attr init_attr;
+ int rc;
+
+ old_state = rqpair->ibv_state;
+ rc = ibv_query_qp(rqpair->rdma_qp->qp, &qp_attr,
+ g_spdk_nvmf_ibv_query_mask, &init_attr);
+
+ if (rc)
+ {
+ SPDK_ERRLOG("Failed to get updated RDMA queue pair state!\n");
+ return IBV_QPS_ERR + 1;
+ }
+
+ new_state = qp_attr.qp_state;
+ rqpair->ibv_state = new_state;
+ qp_attr.ah_attr.port_num = qp_attr.port_num;
+
+ rc = nvmf_rdma_check_ibv_state(new_state);
+ if (rc)
+ {
+ SPDK_ERRLOG("QP#%d: bad state updated: %u, maybe hardware issue\n", rqpair->qpair.qid, new_state);
+ /*
+ * IBV_QPS_UNKNOWN undefined if lib version smaller than libibverbs-1.1.8
+ * IBV_QPS_UNKNOWN is the enum element after IBV_QPS_ERR
+ */
+ return IBV_QPS_ERR + 1;
+ }
+
+ if (old_state != new_state)
+ {
+ spdk_trace_record(TRACE_RDMA_QP_STATE_CHANGE, 0, 0,
+ (uintptr_t)rqpair->cm_id, new_state);
+ }
+ return new_state;
+}
+
+static void
+nvmf_rdma_request_free_data(struct spdk_nvmf_rdma_request *rdma_req,
+ struct spdk_nvmf_rdma_transport *rtransport)
+{
+ struct spdk_nvmf_rdma_request_data *data_wr;
+ struct ibv_send_wr *next_send_wr;
+ uint64_t req_wrid;
+
+ rdma_req->num_outstanding_data_wr = 0;
+ data_wr = &rdma_req->data;
+ req_wrid = data_wr->wr.wr_id;
+ while (data_wr && data_wr->wr.wr_id == req_wrid) {
+ memset(data_wr->sgl, 0, sizeof(data_wr->wr.sg_list[0]) * data_wr->wr.num_sge);
+ data_wr->wr.num_sge = 0;
+ next_send_wr = data_wr->wr.next;
+ if (data_wr != &rdma_req->data) {
+ spdk_mempool_put(rtransport->data_wr_pool, data_wr);
+ }
+ data_wr = (!next_send_wr || next_send_wr == &rdma_req->rsp.wr) ? NULL :
+ SPDK_CONTAINEROF(next_send_wr, struct spdk_nvmf_rdma_request_data, wr);
+ }
+}
+
+static void
+nvmf_rdma_dump_request(struct spdk_nvmf_rdma_request *req)
+{
+ SPDK_ERRLOG("\t\tRequest Data From Pool: %d\n", req->req.data_from_pool);
+ if (req->req.cmd) {
+ SPDK_ERRLOG("\t\tRequest opcode: %d\n", req->req.cmd->nvmf_cmd.opcode);
+ }
+ if (req->recv) {
+ SPDK_ERRLOG("\t\tRequest recv wr_id%lu\n", req->recv->wr.wr_id);
+ }
+}
+
+static void
+nvmf_rdma_dump_qpair_contents(struct spdk_nvmf_rdma_qpair *rqpair)
+{
+ int i;
+
+ SPDK_ERRLOG("Dumping contents of queue pair (QID %d)\n", rqpair->qpair.qid);
+ for (i = 0; i < rqpair->max_queue_depth; i++) {
+ if (rqpair->resources->reqs[i].state != RDMA_REQUEST_STATE_FREE) {
+ nvmf_rdma_dump_request(&rqpair->resources->reqs[i]);
+ }
+ }
+}
+
+static void
+nvmf_rdma_resources_destroy(struct spdk_nvmf_rdma_resources *resources)
+{
+ if (resources->cmds_mr) {
+ ibv_dereg_mr(resources->cmds_mr);
+ }
+
+ if (resources->cpls_mr) {
+ ibv_dereg_mr(resources->cpls_mr);
+ }
+
+ if (resources->bufs_mr) {
+ ibv_dereg_mr(resources->bufs_mr);
+ }
+
+ spdk_free(resources->cmds);
+ spdk_free(resources->cpls);
+ spdk_free(resources->bufs);
+ free(resources->reqs);
+ free(resources->recvs);
+ free(resources);
+}
+
+
+static struct spdk_nvmf_rdma_resources *
+nvmf_rdma_resources_create(struct spdk_nvmf_rdma_resource_opts *opts)
+{
+ struct spdk_nvmf_rdma_resources *resources;
+ struct spdk_nvmf_rdma_request *rdma_req;
+ struct spdk_nvmf_rdma_recv *rdma_recv;
+ struct ibv_qp *qp;
+ struct ibv_srq *srq;
+ uint32_t i;
+ int rc;
+
+ resources = calloc(1, sizeof(struct spdk_nvmf_rdma_resources));
+ if (!resources) {
+ SPDK_ERRLOG("Unable to allocate resources for receive queue.\n");
+ return NULL;
+ }
+
+ resources->reqs = calloc(opts->max_queue_depth, sizeof(*resources->reqs));
+ resources->recvs = calloc(opts->max_queue_depth, sizeof(*resources->recvs));
+ resources->cmds = spdk_zmalloc(opts->max_queue_depth * sizeof(*resources->cmds),
+ 0x1000, NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+ resources->cpls = spdk_zmalloc(opts->max_queue_depth * sizeof(*resources->cpls),
+ 0x1000, NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+
+ if (opts->in_capsule_data_size > 0) {
+ resources->bufs = spdk_zmalloc(opts->max_queue_depth * opts->in_capsule_data_size,
+ 0x1000, NULL, SPDK_ENV_LCORE_ID_ANY,
+ SPDK_MALLOC_DMA);
+ }
+
+ if (!resources->reqs || !resources->recvs || !resources->cmds ||
+ !resources->cpls || (opts->in_capsule_data_size && !resources->bufs)) {
+ SPDK_ERRLOG("Unable to allocate sufficient memory for RDMA queue.\n");
+ goto cleanup;
+ }
+
+ resources->cmds_mr = ibv_reg_mr(opts->pd, resources->cmds,
+ opts->max_queue_depth * sizeof(*resources->cmds),
+ IBV_ACCESS_LOCAL_WRITE);
+ resources->cpls_mr = ibv_reg_mr(opts->pd, resources->cpls,
+ opts->max_queue_depth * sizeof(*resources->cpls),
+ 0);
+
+ if (opts->in_capsule_data_size) {
+ resources->bufs_mr = ibv_reg_mr(opts->pd, resources->bufs,
+ opts->max_queue_depth *
+ opts->in_capsule_data_size,
+ IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
+ }
+
+ if (!resources->cmds_mr || !resources->cpls_mr ||
+ (opts->in_capsule_data_size &&
+ !resources->bufs_mr)) {
+ goto cleanup;
+ }
+ SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Command Array: %p Length: %lx LKey: %x\n",
+ resources->cmds, opts->max_queue_depth * sizeof(*resources->cmds),
+ resources->cmds_mr->lkey);
+ SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Completion Array: %p Length: %lx LKey: %x\n",
+ resources->cpls, opts->max_queue_depth * sizeof(*resources->cpls),
+ resources->cpls_mr->lkey);
+ if (resources->bufs && resources->bufs_mr) {
+ SPDK_DEBUGLOG(SPDK_LOG_RDMA, "In Capsule Data Array: %p Length: %x LKey: %x\n",
+ resources->bufs, opts->max_queue_depth *
+ opts->in_capsule_data_size, resources->bufs_mr->lkey);
+ }
+
+ /* Initialize queues */
+ STAILQ_INIT(&resources->incoming_queue);
+ STAILQ_INIT(&resources->free_queue);
+
+ for (i = 0; i < opts->max_queue_depth; i++) {
+ struct ibv_recv_wr *bad_wr = NULL;
+
+ rdma_recv = &resources->recvs[i];
+ rdma_recv->qpair = opts->qpair;
+
+ /* Set up memory to receive commands */
+ if (resources->bufs) {
+ rdma_recv->buf = (void *)((uintptr_t)resources->bufs + (i *
+ opts->in_capsule_data_size));
+ }
+
+ rdma_recv->rdma_wr.type = RDMA_WR_TYPE_RECV;
+
+ rdma_recv->sgl[0].addr = (uintptr_t)&resources->cmds[i];
+ rdma_recv->sgl[0].length = sizeof(resources->cmds[i]);
+ rdma_recv->sgl[0].lkey = resources->cmds_mr->lkey;
+ rdma_recv->wr.num_sge = 1;
+
+ if (rdma_recv->buf && resources->bufs_mr) {
+ rdma_recv->sgl[1].addr = (uintptr_t)rdma_recv->buf;
+ rdma_recv->sgl[1].length = opts->in_capsule_data_size;
+ rdma_recv->sgl[1].lkey = resources->bufs_mr->lkey;
+ rdma_recv->wr.num_sge++;
+ }
+
+ rdma_recv->wr.wr_id = (uintptr_t)&rdma_recv->rdma_wr;
+ rdma_recv->wr.sg_list = rdma_recv->sgl;
+ if (opts->shared) {
+ srq = (struct ibv_srq *)opts->qp;
+ rc = ibv_post_srq_recv(srq, &rdma_recv->wr, &bad_wr);
+ } else {
+ qp = (struct ibv_qp *)opts->qp;
+ rc = ibv_post_recv(qp, &rdma_recv->wr, &bad_wr);
+ }
+ if (rc) {
+ goto cleanup;
+ }
+ }
+
+ for (i = 0; i < opts->max_queue_depth; i++) {
+ rdma_req = &resources->reqs[i];
+
+ if (opts->qpair != NULL) {
+ rdma_req->req.qpair = &opts->qpair->qpair;
+ } else {
+ rdma_req->req.qpair = NULL;
+ }
+ rdma_req->req.cmd = NULL;
+
+ /* Set up memory to send responses */
+ rdma_req->req.rsp = &resources->cpls[i];
+
+ rdma_req->rsp.sgl[0].addr = (uintptr_t)&resources->cpls[i];
+ rdma_req->rsp.sgl[0].length = sizeof(resources->cpls[i]);
+ rdma_req->rsp.sgl[0].lkey = resources->cpls_mr->lkey;
+
+ rdma_req->rsp.rdma_wr.type = RDMA_WR_TYPE_SEND;
+ rdma_req->rsp.wr.wr_id = (uintptr_t)&rdma_req->rsp.rdma_wr;
+ rdma_req->rsp.wr.next = NULL;
+ rdma_req->rsp.wr.opcode = IBV_WR_SEND;
+ rdma_req->rsp.wr.send_flags = IBV_SEND_SIGNALED;
+ rdma_req->rsp.wr.sg_list = rdma_req->rsp.sgl;
+ rdma_req->rsp.wr.num_sge = SPDK_COUNTOF(rdma_req->rsp.sgl);
+
+ /* Set up memory for data buffers */
+ rdma_req->data.rdma_wr.type = RDMA_WR_TYPE_DATA;
+ rdma_req->data.wr.wr_id = (uintptr_t)&rdma_req->data.rdma_wr;
+ rdma_req->data.wr.next = NULL;
+ rdma_req->data.wr.send_flags = IBV_SEND_SIGNALED;
+ rdma_req->data.wr.sg_list = rdma_req->data.sgl;
+ rdma_req->data.wr.num_sge = SPDK_COUNTOF(rdma_req->data.sgl);
+
+ /* Initialize request state to FREE */
+ rdma_req->state = RDMA_REQUEST_STATE_FREE;
+ STAILQ_INSERT_TAIL(&resources->free_queue, rdma_req, state_link);
+ }
+
+ return resources;
+
+cleanup:
+ nvmf_rdma_resources_destroy(resources);
+ return NULL;
+}
+
+static void
+nvmf_rdma_qpair_clean_ibv_events(struct spdk_nvmf_rdma_qpair *rqpair)
+{
+ struct spdk_nvmf_rdma_ibv_event_ctx *ctx, *tctx;
+ STAILQ_FOREACH_SAFE(ctx, &rqpair->ibv_events, link, tctx) {
+ ctx->rqpair = NULL;
+ /* Memory allocated for ctx is freed in nvmf_rdma_qpair_process_ibv_event */
+ STAILQ_REMOVE(&rqpair->ibv_events, ctx, spdk_nvmf_rdma_ibv_event_ctx, link);
+ }
+}
+
+static void
+nvmf_rdma_qpair_destroy(struct spdk_nvmf_rdma_qpair *rqpair)
+{
+ struct spdk_nvmf_rdma_recv *rdma_recv, *recv_tmp;
+ struct ibv_recv_wr *bad_recv_wr = NULL;
+ int rc;
+
+ spdk_trace_record(TRACE_RDMA_QP_DESTROY, 0, 0, (uintptr_t)rqpair->cm_id, 0);
+
+ spdk_poller_unregister(&rqpair->destruct_poller);
+
+ if (rqpair->qd != 0) {
+ struct spdk_nvmf_qpair *qpair = &rqpair->qpair;
+ struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(qpair->transport,
+ struct spdk_nvmf_rdma_transport, transport);
+ struct spdk_nvmf_rdma_request *req;
+ uint32_t i, max_req_count = 0;
+
+ SPDK_WARNLOG("Destroying qpair when queue depth is %d\n", rqpair->qd);
+
+ if (rqpair->srq == NULL) {
+ nvmf_rdma_dump_qpair_contents(rqpair);
+ max_req_count = rqpair->max_queue_depth;
+ } else if (rqpair->poller && rqpair->resources) {
+ max_req_count = rqpair->poller->max_srq_depth;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Release incomplete requests\n");
+ for (i = 0; i < max_req_count; i++) {
+ req = &rqpair->resources->reqs[i];
+ if (req->req.qpair == qpair && req->state != RDMA_REQUEST_STATE_FREE) {
+ /* nvmf_rdma_request_process checks qpair ibv and internal state
+ * and completes a request */
+ nvmf_rdma_request_process(rtransport, req);
+ }
+ }
+ assert(rqpair->qd == 0);
+ }
+
+ if (rqpair->poller) {
+ TAILQ_REMOVE(&rqpair->poller->qpairs, rqpair, link);
+
+ if (rqpair->srq != NULL && rqpair->resources != NULL) {
+ /* Drop all received but unprocessed commands for this queue and return them to SRQ */
+ STAILQ_FOREACH_SAFE(rdma_recv, &rqpair->resources->incoming_queue, link, recv_tmp) {
+ if (rqpair == rdma_recv->qpair) {
+ STAILQ_REMOVE(&rqpair->resources->incoming_queue, rdma_recv, spdk_nvmf_rdma_recv, link);
+ rc = ibv_post_srq_recv(rqpair->srq, &rdma_recv->wr, &bad_recv_wr);
+ if (rc) {
+ SPDK_ERRLOG("Unable to re-post rx descriptor\n");
+ }
+ }
+ }
+ }
+ }
+
+ if (rqpair->cm_id) {
+ if (rqpair->rdma_qp != NULL) {
+ spdk_rdma_qp_destroy(rqpair->rdma_qp);
+ rqpair->rdma_qp = NULL;
+ }
+ rdma_destroy_id(rqpair->cm_id);
+
+ if (rqpair->poller != NULL && rqpair->srq == NULL) {
+ rqpair->poller->required_num_wr -= MAX_WR_PER_QP(rqpair->max_queue_depth);
+ }
+ }
+
+ if (rqpair->srq == NULL && rqpair->resources != NULL) {
+ nvmf_rdma_resources_destroy(rqpair->resources);
+ }
+
+ nvmf_rdma_qpair_clean_ibv_events(rqpair);
+
+ if (rqpair->destruct_channel) {
+ spdk_put_io_channel(rqpair->destruct_channel);
+ rqpair->destruct_channel = NULL;
+ }
+
+ free(rqpair);
+}
+
+static int
+nvmf_rdma_resize_cq(struct spdk_nvmf_rdma_qpair *rqpair, struct spdk_nvmf_rdma_device *device)
+{
+ struct spdk_nvmf_rdma_poller *rpoller;
+ int rc, num_cqe, required_num_wr;
+
+ /* Enlarge CQ size dynamically */
+ rpoller = rqpair->poller;
+ required_num_wr = rpoller->required_num_wr + MAX_WR_PER_QP(rqpair->max_queue_depth);
+ num_cqe = rpoller->num_cqe;
+ if (num_cqe < required_num_wr) {
+ num_cqe = spdk_max(num_cqe * 2, required_num_wr);
+ num_cqe = spdk_min(num_cqe, device->attr.max_cqe);
+ }
+
+ if (rpoller->num_cqe != num_cqe) {
+ if (required_num_wr > device->attr.max_cqe) {
+ SPDK_ERRLOG("RDMA CQE requirement (%d) exceeds device max_cqe limitation (%d)\n",
+ required_num_wr, device->attr.max_cqe);
+ return -1;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Resize RDMA CQ from %d to %d\n", rpoller->num_cqe, num_cqe);
+ rc = ibv_resize_cq(rpoller->cq, num_cqe);
+ if (rc) {
+ SPDK_ERRLOG("RDMA CQ resize failed: errno %d: %s\n", errno, spdk_strerror(errno));
+ return -1;
+ }
+
+ rpoller->num_cqe = num_cqe;
+ }
+
+ rpoller->required_num_wr = required_num_wr;
+ return 0;
+}
+
+static int
+nvmf_rdma_qpair_initialize(struct spdk_nvmf_qpair *qpair)
+{
+ struct spdk_nvmf_rdma_qpair *rqpair;
+ struct spdk_nvmf_rdma_transport *rtransport;
+ struct spdk_nvmf_transport *transport;
+ struct spdk_nvmf_rdma_resource_opts opts;
+ struct spdk_nvmf_rdma_device *device;
+ struct spdk_rdma_qp_init_attr qp_init_attr = {};
+
+ rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
+ device = rqpair->device;
+
+ qp_init_attr.qp_context = rqpair;
+ qp_init_attr.pd = device->pd;
+ qp_init_attr.send_cq = rqpair->poller->cq;
+ qp_init_attr.recv_cq = rqpair->poller->cq;
+
+ if (rqpair->srq) {
+ qp_init_attr.srq = rqpair->srq;
+ } else {
+ qp_init_attr.cap.max_recv_wr = rqpair->max_queue_depth;
+ }
+
+ /* SEND, READ, and WRITE operations */
+ qp_init_attr.cap.max_send_wr = (uint32_t)rqpair->max_queue_depth * 2;
+ qp_init_attr.cap.max_send_sge = spdk_min((uint32_t)device->attr.max_sge, NVMF_DEFAULT_TX_SGE);
+ qp_init_attr.cap.max_recv_sge = spdk_min((uint32_t)device->attr.max_sge, NVMF_DEFAULT_RX_SGE);
+
+ if (rqpair->srq == NULL && nvmf_rdma_resize_cq(rqpair, device) < 0) {
+ SPDK_ERRLOG("Failed to resize the completion queue. Cannot initialize qpair.\n");
+ goto error;
+ }
+
+ rqpair->rdma_qp = spdk_rdma_qp_create(rqpair->cm_id, &qp_init_attr);
+ if (!rqpair->rdma_qp) {
+ goto error;
+ }
+
+ rqpair->max_send_depth = spdk_min((uint32_t)(rqpair->max_queue_depth * 2),
+ qp_init_attr.cap.max_send_wr);
+ rqpair->max_send_sge = spdk_min(NVMF_DEFAULT_TX_SGE, qp_init_attr.cap.max_send_sge);
+ rqpair->max_recv_sge = spdk_min(NVMF_DEFAULT_RX_SGE, qp_init_attr.cap.max_recv_sge);
+ spdk_trace_record(TRACE_RDMA_QP_CREATE, 0, 0, (uintptr_t)rqpair->cm_id, 0);
+ SPDK_DEBUGLOG(SPDK_LOG_RDMA, "New RDMA Connection: %p\n", qpair);
+
+ if (rqpair->poller->srq == NULL) {
+ rtransport = SPDK_CONTAINEROF(qpair->transport, struct spdk_nvmf_rdma_transport, transport);
+ transport = &rtransport->transport;
+
+ opts.qp = rqpair->rdma_qp->qp;
+ opts.pd = rqpair->cm_id->pd;
+ opts.qpair = rqpair;
+ opts.shared = false;
+ opts.max_queue_depth = rqpair->max_queue_depth;
+ opts.in_capsule_data_size = transport->opts.in_capsule_data_size;
+
+ rqpair->resources = nvmf_rdma_resources_create(&opts);
+
+ if (!rqpair->resources) {
+ SPDK_ERRLOG("Unable to allocate resources for receive queue.\n");
+ rdma_destroy_qp(rqpair->cm_id);
+ goto error;
+ }
+ } else {
+ rqpair->resources = rqpair->poller->resources;
+ }
+
+ rqpair->current_recv_depth = 0;
+ STAILQ_INIT(&rqpair->pending_rdma_read_queue);
+ STAILQ_INIT(&rqpair->pending_rdma_write_queue);
+
+ return 0;
+
+error:
+ rdma_destroy_id(rqpair->cm_id);
+ rqpair->cm_id = NULL;
+ return -1;
+}
+
+/* Append the given recv wr structure to the resource structs outstanding recvs list. */
+/* This function accepts either a single wr or the first wr in a linked list. */
+static void
+nvmf_rdma_qpair_queue_recv_wrs(struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_recv_wr *first)
+{
+ struct ibv_recv_wr *last;
+
+ last = first;
+ while (last->next != NULL) {
+ last = last->next;
+ }
+
+ if (rqpair->resources->recvs_to_post.first == NULL) {
+ rqpair->resources->recvs_to_post.first = first;
+ rqpair->resources->recvs_to_post.last = last;
+ if (rqpair->srq == NULL) {
+ STAILQ_INSERT_TAIL(&rqpair->poller->qpairs_pending_recv, rqpair, recv_link);
+ }
+ } else {
+ rqpair->resources->recvs_to_post.last->next = first;
+ rqpair->resources->recvs_to_post.last = last;
+ }
+}
+
+static int
+request_transfer_in(struct spdk_nvmf_request *req)
+{
+ struct spdk_nvmf_rdma_request *rdma_req;
+ struct spdk_nvmf_qpair *qpair;
+ struct spdk_nvmf_rdma_qpair *rqpair;
+
+ qpair = req->qpair;
+ rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req);
+ rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
+
+ assert(req->xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER);
+ assert(rdma_req != NULL);
+
+ if (spdk_rdma_qp_queue_send_wrs(rqpair->rdma_qp, &rdma_req->data.wr)) {
+ STAILQ_INSERT_TAIL(&rqpair->poller->qpairs_pending_send, rqpair, send_link);
+ }
+
+ rqpair->current_read_depth += rdma_req->num_outstanding_data_wr;
+ rqpair->current_send_depth += rdma_req->num_outstanding_data_wr;
+ return 0;
+}
+
+static int
+request_transfer_out(struct spdk_nvmf_request *req, int *data_posted)
+{
+ int num_outstanding_data_wr = 0;
+ struct spdk_nvmf_rdma_request *rdma_req;
+ struct spdk_nvmf_qpair *qpair;
+ struct spdk_nvmf_rdma_qpair *rqpair;
+ struct spdk_nvme_cpl *rsp;
+ struct ibv_send_wr *first = NULL;
+
+ *data_posted = 0;
+ qpair = req->qpair;
+ rsp = &req->rsp->nvme_cpl;
+ rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req);
+ rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
+
+ /* Advance our sq_head pointer */
+ if (qpair->sq_head == qpair->sq_head_max) {
+ qpair->sq_head = 0;
+ } else {
+ qpair->sq_head++;
+ }
+ rsp->sqhd = qpair->sq_head;
+
+ /* queue the capsule for the recv buffer */
+ assert(rdma_req->recv != NULL);
+
+ nvmf_rdma_qpair_queue_recv_wrs(rqpair, &rdma_req->recv->wr);
+
+ rdma_req->recv = NULL;
+ assert(rqpair->current_recv_depth > 0);
+ rqpair->current_recv_depth--;
+
+ /* Build the response which consists of optional
+ * RDMA WRITEs to transfer data, plus an RDMA SEND
+ * containing the response.
+ */
+ first = &rdma_req->rsp.wr;
+
+ if (rsp->status.sc != SPDK_NVME_SC_SUCCESS) {
+ /* On failure, data was not read from the controller. So clear the
+ * number of outstanding data WRs to zero.
+ */
+ rdma_req->num_outstanding_data_wr = 0;
+ } else if (req->xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) {
+ first = &rdma_req->data.wr;
+ *data_posted = 1;
+ num_outstanding_data_wr = rdma_req->num_outstanding_data_wr;
+ }
+ if (spdk_rdma_qp_queue_send_wrs(rqpair->rdma_qp, first)) {
+ STAILQ_INSERT_TAIL(&rqpair->poller->qpairs_pending_send, rqpair, send_link);
+ }
+
+ /* +1 for the rsp wr */
+ rqpair->current_send_depth += num_outstanding_data_wr + 1;
+
+ return 0;
+}
+
+static int
+nvmf_rdma_event_accept(struct rdma_cm_id *id, struct spdk_nvmf_rdma_qpair *rqpair)
+{
+ struct spdk_nvmf_rdma_accept_private_data accept_data;
+ struct rdma_conn_param ctrlr_event_data = {};
+ int rc;
+
+ accept_data.recfmt = 0;
+ accept_data.crqsize = rqpair->max_queue_depth;
+
+ ctrlr_event_data.private_data = &accept_data;
+ ctrlr_event_data.private_data_len = sizeof(accept_data);
+ if (id->ps == RDMA_PS_TCP) {
+ ctrlr_event_data.responder_resources = 0; /* We accept 0 reads from the host */
+ ctrlr_event_data.initiator_depth = rqpair->max_read_depth;
+ }
+
+ /* Configure infinite retries for the initiator side qpair.
+ * When using a shared receive queue on the target side,
+ * we need to pass this value to the initiator to prevent the
+ * initiator side NIC from completing SEND requests back to the
+ * initiator with status rnr_retry_count_exceeded. */
+ if (rqpair->srq != NULL) {
+ ctrlr_event_data.rnr_retry_count = 0x7;
+ }
+
+ /* When qpair is created without use of rdma cm API, an additional
+ * information must be provided to initiator in the connection response:
+ * whether qpair is using SRQ and its qp_num
+ * Fields below are ignored by rdma cm if qpair has been
+ * created using rdma cm API. */
+ ctrlr_event_data.srq = rqpair->srq ? 1 : 0;
+ ctrlr_event_data.qp_num = rqpair->rdma_qp->qp->qp_num;
+
+ rc = spdk_rdma_qp_accept(rqpair->rdma_qp, &ctrlr_event_data);
+ if (rc) {
+ SPDK_ERRLOG("Error %d on spdk_rdma_qp_accept\n", errno);
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Sent back the accept\n");
+ }
+
+ return rc;
+}
+
+static void
+nvmf_rdma_event_reject(struct rdma_cm_id *id, enum spdk_nvmf_rdma_transport_error error)
+{
+ struct spdk_nvmf_rdma_reject_private_data rej_data;
+
+ rej_data.recfmt = 0;
+ rej_data.sts = error;
+
+ rdma_reject(id, &rej_data, sizeof(rej_data));
+}
+
+static int
+nvmf_rdma_connect(struct spdk_nvmf_transport *transport, struct rdma_cm_event *event)
+{
+ struct spdk_nvmf_rdma_transport *rtransport;
+ struct spdk_nvmf_rdma_qpair *rqpair = NULL;
+ struct spdk_nvmf_rdma_port *port;
+ struct rdma_conn_param *rdma_param = NULL;
+ const struct spdk_nvmf_rdma_request_private_data *private_data = NULL;
+ uint16_t max_queue_depth;
+ uint16_t max_read_depth;
+
+ rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport);
+
+ assert(event->id != NULL); /* Impossible. Can't even reject the connection. */
+ assert(event->id->verbs != NULL); /* Impossible. No way to handle this. */
+
+ rdma_param = &event->param.conn;
+ if (rdma_param->private_data == NULL ||
+ rdma_param->private_data_len < sizeof(struct spdk_nvmf_rdma_request_private_data)) {
+ SPDK_ERRLOG("connect request: no private data provided\n");
+ nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_INVALID_PRIVATE_DATA_LENGTH);
+ return -1;
+ }
+
+ private_data = rdma_param->private_data;
+ if (private_data->recfmt != 0) {
+ SPDK_ERRLOG("Received RDMA private data with RECFMT != 0\n");
+ nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_INVALID_RECFMT);
+ return -1;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Connect Recv on fabric intf name %s, dev_name %s\n",
+ event->id->verbs->device->name, event->id->verbs->device->dev_name);
+
+ port = event->listen_id->context;
+ SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Listen Id was %p with verbs %p. ListenAddr: %p\n",
+ event->listen_id, event->listen_id->verbs, port);
+
+ /* Figure out the supported queue depth. This is a multi-step process
+ * that takes into account hardware maximums, host provided values,
+ * and our target's internal memory limits */
+
+ SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Calculating Queue Depth\n");
+
+ /* Start with the maximum queue depth allowed by the target */
+ max_queue_depth = rtransport->transport.opts.max_queue_depth;
+ max_read_depth = rtransport->transport.opts.max_queue_depth;
+ SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Target Max Queue Depth: %d\n",
+ rtransport->transport.opts.max_queue_depth);
+
+ /* Next check the local NIC's hardware limitations */
+ SPDK_DEBUGLOG(SPDK_LOG_RDMA,
+ "Local NIC Max Send/Recv Queue Depth: %d Max Read/Write Queue Depth: %d\n",
+ port->device->attr.max_qp_wr, port->device->attr.max_qp_rd_atom);
+ max_queue_depth = spdk_min(max_queue_depth, port->device->attr.max_qp_wr);
+ max_read_depth = spdk_min(max_read_depth, port->device->attr.max_qp_init_rd_atom);
+
+ /* Next check the remote NIC's hardware limitations */
+ SPDK_DEBUGLOG(SPDK_LOG_RDMA,
+ "Host (Initiator) NIC Max Incoming RDMA R/W operations: %d Max Outgoing RDMA R/W operations: %d\n",
+ rdma_param->initiator_depth, rdma_param->responder_resources);
+ if (rdma_param->initiator_depth > 0) {
+ max_read_depth = spdk_min(max_read_depth, rdma_param->initiator_depth);
+ }
+
+ /* Finally check for the host software requested values, which are
+ * optional. */
+ if (rdma_param->private_data != NULL &&
+ rdma_param->private_data_len >= sizeof(struct spdk_nvmf_rdma_request_private_data)) {
+ SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Host Receive Queue Size: %d\n", private_data->hrqsize);
+ SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Host Send Queue Size: %d\n", private_data->hsqsize);
+ max_queue_depth = spdk_min(max_queue_depth, private_data->hrqsize);
+ max_queue_depth = spdk_min(max_queue_depth, private_data->hsqsize + 1);
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Final Negotiated Queue Depth: %d R/W Depth: %d\n",
+ max_queue_depth, max_read_depth);
+
+ rqpair = calloc(1, sizeof(struct spdk_nvmf_rdma_qpair));
+ if (rqpair == NULL) {
+ SPDK_ERRLOG("Could not allocate new connection.\n");
+ nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_NO_RESOURCES);
+ return -1;
+ }
+
+ rqpair->device = port->device;
+ rqpair->max_queue_depth = max_queue_depth;
+ rqpair->max_read_depth = max_read_depth;
+ rqpair->cm_id = event->id;
+ rqpair->listen_id = event->listen_id;
+ rqpair->qpair.transport = transport;
+ STAILQ_INIT(&rqpair->ibv_events);
+ /* use qid from the private data to determine the qpair type
+ qid will be set to the appropriate value when the controller is created */
+ rqpair->qpair.qid = private_data->qid;
+
+ event->id->context = &rqpair->qpair;
+
+ spdk_nvmf_tgt_new_qpair(transport->tgt, &rqpair->qpair);
+
+ return 0;
+}
+
+static int
+nvmf_rdma_mem_notify(void *cb_ctx, struct spdk_mem_map *map,
+ enum spdk_mem_map_notify_action action,
+ void *vaddr, size_t size)
+{
+ struct ibv_pd *pd = cb_ctx;
+ struct ibv_mr *mr;
+ int rc;
+
+ switch (action) {
+ case SPDK_MEM_MAP_NOTIFY_REGISTER:
+ if (!g_nvmf_hooks.get_rkey) {
+ mr = ibv_reg_mr(pd, vaddr, size,
+ IBV_ACCESS_LOCAL_WRITE |
+ IBV_ACCESS_REMOTE_READ |
+ IBV_ACCESS_REMOTE_WRITE);
+ if (mr == NULL) {
+ SPDK_ERRLOG("ibv_reg_mr() failed\n");
+ return -1;
+ } else {
+ rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, size, (uint64_t)mr);
+ }
+ } else {
+ rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, size,
+ g_nvmf_hooks.get_rkey(pd, vaddr, size));
+ }
+ break;
+ case SPDK_MEM_MAP_NOTIFY_UNREGISTER:
+ if (!g_nvmf_hooks.get_rkey) {
+ mr = (struct ibv_mr *)spdk_mem_map_translate(map, (uint64_t)vaddr, NULL);
+ if (mr) {
+ ibv_dereg_mr(mr);
+ }
+ }
+ rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, size);
+ break;
+ default:
+ SPDK_UNREACHABLE();
+ }
+
+ return rc;
+}
+
+static int
+nvmf_rdma_check_contiguous_entries(uint64_t addr_1, uint64_t addr_2)
+{
+ /* Two contiguous mappings will point to the same address which is the start of the RDMA MR. */
+ return addr_1 == addr_2;
+}
+
+static inline void
+nvmf_rdma_setup_wr(struct ibv_send_wr *wr, struct ibv_send_wr *next,
+ enum spdk_nvme_data_transfer xfer)
+{
+ if (xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) {
+ wr->opcode = IBV_WR_RDMA_WRITE;
+ wr->send_flags = 0;
+ wr->next = next;
+ } else if (xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) {
+ wr->opcode = IBV_WR_RDMA_READ;
+ wr->send_flags = IBV_SEND_SIGNALED;
+ wr->next = NULL;
+ } else {
+ assert(0);
+ }
+}
+
+static int
+nvmf_request_alloc_wrs(struct spdk_nvmf_rdma_transport *rtransport,
+ struct spdk_nvmf_rdma_request *rdma_req,
+ uint32_t num_sgl_descriptors)
+{
+ struct spdk_nvmf_rdma_request_data *work_requests[SPDK_NVMF_MAX_SGL_ENTRIES];
+ struct spdk_nvmf_rdma_request_data *current_data_wr;
+ uint32_t i;
+
+ if (num_sgl_descriptors > SPDK_NVMF_MAX_SGL_ENTRIES) {
+ SPDK_ERRLOG("Requested too much entries (%u), the limit is %u\n",
+ num_sgl_descriptors, SPDK_NVMF_MAX_SGL_ENTRIES);
+ return -EINVAL;
+ }
+
+ if (spdk_mempool_get_bulk(rtransport->data_wr_pool, (void **)work_requests, num_sgl_descriptors)) {
+ return -ENOMEM;
+ }
+
+ current_data_wr = &rdma_req->data;
+
+ for (i = 0; i < num_sgl_descriptors; i++) {
+ nvmf_rdma_setup_wr(&current_data_wr->wr, &work_requests[i]->wr, rdma_req->req.xfer);
+ current_data_wr->wr.next = &work_requests[i]->wr;
+ current_data_wr = work_requests[i];
+ current_data_wr->wr.sg_list = current_data_wr->sgl;
+ current_data_wr->wr.wr_id = rdma_req->data.wr.wr_id;
+ }
+
+ nvmf_rdma_setup_wr(&current_data_wr->wr, &rdma_req->rsp.wr, rdma_req->req.xfer);
+
+ return 0;
+}
+
+static inline void
+nvmf_rdma_setup_request(struct spdk_nvmf_rdma_request *rdma_req)
+{
+ struct ibv_send_wr *wr = &rdma_req->data.wr;
+ struct spdk_nvme_sgl_descriptor *sgl = &rdma_req->req.cmd->nvme_cmd.dptr.sgl1;
+
+ wr->wr.rdma.rkey = sgl->keyed.key;
+ wr->wr.rdma.remote_addr = sgl->address;
+ nvmf_rdma_setup_wr(wr, &rdma_req->rsp.wr, rdma_req->req.xfer);
+}
+
+static inline void
+nvmf_rdma_update_remote_addr(struct spdk_nvmf_rdma_request *rdma_req, uint32_t num_wrs)
+{
+ struct ibv_send_wr *wr = &rdma_req->data.wr;
+ struct spdk_nvme_sgl_descriptor *sgl = &rdma_req->req.cmd->nvme_cmd.dptr.sgl1;
+ uint32_t i;
+ int j;
+ uint64_t remote_addr_offset = 0;
+
+ for (i = 0; i < num_wrs; ++i) {
+ wr->wr.rdma.rkey = sgl->keyed.key;
+ wr->wr.rdma.remote_addr = sgl->address + remote_addr_offset;
+ for (j = 0; j < wr->num_sge; ++j) {
+ remote_addr_offset += wr->sg_list[j].length;
+ }
+ wr = wr->next;
+ }
+}
+
+/* This function is used in the rare case that we have a buffer split over multiple memory regions. */
+static int
+nvmf_rdma_replace_buffer(struct spdk_nvmf_rdma_poll_group *rgroup, void **buf)
+{
+ struct spdk_nvmf_transport_poll_group *group = &rgroup->group;
+ struct spdk_nvmf_transport *transport = group->transport;
+ struct spdk_nvmf_transport_pg_cache_buf *old_buf;
+ void *new_buf;
+
+ if (!(STAILQ_EMPTY(&group->buf_cache))) {
+ group->buf_cache_count--;
+ new_buf = STAILQ_FIRST(&group->buf_cache);
+ STAILQ_REMOVE_HEAD(&group->buf_cache, link);
+ assert(*buf != NULL);
+ } else {
+ new_buf = spdk_mempool_get(transport->data_buf_pool);
+ }
+
+ if (*buf == NULL) {
+ return -ENOMEM;
+ }
+
+ old_buf = *buf;
+ STAILQ_INSERT_HEAD(&rgroup->retired_bufs, old_buf, link);
+ *buf = new_buf;
+ return 0;
+}
+
+static bool
+nvmf_rdma_get_lkey(struct spdk_nvmf_rdma_device *device, struct iovec *iov,
+ uint32_t *_lkey)
+{
+ uint64_t translation_len;
+ uint32_t lkey;
+
+ translation_len = iov->iov_len;
+
+ if (!g_nvmf_hooks.get_rkey) {
+ lkey = ((struct ibv_mr *)spdk_mem_map_translate(device->map,
+ (uint64_t)iov->iov_base, &translation_len))->lkey;
+ } else {
+ lkey = spdk_mem_map_translate(device->map,
+ (uint64_t)iov->iov_base, &translation_len);
+ }
+
+ if (spdk_unlikely(translation_len < iov->iov_len)) {
+ return false;
+ }
+
+ *_lkey = lkey;
+ return true;
+}
+
+static bool
+nvmf_rdma_fill_wr_sge(struct spdk_nvmf_rdma_device *device,
+ struct iovec *iov, struct ibv_send_wr **_wr,
+ uint32_t *_remaining_data_block, uint32_t *_offset,
+ uint32_t *_num_extra_wrs,
+ const struct spdk_dif_ctx *dif_ctx)
+{
+ struct ibv_send_wr *wr = *_wr;
+ struct ibv_sge *sg_ele = &wr->sg_list[wr->num_sge];
+ uint32_t lkey = 0;
+ uint32_t remaining, data_block_size, md_size, sge_len;
+
+ if (spdk_unlikely(!nvmf_rdma_get_lkey(device, iov, &lkey))) {
+ /* This is a very rare case that can occur when using DPDK version < 19.05 */
+ SPDK_ERRLOG("Data buffer split over multiple RDMA Memory Regions. Removing it from circulation.\n");
+ return false;
+ }
+
+ if (spdk_likely(!dif_ctx)) {
+ sg_ele->lkey = lkey;
+ sg_ele->addr = (uintptr_t)(iov->iov_base);
+ sg_ele->length = iov->iov_len;
+ wr->num_sge++;
+ } else {
+ remaining = iov->iov_len - *_offset;
+ data_block_size = dif_ctx->block_size - dif_ctx->md_size;
+ md_size = dif_ctx->md_size;
+
+ while (remaining) {
+ if (wr->num_sge >= SPDK_NVMF_MAX_SGL_ENTRIES) {
+ if (*_num_extra_wrs > 0 && wr->next) {
+ *_wr = wr->next;
+ wr = *_wr;
+ wr->num_sge = 0;
+ sg_ele = &wr->sg_list[wr->num_sge];
+ (*_num_extra_wrs)--;
+ } else {
+ break;
+ }
+ }
+ sg_ele->lkey = lkey;
+ sg_ele->addr = (uintptr_t)((char *)iov->iov_base + *_offset);
+ sge_len = spdk_min(remaining, *_remaining_data_block);
+ sg_ele->length = sge_len;
+ remaining -= sge_len;
+ *_remaining_data_block -= sge_len;
+ *_offset += sge_len;
+
+ sg_ele++;
+ wr->num_sge++;
+
+ if (*_remaining_data_block == 0) {
+ /* skip metadata */
+ *_offset += md_size;
+ /* Metadata that do not fit this IO buffer will be included in the next IO buffer */
+ remaining -= spdk_min(remaining, md_size);
+ *_remaining_data_block = data_block_size;
+ }
+
+ if (remaining == 0) {
+ /* By subtracting the size of the last IOV from the offset, we ensure that we skip
+ the remaining metadata bits at the beginning of the next buffer */
+ *_offset -= iov->iov_len;
+ }
+ }
+ }
+
+ return true;
+}
+
+static int
+nvmf_rdma_fill_wr_sgl(struct spdk_nvmf_rdma_poll_group *rgroup,
+ struct spdk_nvmf_rdma_device *device,
+ struct spdk_nvmf_rdma_request *rdma_req,
+ struct ibv_send_wr *wr,
+ uint32_t length,
+ uint32_t num_extra_wrs)
+{
+ struct spdk_nvmf_request *req = &rdma_req->req;
+ struct spdk_dif_ctx *dif_ctx = NULL;
+ uint32_t remaining_data_block = 0;
+ uint32_t offset = 0;
+
+ if (spdk_unlikely(rdma_req->req.dif.dif_insert_or_strip)) {
+ dif_ctx = &rdma_req->req.dif.dif_ctx;
+ remaining_data_block = dif_ctx->block_size - dif_ctx->md_size;
+ }
+
+ wr->num_sge = 0;
+
+ while (length && (num_extra_wrs || wr->num_sge < SPDK_NVMF_MAX_SGL_ENTRIES)) {
+ while (spdk_unlikely(!nvmf_rdma_fill_wr_sge(device, &req->iov[rdma_req->iovpos], &wr,
+ &remaining_data_block, &offset, &num_extra_wrs, dif_ctx))) {
+ if (nvmf_rdma_replace_buffer(rgroup, &req->buffers[rdma_req->iovpos]) == -ENOMEM) {
+ return -ENOMEM;
+ }
+ req->iov[rdma_req->iovpos].iov_base = (void *)((uintptr_t)(req->buffers[rdma_req->iovpos] +
+ NVMF_DATA_BUFFER_MASK) &
+ ~NVMF_DATA_BUFFER_MASK);
+ }
+
+ length -= req->iov[rdma_req->iovpos].iov_len;
+ rdma_req->iovpos++;
+ }
+
+ if (length) {
+ SPDK_ERRLOG("Not enough SG entries to hold data buffer\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static inline uint32_t
+nvmf_rdma_calc_num_wrs(uint32_t length, uint32_t io_unit_size, uint32_t block_size)
+{
+ /* estimate the number of SG entries and WRs needed to process the request */
+ uint32_t num_sge = 0;
+ uint32_t i;
+ uint32_t num_buffers = SPDK_CEIL_DIV(length, io_unit_size);
+
+ for (i = 0; i < num_buffers && length > 0; i++) {
+ uint32_t buffer_len = spdk_min(length, io_unit_size);
+ uint32_t num_sge_in_block = SPDK_CEIL_DIV(buffer_len, block_size);
+
+ if (num_sge_in_block * block_size > buffer_len) {
+ ++num_sge_in_block;
+ }
+ num_sge += num_sge_in_block;
+ length -= buffer_len;
+ }
+ return SPDK_CEIL_DIV(num_sge, SPDK_NVMF_MAX_SGL_ENTRIES);
+}
+
+static int
+nvmf_rdma_request_fill_iovs(struct spdk_nvmf_rdma_transport *rtransport,
+ struct spdk_nvmf_rdma_device *device,
+ struct spdk_nvmf_rdma_request *rdma_req,
+ uint32_t length)
+{
+ struct spdk_nvmf_rdma_qpair *rqpair;
+ struct spdk_nvmf_rdma_poll_group *rgroup;
+ struct spdk_nvmf_request *req = &rdma_req->req;
+ struct ibv_send_wr *wr = &rdma_req->data.wr;
+ int rc;
+ uint32_t num_wrs = 1;
+
+ rqpair = SPDK_CONTAINEROF(req->qpair, struct spdk_nvmf_rdma_qpair, qpair);
+ rgroup = rqpair->poller->group;
+
+ /* rdma wr specifics */
+ nvmf_rdma_setup_request(rdma_req);
+
+ rc = spdk_nvmf_request_get_buffers(req, &rgroup->group, &rtransport->transport,
+ length);
+ if (rc != 0) {
+ return rc;
+ }
+
+ assert(req->iovcnt <= rqpair->max_send_sge);
+
+ rdma_req->iovpos = 0;
+
+ if (spdk_unlikely(req->dif.dif_insert_or_strip)) {
+ num_wrs = nvmf_rdma_calc_num_wrs(length, rtransport->transport.opts.io_unit_size,
+ req->dif.dif_ctx.block_size);
+ if (num_wrs > 1) {
+ rc = nvmf_request_alloc_wrs(rtransport, rdma_req, num_wrs - 1);
+ if (rc != 0) {
+ goto err_exit;
+ }
+ }
+ }
+
+ rc = nvmf_rdma_fill_wr_sgl(rgroup, device, rdma_req, wr, length, num_wrs - 1);
+ if (spdk_unlikely(rc != 0)) {
+ goto err_exit;
+ }
+
+ if (spdk_unlikely(num_wrs > 1)) {
+ nvmf_rdma_update_remote_addr(rdma_req, num_wrs);
+ }
+
+ /* set the number of outstanding data WRs for this request. */
+ rdma_req->num_outstanding_data_wr = num_wrs;
+
+ return rc;
+
+err_exit:
+ spdk_nvmf_request_free_buffers(req, &rgroup->group, &rtransport->transport);
+ nvmf_rdma_request_free_data(rdma_req, rtransport);
+ req->iovcnt = 0;
+ return rc;
+}
+
+static int
+nvmf_rdma_request_fill_iovs_multi_sgl(struct spdk_nvmf_rdma_transport *rtransport,
+ struct spdk_nvmf_rdma_device *device,
+ struct spdk_nvmf_rdma_request *rdma_req)
+{
+ struct spdk_nvmf_rdma_qpair *rqpair;
+ struct spdk_nvmf_rdma_poll_group *rgroup;
+ struct ibv_send_wr *current_wr;
+ struct spdk_nvmf_request *req = &rdma_req->req;
+ struct spdk_nvme_sgl_descriptor *inline_segment, *desc;
+ uint32_t num_sgl_descriptors;
+ uint32_t lengths[SPDK_NVMF_MAX_SGL_ENTRIES];
+ uint32_t i;
+ int rc;
+
+ rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair);
+ rgroup = rqpair->poller->group;
+
+ inline_segment = &req->cmd->nvme_cmd.dptr.sgl1;
+ assert(inline_segment->generic.type == SPDK_NVME_SGL_TYPE_LAST_SEGMENT);
+ assert(inline_segment->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET);
+
+ num_sgl_descriptors = inline_segment->unkeyed.length / sizeof(struct spdk_nvme_sgl_descriptor);
+ assert(num_sgl_descriptors <= SPDK_NVMF_MAX_SGL_ENTRIES);
+
+ if (nvmf_request_alloc_wrs(rtransport, rdma_req, num_sgl_descriptors - 1) != 0) {
+ return -ENOMEM;
+ }
+
+ desc = (struct spdk_nvme_sgl_descriptor *)rdma_req->recv->buf + inline_segment->address;
+ for (i = 0; i < num_sgl_descriptors; i++) {
+ if (spdk_likely(!req->dif.dif_insert_or_strip)) {
+ lengths[i] = desc->keyed.length;
+ } else {
+ req->dif.orig_length += desc->keyed.length;
+ lengths[i] = spdk_dif_get_length_with_md(desc->keyed.length, &req->dif.dif_ctx);
+ req->dif.elba_length += lengths[i];
+ }
+ desc++;
+ }
+
+ rc = spdk_nvmf_request_get_buffers_multi(req, &rgroup->group, &rtransport->transport,
+ lengths, num_sgl_descriptors);
+ if (rc != 0) {
+ nvmf_rdma_request_free_data(rdma_req, rtransport);
+ return rc;
+ }
+
+ /* The first WR must always be the embedded data WR. This is how we unwind them later. */
+ current_wr = &rdma_req->data.wr;
+ assert(current_wr != NULL);
+
+ req->length = 0;
+ rdma_req->iovpos = 0;
+ desc = (struct spdk_nvme_sgl_descriptor *)rdma_req->recv->buf + inline_segment->address;
+ for (i = 0; i < num_sgl_descriptors; i++) {
+ /* The descriptors must be keyed data block descriptors with an address, not an offset. */
+ if (spdk_unlikely(desc->generic.type != SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK ||
+ desc->keyed.subtype != SPDK_NVME_SGL_SUBTYPE_ADDRESS)) {
+ rc = -EINVAL;
+ goto err_exit;
+ }
+
+ current_wr->num_sge = 0;
+
+ rc = nvmf_rdma_fill_wr_sgl(rgroup, device, rdma_req, current_wr, lengths[i], 0);
+ if (rc != 0) {
+ rc = -ENOMEM;
+ goto err_exit;
+ }
+
+ req->length += desc->keyed.length;
+ current_wr->wr.rdma.rkey = desc->keyed.key;
+ current_wr->wr.rdma.remote_addr = desc->address;
+ current_wr = current_wr->next;
+ desc++;
+ }
+
+#ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL
+ /* Go back to the last descriptor in the list. */
+ desc--;
+ if ((device->attr.device_cap_flags & IBV_DEVICE_MEM_MGT_EXTENSIONS) != 0) {
+ if (desc->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY) {
+ rdma_req->rsp.wr.opcode = IBV_WR_SEND_WITH_INV;
+ rdma_req->rsp.wr.imm_data = desc->keyed.key;
+ }
+ }
+#endif
+
+ rdma_req->num_outstanding_data_wr = num_sgl_descriptors;
+
+ return 0;
+
+err_exit:
+ spdk_nvmf_request_free_buffers(req, &rgroup->group, &rtransport->transport);
+ nvmf_rdma_request_free_data(rdma_req, rtransport);
+ return rc;
+}
+
+static int
+nvmf_rdma_request_parse_sgl(struct spdk_nvmf_rdma_transport *rtransport,
+ struct spdk_nvmf_rdma_device *device,
+ struct spdk_nvmf_rdma_request *rdma_req)
+{
+ struct spdk_nvmf_request *req = &rdma_req->req;
+ struct spdk_nvme_cpl *rsp;
+ struct spdk_nvme_sgl_descriptor *sgl;
+ int rc;
+ uint32_t length;
+
+ rsp = &req->rsp->nvme_cpl;
+ sgl = &req->cmd->nvme_cmd.dptr.sgl1;
+
+ if (sgl->generic.type == SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK &&
+ (sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_ADDRESS ||
+ sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY)) {
+
+ length = sgl->keyed.length;
+ if (length > rtransport->transport.opts.max_io_size) {
+ SPDK_ERRLOG("SGL length 0x%x exceeds max io size 0x%x\n",
+ length, rtransport->transport.opts.max_io_size);
+ rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID;
+ return -1;
+ }
+#ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL
+ if ((device->attr.device_cap_flags & IBV_DEVICE_MEM_MGT_EXTENSIONS) != 0) {
+ if (sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY) {
+ rdma_req->rsp.wr.opcode = IBV_WR_SEND_WITH_INV;
+ rdma_req->rsp.wr.imm_data = sgl->keyed.key;
+ }
+ }
+#endif
+
+ /* fill request length and populate iovs */
+ req->length = length;
+
+ if (spdk_unlikely(req->dif.dif_insert_or_strip)) {
+ req->dif.orig_length = length;
+ length = spdk_dif_get_length_with_md(length, &req->dif.dif_ctx);
+ req->dif.elba_length = length;
+ }
+
+ rc = nvmf_rdma_request_fill_iovs(rtransport, device, rdma_req, length);
+ if (spdk_unlikely(rc < 0)) {
+ if (rc == -EINVAL) {
+ SPDK_ERRLOG("SGL length exceeds the max I/O size\n");
+ rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID;
+ return -1;
+ }
+ /* No available buffers. Queue this request up. */
+ SPDK_DEBUGLOG(SPDK_LOG_RDMA, "No available large data buffers. Queueing request %p\n", rdma_req);
+ return 0;
+ }
+
+ /* backward compatible */
+ req->data = req->iov[0].iov_base;
+
+ SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Request %p took %d buffer/s from central pool\n", rdma_req,
+ req->iovcnt);
+
+ return 0;
+ } else if (sgl->generic.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK &&
+ sgl->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET) {
+ uint64_t offset = sgl->address;
+ uint32_t max_len = rtransport->transport.opts.in_capsule_data_size;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "In-capsule data: offset 0x%" PRIx64 ", length 0x%x\n",
+ offset, sgl->unkeyed.length);
+
+ if (offset > max_len) {
+ SPDK_ERRLOG("In-capsule offset 0x%" PRIx64 " exceeds capsule length 0x%x\n",
+ offset, max_len);
+ rsp->status.sc = SPDK_NVME_SC_INVALID_SGL_OFFSET;
+ return -1;
+ }
+ max_len -= (uint32_t)offset;
+
+ if (sgl->unkeyed.length > max_len) {
+ SPDK_ERRLOG("In-capsule data length 0x%x exceeds capsule length 0x%x\n",
+ sgl->unkeyed.length, max_len);
+ rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID;
+ return -1;
+ }
+
+ rdma_req->num_outstanding_data_wr = 0;
+ req->data = rdma_req->recv->buf + offset;
+ req->data_from_pool = false;
+ req->length = sgl->unkeyed.length;
+
+ req->iov[0].iov_base = req->data;
+ req->iov[0].iov_len = req->length;
+ req->iovcnt = 1;
+
+ return 0;
+ } else if (sgl->generic.type == SPDK_NVME_SGL_TYPE_LAST_SEGMENT &&
+ sgl->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET) {
+
+ rc = nvmf_rdma_request_fill_iovs_multi_sgl(rtransport, device, rdma_req);
+ if (rc == -ENOMEM) {
+ SPDK_DEBUGLOG(SPDK_LOG_RDMA, "No available large data buffers. Queueing request %p\n", rdma_req);
+ return 0;
+ } else if (rc == -EINVAL) {
+ SPDK_ERRLOG("Multi SGL element request length exceeds the max I/O size\n");
+ rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID;
+ return -1;
+ }
+
+ /* backward compatible */
+ req->data = req->iov[0].iov_base;
+
+ SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Request %p took %d buffer/s from central pool\n", rdma_req,
+ req->iovcnt);
+
+ return 0;
+ }
+
+ SPDK_ERRLOG("Invalid NVMf I/O Command SGL: Type 0x%x, Subtype 0x%x\n",
+ sgl->generic.type, sgl->generic.subtype);
+ rsp->status.sc = SPDK_NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID;
+ return -1;
+}
+
+static void
+_nvmf_rdma_request_free(struct spdk_nvmf_rdma_request *rdma_req,
+ struct spdk_nvmf_rdma_transport *rtransport)
+{
+ struct spdk_nvmf_rdma_qpair *rqpair;
+ struct spdk_nvmf_rdma_poll_group *rgroup;
+
+ rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair);
+ if (rdma_req->req.data_from_pool) {
+ rgroup = rqpair->poller->group;
+
+ spdk_nvmf_request_free_buffers(&rdma_req->req, &rgroup->group, &rtransport->transport);
+ }
+ nvmf_rdma_request_free_data(rdma_req, rtransport);
+ rdma_req->req.length = 0;
+ rdma_req->req.iovcnt = 0;
+ rdma_req->req.data = NULL;
+ rdma_req->rsp.wr.next = NULL;
+ rdma_req->data.wr.next = NULL;
+ memset(&rdma_req->req.dif, 0, sizeof(rdma_req->req.dif));
+ rqpair->qd--;
+
+ STAILQ_INSERT_HEAD(&rqpair->resources->free_queue, rdma_req, state_link);
+ rdma_req->state = RDMA_REQUEST_STATE_FREE;
+}
+
+bool
+nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport,
+ struct spdk_nvmf_rdma_request *rdma_req)
+{
+ struct spdk_nvmf_rdma_qpair *rqpair;
+ struct spdk_nvmf_rdma_device *device;
+ struct spdk_nvmf_rdma_poll_group *rgroup;
+ struct spdk_nvme_cpl *rsp = &rdma_req->req.rsp->nvme_cpl;
+ int rc;
+ struct spdk_nvmf_rdma_recv *rdma_recv;
+ enum spdk_nvmf_rdma_request_state prev_state;
+ bool progress = false;
+ int data_posted;
+ uint32_t num_blocks;
+
+ rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair);
+ device = rqpair->device;
+ rgroup = rqpair->poller->group;
+
+ assert(rdma_req->state != RDMA_REQUEST_STATE_FREE);
+
+ /* If the queue pair is in an error state, force the request to the completed state
+ * to release resources. */
+ if (rqpair->ibv_state == IBV_QPS_ERR || rqpair->qpair.state != SPDK_NVMF_QPAIR_ACTIVE) {
+ if (rdma_req->state == RDMA_REQUEST_STATE_NEED_BUFFER) {
+ STAILQ_REMOVE(&rgroup->group.pending_buf_queue, &rdma_req->req, spdk_nvmf_request, buf_link);
+ } else if (rdma_req->state == RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING) {
+ STAILQ_REMOVE(&rqpair->pending_rdma_read_queue, rdma_req, spdk_nvmf_rdma_request, state_link);
+ } else if (rdma_req->state == RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING) {
+ STAILQ_REMOVE(&rqpair->pending_rdma_write_queue, rdma_req, spdk_nvmf_rdma_request, state_link);
+ }
+ rdma_req->state = RDMA_REQUEST_STATE_COMPLETED;
+ }
+
+ /* The loop here is to allow for several back-to-back state changes. */
+ do {
+ prev_state = rdma_req->state;
+
+ SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Request %p entering state %d\n", rdma_req, prev_state);
+
+ switch (rdma_req->state) {
+ case RDMA_REQUEST_STATE_FREE:
+ /* Some external code must kick a request into RDMA_REQUEST_STATE_NEW
+ * to escape this state. */
+ break;
+ case RDMA_REQUEST_STATE_NEW:
+ spdk_trace_record(TRACE_RDMA_REQUEST_STATE_NEW, 0, 0,
+ (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id);
+ rdma_recv = rdma_req->recv;
+
+ /* The first element of the SGL is the NVMe command */
+ rdma_req->req.cmd = (union nvmf_h2c_msg *)rdma_recv->sgl[0].addr;
+ memset(rdma_req->req.rsp, 0, sizeof(*rdma_req->req.rsp));
+
+ if (rqpair->ibv_state == IBV_QPS_ERR || rqpair->qpair.state != SPDK_NVMF_QPAIR_ACTIVE) {
+ rdma_req->state = RDMA_REQUEST_STATE_COMPLETED;
+ break;
+ }
+
+ if (spdk_unlikely(spdk_nvmf_request_get_dif_ctx(&rdma_req->req, &rdma_req->req.dif.dif_ctx))) {
+ rdma_req->req.dif.dif_insert_or_strip = true;
+ }
+
+#ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL
+ rdma_req->rsp.wr.opcode = IBV_WR_SEND;
+ rdma_req->rsp.wr.imm_data = 0;
+#endif
+
+ /* The next state transition depends on the data transfer needs of this request. */
+ rdma_req->req.xfer = spdk_nvmf_req_get_xfer(&rdma_req->req);
+
+ /* If no data to transfer, ready to execute. */
+ if (rdma_req->req.xfer == SPDK_NVME_DATA_NONE) {
+ rdma_req->state = RDMA_REQUEST_STATE_READY_TO_EXECUTE;
+ break;
+ }
+
+ rdma_req->state = RDMA_REQUEST_STATE_NEED_BUFFER;
+ STAILQ_INSERT_TAIL(&rgroup->group.pending_buf_queue, &rdma_req->req, buf_link);
+ break;
+ case RDMA_REQUEST_STATE_NEED_BUFFER:
+ spdk_trace_record(TRACE_RDMA_REQUEST_STATE_NEED_BUFFER, 0, 0,
+ (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id);
+
+ assert(rdma_req->req.xfer != SPDK_NVME_DATA_NONE);
+
+ if (&rdma_req->req != STAILQ_FIRST(&rgroup->group.pending_buf_queue)) {
+ /* This request needs to wait in line to obtain a buffer */
+ break;
+ }
+
+ /* Try to get a data buffer */
+ rc = nvmf_rdma_request_parse_sgl(rtransport, device, rdma_req);
+ if (rc < 0) {
+ STAILQ_REMOVE_HEAD(&rgroup->group.pending_buf_queue, buf_link);
+ rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE;
+ break;
+ }
+
+ if (!rdma_req->req.data) {
+ /* No buffers available. */
+ rgroup->stat.pending_data_buffer++;
+ break;
+ }
+
+ STAILQ_REMOVE_HEAD(&rgroup->group.pending_buf_queue, buf_link);
+
+ /* If data is transferring from host to controller and the data didn't
+ * arrive using in capsule data, we need to do a transfer from the host.
+ */
+ if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER &&
+ rdma_req->req.data_from_pool) {
+ STAILQ_INSERT_TAIL(&rqpair->pending_rdma_read_queue, rdma_req, state_link);
+ rdma_req->state = RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING;
+ break;
+ }
+
+ rdma_req->state = RDMA_REQUEST_STATE_READY_TO_EXECUTE;
+ break;
+ case RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING:
+ spdk_trace_record(TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING, 0, 0,
+ (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id);
+
+ if (rdma_req != STAILQ_FIRST(&rqpair->pending_rdma_read_queue)) {
+ /* This request needs to wait in line to perform RDMA */
+ break;
+ }
+ if (rqpair->current_send_depth + rdma_req->num_outstanding_data_wr > rqpair->max_send_depth
+ || rqpair->current_read_depth + rdma_req->num_outstanding_data_wr > rqpair->max_read_depth) {
+ /* We can only have so many WRs outstanding. we have to wait until some finish. */
+ rqpair->poller->stat.pending_rdma_read++;
+ break;
+ }
+
+ /* We have already verified that this request is the head of the queue. */
+ STAILQ_REMOVE_HEAD(&rqpair->pending_rdma_read_queue, state_link);
+
+ rc = request_transfer_in(&rdma_req->req);
+ if (!rc) {
+ rdma_req->state = RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER;
+ } else {
+ rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+ rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE;
+ }
+ break;
+ case RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER:
+ spdk_trace_record(TRACE_RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER, 0, 0,
+ (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id);
+ /* Some external code must kick a request into RDMA_REQUEST_STATE_READY_TO_EXECUTE
+ * to escape this state. */
+ break;
+ case RDMA_REQUEST_STATE_READY_TO_EXECUTE:
+ spdk_trace_record(TRACE_RDMA_REQUEST_STATE_READY_TO_EXECUTE, 0, 0,
+ (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id);
+
+ if (spdk_unlikely(rdma_req->req.dif.dif_insert_or_strip)) {
+ if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) {
+ /* generate DIF for write operation */
+ num_blocks = SPDK_CEIL_DIV(rdma_req->req.dif.elba_length, rdma_req->req.dif.dif_ctx.block_size);
+ assert(num_blocks > 0);
+
+ rc = spdk_dif_generate(rdma_req->req.iov, rdma_req->req.iovcnt,
+ num_blocks, &rdma_req->req.dif.dif_ctx);
+ if (rc != 0) {
+ SPDK_ERRLOG("DIF generation failed\n");
+ rdma_req->state = RDMA_REQUEST_STATE_COMPLETED;
+ nvmf_rdma_start_disconnect(rqpair);
+ break;
+ }
+ }
+
+ assert(rdma_req->req.dif.elba_length >= rdma_req->req.length);
+ /* set extended length before IO operation */
+ rdma_req->req.length = rdma_req->req.dif.elba_length;
+ }
+
+ rdma_req->state = RDMA_REQUEST_STATE_EXECUTING;
+ spdk_nvmf_request_exec(&rdma_req->req);
+ break;
+ case RDMA_REQUEST_STATE_EXECUTING:
+ spdk_trace_record(TRACE_RDMA_REQUEST_STATE_EXECUTING, 0, 0,
+ (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id);
+ /* Some external code must kick a request into RDMA_REQUEST_STATE_EXECUTED
+ * to escape this state. */
+ break;
+ case RDMA_REQUEST_STATE_EXECUTED:
+ spdk_trace_record(TRACE_RDMA_REQUEST_STATE_EXECUTED, 0, 0,
+ (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id);
+ if (rsp->status.sc == SPDK_NVME_SC_SUCCESS &&
+ rdma_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) {
+ STAILQ_INSERT_TAIL(&rqpair->pending_rdma_write_queue, rdma_req, state_link);
+ rdma_req->state = RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING;
+ } else {
+ rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE;
+ }
+ if (spdk_unlikely(rdma_req->req.dif.dif_insert_or_strip)) {
+ /* restore the original length */
+ rdma_req->req.length = rdma_req->req.dif.orig_length;
+
+ if (rdma_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) {
+ struct spdk_dif_error error_blk;
+
+ num_blocks = SPDK_CEIL_DIV(rdma_req->req.dif.elba_length, rdma_req->req.dif.dif_ctx.block_size);
+
+ rc = spdk_dif_verify(rdma_req->req.iov, rdma_req->req.iovcnt, num_blocks,
+ &rdma_req->req.dif.dif_ctx, &error_blk);
+ if (rc) {
+ struct spdk_nvme_cpl *rsp = &rdma_req->req.rsp->nvme_cpl;
+
+ SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", error_blk.err_type,
+ error_blk.err_offset);
+ rsp->status.sct = SPDK_NVME_SCT_MEDIA_ERROR;
+ rsp->status.sc = nvmf_rdma_dif_error_to_compl_status(error_blk.err_type);
+ rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE;
+ STAILQ_REMOVE(&rqpair->pending_rdma_write_queue, rdma_req, spdk_nvmf_rdma_request, state_link);
+ }
+ }
+ }
+ break;
+ case RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING:
+ spdk_trace_record(TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING, 0, 0,
+ (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id);
+
+ if (rdma_req != STAILQ_FIRST(&rqpair->pending_rdma_write_queue)) {
+ /* This request needs to wait in line to perform RDMA */
+ break;
+ }
+ if ((rqpair->current_send_depth + rdma_req->num_outstanding_data_wr + 1) >
+ rqpair->max_send_depth) {
+ /* We can only have so many WRs outstanding. we have to wait until some finish.
+ * +1 since each request has an additional wr in the resp. */
+ rqpair->poller->stat.pending_rdma_write++;
+ break;
+ }
+
+ /* We have already verified that this request is the head of the queue. */
+ STAILQ_REMOVE_HEAD(&rqpair->pending_rdma_write_queue, state_link);
+
+ /* The data transfer will be kicked off from
+ * RDMA_REQUEST_STATE_READY_TO_COMPLETE state.
+ */
+ rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE;
+ break;
+ case RDMA_REQUEST_STATE_READY_TO_COMPLETE:
+ spdk_trace_record(TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE, 0, 0,
+ (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id);
+ rc = request_transfer_out(&rdma_req->req, &data_posted);
+ assert(rc == 0); /* No good way to handle this currently */
+ if (rc) {
+ rdma_req->state = RDMA_REQUEST_STATE_COMPLETED;
+ } else {
+ rdma_req->state = data_posted ? RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST :
+ RDMA_REQUEST_STATE_COMPLETING;
+ }
+ break;
+ case RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST:
+ spdk_trace_record(TRACE_RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST, 0, 0,
+ (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id);
+ /* Some external code must kick a request into RDMA_REQUEST_STATE_COMPLETED
+ * to escape this state. */
+ break;
+ case RDMA_REQUEST_STATE_COMPLETING:
+ spdk_trace_record(TRACE_RDMA_REQUEST_STATE_COMPLETING, 0, 0,
+ (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id);
+ /* Some external code must kick a request into RDMA_REQUEST_STATE_COMPLETED
+ * to escape this state. */
+ break;
+ case RDMA_REQUEST_STATE_COMPLETED:
+ spdk_trace_record(TRACE_RDMA_REQUEST_STATE_COMPLETED, 0, 0,
+ (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id);
+
+ rqpair->poller->stat.request_latency += spdk_get_ticks() - rdma_req->receive_tsc;
+ _nvmf_rdma_request_free(rdma_req, rtransport);
+ break;
+ case RDMA_REQUEST_NUM_STATES:
+ default:
+ assert(0);
+ break;
+ }
+
+ if (rdma_req->state != prev_state) {
+ progress = true;
+ }
+ } while (rdma_req->state != prev_state);
+
+ return progress;
+}
+
+/* Public API callbacks begin here */
+
+#define SPDK_NVMF_RDMA_DEFAULT_MAX_QUEUE_DEPTH 128
+#define SPDK_NVMF_RDMA_DEFAULT_AQ_DEPTH 128
+#define SPDK_NVMF_RDMA_DEFAULT_SRQ_DEPTH 4096
+#define SPDK_NVMF_RDMA_DEFAULT_MAX_QPAIRS_PER_CTRLR 128
+#define SPDK_NVMF_RDMA_DEFAULT_IN_CAPSULE_DATA_SIZE 4096
+#define SPDK_NVMF_RDMA_DEFAULT_MAX_IO_SIZE 131072
+#define SPDK_NVMF_RDMA_MIN_IO_BUFFER_SIZE (SPDK_NVMF_RDMA_DEFAULT_MAX_IO_SIZE / SPDK_NVMF_MAX_SGL_ENTRIES)
+#define SPDK_NVMF_RDMA_DEFAULT_NUM_SHARED_BUFFERS 4095
+#define SPDK_NVMF_RDMA_DEFAULT_BUFFER_CACHE_SIZE 32
+#define SPDK_NVMF_RDMA_DEFAULT_NO_SRQ false
+#define SPDK_NVMF_RDMA_DIF_INSERT_OR_STRIP false
+#define SPDK_NVMF_RDMA_ACCEPTOR_BACKLOG 100
+#define SPDK_NVMF_RDMA_DEFAULT_ABORT_TIMEOUT_SEC 1
+
+static void
+nvmf_rdma_opts_init(struct spdk_nvmf_transport_opts *opts)
+{
+ opts->max_queue_depth = SPDK_NVMF_RDMA_DEFAULT_MAX_QUEUE_DEPTH;
+ opts->max_qpairs_per_ctrlr = SPDK_NVMF_RDMA_DEFAULT_MAX_QPAIRS_PER_CTRLR;
+ opts->in_capsule_data_size = SPDK_NVMF_RDMA_DEFAULT_IN_CAPSULE_DATA_SIZE;
+ opts->max_io_size = SPDK_NVMF_RDMA_DEFAULT_MAX_IO_SIZE;
+ opts->io_unit_size = SPDK_NVMF_RDMA_MIN_IO_BUFFER_SIZE;
+ opts->max_aq_depth = SPDK_NVMF_RDMA_DEFAULT_AQ_DEPTH;
+ opts->num_shared_buffers = SPDK_NVMF_RDMA_DEFAULT_NUM_SHARED_BUFFERS;
+ opts->buf_cache_size = SPDK_NVMF_RDMA_DEFAULT_BUFFER_CACHE_SIZE;
+ opts->max_srq_depth = SPDK_NVMF_RDMA_DEFAULT_SRQ_DEPTH;
+ opts->no_srq = SPDK_NVMF_RDMA_DEFAULT_NO_SRQ;
+ opts->dif_insert_or_strip = SPDK_NVMF_RDMA_DIF_INSERT_OR_STRIP;
+ opts->acceptor_backlog = SPDK_NVMF_RDMA_ACCEPTOR_BACKLOG;
+ opts->abort_timeout_sec = SPDK_NVMF_RDMA_DEFAULT_ABORT_TIMEOUT_SEC;
+}
+
+const struct spdk_mem_map_ops g_nvmf_rdma_map_ops = {
+ .notify_cb = nvmf_rdma_mem_notify,
+ .are_contiguous = nvmf_rdma_check_contiguous_entries
+};
+
+static int nvmf_rdma_destroy(struct spdk_nvmf_transport *transport);
+
+static struct spdk_nvmf_transport *
+nvmf_rdma_create(struct spdk_nvmf_transport_opts *opts)
+{
+ int rc;
+ struct spdk_nvmf_rdma_transport *rtransport;
+ struct spdk_nvmf_rdma_device *device, *tmp;
+ struct ibv_context **contexts;
+ uint32_t i;
+ int flag;
+ uint32_t sge_count;
+ uint32_t min_shared_buffers;
+ int max_device_sge = SPDK_NVMF_MAX_SGL_ENTRIES;
+ pthread_mutexattr_t attr;
+
+ rtransport = calloc(1, sizeof(*rtransport));
+ if (!rtransport) {
+ return NULL;
+ }
+
+ if (pthread_mutexattr_init(&attr)) {
+ SPDK_ERRLOG("pthread_mutexattr_init() failed\n");
+ free(rtransport);
+ return NULL;
+ }
+
+ if (pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE)) {
+ SPDK_ERRLOG("pthread_mutexattr_settype() failed\n");
+ pthread_mutexattr_destroy(&attr);
+ free(rtransport);
+ return NULL;
+ }
+
+ if (pthread_mutex_init(&rtransport->lock, &attr)) {
+ SPDK_ERRLOG("pthread_mutex_init() failed\n");
+ pthread_mutexattr_destroy(&attr);
+ free(rtransport);
+ return NULL;
+ }
+
+ pthread_mutexattr_destroy(&attr);
+
+ TAILQ_INIT(&rtransport->devices);
+ TAILQ_INIT(&rtransport->ports);
+ TAILQ_INIT(&rtransport->poll_groups);
+
+ rtransport->transport.ops = &spdk_nvmf_transport_rdma;
+
+ SPDK_INFOLOG(SPDK_LOG_RDMA, "*** RDMA Transport Init ***\n"
+ " Transport opts: max_ioq_depth=%d, max_io_size=%d,\n"
+ " max_io_qpairs_per_ctrlr=%d, io_unit_size=%d,\n"
+ " in_capsule_data_size=%d, max_aq_depth=%d,\n"
+ " num_shared_buffers=%d, max_srq_depth=%d, no_srq=%d,"
+ " acceptor_backlog=%d, abort_timeout_sec=%d\n",
+ opts->max_queue_depth,
+ opts->max_io_size,
+ opts->max_qpairs_per_ctrlr - 1,
+ opts->io_unit_size,
+ opts->in_capsule_data_size,
+ opts->max_aq_depth,
+ opts->num_shared_buffers,
+ opts->max_srq_depth,
+ opts->no_srq,
+ opts->acceptor_backlog,
+ opts->abort_timeout_sec);
+
+ /* I/O unit size cannot be larger than max I/O size */
+ if (opts->io_unit_size > opts->max_io_size) {
+ opts->io_unit_size = opts->max_io_size;
+ }
+
+ if (opts->acceptor_backlog <= 0) {
+ SPDK_ERRLOG("The acceptor backlog cannot be less than 1, setting to the default value of (%d).\n",
+ SPDK_NVMF_RDMA_ACCEPTOR_BACKLOG);
+ opts->acceptor_backlog = SPDK_NVMF_RDMA_ACCEPTOR_BACKLOG;
+ }
+
+ if (opts->num_shared_buffers < (SPDK_NVMF_MAX_SGL_ENTRIES * 2)) {
+ SPDK_ERRLOG("The number of shared data buffers (%d) is less than"
+ "the minimum number required to guarantee that forward progress can be made (%d)\n",
+ opts->num_shared_buffers, (SPDK_NVMF_MAX_SGL_ENTRIES * 2));
+ nvmf_rdma_destroy(&rtransport->transport);
+ return NULL;
+ }
+
+ min_shared_buffers = spdk_thread_get_count() * opts->buf_cache_size;
+ if (min_shared_buffers > opts->num_shared_buffers) {
+ SPDK_ERRLOG("There are not enough buffers to satisfy"
+ "per-poll group caches for each thread. (%" PRIu32 ")"
+ "supplied. (%" PRIu32 ") required\n", opts->num_shared_buffers, min_shared_buffers);
+ SPDK_ERRLOG("Please specify a larger number of shared buffers\n");
+ nvmf_rdma_destroy(&rtransport->transport);
+ return NULL;
+ }
+
+ sge_count = opts->max_io_size / opts->io_unit_size;
+ if (sge_count > NVMF_DEFAULT_TX_SGE) {
+ SPDK_ERRLOG("Unsupported IO Unit size specified, %d bytes\n", opts->io_unit_size);
+ nvmf_rdma_destroy(&rtransport->transport);
+ return NULL;
+ }
+
+ rtransport->event_channel = rdma_create_event_channel();
+ if (rtransport->event_channel == NULL) {
+ SPDK_ERRLOG("rdma_create_event_channel() failed, %s\n", spdk_strerror(errno));
+ nvmf_rdma_destroy(&rtransport->transport);
+ return NULL;
+ }
+
+ flag = fcntl(rtransport->event_channel->fd, F_GETFL);
+ if (fcntl(rtransport->event_channel->fd, F_SETFL, flag | O_NONBLOCK) < 0) {
+ SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%s)\n",
+ rtransport->event_channel->fd, spdk_strerror(errno));
+ nvmf_rdma_destroy(&rtransport->transport);
+ return NULL;
+ }
+
+ rtransport->data_wr_pool = spdk_mempool_create("spdk_nvmf_rdma_wr_data",
+ opts->max_queue_depth * SPDK_NVMF_MAX_SGL_ENTRIES,
+ sizeof(struct spdk_nvmf_rdma_request_data),
+ SPDK_MEMPOOL_DEFAULT_CACHE_SIZE,
+ SPDK_ENV_SOCKET_ID_ANY);
+ if (!rtransport->data_wr_pool) {
+ SPDK_ERRLOG("Unable to allocate work request pool for poll group\n");
+ nvmf_rdma_destroy(&rtransport->transport);
+ return NULL;
+ }
+
+ contexts = rdma_get_devices(NULL);
+ if (contexts == NULL) {
+ SPDK_ERRLOG("rdma_get_devices() failed: %s (%d)\n", spdk_strerror(errno), errno);
+ nvmf_rdma_destroy(&rtransport->transport);
+ return NULL;
+ }
+
+ i = 0;
+ rc = 0;
+ while (contexts[i] != NULL) {
+ device = calloc(1, sizeof(*device));
+ if (!device) {
+ SPDK_ERRLOG("Unable to allocate memory for RDMA devices.\n");
+ rc = -ENOMEM;
+ break;
+ }
+ device->context = contexts[i];
+ rc = ibv_query_device(device->context, &device->attr);
+ if (rc < 0) {
+ SPDK_ERRLOG("Failed to query RDMA device attributes.\n");
+ free(device);
+ break;
+
+ }
+
+ max_device_sge = spdk_min(max_device_sge, device->attr.max_sge);
+
+#ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL
+ if ((device->attr.device_cap_flags & IBV_DEVICE_MEM_MGT_EXTENSIONS) == 0) {
+ SPDK_WARNLOG("The libibverbs on this system supports SEND_WITH_INVALIDATE,");
+ SPDK_WARNLOG("but the device with vendor ID %u does not.\n", device->attr.vendor_id);
+ }
+
+ /**
+ * The vendor ID is assigned by the IEEE and an ID of 0 implies Soft-RoCE.
+ * The Soft-RoCE RXE driver does not currently support send with invalidate,
+ * but incorrectly reports that it does. There are changes making their way
+ * through the kernel now that will enable this feature. When they are merged,
+ * we can conditionally enable this feature.
+ *
+ * TODO: enable this for versions of the kernel rxe driver that support it.
+ */
+ if (device->attr.vendor_id == 0) {
+ device->attr.device_cap_flags &= ~(IBV_DEVICE_MEM_MGT_EXTENSIONS);
+ }
+#endif
+
+ /* set up device context async ev fd as NON_BLOCKING */
+ flag = fcntl(device->context->async_fd, F_GETFL);
+ rc = fcntl(device->context->async_fd, F_SETFL, flag | O_NONBLOCK);
+ if (rc < 0) {
+ SPDK_ERRLOG("Failed to set context async fd to NONBLOCK.\n");
+ free(device);
+ break;
+ }
+
+ TAILQ_INSERT_TAIL(&rtransport->devices, device, link);
+ i++;
+
+ if (g_nvmf_hooks.get_ibv_pd) {
+ device->pd = g_nvmf_hooks.get_ibv_pd(NULL, device->context);
+ } else {
+ device->pd = ibv_alloc_pd(device->context);
+ }
+
+ if (!device->pd) {
+ SPDK_ERRLOG("Unable to allocate protection domain.\n");
+ rc = -ENOMEM;
+ break;
+ }
+
+ assert(device->map == NULL);
+
+ device->map = spdk_mem_map_alloc(0, &g_nvmf_rdma_map_ops, device->pd);
+ if (!device->map) {
+ SPDK_ERRLOG("Unable to allocate memory map for listen address\n");
+ rc = -ENOMEM;
+ break;
+ }
+
+ assert(device->map != NULL);
+ assert(device->pd != NULL);
+ }
+ rdma_free_devices(contexts);
+
+ if (opts->io_unit_size * max_device_sge < opts->max_io_size) {
+ /* divide and round up. */
+ opts->io_unit_size = (opts->max_io_size + max_device_sge - 1) / max_device_sge;
+
+ /* round up to the nearest 4k. */
+ opts->io_unit_size = (opts->io_unit_size + NVMF_DATA_BUFFER_ALIGNMENT - 1) & ~NVMF_DATA_BUFFER_MASK;
+
+ opts->io_unit_size = spdk_max(opts->io_unit_size, SPDK_NVMF_RDMA_MIN_IO_BUFFER_SIZE);
+ SPDK_NOTICELOG("Adjusting the io unit size to fit the device's maximum I/O size. New I/O unit size %u\n",
+ opts->io_unit_size);
+ }
+
+ if (rc < 0) {
+ nvmf_rdma_destroy(&rtransport->transport);
+ return NULL;
+ }
+
+ /* Set up poll descriptor array to monitor events from RDMA and IB
+ * in a single poll syscall
+ */
+ rtransport->npoll_fds = i + 1;
+ i = 0;
+ rtransport->poll_fds = calloc(rtransport->npoll_fds, sizeof(struct pollfd));
+ if (rtransport->poll_fds == NULL) {
+ SPDK_ERRLOG("poll_fds allocation failed\n");
+ nvmf_rdma_destroy(&rtransport->transport);
+ return NULL;
+ }
+
+ rtransport->poll_fds[i].fd = rtransport->event_channel->fd;
+ rtransport->poll_fds[i++].events = POLLIN;
+
+ TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, tmp) {
+ rtransport->poll_fds[i].fd = device->context->async_fd;
+ rtransport->poll_fds[i++].events = POLLIN;
+ }
+
+ return &rtransport->transport;
+}
+
+static int
+nvmf_rdma_destroy(struct spdk_nvmf_transport *transport)
+{
+ struct spdk_nvmf_rdma_transport *rtransport;
+ struct spdk_nvmf_rdma_port *port, *port_tmp;
+ struct spdk_nvmf_rdma_device *device, *device_tmp;
+
+ rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport);
+
+ TAILQ_FOREACH_SAFE(port, &rtransport->ports, link, port_tmp) {
+ TAILQ_REMOVE(&rtransport->ports, port, link);
+ rdma_destroy_id(port->id);
+ free(port);
+ }
+
+ if (rtransport->poll_fds != NULL) {
+ free(rtransport->poll_fds);
+ }
+
+ if (rtransport->event_channel != NULL) {
+ rdma_destroy_event_channel(rtransport->event_channel);
+ }
+
+ TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, device_tmp) {
+ TAILQ_REMOVE(&rtransport->devices, device, link);
+ if (device->map) {
+ spdk_mem_map_free(&device->map);
+ }
+ if (device->pd) {
+ if (!g_nvmf_hooks.get_ibv_pd) {
+ ibv_dealloc_pd(device->pd);
+ }
+ }
+ free(device);
+ }
+
+ if (rtransport->data_wr_pool != NULL) {
+ if (spdk_mempool_count(rtransport->data_wr_pool) !=
+ (transport->opts.max_queue_depth * SPDK_NVMF_MAX_SGL_ENTRIES)) {
+ SPDK_ERRLOG("transport wr pool count is %zu but should be %u\n",
+ spdk_mempool_count(rtransport->data_wr_pool),
+ transport->opts.max_queue_depth * SPDK_NVMF_MAX_SGL_ENTRIES);
+ }
+ }
+
+ spdk_mempool_free(rtransport->data_wr_pool);
+
+ pthread_mutex_destroy(&rtransport->lock);
+ free(rtransport);
+
+ return 0;
+}
+
+static int
+nvmf_rdma_trid_from_cm_id(struct rdma_cm_id *id,
+ struct spdk_nvme_transport_id *trid,
+ bool peer);
+
+static int
+nvmf_rdma_listen(struct spdk_nvmf_transport *transport,
+ const struct spdk_nvme_transport_id *trid)
+{
+ struct spdk_nvmf_rdma_transport *rtransport;
+ struct spdk_nvmf_rdma_device *device;
+ struct spdk_nvmf_rdma_port *port;
+ struct addrinfo *res;
+ struct addrinfo hints;
+ int family;
+ int rc;
+
+ rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport);
+ assert(rtransport->event_channel != NULL);
+
+ pthread_mutex_lock(&rtransport->lock);
+ port = calloc(1, sizeof(*port));
+ if (!port) {
+ SPDK_ERRLOG("Port allocation failed\n");
+ pthread_mutex_unlock(&rtransport->lock);
+ return -ENOMEM;
+ }
+
+ port->trid = trid;
+
+ switch (trid->adrfam) {
+ case SPDK_NVMF_ADRFAM_IPV4:
+ family = AF_INET;
+ break;
+ case SPDK_NVMF_ADRFAM_IPV6:
+ family = AF_INET6;
+ break;
+ default:
+ SPDK_ERRLOG("Unhandled ADRFAM %d\n", trid->adrfam);
+ free(port);
+ pthread_mutex_unlock(&rtransport->lock);
+ return -EINVAL;
+ }
+
+ memset(&hints, 0, sizeof(hints));
+ hints.ai_family = family;
+ hints.ai_flags = AI_NUMERICSERV;
+ hints.ai_socktype = SOCK_STREAM;
+ hints.ai_protocol = 0;
+
+ rc = getaddrinfo(trid->traddr, trid->trsvcid, &hints, &res);
+ if (rc) {
+ SPDK_ERRLOG("getaddrinfo failed: %s (%d)\n", gai_strerror(rc), rc);
+ free(port);
+ pthread_mutex_unlock(&rtransport->lock);
+ return -EINVAL;
+ }
+
+ rc = rdma_create_id(rtransport->event_channel, &port->id, port, RDMA_PS_TCP);
+ if (rc < 0) {
+ SPDK_ERRLOG("rdma_create_id() failed\n");
+ freeaddrinfo(res);
+ free(port);
+ pthread_mutex_unlock(&rtransport->lock);
+ return rc;
+ }
+
+ rc = rdma_bind_addr(port->id, res->ai_addr);
+ freeaddrinfo(res);
+
+ if (rc < 0) {
+ SPDK_ERRLOG("rdma_bind_addr() failed\n");
+ rdma_destroy_id(port->id);
+ free(port);
+ pthread_mutex_unlock(&rtransport->lock);
+ return rc;
+ }
+
+ if (!port->id->verbs) {
+ SPDK_ERRLOG("ibv_context is null\n");
+ rdma_destroy_id(port->id);
+ free(port);
+ pthread_mutex_unlock(&rtransport->lock);
+ return -1;
+ }
+
+ rc = rdma_listen(port->id, transport->opts.acceptor_backlog);
+ if (rc < 0) {
+ SPDK_ERRLOG("rdma_listen() failed\n");
+ rdma_destroy_id(port->id);
+ free(port);
+ pthread_mutex_unlock(&rtransport->lock);
+ return rc;
+ }
+
+ TAILQ_FOREACH(device, &rtransport->devices, link) {
+ if (device->context == port->id->verbs) {
+ port->device = device;
+ break;
+ }
+ }
+ if (!port->device) {
+ SPDK_ERRLOG("Accepted a connection with verbs %p, but unable to find a corresponding device.\n",
+ port->id->verbs);
+ rdma_destroy_id(port->id);
+ free(port);
+ pthread_mutex_unlock(&rtransport->lock);
+ return -EINVAL;
+ }
+
+ SPDK_NOTICELOG("*** NVMe/RDMA Target Listening on %s port %s ***\n",
+ trid->traddr, trid->trsvcid);
+
+ TAILQ_INSERT_TAIL(&rtransport->ports, port, link);
+ pthread_mutex_unlock(&rtransport->lock);
+ return 0;
+}
+
+static void
+nvmf_rdma_stop_listen(struct spdk_nvmf_transport *transport,
+ const struct spdk_nvme_transport_id *trid)
+{
+ struct spdk_nvmf_rdma_transport *rtransport;
+ struct spdk_nvmf_rdma_port *port, *tmp;
+
+ rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport);
+
+ pthread_mutex_lock(&rtransport->lock);
+ TAILQ_FOREACH_SAFE(port, &rtransport->ports, link, tmp) {
+ if (spdk_nvme_transport_id_compare(port->trid, trid) == 0) {
+ TAILQ_REMOVE(&rtransport->ports, port, link);
+ rdma_destroy_id(port->id);
+ free(port);
+ break;
+ }
+ }
+
+ pthread_mutex_unlock(&rtransport->lock);
+}
+
+static void
+nvmf_rdma_qpair_process_pending(struct spdk_nvmf_rdma_transport *rtransport,
+ struct spdk_nvmf_rdma_qpair *rqpair, bool drain)
+{
+ struct spdk_nvmf_request *req, *tmp;
+ struct spdk_nvmf_rdma_request *rdma_req, *req_tmp;
+ struct spdk_nvmf_rdma_resources *resources;
+
+ /* We process I/O in the data transfer pending queue at the highest priority. RDMA reads first */
+ STAILQ_FOREACH_SAFE(rdma_req, &rqpair->pending_rdma_read_queue, state_link, req_tmp) {
+ if (nvmf_rdma_request_process(rtransport, rdma_req) == false && drain == false) {
+ break;
+ }
+ }
+
+ /* Then RDMA writes since reads have stronger restrictions than writes */
+ STAILQ_FOREACH_SAFE(rdma_req, &rqpair->pending_rdma_write_queue, state_link, req_tmp) {
+ if (nvmf_rdma_request_process(rtransport, rdma_req) == false && drain == false) {
+ break;
+ }
+ }
+
+ /* The second highest priority is I/O waiting on memory buffers. */
+ STAILQ_FOREACH_SAFE(req, &rqpair->poller->group->group.pending_buf_queue, buf_link, tmp) {
+ rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req);
+ if (nvmf_rdma_request_process(rtransport, rdma_req) == false && drain == false) {
+ break;
+ }
+ }
+
+ resources = rqpair->resources;
+ while (!STAILQ_EMPTY(&resources->free_queue) && !STAILQ_EMPTY(&resources->incoming_queue)) {
+ rdma_req = STAILQ_FIRST(&resources->free_queue);
+ STAILQ_REMOVE_HEAD(&resources->free_queue, state_link);
+ rdma_req->recv = STAILQ_FIRST(&resources->incoming_queue);
+ STAILQ_REMOVE_HEAD(&resources->incoming_queue, link);
+
+ if (rqpair->srq != NULL) {
+ rdma_req->req.qpair = &rdma_req->recv->qpair->qpair;
+ rdma_req->recv->qpair->qd++;
+ } else {
+ rqpair->qd++;
+ }
+
+ rdma_req->receive_tsc = rdma_req->recv->receive_tsc;
+ rdma_req->state = RDMA_REQUEST_STATE_NEW;
+ if (nvmf_rdma_request_process(rtransport, rdma_req) == false) {
+ break;
+ }
+ }
+ if (!STAILQ_EMPTY(&resources->incoming_queue) && STAILQ_EMPTY(&resources->free_queue)) {
+ rqpair->poller->stat.pending_free_request++;
+ }
+}
+
+static void
+_nvmf_rdma_qpair_disconnect(void *ctx)
+{
+ struct spdk_nvmf_qpair *qpair = ctx;
+
+ spdk_nvmf_qpair_disconnect(qpair, NULL, NULL);
+}
+
+static void
+_nvmf_rdma_try_disconnect(void *ctx)
+{
+ struct spdk_nvmf_qpair *qpair = ctx;
+ struct spdk_nvmf_poll_group *group;
+
+ /* Read the group out of the qpair. This is normally set and accessed only from
+ * the thread that created the group. Here, we're not on that thread necessarily.
+ * The data member qpair->group begins it's life as NULL and then is assigned to
+ * a pointer and never changes. So fortunately reading this and checking for
+ * non-NULL is thread safe in the x86_64 memory model. */
+ group = qpair->group;
+
+ if (group == NULL) {
+ /* The qpair hasn't been assigned to a group yet, so we can't
+ * process a disconnect. Send a message to ourself and try again. */
+ spdk_thread_send_msg(spdk_get_thread(), _nvmf_rdma_try_disconnect, qpair);
+ return;
+ }
+
+ spdk_thread_send_msg(group->thread, _nvmf_rdma_qpair_disconnect, qpair);
+}
+
+static inline void
+nvmf_rdma_start_disconnect(struct spdk_nvmf_rdma_qpair *rqpair)
+{
+ if (!__atomic_test_and_set(&rqpair->disconnect_started, __ATOMIC_RELAXED)) {
+ _nvmf_rdma_try_disconnect(&rqpair->qpair);
+ }
+}
+
+static void nvmf_rdma_destroy_drained_qpair(void *ctx)
+{
+ struct spdk_nvmf_rdma_qpair *rqpair = ctx;
+ struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(rqpair->qpair.transport,
+ struct spdk_nvmf_rdma_transport, transport);
+
+ /* In non SRQ path, we will reach rqpair->max_queue_depth. In SRQ path, we will get the last_wqe event. */
+ if (rqpair->current_send_depth != 0) {
+ return;
+ }
+
+ if (rqpair->srq == NULL && rqpair->current_recv_depth != rqpair->max_queue_depth) {
+ return;
+ }
+
+ if (rqpair->srq != NULL && rqpair->last_wqe_reached == false) {
+ return;
+ }
+
+ nvmf_rdma_qpair_process_pending(rtransport, rqpair, true);
+
+ /* Qpair will be destroyed after nvmf layer closes this qpair */
+ if (rqpair->qpair.state != SPDK_NVMF_QPAIR_ERROR) {
+ return;
+ }
+
+ nvmf_rdma_qpair_destroy(rqpair);
+}
+
+
+static int
+nvmf_rdma_disconnect(struct rdma_cm_event *evt)
+{
+ struct spdk_nvmf_qpair *qpair;
+ struct spdk_nvmf_rdma_qpair *rqpair;
+
+ if (evt->id == NULL) {
+ SPDK_ERRLOG("disconnect request: missing cm_id\n");
+ return -1;
+ }
+
+ qpair = evt->id->context;
+ if (qpair == NULL) {
+ SPDK_ERRLOG("disconnect request: no active connection\n");
+ return -1;
+ }
+
+ rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
+
+ spdk_trace_record(TRACE_RDMA_QP_DISCONNECT, 0, 0, (uintptr_t)rqpair->cm_id, 0);
+
+ nvmf_rdma_start_disconnect(rqpair);
+
+ return 0;
+}
+
+#ifdef DEBUG
+static const char *CM_EVENT_STR[] = {
+ "RDMA_CM_EVENT_ADDR_RESOLVED",
+ "RDMA_CM_EVENT_ADDR_ERROR",
+ "RDMA_CM_EVENT_ROUTE_RESOLVED",
+ "RDMA_CM_EVENT_ROUTE_ERROR",
+ "RDMA_CM_EVENT_CONNECT_REQUEST",
+ "RDMA_CM_EVENT_CONNECT_RESPONSE",
+ "RDMA_CM_EVENT_CONNECT_ERROR",
+ "RDMA_CM_EVENT_UNREACHABLE",
+ "RDMA_CM_EVENT_REJECTED",
+ "RDMA_CM_EVENT_ESTABLISHED",
+ "RDMA_CM_EVENT_DISCONNECTED",
+ "RDMA_CM_EVENT_DEVICE_REMOVAL",
+ "RDMA_CM_EVENT_MULTICAST_JOIN",
+ "RDMA_CM_EVENT_MULTICAST_ERROR",
+ "RDMA_CM_EVENT_ADDR_CHANGE",
+ "RDMA_CM_EVENT_TIMEWAIT_EXIT"
+};
+#endif /* DEBUG */
+
+static void
+nvmf_rdma_disconnect_qpairs_on_port(struct spdk_nvmf_rdma_transport *rtransport,
+ struct spdk_nvmf_rdma_port *port)
+{
+ struct spdk_nvmf_rdma_poll_group *rgroup;
+ struct spdk_nvmf_rdma_poller *rpoller;
+ struct spdk_nvmf_rdma_qpair *rqpair;
+
+ TAILQ_FOREACH(rgroup, &rtransport->poll_groups, link) {
+ TAILQ_FOREACH(rpoller, &rgroup->pollers, link) {
+ TAILQ_FOREACH(rqpair, &rpoller->qpairs, link) {
+ if (rqpair->listen_id == port->id) {
+ nvmf_rdma_start_disconnect(rqpair);
+ }
+ }
+ }
+ }
+}
+
+static bool
+nvmf_rdma_handle_cm_event_addr_change(struct spdk_nvmf_transport *transport,
+ struct rdma_cm_event *event)
+{
+ const struct spdk_nvme_transport_id *trid;
+ struct spdk_nvmf_rdma_port *port;
+ struct spdk_nvmf_rdma_transport *rtransport;
+ bool event_acked = false;
+
+ rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport);
+ TAILQ_FOREACH(port, &rtransport->ports, link) {
+ if (port->id == event->id) {
+ SPDK_ERRLOG("ADDR_CHANGE: IP %s:%s migrated\n", port->trid->traddr, port->trid->trsvcid);
+ rdma_ack_cm_event(event);
+ event_acked = true;
+ trid = port->trid;
+ break;
+ }
+ }
+
+ if (event_acked) {
+ nvmf_rdma_disconnect_qpairs_on_port(rtransport, port);
+
+ nvmf_rdma_stop_listen(transport, trid);
+ nvmf_rdma_listen(transport, trid);
+ }
+
+ return event_acked;
+}
+
+static void
+nvmf_rdma_handle_cm_event_port_removal(struct spdk_nvmf_transport *transport,
+ struct rdma_cm_event *event)
+{
+ struct spdk_nvmf_rdma_port *port;
+ struct spdk_nvmf_rdma_transport *rtransport;
+
+ port = event->id->context;
+ rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport);
+
+ SPDK_NOTICELOG("Port %s:%s is being removed\n", port->trid->traddr, port->trid->trsvcid);
+
+ nvmf_rdma_disconnect_qpairs_on_port(rtransport, port);
+
+ rdma_ack_cm_event(event);
+
+ while (spdk_nvmf_transport_stop_listen(transport, port->trid) == 0) {
+ ;
+ }
+}
+
+static void
+nvmf_process_cm_event(struct spdk_nvmf_transport *transport)
+{
+ struct spdk_nvmf_rdma_transport *rtransport;
+ struct rdma_cm_event *event;
+ int rc;
+ bool event_acked;
+
+ rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport);
+
+ if (rtransport->event_channel == NULL) {
+ return;
+ }
+
+ while (1) {
+ event_acked = false;
+ rc = rdma_get_cm_event(rtransport->event_channel, &event);
+ if (rc) {
+ if (errno != EAGAIN && errno != EWOULDBLOCK) {
+ SPDK_ERRLOG("Acceptor Event Error: %s\n", spdk_strerror(errno));
+ }
+ break;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Acceptor Event: %s\n", CM_EVENT_STR[event->event]);
+
+ spdk_trace_record(TRACE_RDMA_CM_ASYNC_EVENT, 0, 0, 0, event->event);
+
+ switch (event->event) {
+ case RDMA_CM_EVENT_ADDR_RESOLVED:
+ case RDMA_CM_EVENT_ADDR_ERROR:
+ case RDMA_CM_EVENT_ROUTE_RESOLVED:
+ case RDMA_CM_EVENT_ROUTE_ERROR:
+ /* No action required. The target never attempts to resolve routes. */
+ break;
+ case RDMA_CM_EVENT_CONNECT_REQUEST:
+ rc = nvmf_rdma_connect(transport, event);
+ if (rc < 0) {
+ SPDK_ERRLOG("Unable to process connect event. rc: %d\n", rc);
+ break;
+ }
+ break;
+ case RDMA_CM_EVENT_CONNECT_RESPONSE:
+ /* The target never initiates a new connection. So this will not occur. */
+ break;
+ case RDMA_CM_EVENT_CONNECT_ERROR:
+ /* Can this happen? The docs say it can, but not sure what causes it. */
+ break;
+ case RDMA_CM_EVENT_UNREACHABLE:
+ case RDMA_CM_EVENT_REJECTED:
+ /* These only occur on the client side. */
+ break;
+ case RDMA_CM_EVENT_ESTABLISHED:
+ /* TODO: Should we be waiting for this event anywhere? */
+ break;
+ case RDMA_CM_EVENT_DISCONNECTED:
+ rc = nvmf_rdma_disconnect(event);
+ if (rc < 0) {
+ SPDK_ERRLOG("Unable to process disconnect event. rc: %d\n", rc);
+ break;
+ }
+ break;
+ case RDMA_CM_EVENT_DEVICE_REMOVAL:
+ /* In case of device removal, kernel IB part triggers IBV_EVENT_DEVICE_FATAL
+ * which triggers RDMA_CM_EVENT_DEVICE_REMOVAL on all cma_id’s.
+ * Once these events are sent to SPDK, we should release all IB resources and
+ * don't make attempts to call any ibv_query/modify/create functions. We can only call
+ * ibv_destory* functions to release user space memory allocated by IB. All kernel
+ * resources are already cleaned. */
+ if (event->id->qp) {
+ /* If rdma_cm event has a valid `qp` pointer then the event refers to the
+ * corresponding qpair. Otherwise the event refers to a listening device */
+ rc = nvmf_rdma_disconnect(event);
+ if (rc < 0) {
+ SPDK_ERRLOG("Unable to process disconnect event. rc: %d\n", rc);
+ break;
+ }
+ } else {
+ nvmf_rdma_handle_cm_event_port_removal(transport, event);
+ event_acked = true;
+ }
+ break;
+ case RDMA_CM_EVENT_MULTICAST_JOIN:
+ case RDMA_CM_EVENT_MULTICAST_ERROR:
+ /* Multicast is not used */
+ break;
+ case RDMA_CM_EVENT_ADDR_CHANGE:
+ event_acked = nvmf_rdma_handle_cm_event_addr_change(transport, event);
+ break;
+ case RDMA_CM_EVENT_TIMEWAIT_EXIT:
+ /* For now, do nothing. The target never re-uses queue pairs. */
+ break;
+ default:
+ SPDK_ERRLOG("Unexpected Acceptor Event [%d]\n", event->event);
+ break;
+ }
+ if (!event_acked) {
+ rdma_ack_cm_event(event);
+ }
+ }
+}
+
+static void
+nvmf_rdma_handle_qp_fatal(struct spdk_nvmf_rdma_qpair *rqpair)
+{
+ nvmf_rdma_update_ibv_state(rqpair);
+ nvmf_rdma_start_disconnect(rqpair);
+}
+
+static void
+nvmf_rdma_handle_last_wqe_reached(struct spdk_nvmf_rdma_qpair *rqpair)
+{
+ rqpair->last_wqe_reached = true;
+ nvmf_rdma_destroy_drained_qpair(rqpair);
+}
+
+static void
+nvmf_rdma_handle_sq_drained(struct spdk_nvmf_rdma_qpair *rqpair)
+{
+ nvmf_rdma_start_disconnect(rqpair);
+}
+
+static void
+nvmf_rdma_qpair_process_ibv_event(void *ctx)
+{
+ struct spdk_nvmf_rdma_ibv_event_ctx *event_ctx = ctx;
+
+ if (event_ctx->rqpair) {
+ STAILQ_REMOVE(&event_ctx->rqpair->ibv_events, event_ctx, spdk_nvmf_rdma_ibv_event_ctx, link);
+ if (event_ctx->cb_fn) {
+ event_ctx->cb_fn(event_ctx->rqpair);
+ }
+ }
+ free(event_ctx);
+}
+
+static int
+nvmf_rdma_send_qpair_async_event(struct spdk_nvmf_rdma_qpair *rqpair,
+ spdk_nvmf_rdma_qpair_ibv_event fn)
+{
+ struct spdk_nvmf_rdma_ibv_event_ctx *ctx;
+ struct spdk_thread *thr = NULL;
+ int rc;
+
+ if (rqpair->qpair.group) {
+ thr = rqpair->qpair.group->thread;
+ } else if (rqpair->destruct_channel) {
+ thr = spdk_io_channel_get_thread(rqpair->destruct_channel);
+ }
+
+ if (!thr) {
+ SPDK_DEBUGLOG(SPDK_LOG_RDMA, "rqpair %p has no thread\n", rqpair);
+ return -EINVAL;
+ }
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx) {
+ return -ENOMEM;
+ }
+
+ ctx->rqpair = rqpair;
+ ctx->cb_fn = fn;
+ STAILQ_INSERT_TAIL(&rqpair->ibv_events, ctx, link);
+
+ rc = spdk_thread_send_msg(thr, nvmf_rdma_qpair_process_ibv_event, ctx);
+ if (rc) {
+ STAILQ_REMOVE(&rqpair->ibv_events, ctx, spdk_nvmf_rdma_ibv_event_ctx, link);
+ free(ctx);
+ }
+
+ return rc;
+}
+
+static void
+nvmf_process_ib_event(struct spdk_nvmf_rdma_device *device)
+{
+ int rc;
+ struct spdk_nvmf_rdma_qpair *rqpair = NULL;
+ struct ibv_async_event event;
+
+ rc = ibv_get_async_event(device->context, &event);
+
+ if (rc) {
+ SPDK_ERRLOG("Failed to get async_event (%d): %s\n",
+ errno, spdk_strerror(errno));
+ return;
+ }
+
+ switch (event.event_type) {
+ case IBV_EVENT_QP_FATAL:
+ rqpair = event.element.qp->qp_context;
+ SPDK_ERRLOG("Fatal event received for rqpair %p\n", rqpair);
+ spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0,
+ (uintptr_t)rqpair->cm_id, event.event_type);
+ rc = nvmf_rdma_send_qpair_async_event(rqpair, nvmf_rdma_handle_qp_fatal);
+ if (rc) {
+ SPDK_WARNLOG("Failed to send QP_FATAL event. rqpair %p, err %d\n", rqpair, rc);
+ nvmf_rdma_handle_qp_fatal(rqpair);
+ }
+ break;
+ case IBV_EVENT_QP_LAST_WQE_REACHED:
+ /* This event only occurs for shared receive queues. */
+ rqpair = event.element.qp->qp_context;
+ SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Last WQE reached event received for rqpair %p\n", rqpair);
+ rc = nvmf_rdma_send_qpair_async_event(rqpair, nvmf_rdma_handle_last_wqe_reached);
+ if (rc) {
+ SPDK_WARNLOG("Failed to send LAST_WQE_REACHED event. rqpair %p, err %d\n", rqpair, rc);
+ rqpair->last_wqe_reached = true;
+ }
+ break;
+ case IBV_EVENT_SQ_DRAINED:
+ /* This event occurs frequently in both error and non-error states.
+ * Check if the qpair is in an error state before sending a message. */
+ rqpair = event.element.qp->qp_context;
+ SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Last sq drained event received for rqpair %p\n", rqpair);
+ spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0,
+ (uintptr_t)rqpair->cm_id, event.event_type);
+ if (nvmf_rdma_update_ibv_state(rqpair) == IBV_QPS_ERR) {
+ rc = nvmf_rdma_send_qpair_async_event(rqpair, nvmf_rdma_handle_sq_drained);
+ if (rc) {
+ SPDK_WARNLOG("Failed to send SQ_DRAINED event. rqpair %p, err %d\n", rqpair, rc);
+ nvmf_rdma_handle_sq_drained(rqpair);
+ }
+ }
+ break;
+ case IBV_EVENT_QP_REQ_ERR:
+ case IBV_EVENT_QP_ACCESS_ERR:
+ case IBV_EVENT_COMM_EST:
+ case IBV_EVENT_PATH_MIG:
+ case IBV_EVENT_PATH_MIG_ERR:
+ SPDK_NOTICELOG("Async event: %s\n",
+ ibv_event_type_str(event.event_type));
+ rqpair = event.element.qp->qp_context;
+ spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0,
+ (uintptr_t)rqpair->cm_id, event.event_type);
+ nvmf_rdma_update_ibv_state(rqpair);
+ break;
+ case IBV_EVENT_CQ_ERR:
+ case IBV_EVENT_DEVICE_FATAL:
+ case IBV_EVENT_PORT_ACTIVE:
+ case IBV_EVENT_PORT_ERR:
+ case IBV_EVENT_LID_CHANGE:
+ case IBV_EVENT_PKEY_CHANGE:
+ case IBV_EVENT_SM_CHANGE:
+ case IBV_EVENT_SRQ_ERR:
+ case IBV_EVENT_SRQ_LIMIT_REACHED:
+ case IBV_EVENT_CLIENT_REREGISTER:
+ case IBV_EVENT_GID_CHANGE:
+ default:
+ SPDK_NOTICELOG("Async event: %s\n",
+ ibv_event_type_str(event.event_type));
+ spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0, 0, event.event_type);
+ break;
+ }
+ ibv_ack_async_event(&event);
+}
+
+static uint32_t
+nvmf_rdma_accept(struct spdk_nvmf_transport *transport)
+{
+ int nfds, i = 0;
+ struct spdk_nvmf_rdma_transport *rtransport;
+ struct spdk_nvmf_rdma_device *device, *tmp;
+ uint32_t count;
+
+ rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport);
+ count = nfds = poll(rtransport->poll_fds, rtransport->npoll_fds, 0);
+
+ if (nfds <= 0) {
+ return 0;
+ }
+
+ /* The first poll descriptor is RDMA CM event */
+ if (rtransport->poll_fds[i++].revents & POLLIN) {
+ nvmf_process_cm_event(transport);
+ nfds--;
+ }
+
+ if (nfds == 0) {
+ return count;
+ }
+
+ /* Second and subsequent poll descriptors are IB async events */
+ TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, tmp) {
+ if (rtransport->poll_fds[i++].revents & POLLIN) {
+ nvmf_process_ib_event(device);
+ nfds--;
+ }
+ }
+ /* check all flagged fd's have been served */
+ assert(nfds == 0);
+
+ return count;
+}
+
+static void
+nvmf_rdma_cdata_init(struct spdk_nvmf_transport *transport, struct spdk_nvmf_subsystem *subsystem,
+ struct spdk_nvmf_ctrlr_data *cdata)
+{
+ cdata->nvmf_specific.msdbd = SPDK_NVMF_MAX_SGL_ENTRIES;
+
+ /* Disable in-capsule data transfer for RDMA controller when dif_insert_or_strip is enabled
+ since in-capsule data only works with NVME drives that support SGL memory layout */
+ if (transport->opts.dif_insert_or_strip) {
+ cdata->nvmf_specific.ioccsz = sizeof(struct spdk_nvme_cmd) / 16;
+ }
+}
+
+static void
+nvmf_rdma_discover(struct spdk_nvmf_transport *transport,
+ struct spdk_nvme_transport_id *trid,
+ struct spdk_nvmf_discovery_log_page_entry *entry)
+{
+ entry->trtype = SPDK_NVMF_TRTYPE_RDMA;
+ entry->adrfam = trid->adrfam;
+ entry->treq.secure_channel = SPDK_NVMF_TREQ_SECURE_CHANNEL_NOT_REQUIRED;
+
+ spdk_strcpy_pad(entry->trsvcid, trid->trsvcid, sizeof(entry->trsvcid), ' ');
+ spdk_strcpy_pad(entry->traddr, trid->traddr, sizeof(entry->traddr), ' ');
+
+ entry->tsas.rdma.rdma_qptype = SPDK_NVMF_RDMA_QPTYPE_RELIABLE_CONNECTED;
+ entry->tsas.rdma.rdma_prtype = SPDK_NVMF_RDMA_PRTYPE_NONE;
+ entry->tsas.rdma.rdma_cms = SPDK_NVMF_RDMA_CMS_RDMA_CM;
+}
+
+static void
+nvmf_rdma_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group);
+
+static struct spdk_nvmf_transport_poll_group *
+nvmf_rdma_poll_group_create(struct spdk_nvmf_transport *transport)
+{
+ struct spdk_nvmf_rdma_transport *rtransport;
+ struct spdk_nvmf_rdma_poll_group *rgroup;
+ struct spdk_nvmf_rdma_poller *poller;
+ struct spdk_nvmf_rdma_device *device;
+ struct ibv_srq_init_attr srq_init_attr;
+ struct spdk_nvmf_rdma_resource_opts opts;
+ int num_cqe;
+
+ rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport);
+
+ rgroup = calloc(1, sizeof(*rgroup));
+ if (!rgroup) {
+ return NULL;
+ }
+
+ TAILQ_INIT(&rgroup->pollers);
+ STAILQ_INIT(&rgroup->retired_bufs);
+
+ pthread_mutex_lock(&rtransport->lock);
+ TAILQ_FOREACH(device, &rtransport->devices, link) {
+ poller = calloc(1, sizeof(*poller));
+ if (!poller) {
+ SPDK_ERRLOG("Unable to allocate memory for new RDMA poller\n");
+ nvmf_rdma_poll_group_destroy(&rgroup->group);
+ pthread_mutex_unlock(&rtransport->lock);
+ return NULL;
+ }
+
+ poller->device = device;
+ poller->group = rgroup;
+
+ TAILQ_INIT(&poller->qpairs);
+ STAILQ_INIT(&poller->qpairs_pending_send);
+ STAILQ_INIT(&poller->qpairs_pending_recv);
+
+ TAILQ_INSERT_TAIL(&rgroup->pollers, poller, link);
+ if (transport->opts.no_srq == false && device->num_srq < device->attr.max_srq) {
+ poller->max_srq_depth = transport->opts.max_srq_depth;
+
+ device->num_srq++;
+ memset(&srq_init_attr, 0, sizeof(struct ibv_srq_init_attr));
+ srq_init_attr.attr.max_wr = poller->max_srq_depth;
+ srq_init_attr.attr.max_sge = spdk_min(device->attr.max_sge, NVMF_DEFAULT_RX_SGE);
+ poller->srq = ibv_create_srq(device->pd, &srq_init_attr);
+ if (!poller->srq) {
+ SPDK_ERRLOG("Unable to create shared receive queue, errno %d\n", errno);
+ nvmf_rdma_poll_group_destroy(&rgroup->group);
+ pthread_mutex_unlock(&rtransport->lock);
+ return NULL;
+ }
+
+ opts.qp = poller->srq;
+ opts.pd = device->pd;
+ opts.qpair = NULL;
+ opts.shared = true;
+ opts.max_queue_depth = poller->max_srq_depth;
+ opts.in_capsule_data_size = transport->opts.in_capsule_data_size;
+
+ poller->resources = nvmf_rdma_resources_create(&opts);
+ if (!poller->resources) {
+ SPDK_ERRLOG("Unable to allocate resources for shared receive queue.\n");
+ nvmf_rdma_poll_group_destroy(&rgroup->group);
+ pthread_mutex_unlock(&rtransport->lock);
+ return NULL;
+ }
+ }
+
+ /*
+ * When using an srq, we can limit the completion queue at startup.
+ * The following formula represents the calculation:
+ * num_cqe = num_recv + num_data_wr + num_send_wr.
+ * where num_recv=num_data_wr=and num_send_wr=poller->max_srq_depth
+ */
+ if (poller->srq) {
+ num_cqe = poller->max_srq_depth * 3;
+ } else {
+ num_cqe = DEFAULT_NVMF_RDMA_CQ_SIZE;
+ }
+
+ poller->cq = ibv_create_cq(device->context, num_cqe, poller, NULL, 0);
+ if (!poller->cq) {
+ SPDK_ERRLOG("Unable to create completion queue\n");
+ nvmf_rdma_poll_group_destroy(&rgroup->group);
+ pthread_mutex_unlock(&rtransport->lock);
+ return NULL;
+ }
+ poller->num_cqe = num_cqe;
+ }
+
+ TAILQ_INSERT_TAIL(&rtransport->poll_groups, rgroup, link);
+ if (rtransport->conn_sched.next_admin_pg == NULL) {
+ rtransport->conn_sched.next_admin_pg = rgroup;
+ rtransport->conn_sched.next_io_pg = rgroup;
+ }
+
+ pthread_mutex_unlock(&rtransport->lock);
+ return &rgroup->group;
+}
+
+static struct spdk_nvmf_transport_poll_group *
+nvmf_rdma_get_optimal_poll_group(struct spdk_nvmf_qpair *qpair)
+{
+ struct spdk_nvmf_rdma_transport *rtransport;
+ struct spdk_nvmf_rdma_poll_group **pg;
+ struct spdk_nvmf_transport_poll_group *result;
+
+ rtransport = SPDK_CONTAINEROF(qpair->transport, struct spdk_nvmf_rdma_transport, transport);
+
+ pthread_mutex_lock(&rtransport->lock);
+
+ if (TAILQ_EMPTY(&rtransport->poll_groups)) {
+ pthread_mutex_unlock(&rtransport->lock);
+ return NULL;
+ }
+
+ if (qpair->qid == 0) {
+ pg = &rtransport->conn_sched.next_admin_pg;
+ } else {
+ pg = &rtransport->conn_sched.next_io_pg;
+ }
+
+ assert(*pg != NULL);
+
+ result = &(*pg)->group;
+
+ *pg = TAILQ_NEXT(*pg, link);
+ if (*pg == NULL) {
+ *pg = TAILQ_FIRST(&rtransport->poll_groups);
+ }
+
+ pthread_mutex_unlock(&rtransport->lock);
+
+ return result;
+}
+
+static void
+nvmf_rdma_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group)
+{
+ struct spdk_nvmf_rdma_poll_group *rgroup, *next_rgroup;
+ struct spdk_nvmf_rdma_poller *poller, *tmp;
+ struct spdk_nvmf_rdma_qpair *qpair, *tmp_qpair;
+ struct spdk_nvmf_transport_pg_cache_buf *buf, *tmp_buf;
+ struct spdk_nvmf_rdma_transport *rtransport;
+
+ rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group);
+ if (!rgroup) {
+ return;
+ }
+
+ /* free all retired buffers back to the transport so we don't short the mempool. */
+ STAILQ_FOREACH_SAFE(buf, &rgroup->retired_bufs, link, tmp_buf) {
+ STAILQ_REMOVE(&rgroup->retired_bufs, buf, spdk_nvmf_transport_pg_cache_buf, link);
+ assert(group->transport != NULL);
+ spdk_mempool_put(group->transport->data_buf_pool, buf);
+ }
+
+ TAILQ_FOREACH_SAFE(poller, &rgroup->pollers, link, tmp) {
+ TAILQ_REMOVE(&rgroup->pollers, poller, link);
+
+ TAILQ_FOREACH_SAFE(qpair, &poller->qpairs, link, tmp_qpair) {
+ nvmf_rdma_qpair_destroy(qpair);
+ }
+
+ if (poller->srq) {
+ if (poller->resources) {
+ nvmf_rdma_resources_destroy(poller->resources);
+ }
+ ibv_destroy_srq(poller->srq);
+ SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Destroyed RDMA shared queue %p\n", poller->srq);
+ }
+
+ if (poller->cq) {
+ ibv_destroy_cq(poller->cq);
+ }
+
+ free(poller);
+ }
+
+ if (rgroup->group.transport == NULL) {
+ /* Transport can be NULL when nvmf_rdma_poll_group_create()
+ * calls this function directly in a failure path. */
+ free(rgroup);
+ return;
+ }
+
+ rtransport = SPDK_CONTAINEROF(rgroup->group.transport, struct spdk_nvmf_rdma_transport, transport);
+
+ pthread_mutex_lock(&rtransport->lock);
+ next_rgroup = TAILQ_NEXT(rgroup, link);
+ TAILQ_REMOVE(&rtransport->poll_groups, rgroup, link);
+ if (next_rgroup == NULL) {
+ next_rgroup = TAILQ_FIRST(&rtransport->poll_groups);
+ }
+ if (rtransport->conn_sched.next_admin_pg == rgroup) {
+ rtransport->conn_sched.next_admin_pg = next_rgroup;
+ }
+ if (rtransport->conn_sched.next_io_pg == rgroup) {
+ rtransport->conn_sched.next_io_pg = next_rgroup;
+ }
+ pthread_mutex_unlock(&rtransport->lock);
+
+ free(rgroup);
+}
+
+static void
+nvmf_rdma_qpair_reject_connection(struct spdk_nvmf_rdma_qpair *rqpair)
+{
+ if (rqpair->cm_id != NULL) {
+ nvmf_rdma_event_reject(rqpair->cm_id, SPDK_NVMF_RDMA_ERROR_NO_RESOURCES);
+ }
+ nvmf_rdma_qpair_destroy(rqpair);
+}
+
+static int
+nvmf_rdma_poll_group_add(struct spdk_nvmf_transport_poll_group *group,
+ struct spdk_nvmf_qpair *qpair)
+{
+ struct spdk_nvmf_rdma_poll_group *rgroup;
+ struct spdk_nvmf_rdma_qpair *rqpair;
+ struct spdk_nvmf_rdma_device *device;
+ struct spdk_nvmf_rdma_poller *poller;
+ int rc;
+
+ rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group);
+ rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
+
+ device = rqpair->device;
+
+ TAILQ_FOREACH(poller, &rgroup->pollers, link) {
+ if (poller->device == device) {
+ break;
+ }
+ }
+
+ if (!poller) {
+ SPDK_ERRLOG("No poller found for device.\n");
+ return -1;
+ }
+
+ TAILQ_INSERT_TAIL(&poller->qpairs, rqpair, link);
+ rqpair->poller = poller;
+ rqpair->srq = rqpair->poller->srq;
+
+ rc = nvmf_rdma_qpair_initialize(qpair);
+ if (rc < 0) {
+ SPDK_ERRLOG("Failed to initialize nvmf_rdma_qpair with qpair=%p\n", qpair);
+ return -1;
+ }
+
+ rc = nvmf_rdma_event_accept(rqpair->cm_id, rqpair);
+ if (rc) {
+ /* Try to reject, but we probably can't */
+ nvmf_rdma_qpair_reject_connection(rqpair);
+ return -1;
+ }
+
+ nvmf_rdma_update_ibv_state(rqpair);
+
+ return 0;
+}
+
+static int
+nvmf_rdma_poll_group_remove(struct spdk_nvmf_transport_poll_group *group,
+ struct spdk_nvmf_qpair *qpair)
+{
+ struct spdk_nvmf_rdma_qpair *rqpair;
+
+ rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
+ assert(group->transport->tgt != NULL);
+
+ rqpair->destruct_channel = spdk_get_io_channel(group->transport->tgt);
+
+ if (!rqpair->destruct_channel) {
+ SPDK_WARNLOG("failed to get io_channel, qpair %p\n", qpair);
+ return 0;
+ }
+
+ /* Sanity check that we get io_channel on the correct thread */
+ if (qpair->group) {
+ assert(qpair->group->thread == spdk_io_channel_get_thread(rqpair->destruct_channel));
+ }
+
+ return 0;
+}
+
+static int
+nvmf_rdma_request_free(struct spdk_nvmf_request *req)
+{
+ struct spdk_nvmf_rdma_request *rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req);
+ struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(req->qpair->transport,
+ struct spdk_nvmf_rdma_transport, transport);
+ struct spdk_nvmf_rdma_qpair *rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair,
+ struct spdk_nvmf_rdma_qpair, qpair);
+
+ /*
+ * AER requests are freed when a qpair is destroyed. The recv corresponding to that request
+ * needs to be returned to the shared receive queue or the poll group will eventually be
+ * starved of RECV structures.
+ */
+ if (rqpair->srq && rdma_req->recv) {
+ int rc;
+ struct ibv_recv_wr *bad_recv_wr;
+
+ rc = ibv_post_srq_recv(rqpair->srq, &rdma_req->recv->wr, &bad_recv_wr);
+ if (rc) {
+ SPDK_ERRLOG("Unable to re-post rx descriptor\n");
+ }
+ }
+
+ _nvmf_rdma_request_free(rdma_req, rtransport);
+ return 0;
+}
+
+static int
+nvmf_rdma_request_complete(struct spdk_nvmf_request *req)
+{
+ struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(req->qpair->transport,
+ struct spdk_nvmf_rdma_transport, transport);
+ struct spdk_nvmf_rdma_request *rdma_req = SPDK_CONTAINEROF(req,
+ struct spdk_nvmf_rdma_request, req);
+ struct spdk_nvmf_rdma_qpair *rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair,
+ struct spdk_nvmf_rdma_qpair, qpair);
+
+ if (rqpair->ibv_state != IBV_QPS_ERR) {
+ /* The connection is alive, so process the request as normal */
+ rdma_req->state = RDMA_REQUEST_STATE_EXECUTED;
+ } else {
+ /* The connection is dead. Move the request directly to the completed state. */
+ rdma_req->state = RDMA_REQUEST_STATE_COMPLETED;
+ }
+
+ nvmf_rdma_request_process(rtransport, rdma_req);
+
+ return 0;
+}
+
+static int
+nvmf_rdma_destroy_defunct_qpair(void *ctx)
+{
+ struct spdk_nvmf_rdma_qpair *rqpair = ctx;
+ struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(rqpair->qpair.transport,
+ struct spdk_nvmf_rdma_transport, transport);
+
+ SPDK_INFOLOG(SPDK_LOG_RDMA, "QP#%d hasn't been drained as expected, manually destroy it\n",
+ rqpair->qpair.qid);
+
+ nvmf_rdma_qpair_process_pending(rtransport, rqpair, true);
+ nvmf_rdma_qpair_destroy(rqpair);
+
+ return SPDK_POLLER_BUSY;
+}
+
+static void
+nvmf_rdma_close_qpair(struct spdk_nvmf_qpair *qpair)
+{
+ struct spdk_nvmf_rdma_qpair *rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
+
+ if (rqpair->disconnect_flags & RDMA_QP_DISCONNECTING) {
+ return;
+ }
+
+ rqpair->disconnect_flags |= RDMA_QP_DISCONNECTING;
+
+ /* This happens only when the qpair is disconnected before
+ * it is added to the poll group. Since there is no poll group,
+ * the RDMA qp has not been initialized yet and the RDMA CM
+ * event has not yet been acknowledged, so we need to reject it.
+ */
+ if (rqpair->qpair.state == SPDK_NVMF_QPAIR_UNINITIALIZED) {
+ nvmf_rdma_qpair_reject_connection(rqpair);
+ return;
+ }
+
+ if (rqpair->rdma_qp) {
+ spdk_rdma_qp_disconnect(rqpair->rdma_qp);
+ }
+
+ rqpair->destruct_poller = SPDK_POLLER_REGISTER(nvmf_rdma_destroy_defunct_qpair, (void *)rqpair,
+ NVMF_RDMA_QPAIR_DESTROY_TIMEOUT_US);
+}
+
+static struct spdk_nvmf_rdma_qpair *
+get_rdma_qpair_from_wc(struct spdk_nvmf_rdma_poller *rpoller, struct ibv_wc *wc)
+{
+ struct spdk_nvmf_rdma_qpair *rqpair;
+ /* @todo: improve QP search */
+ TAILQ_FOREACH(rqpair, &rpoller->qpairs, link) {
+ if (wc->qp_num == rqpair->rdma_qp->qp->qp_num) {
+ return rqpair;
+ }
+ }
+ SPDK_ERRLOG("Didn't find QP with qp_num %u\n", wc->qp_num);
+ return NULL;
+}
+
+#ifdef DEBUG
+static int
+nvmf_rdma_req_is_completing(struct spdk_nvmf_rdma_request *rdma_req)
+{
+ return rdma_req->state == RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST ||
+ rdma_req->state == RDMA_REQUEST_STATE_COMPLETING;
+}
+#endif
+
+static void
+_poller_reset_failed_recvs(struct spdk_nvmf_rdma_poller *rpoller, struct ibv_recv_wr *bad_recv_wr,
+ int rc)
+{
+ struct spdk_nvmf_rdma_recv *rdma_recv;
+ struct spdk_nvmf_rdma_wr *bad_rdma_wr;
+
+ SPDK_ERRLOG("Failed to post a recv for the poller %p with errno %d\n", rpoller, -rc);
+ while (bad_recv_wr != NULL) {
+ bad_rdma_wr = (struct spdk_nvmf_rdma_wr *)bad_recv_wr->wr_id;
+ rdma_recv = SPDK_CONTAINEROF(bad_rdma_wr, struct spdk_nvmf_rdma_recv, rdma_wr);
+
+ rdma_recv->qpair->current_recv_depth++;
+ bad_recv_wr = bad_recv_wr->next;
+ SPDK_ERRLOG("Failed to post a recv for the qpair %p with errno %d\n", rdma_recv->qpair, -rc);
+ nvmf_rdma_start_disconnect(rdma_recv->qpair);
+ }
+}
+
+static void
+_qp_reset_failed_recvs(struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_recv_wr *bad_recv_wr, int rc)
+{
+ SPDK_ERRLOG("Failed to post a recv for the qpair %p with errno %d\n", rqpair, -rc);
+ while (bad_recv_wr != NULL) {
+ bad_recv_wr = bad_recv_wr->next;
+ rqpair->current_recv_depth++;
+ }
+ nvmf_rdma_start_disconnect(rqpair);
+}
+
+static void
+_poller_submit_recvs(struct spdk_nvmf_rdma_transport *rtransport,
+ struct spdk_nvmf_rdma_poller *rpoller)
+{
+ struct spdk_nvmf_rdma_qpair *rqpair;
+ struct ibv_recv_wr *bad_recv_wr;
+ int rc;
+
+ if (rpoller->srq) {
+ if (rpoller->resources->recvs_to_post.first != NULL) {
+ rc = ibv_post_srq_recv(rpoller->srq, rpoller->resources->recvs_to_post.first, &bad_recv_wr);
+ if (rc) {
+ _poller_reset_failed_recvs(rpoller, bad_recv_wr, rc);
+ }
+ rpoller->resources->recvs_to_post.first = NULL;
+ rpoller->resources->recvs_to_post.last = NULL;
+ }
+ } else {
+ while (!STAILQ_EMPTY(&rpoller->qpairs_pending_recv)) {
+ rqpair = STAILQ_FIRST(&rpoller->qpairs_pending_recv);
+ assert(rqpair->resources->recvs_to_post.first != NULL);
+ rc = ibv_post_recv(rqpair->rdma_qp->qp, rqpair->resources->recvs_to_post.first, &bad_recv_wr);
+ if (rc) {
+ _qp_reset_failed_recvs(rqpair, bad_recv_wr, rc);
+ }
+ rqpair->resources->recvs_to_post.first = NULL;
+ rqpair->resources->recvs_to_post.last = NULL;
+ STAILQ_REMOVE_HEAD(&rpoller->qpairs_pending_recv, recv_link);
+ }
+ }
+}
+
+static void
+_qp_reset_failed_sends(struct spdk_nvmf_rdma_transport *rtransport,
+ struct spdk_nvmf_rdma_qpair *rqpair, struct ibv_send_wr *bad_wr, int rc)
+{
+ struct spdk_nvmf_rdma_wr *bad_rdma_wr;
+ struct spdk_nvmf_rdma_request *prev_rdma_req = NULL, *cur_rdma_req = NULL;
+
+ SPDK_ERRLOG("Failed to post a send for the qpair %p with errno %d\n", rqpair, -rc);
+ for (; bad_wr != NULL; bad_wr = bad_wr->next) {
+ bad_rdma_wr = (struct spdk_nvmf_rdma_wr *)bad_wr->wr_id;
+ assert(rqpair->current_send_depth > 0);
+ rqpair->current_send_depth--;
+ switch (bad_rdma_wr->type) {
+ case RDMA_WR_TYPE_DATA:
+ cur_rdma_req = SPDK_CONTAINEROF(bad_rdma_wr, struct spdk_nvmf_rdma_request, data.rdma_wr);
+ if (bad_wr->opcode == IBV_WR_RDMA_READ) {
+ assert(rqpair->current_read_depth > 0);
+ rqpair->current_read_depth--;
+ }
+ break;
+ case RDMA_WR_TYPE_SEND:
+ cur_rdma_req = SPDK_CONTAINEROF(bad_rdma_wr, struct spdk_nvmf_rdma_request, rsp.rdma_wr);
+ break;
+ default:
+ SPDK_ERRLOG("Found a RECV in the list of pending SEND requests for qpair %p\n", rqpair);
+ prev_rdma_req = cur_rdma_req;
+ continue;
+ }
+
+ if (prev_rdma_req == cur_rdma_req) {
+ /* this request was handled by an earlier wr. i.e. we were performing an nvme read. */
+ /* We only have to check against prev_wr since each requests wrs are contiguous in this list. */
+ continue;
+ }
+
+ switch (cur_rdma_req->state) {
+ case RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER:
+ cur_rdma_req->req.rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+ cur_rdma_req->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE;
+ break;
+ case RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST:
+ case RDMA_REQUEST_STATE_COMPLETING:
+ cur_rdma_req->state = RDMA_REQUEST_STATE_COMPLETED;
+ break;
+ default:
+ SPDK_ERRLOG("Found a request in a bad state %d when draining pending SEND requests for qpair %p\n",
+ cur_rdma_req->state, rqpair);
+ continue;
+ }
+
+ nvmf_rdma_request_process(rtransport, cur_rdma_req);
+ prev_rdma_req = cur_rdma_req;
+ }
+
+ if (rqpair->qpair.state == SPDK_NVMF_QPAIR_ACTIVE) {
+ /* Disconnect the connection. */
+ nvmf_rdma_start_disconnect(rqpair);
+ }
+
+}
+
+static void
+_poller_submit_sends(struct spdk_nvmf_rdma_transport *rtransport,
+ struct spdk_nvmf_rdma_poller *rpoller)
+{
+ struct spdk_nvmf_rdma_qpair *rqpair;
+ struct ibv_send_wr *bad_wr = NULL;
+ int rc;
+
+ while (!STAILQ_EMPTY(&rpoller->qpairs_pending_send)) {
+ rqpair = STAILQ_FIRST(&rpoller->qpairs_pending_send);
+ rc = spdk_rdma_qp_flush_send_wrs(rqpair->rdma_qp, &bad_wr);
+
+ /* bad wr always points to the first wr that failed. */
+ if (rc) {
+ _qp_reset_failed_sends(rtransport, rqpair, bad_wr, rc);
+ }
+ STAILQ_REMOVE_HEAD(&rpoller->qpairs_pending_send, send_link);
+ }
+}
+
+static int
+nvmf_rdma_poller_poll(struct spdk_nvmf_rdma_transport *rtransport,
+ struct spdk_nvmf_rdma_poller *rpoller)
+{
+ struct ibv_wc wc[32];
+ struct spdk_nvmf_rdma_wr *rdma_wr;
+ struct spdk_nvmf_rdma_request *rdma_req;
+ struct spdk_nvmf_rdma_recv *rdma_recv;
+ struct spdk_nvmf_rdma_qpair *rqpair;
+ int reaped, i;
+ int count = 0;
+ bool error = false;
+ uint64_t poll_tsc = spdk_get_ticks();
+
+ /* Poll for completing operations. */
+ reaped = ibv_poll_cq(rpoller->cq, 32, wc);
+ if (reaped < 0) {
+ SPDK_ERRLOG("Error polling CQ! (%d): %s\n",
+ errno, spdk_strerror(errno));
+ return -1;
+ }
+
+ rpoller->stat.polls++;
+ rpoller->stat.completions += reaped;
+
+ for (i = 0; i < reaped; i++) {
+
+ rdma_wr = (struct spdk_nvmf_rdma_wr *)wc[i].wr_id;
+
+ switch (rdma_wr->type) {
+ case RDMA_WR_TYPE_SEND:
+ rdma_req = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_request, rsp.rdma_wr);
+ rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair);
+
+ if (!wc[i].status) {
+ count++;
+ assert(wc[i].opcode == IBV_WC_SEND);
+ assert(nvmf_rdma_req_is_completing(rdma_req));
+ }
+
+ rdma_req->state = RDMA_REQUEST_STATE_COMPLETED;
+ /* RDMA_WRITE operation completed. +1 since it was chained with rsp WR */
+ rqpair->current_send_depth -= rdma_req->num_outstanding_data_wr + 1;
+ rdma_req->num_outstanding_data_wr = 0;
+
+ nvmf_rdma_request_process(rtransport, rdma_req);
+ break;
+ case RDMA_WR_TYPE_RECV:
+ /* rdma_recv->qpair will be invalid if using an SRQ. In that case we have to get the qpair from the wc. */
+ rdma_recv = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_recv, rdma_wr);
+ if (rpoller->srq != NULL) {
+ rdma_recv->qpair = get_rdma_qpair_from_wc(rpoller, &wc[i]);
+ /* It is possible that there are still some completions for destroyed QP
+ * associated with SRQ. We just ignore these late completions and re-post
+ * receive WRs back to SRQ.
+ */
+ if (spdk_unlikely(NULL == rdma_recv->qpair)) {
+ struct ibv_recv_wr *bad_wr;
+ int rc;
+
+ rdma_recv->wr.next = NULL;
+ rc = ibv_post_srq_recv(rpoller->srq,
+ &rdma_recv->wr,
+ &bad_wr);
+ if (rc) {
+ SPDK_ERRLOG("Failed to re-post recv WR to SRQ, err %d\n", rc);
+ }
+ continue;
+ }
+ }
+ rqpair = rdma_recv->qpair;
+
+ assert(rqpair != NULL);
+ if (!wc[i].status) {
+ assert(wc[i].opcode == IBV_WC_RECV);
+ if (rqpair->current_recv_depth >= rqpair->max_queue_depth) {
+ nvmf_rdma_start_disconnect(rqpair);
+ break;
+ }
+ }
+
+ rdma_recv->wr.next = NULL;
+ rqpair->current_recv_depth++;
+ rdma_recv->receive_tsc = poll_tsc;
+ rpoller->stat.requests++;
+ STAILQ_INSERT_TAIL(&rqpair->resources->incoming_queue, rdma_recv, link);
+ break;
+ case RDMA_WR_TYPE_DATA:
+ rdma_req = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvmf_rdma_request, data.rdma_wr);
+ rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair);
+
+ assert(rdma_req->num_outstanding_data_wr > 0);
+
+ rqpair->current_send_depth--;
+ rdma_req->num_outstanding_data_wr--;
+ if (!wc[i].status) {
+ assert(wc[i].opcode == IBV_WC_RDMA_READ);
+ rqpair->current_read_depth--;
+ /* wait for all outstanding reads associated with the same rdma_req to complete before proceeding. */
+ if (rdma_req->num_outstanding_data_wr == 0) {
+ rdma_req->state = RDMA_REQUEST_STATE_READY_TO_EXECUTE;
+ nvmf_rdma_request_process(rtransport, rdma_req);
+ }
+ } else {
+ /* If the data transfer fails still force the queue into the error state,
+ * if we were performing an RDMA_READ, we need to force the request into a
+ * completed state since it wasn't linked to a send. However, in the RDMA_WRITE
+ * case, we should wait for the SEND to complete. */
+ if (rdma_req->data.wr.opcode == IBV_WR_RDMA_READ) {
+ rqpair->current_read_depth--;
+ if (rdma_req->num_outstanding_data_wr == 0) {
+ rdma_req->state = RDMA_REQUEST_STATE_COMPLETED;
+ }
+ }
+ }
+ break;
+ default:
+ SPDK_ERRLOG("Received an unknown opcode on the CQ: %d\n", wc[i].opcode);
+ continue;
+ }
+
+ /* Handle error conditions */
+ if (wc[i].status) {
+ if ((rdma_wr->type == RDMA_WR_TYPE_RECV && !rpoller->srq)) {
+ /* When we don't use SRQ and close a qpair, we will receive completions with error
+ * status for all posted ibv_recv_wrs. This is expected and we don't want to log
+ * an error in that case. */
+ SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Error on CQ %p, request 0x%lu, type %d, status: (%d): %s\n",
+ rpoller->cq, wc[i].wr_id, rdma_wr->type, wc[i].status, ibv_wc_status_str(wc[i].status));
+ } else {
+ SPDK_ERRLOG("Error on CQ %p, request 0x%lu, type %d, status: (%d): %s\n",
+ rpoller->cq, wc[i].wr_id, rdma_wr->type, wc[i].status, ibv_wc_status_str(wc[i].status));
+ }
+
+ error = true;
+
+ if (rqpair->qpair.state == SPDK_NVMF_QPAIR_ACTIVE) {
+ /* Disconnect the connection. */
+ nvmf_rdma_start_disconnect(rqpair);
+ } else {
+ nvmf_rdma_destroy_drained_qpair(rqpair);
+ }
+ continue;
+ }
+
+ nvmf_rdma_qpair_process_pending(rtransport, rqpair, false);
+
+ if (rqpair->qpair.state != SPDK_NVMF_QPAIR_ACTIVE) {
+ nvmf_rdma_destroy_drained_qpair(rqpair);
+ }
+ }
+
+ if (error == true) {
+ return -1;
+ }
+
+ /* submit outstanding work requests. */
+ _poller_submit_recvs(rtransport, rpoller);
+ _poller_submit_sends(rtransport, rpoller);
+
+ return count;
+}
+
+static int
+nvmf_rdma_poll_group_poll(struct spdk_nvmf_transport_poll_group *group)
+{
+ struct spdk_nvmf_rdma_transport *rtransport;
+ struct spdk_nvmf_rdma_poll_group *rgroup;
+ struct spdk_nvmf_rdma_poller *rpoller;
+ int count, rc;
+
+ rtransport = SPDK_CONTAINEROF(group->transport, struct spdk_nvmf_rdma_transport, transport);
+ rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group);
+
+ count = 0;
+ TAILQ_FOREACH(rpoller, &rgroup->pollers, link) {
+ rc = nvmf_rdma_poller_poll(rtransport, rpoller);
+ if (rc < 0) {
+ return rc;
+ }
+ count += rc;
+ }
+
+ return count;
+}
+
+static int
+nvmf_rdma_trid_from_cm_id(struct rdma_cm_id *id,
+ struct spdk_nvme_transport_id *trid,
+ bool peer)
+{
+ struct sockaddr *saddr;
+ uint16_t port;
+
+ spdk_nvme_trid_populate_transport(trid, SPDK_NVME_TRANSPORT_RDMA);
+
+ if (peer) {
+ saddr = rdma_get_peer_addr(id);
+ } else {
+ saddr = rdma_get_local_addr(id);
+ }
+ switch (saddr->sa_family) {
+ case AF_INET: {
+ struct sockaddr_in *saddr_in = (struct sockaddr_in *)saddr;
+
+ trid->adrfam = SPDK_NVMF_ADRFAM_IPV4;
+ inet_ntop(AF_INET, &saddr_in->sin_addr,
+ trid->traddr, sizeof(trid->traddr));
+ if (peer) {
+ port = ntohs(rdma_get_dst_port(id));
+ } else {
+ port = ntohs(rdma_get_src_port(id));
+ }
+ snprintf(trid->trsvcid, sizeof(trid->trsvcid), "%u", port);
+ break;
+ }
+ case AF_INET6: {
+ struct sockaddr_in6 *saddr_in = (struct sockaddr_in6 *)saddr;
+ trid->adrfam = SPDK_NVMF_ADRFAM_IPV6;
+ inet_ntop(AF_INET6, &saddr_in->sin6_addr,
+ trid->traddr, sizeof(trid->traddr));
+ if (peer) {
+ port = ntohs(rdma_get_dst_port(id));
+ } else {
+ port = ntohs(rdma_get_src_port(id));
+ }
+ snprintf(trid->trsvcid, sizeof(trid->trsvcid), "%u", port);
+ break;
+ }
+ default:
+ return -1;
+
+ }
+
+ return 0;
+}
+
+static int
+nvmf_rdma_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair,
+ struct spdk_nvme_transport_id *trid)
+{
+ struct spdk_nvmf_rdma_qpair *rqpair;
+
+ rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
+
+ return nvmf_rdma_trid_from_cm_id(rqpair->cm_id, trid, true);
+}
+
+static int
+nvmf_rdma_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair,
+ struct spdk_nvme_transport_id *trid)
+{
+ struct spdk_nvmf_rdma_qpair *rqpair;
+
+ rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
+
+ return nvmf_rdma_trid_from_cm_id(rqpair->cm_id, trid, false);
+}
+
+static int
+nvmf_rdma_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair,
+ struct spdk_nvme_transport_id *trid)
+{
+ struct spdk_nvmf_rdma_qpair *rqpair;
+
+ rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
+
+ return nvmf_rdma_trid_from_cm_id(rqpair->listen_id, trid, false);
+}
+
+void
+spdk_nvmf_rdma_init_hooks(struct spdk_nvme_rdma_hooks *hooks)
+{
+ g_nvmf_hooks = *hooks;
+}
+
+static void
+nvmf_rdma_request_set_abort_status(struct spdk_nvmf_request *req,
+ struct spdk_nvmf_rdma_request *rdma_req_to_abort)
+{
+ rdma_req_to_abort->req.rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+ rdma_req_to_abort->req.rsp->nvme_cpl.status.sc = SPDK_NVME_SC_ABORTED_BY_REQUEST;
+
+ rdma_req_to_abort->state = RDMA_REQUEST_STATE_READY_TO_COMPLETE;
+
+ req->rsp->nvme_cpl.cdw0 &= ~1U; /* Command was successfully aborted. */
+}
+
+static int
+_nvmf_rdma_qpair_abort_request(void *ctx)
+{
+ struct spdk_nvmf_request *req = ctx;
+ struct spdk_nvmf_rdma_request *rdma_req_to_abort = SPDK_CONTAINEROF(
+ req->req_to_abort, struct spdk_nvmf_rdma_request, req);
+ struct spdk_nvmf_rdma_qpair *rqpair = SPDK_CONTAINEROF(req->req_to_abort->qpair,
+ struct spdk_nvmf_rdma_qpair, qpair);
+ int rc;
+
+ spdk_poller_unregister(&req->poller);
+
+ switch (rdma_req_to_abort->state) {
+ case RDMA_REQUEST_STATE_EXECUTING:
+ rc = nvmf_ctrlr_abort_request(req);
+ if (rc == SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS) {
+ return SPDK_POLLER_BUSY;
+ }
+ break;
+
+ case RDMA_REQUEST_STATE_NEED_BUFFER:
+ STAILQ_REMOVE(&rqpair->poller->group->group.pending_buf_queue,
+ &rdma_req_to_abort->req, spdk_nvmf_request, buf_link);
+
+ nvmf_rdma_request_set_abort_status(req, rdma_req_to_abort);
+ break;
+
+ case RDMA_REQUEST_STATE_DATA_TRANSFER_TO_CONTROLLER_PENDING:
+ STAILQ_REMOVE(&rqpair->pending_rdma_read_queue, rdma_req_to_abort,
+ spdk_nvmf_rdma_request, state_link);
+
+ nvmf_rdma_request_set_abort_status(req, rdma_req_to_abort);
+ break;
+
+ case RDMA_REQUEST_STATE_DATA_TRANSFER_TO_HOST_PENDING:
+ STAILQ_REMOVE(&rqpair->pending_rdma_write_queue, rdma_req_to_abort,
+ spdk_nvmf_rdma_request, state_link);
+
+ nvmf_rdma_request_set_abort_status(req, rdma_req_to_abort);
+ break;
+
+ case RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER:
+ if (spdk_get_ticks() < req->timeout_tsc) {
+ req->poller = SPDK_POLLER_REGISTER(_nvmf_rdma_qpair_abort_request, req, 0);
+ return SPDK_POLLER_BUSY;
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ spdk_nvmf_request_complete(req);
+ return SPDK_POLLER_BUSY;
+}
+
+static void
+nvmf_rdma_qpair_abort_request(struct spdk_nvmf_qpair *qpair,
+ struct spdk_nvmf_request *req)
+{
+ struct spdk_nvmf_rdma_qpair *rqpair;
+ struct spdk_nvmf_rdma_transport *rtransport;
+ struct spdk_nvmf_transport *transport;
+ uint16_t cid;
+ uint32_t i;
+ struct spdk_nvmf_rdma_request *rdma_req_to_abort = NULL;
+
+ rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair);
+ rtransport = SPDK_CONTAINEROF(qpair->transport, struct spdk_nvmf_rdma_transport, transport);
+ transport = &rtransport->transport;
+
+ cid = req->cmd->nvme_cmd.cdw10_bits.abort.cid;
+
+ for (i = 0; i < rqpair->max_queue_depth; i++) {
+ rdma_req_to_abort = &rqpair->resources->reqs[i];
+
+ if (rdma_req_to_abort->state != RDMA_REQUEST_STATE_FREE &&
+ rdma_req_to_abort->req.cmd->nvme_cmd.cid == cid) {
+ break;
+ }
+ }
+
+ if (rdma_req_to_abort == NULL) {
+ spdk_nvmf_request_complete(req);
+ return;
+ }
+
+ req->req_to_abort = &rdma_req_to_abort->req;
+ req->timeout_tsc = spdk_get_ticks() +
+ transport->opts.abort_timeout_sec * spdk_get_ticks_hz();
+ req->poller = NULL;
+
+ _nvmf_rdma_qpair_abort_request(req);
+}
+
+static int
+nvmf_rdma_poll_group_get_stat(struct spdk_nvmf_tgt *tgt,
+ struct spdk_nvmf_transport_poll_group_stat **stat)
+{
+ struct spdk_io_channel *ch;
+ struct spdk_nvmf_poll_group *group;
+ struct spdk_nvmf_transport_poll_group *tgroup;
+ struct spdk_nvmf_rdma_poll_group *rgroup;
+ struct spdk_nvmf_rdma_poller *rpoller;
+ struct spdk_nvmf_rdma_device_stat *device_stat;
+ uint64_t num_devices = 0;
+
+ if (tgt == NULL || stat == NULL) {
+ return -EINVAL;
+ }
+
+ ch = spdk_get_io_channel(tgt);
+ group = spdk_io_channel_get_ctx(ch);;
+ spdk_put_io_channel(ch);
+ TAILQ_FOREACH(tgroup, &group->tgroups, link) {
+ if (SPDK_NVME_TRANSPORT_RDMA == tgroup->transport->ops->type) {
+ *stat = calloc(1, sizeof(struct spdk_nvmf_transport_poll_group_stat));
+ if (!*stat) {
+ SPDK_ERRLOG("Failed to allocate memory for NVMf RDMA statistics\n");
+ return -ENOMEM;
+ }
+ (*stat)->trtype = SPDK_NVME_TRANSPORT_RDMA;
+
+ rgroup = SPDK_CONTAINEROF(tgroup, struct spdk_nvmf_rdma_poll_group, group);
+ /* Count devices to allocate enough memory */
+ TAILQ_FOREACH(rpoller, &rgroup->pollers, link) {
+ ++num_devices;
+ }
+ (*stat)->rdma.devices = calloc(num_devices, sizeof(struct spdk_nvmf_rdma_device_stat));
+ if (!(*stat)->rdma.devices) {
+ SPDK_ERRLOG("Failed to allocate NVMf RDMA devices statistics\n");
+ free(*stat);
+ return -ENOMEM;
+ }
+
+ (*stat)->rdma.pending_data_buffer = rgroup->stat.pending_data_buffer;
+ (*stat)->rdma.num_devices = num_devices;
+ num_devices = 0;
+ TAILQ_FOREACH(rpoller, &rgroup->pollers, link) {
+ device_stat = &(*stat)->rdma.devices[num_devices++];
+ device_stat->name = ibv_get_device_name(rpoller->device->context->device);
+ device_stat->polls = rpoller->stat.polls;
+ device_stat->completions = rpoller->stat.completions;
+ device_stat->requests = rpoller->stat.requests;
+ device_stat->request_latency = rpoller->stat.request_latency;
+ device_stat->pending_free_request = rpoller->stat.pending_free_request;
+ device_stat->pending_rdma_read = rpoller->stat.pending_rdma_read;
+ device_stat->pending_rdma_write = rpoller->stat.pending_rdma_write;
+ }
+ return 0;
+ }
+ }
+ return -ENOENT;
+}
+
+static void
+nvmf_rdma_poll_group_free_stat(struct spdk_nvmf_transport_poll_group_stat *stat)
+{
+ if (stat) {
+ free(stat->rdma.devices);
+ }
+ free(stat);
+}
+
+const struct spdk_nvmf_transport_ops spdk_nvmf_transport_rdma = {
+ .name = "RDMA",
+ .type = SPDK_NVME_TRANSPORT_RDMA,
+ .opts_init = nvmf_rdma_opts_init,
+ .create = nvmf_rdma_create,
+ .destroy = nvmf_rdma_destroy,
+
+ .listen = nvmf_rdma_listen,
+ .stop_listen = nvmf_rdma_stop_listen,
+ .accept = nvmf_rdma_accept,
+ .cdata_init = nvmf_rdma_cdata_init,
+
+ .listener_discover = nvmf_rdma_discover,
+
+ .poll_group_create = nvmf_rdma_poll_group_create,
+ .get_optimal_poll_group = nvmf_rdma_get_optimal_poll_group,
+ .poll_group_destroy = nvmf_rdma_poll_group_destroy,
+ .poll_group_add = nvmf_rdma_poll_group_add,
+ .poll_group_remove = nvmf_rdma_poll_group_remove,
+ .poll_group_poll = nvmf_rdma_poll_group_poll,
+
+ .req_free = nvmf_rdma_request_free,
+ .req_complete = nvmf_rdma_request_complete,
+
+ .qpair_fini = nvmf_rdma_close_qpair,
+ .qpair_get_peer_trid = nvmf_rdma_qpair_get_peer_trid,
+ .qpair_get_local_trid = nvmf_rdma_qpair_get_local_trid,
+ .qpair_get_listen_trid = nvmf_rdma_qpair_get_listen_trid,
+ .qpair_abort_request = nvmf_rdma_qpair_abort_request,
+
+ .poll_group_get_stat = nvmf_rdma_poll_group_get_stat,
+ .poll_group_free_stat = nvmf_rdma_poll_group_free_stat,
+};
+
+SPDK_NVMF_TRANSPORT_REGISTER(rdma, &spdk_nvmf_transport_rdma);
+SPDK_LOG_REGISTER_COMPONENT("rdma", SPDK_LOG_RDMA)
diff --git a/src/spdk/lib/nvmf/spdk_nvmf.map b/src/spdk/lib/nvmf/spdk_nvmf.map
new file mode 100644
index 000000000..994e7437b
--- /dev/null
+++ b/src/spdk/lib/nvmf/spdk_nvmf.map
@@ -0,0 +1,118 @@
+{
+ global:
+
+ # public functions in nvmf.h
+ spdk_nvmf_tgt_create;
+ spdk_nvmf_tgt_destroy;
+ spdk_nvmf_tgt_get_name;
+ spdk_nvmf_get_tgt;
+ spdk_nvmf_get_first_tgt;
+ spdk_nvmf_get_next_tgt;
+ spdk_nvmf_tgt_write_config_json;
+ spdk_nvmf_tgt_listen;
+ spdk_nvmf_tgt_stop_listen;
+ spdk_nvmf_tgt_accept;
+ spdk_nvmf_poll_group_create;
+ spdk_nvmf_get_optimal_poll_group;
+ spdk_nvmf_poll_group_destroy;
+ spdk_nvmf_poll_group_add;
+ spdk_nvmf_poll_group_get_stat;
+ spdk_nvmf_qpair_disconnect;
+ spdk_nvmf_qpair_get_peer_trid;
+ spdk_nvmf_qpair_get_local_trid;
+ spdk_nvmf_qpair_get_listen_trid;
+ spdk_nvmf_subsystem_create;
+ spdk_nvmf_subsystem_destroy;
+ spdk_nvmf_subsystem_start;
+ spdk_nvmf_subsystem_stop;
+ spdk_nvmf_subsystem_pause;
+ spdk_nvmf_subsystem_resume;
+ spdk_nvmf_tgt_find_subsystem;
+ spdk_nvmf_subsystem_get_first;
+ spdk_nvmf_subsystem_get_next;
+ spdk_nvmf_subsystem_add_host;
+ spdk_nvmf_subsystem_remove_host;
+ spdk_nvmf_subsystem_set_allow_any_host;
+ spdk_nvmf_subsystem_get_allow_any_host;
+ spdk_nvmf_subsystem_host_allowed;
+ spdk_nvmf_subsystem_get_first_host;
+ spdk_nvmf_subsystem_get_next_host;
+ spdk_nvmf_host_get_nqn;
+ spdk_nvmf_subsystem_add_listener;
+ spdk_nvmf_subsystem_remove_listener;
+ spdk_nvmf_subsystem_listener_allowed;
+ spdk_nvmf_subsystem_get_first_listener;
+ spdk_nvmf_subsystem_get_next_listener;
+ spdk_nvmf_subsystem_listener_get_trid;
+ spdk_nvmf_subsystem_allow_any_listener;
+ spdk_nvmf_subsytem_any_listener_allowed;
+ spdk_nvmf_ns_opts_get_defaults;
+ spdk_nvmf_subsystem_add_ns;
+ spdk_nvmf_subsystem_remove_ns;
+ spdk_nvmf_subsystem_get_first_ns;
+ spdk_nvmf_subsystem_get_next_ns;
+ spdk_nvmf_subsystem_get_ns;
+ spdk_nvmf_subsystem_get_max_namespaces;
+ spdk_nvmf_ns_get_id;
+ spdk_nvmf_ns_get_bdev;
+ spdk_nvmf_ns_get_opts;
+ spdk_nvmf_subsystem_get_sn;
+ spdk_nvmf_subsystem_set_sn;
+ spdk_nvmf_subsystem_get_mn;
+ spdk_nvmf_subsystem_set_mn;
+ spdk_nvmf_subsystem_get_nqn;
+ spdk_nvmf_subsystem_get_type;
+ spdk_nvmf_subsystem_get_max_nsid;
+ spdk_nvmf_transport_opts_init;
+ spdk_nvmf_transport_create;
+ spdk_nvmf_transport_destroy;
+ spdk_nvmf_tgt_get_transport;
+ spdk_nvmf_transport_get_first;
+ spdk_nvmf_transport_get_next;
+ spdk_nvmf_get_transport_opts;
+ spdk_nvmf_get_transport_type;
+ spdk_nvmf_get_transport_name;
+ spdk_nvmf_tgt_add_transport;
+ spdk_nvmf_transport_listen;
+ spdk_nvmf_transport_stop_listen;
+ spdk_nvmf_transport_poll_group_get_stat;
+ spdk_nvmf_transport_poll_group_free_stat;
+ spdk_nvmf_rdma_init_hooks;
+
+ # public functions in nvmf_cmd.h
+ spdk_nvmf_ctrlr_identify_ctrlr;
+ spdk_nvmf_ctrlr_identify_ns;
+ spdk_nvmf_set_custom_admin_cmd_hdlr;
+ spdk_nvmf_set_passthru_admin_cmd;
+ spdk_nvmf_bdev_ctrlr_nvme_passthru_admin;
+ spdk_nvmf_request_get_bdev;
+ spdk_nvmf_request_get_ctrlr;
+ spdk_nvmf_request_get_subsystem;
+ spdk_nvmf_request_get_data;
+ spdk_nvmf_request_get_cmd;
+ spdk_nvmf_request_get_response;
+ spdk_nvmf_request_get_req_to_abort;
+ spdk_nvmf_bdev_ctrlr_abort_cmd;
+
+ # public functions in nvmf_transport.h
+ spdk_nvmf_transport_register;
+ spdk_nvmf_tgt_new_qpair;
+ spdk_nvmf_ctrlr_connect;
+ spdk_nvmf_ctrlr_data_init;
+ spdk_nvmf_ctrlr_get_regs;
+ spdk_nvmf_request_free_buffers;
+ spdk_nvmf_request_get_buffers;
+ spdk_nvmf_request_get_buffers_multi;
+ spdk_nvmf_request_get_dif_ctx;
+ spdk_nvmf_request_exec;
+ spdk_nvmf_request_exec_fabrics;
+ spdk_nvmf_request_free;
+ spdk_nvmf_request_complete;
+ spdk_nvmf_ctrlr_get_subsystem;
+ spdk_nvmf_ctrlr_get_id;
+ spdk_nvmf_req_get_xfer;
+ spdk_nvmf_poll_group_remove;
+
+
+ local: *;
+};
diff --git a/src/spdk/lib/nvmf/subsystem.c b/src/spdk/lib/nvmf/subsystem.c
new file mode 100644
index 000000000..ebe8d9a8e
--- /dev/null
+++ b/src/spdk/lib/nvmf/subsystem.c
@@ -0,0 +1,2515 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation. All rights reserved.
+ * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "nvmf_internal.h"
+#include "transport.h"
+
+#include "spdk/likely.h"
+#include "spdk/string.h"
+#include "spdk/trace.h"
+#include "spdk/nvmf_spec.h"
+#include "spdk/uuid.h"
+#include "spdk/json.h"
+#include "spdk/file.h"
+
+#include "spdk/bdev_module.h"
+#include "spdk_internal/log.h"
+#include "spdk_internal/utf.h"
+
+#define MODEL_NUMBER_DEFAULT "SPDK bdev Controller"
+
+/*
+ * States for parsing valid domains in NQNs according to RFC 1034
+ */
+enum spdk_nvmf_nqn_domain_states {
+ /* First character of a domain must be a letter */
+ SPDK_NVMF_DOMAIN_ACCEPT_LETTER = 0,
+
+ /* Subsequent characters can be any of letter, digit, or hyphen */
+ SPDK_NVMF_DOMAIN_ACCEPT_LDH = 1,
+
+ /* A domain label must end with either a letter or digit */
+ SPDK_NVMF_DOMAIN_ACCEPT_ANY = 2
+};
+
+/* Returns true if is a valid ASCII string as defined by the NVMe spec */
+static bool
+nvmf_valid_ascii_string(const void *buf, size_t size)
+{
+ const uint8_t *str = buf;
+ size_t i;
+
+ for (i = 0; i < size; i++) {
+ if (str[i] < 0x20 || str[i] > 0x7E) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static bool
+nvmf_valid_nqn(const char *nqn)
+{
+ size_t len;
+ struct spdk_uuid uuid_value;
+ uint32_t i;
+ int bytes_consumed;
+ uint32_t domain_label_length;
+ char *reverse_domain_end;
+ uint32_t reverse_domain_end_index;
+ enum spdk_nvmf_nqn_domain_states domain_state = SPDK_NVMF_DOMAIN_ACCEPT_LETTER;
+
+ /* Check for length requirements */
+ len = strlen(nqn);
+ if (len > SPDK_NVMF_NQN_MAX_LEN) {
+ SPDK_ERRLOG("Invalid NQN \"%s\": length %zu > max %d\n", nqn, len, SPDK_NVMF_NQN_MAX_LEN);
+ return false;
+ }
+
+ /* The nqn must be at least as long as SPDK_NVMF_NQN_MIN_LEN to contain the necessary prefix. */
+ if (len < SPDK_NVMF_NQN_MIN_LEN) {
+ SPDK_ERRLOG("Invalid NQN \"%s\": length %zu < min %d\n", nqn, len, SPDK_NVMF_NQN_MIN_LEN);
+ return false;
+ }
+
+ /* Check for discovery controller nqn */
+ if (!strcmp(nqn, SPDK_NVMF_DISCOVERY_NQN)) {
+ return true;
+ }
+
+ /* Check for equality with the generic nqn structure of the form "nqn.2014-08.org.nvmexpress:uuid:11111111-2222-3333-4444-555555555555" */
+ if (!strncmp(nqn, SPDK_NVMF_NQN_UUID_PRE, SPDK_NVMF_NQN_UUID_PRE_LEN)) {
+ if (len != SPDK_NVMF_NQN_UUID_PRE_LEN + SPDK_NVMF_UUID_STRING_LEN) {
+ SPDK_ERRLOG("Invalid NQN \"%s\": uuid is not the correct length\n", nqn);
+ return false;
+ }
+
+ if (spdk_uuid_parse(&uuid_value, &nqn[SPDK_NVMF_NQN_UUID_PRE_LEN])) {
+ SPDK_ERRLOG("Invalid NQN \"%s\": uuid is not formatted correctly\n", nqn);
+ return false;
+ }
+ return true;
+ }
+
+ /* If the nqn does not match the uuid structure, the next several checks validate the form "nqn.yyyy-mm.reverse.domain:user-string" */
+
+ if (strncmp(nqn, "nqn.", 4) != 0) {
+ SPDK_ERRLOG("Invalid NQN \"%s\": NQN must begin with \"nqn.\".\n", nqn);
+ return false;
+ }
+
+ /* Check for yyyy-mm. */
+ if (!(isdigit(nqn[4]) && isdigit(nqn[5]) && isdigit(nqn[6]) && isdigit(nqn[7]) &&
+ nqn[8] == '-' && isdigit(nqn[9]) && isdigit(nqn[10]) && nqn[11] == '.')) {
+ SPDK_ERRLOG("Invalid date code in NQN \"%s\"\n", nqn);
+ return false;
+ }
+
+ reverse_domain_end = strchr(nqn, ':');
+ if (reverse_domain_end != NULL && (reverse_domain_end_index = reverse_domain_end - nqn) < len - 1) {
+ } else {
+ SPDK_ERRLOG("Invalid NQN \"%s\". NQN must contain user specified name with a ':' as a prefix.\n",
+ nqn);
+ return false;
+ }
+
+ /* Check for valid reverse domain */
+ domain_label_length = 0;
+ for (i = 12; i < reverse_domain_end_index; i++) {
+ if (domain_label_length > SPDK_DOMAIN_LABEL_MAX_LEN) {
+ SPDK_ERRLOG("Invalid domain name in NQN \"%s\". At least one Label is too long.\n", nqn);
+ return false;
+ }
+
+ switch (domain_state) {
+
+ case SPDK_NVMF_DOMAIN_ACCEPT_LETTER: {
+ if (isalpha(nqn[i])) {
+ domain_state = SPDK_NVMF_DOMAIN_ACCEPT_ANY;
+ domain_label_length++;
+ break;
+ } else {
+ SPDK_ERRLOG("Invalid domain name in NQN \"%s\". Label names must start with a letter.\n", nqn);
+ return false;
+ }
+ }
+
+ case SPDK_NVMF_DOMAIN_ACCEPT_LDH: {
+ if (isalpha(nqn[i]) || isdigit(nqn[i])) {
+ domain_state = SPDK_NVMF_DOMAIN_ACCEPT_ANY;
+ domain_label_length++;
+ break;
+ } else if (nqn[i] == '-') {
+ if (i == reverse_domain_end_index - 1) {
+ SPDK_ERRLOG("Invalid domain name in NQN \"%s\". Label names must end with an alphanumeric symbol.\n",
+ nqn);
+ return false;
+ }
+ domain_state = SPDK_NVMF_DOMAIN_ACCEPT_LDH;
+ domain_label_length++;
+ break;
+ } else if (nqn[i] == '.') {
+ SPDK_ERRLOG("Invalid domain name in NQN \"%s\". Label names must end with an alphanumeric symbol.\n",
+ nqn);
+ return false;
+ } else {
+ SPDK_ERRLOG("Invalid domain name in NQN \"%s\". Label names must contain only [a-z,A-Z,0-9,'-','.'].\n",
+ nqn);
+ return false;
+ }
+ }
+
+ case SPDK_NVMF_DOMAIN_ACCEPT_ANY: {
+ if (isalpha(nqn[i]) || isdigit(nqn[i])) {
+ domain_state = SPDK_NVMF_DOMAIN_ACCEPT_ANY;
+ domain_label_length++;
+ break;
+ } else if (nqn[i] == '-') {
+ if (i == reverse_domain_end_index - 1) {
+ SPDK_ERRLOG("Invalid domain name in NQN \"%s\". Label names must end with an alphanumeric symbol.\n",
+ nqn);
+ return false;
+ }
+ domain_state = SPDK_NVMF_DOMAIN_ACCEPT_LDH;
+ domain_label_length++;
+ break;
+ } else if (nqn[i] == '.') {
+ domain_state = SPDK_NVMF_DOMAIN_ACCEPT_LETTER;
+ domain_label_length = 0;
+ break;
+ } else {
+ SPDK_ERRLOG("Invalid domain name in NQN \"%s\". Label names must contain only [a-z,A-Z,0-9,'-','.'].\n",
+ nqn);
+ return false;
+ }
+ }
+ }
+ }
+
+ i = reverse_domain_end_index + 1;
+ while (i < len) {
+ bytes_consumed = utf8_valid(&nqn[i], &nqn[len]);
+ if (bytes_consumed <= 0) {
+ SPDK_ERRLOG("Invalid domain name in NQN \"%s\". Label names must contain only valid utf-8.\n", nqn);
+ return false;
+ }
+
+ i += bytes_consumed;
+ }
+ return true;
+}
+
+struct spdk_nvmf_subsystem *
+spdk_nvmf_subsystem_create(struct spdk_nvmf_tgt *tgt,
+ const char *nqn,
+ enum spdk_nvmf_subtype type,
+ uint32_t num_ns)
+{
+ struct spdk_nvmf_subsystem *subsystem;
+ uint32_t sid;
+
+ if (spdk_nvmf_tgt_find_subsystem(tgt, nqn)) {
+ SPDK_ERRLOG("Subsystem NQN '%s' already exists\n", nqn);
+ return NULL;
+ }
+
+ if (!nvmf_valid_nqn(nqn)) {
+ return NULL;
+ }
+
+ if (type == SPDK_NVMF_SUBTYPE_DISCOVERY && num_ns != 0) {
+ SPDK_ERRLOG("Discovery subsystem cannot have namespaces.\n");
+ return NULL;
+ }
+
+ /* Find a free subsystem id (sid) */
+ for (sid = 0; sid < tgt->max_subsystems; sid++) {
+ if (tgt->subsystems[sid] == NULL) {
+ break;
+ }
+ }
+ if (sid >= tgt->max_subsystems) {
+ return NULL;
+ }
+
+ subsystem = calloc(1, sizeof(struct spdk_nvmf_subsystem));
+ if (subsystem == NULL) {
+ return NULL;
+ }
+
+ subsystem->thread = spdk_get_thread();
+ subsystem->state = SPDK_NVMF_SUBSYSTEM_INACTIVE;
+ subsystem->tgt = tgt;
+ subsystem->id = sid;
+ subsystem->subtype = type;
+ subsystem->max_nsid = num_ns;
+ subsystem->max_allowed_nsid = num_ns;
+ subsystem->next_cntlid = 0;
+ snprintf(subsystem->subnqn, sizeof(subsystem->subnqn), "%s", nqn);
+ TAILQ_INIT(&subsystem->listeners);
+ TAILQ_INIT(&subsystem->hosts);
+ TAILQ_INIT(&subsystem->ctrlrs);
+
+ if (num_ns != 0) {
+ subsystem->ns = calloc(num_ns, sizeof(struct spdk_nvmf_ns *));
+ if (subsystem->ns == NULL) {
+ SPDK_ERRLOG("Namespace memory allocation failed\n");
+ free(subsystem);
+ return NULL;
+ }
+ }
+
+ memset(subsystem->sn, '0', sizeof(subsystem->sn) - 1);
+ subsystem->sn[sizeof(subsystem->sn) - 1] = '\0';
+
+ snprintf(subsystem->mn, sizeof(subsystem->mn), "%s",
+ MODEL_NUMBER_DEFAULT);
+
+ tgt->subsystems[sid] = subsystem;
+ tgt->discovery_genctr++;
+
+ return subsystem;
+}
+
+static void
+nvmf_subsystem_remove_host(struct spdk_nvmf_subsystem *subsystem, struct spdk_nvmf_host *host)
+{
+ TAILQ_REMOVE(&subsystem->hosts, host, link);
+ free(host);
+}
+
+static void
+_nvmf_subsystem_remove_listener(struct spdk_nvmf_subsystem *subsystem,
+ struct spdk_nvmf_subsystem_listener *listener,
+ bool stop)
+{
+ struct spdk_nvmf_transport *transport;
+
+ if (stop) {
+ transport = spdk_nvmf_tgt_get_transport(subsystem->tgt, listener->trid->trstring);
+ if (transport != NULL) {
+ spdk_nvmf_transport_stop_listen(transport, listener->trid);
+ }
+ }
+
+ TAILQ_REMOVE(&subsystem->listeners, listener, link);
+ free(listener);
+}
+
+void
+spdk_nvmf_subsystem_destroy(struct spdk_nvmf_subsystem *subsystem)
+{
+ struct spdk_nvmf_host *host, *host_tmp;
+ struct spdk_nvmf_ctrlr *ctrlr, *ctrlr_tmp;
+ struct spdk_nvmf_ns *ns;
+
+ if (!subsystem) {
+ return;
+ }
+
+ assert(subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE);
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "subsystem is %p\n", subsystem);
+
+ nvmf_subsystem_remove_all_listeners(subsystem, false);
+
+ TAILQ_FOREACH_SAFE(host, &subsystem->hosts, link, host_tmp) {
+ nvmf_subsystem_remove_host(subsystem, host);
+ }
+
+ TAILQ_FOREACH_SAFE(ctrlr, &subsystem->ctrlrs, link, ctrlr_tmp) {
+ nvmf_ctrlr_destruct(ctrlr);
+ }
+
+ ns = spdk_nvmf_subsystem_get_first_ns(subsystem);
+ while (ns != NULL) {
+ struct spdk_nvmf_ns *next_ns = spdk_nvmf_subsystem_get_next_ns(subsystem, ns);
+
+ spdk_nvmf_subsystem_remove_ns(subsystem, ns->opts.nsid);
+ ns = next_ns;
+ }
+
+ free(subsystem->ns);
+
+ subsystem->tgt->subsystems[subsystem->id] = NULL;
+ subsystem->tgt->discovery_genctr++;
+
+ free(subsystem);
+}
+
+static int
+nvmf_subsystem_set_state(struct spdk_nvmf_subsystem *subsystem,
+ enum spdk_nvmf_subsystem_state state)
+{
+ enum spdk_nvmf_subsystem_state actual_old_state, expected_old_state;
+ bool exchanged;
+
+ switch (state) {
+ case SPDK_NVMF_SUBSYSTEM_INACTIVE:
+ expected_old_state = SPDK_NVMF_SUBSYSTEM_DEACTIVATING;
+ break;
+ case SPDK_NVMF_SUBSYSTEM_ACTIVATING:
+ expected_old_state = SPDK_NVMF_SUBSYSTEM_INACTIVE;
+ break;
+ case SPDK_NVMF_SUBSYSTEM_ACTIVE:
+ expected_old_state = SPDK_NVMF_SUBSYSTEM_ACTIVATING;
+ break;
+ case SPDK_NVMF_SUBSYSTEM_PAUSING:
+ expected_old_state = SPDK_NVMF_SUBSYSTEM_ACTIVE;
+ break;
+ case SPDK_NVMF_SUBSYSTEM_PAUSED:
+ expected_old_state = SPDK_NVMF_SUBSYSTEM_PAUSING;
+ break;
+ case SPDK_NVMF_SUBSYSTEM_RESUMING:
+ expected_old_state = SPDK_NVMF_SUBSYSTEM_PAUSED;
+ break;
+ case SPDK_NVMF_SUBSYSTEM_DEACTIVATING:
+ expected_old_state = SPDK_NVMF_SUBSYSTEM_ACTIVE;
+ break;
+ default:
+ assert(false);
+ return -1;
+ }
+
+ actual_old_state = expected_old_state;
+ exchanged = __atomic_compare_exchange_n(&subsystem->state, &actual_old_state, state, false,
+ __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+ if (spdk_unlikely(exchanged == false)) {
+ if (actual_old_state == SPDK_NVMF_SUBSYSTEM_RESUMING &&
+ state == SPDK_NVMF_SUBSYSTEM_ACTIVE) {
+ expected_old_state = SPDK_NVMF_SUBSYSTEM_RESUMING;
+ }
+ /* This is for the case when activating the subsystem fails. */
+ if (actual_old_state == SPDK_NVMF_SUBSYSTEM_ACTIVATING &&
+ state == SPDK_NVMF_SUBSYSTEM_DEACTIVATING) {
+ expected_old_state = SPDK_NVMF_SUBSYSTEM_ACTIVATING;
+ }
+ actual_old_state = expected_old_state;
+ __atomic_compare_exchange_n(&subsystem->state, &actual_old_state, state, false,
+ __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+ }
+ assert(actual_old_state == expected_old_state);
+ return actual_old_state - expected_old_state;
+}
+
+struct subsystem_state_change_ctx {
+ struct spdk_nvmf_subsystem *subsystem;
+
+ enum spdk_nvmf_subsystem_state requested_state;
+
+ spdk_nvmf_subsystem_state_change_done cb_fn;
+ void *cb_arg;
+};
+
+static void
+subsystem_state_change_done(struct spdk_io_channel_iter *i, int status)
+{
+ struct subsystem_state_change_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
+
+ if (status == 0) {
+ status = nvmf_subsystem_set_state(ctx->subsystem, ctx->requested_state);
+ if (status) {
+ status = -1;
+ }
+ }
+
+ if (ctx->cb_fn) {
+ ctx->cb_fn(ctx->subsystem, ctx->cb_arg, status);
+ }
+ free(ctx);
+}
+
+static void
+subsystem_state_change_continue(void *ctx, int status)
+{
+ struct spdk_io_channel_iter *i = ctx;
+ spdk_for_each_channel_continue(i, status);
+}
+
+static void
+subsystem_state_change_on_pg(struct spdk_io_channel_iter *i)
+{
+ struct subsystem_state_change_ctx *ctx;
+ struct spdk_io_channel *ch;
+ struct spdk_nvmf_poll_group *group;
+
+ ctx = spdk_io_channel_iter_get_ctx(i);
+ ch = spdk_io_channel_iter_get_channel(i);
+ group = spdk_io_channel_get_ctx(ch);
+
+ switch (ctx->requested_state) {
+ case SPDK_NVMF_SUBSYSTEM_INACTIVE:
+ nvmf_poll_group_remove_subsystem(group, ctx->subsystem, subsystem_state_change_continue, i);
+ break;
+ case SPDK_NVMF_SUBSYSTEM_ACTIVE:
+ if (ctx->subsystem->state == SPDK_NVMF_SUBSYSTEM_ACTIVATING) {
+ nvmf_poll_group_add_subsystem(group, ctx->subsystem, subsystem_state_change_continue, i);
+ } else if (ctx->subsystem->state == SPDK_NVMF_SUBSYSTEM_RESUMING) {
+ nvmf_poll_group_resume_subsystem(group, ctx->subsystem, subsystem_state_change_continue, i);
+ }
+ break;
+ case SPDK_NVMF_SUBSYSTEM_PAUSED:
+ nvmf_poll_group_pause_subsystem(group, ctx->subsystem, subsystem_state_change_continue, i);
+ break;
+ default:
+ assert(false);
+ break;
+ }
+}
+
+static int
+nvmf_subsystem_state_change(struct spdk_nvmf_subsystem *subsystem,
+ enum spdk_nvmf_subsystem_state requested_state,
+ spdk_nvmf_subsystem_state_change_done cb_fn,
+ void *cb_arg)
+{
+ struct subsystem_state_change_ctx *ctx;
+ enum spdk_nvmf_subsystem_state intermediate_state;
+ int rc;
+
+ switch (requested_state) {
+ case SPDK_NVMF_SUBSYSTEM_INACTIVE:
+ intermediate_state = SPDK_NVMF_SUBSYSTEM_DEACTIVATING;
+ break;
+ case SPDK_NVMF_SUBSYSTEM_ACTIVE:
+ if (subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSED) {
+ intermediate_state = SPDK_NVMF_SUBSYSTEM_RESUMING;
+ } else {
+ intermediate_state = SPDK_NVMF_SUBSYSTEM_ACTIVATING;
+ }
+ break;
+ case SPDK_NVMF_SUBSYSTEM_PAUSED:
+ intermediate_state = SPDK_NVMF_SUBSYSTEM_PAUSING;
+ break;
+ default:
+ assert(false);
+ return -EINVAL;
+ }
+
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx) {
+ return -ENOMEM;
+ }
+
+ rc = nvmf_subsystem_set_state(subsystem, intermediate_state);
+ if (rc) {
+ free(ctx);
+ return rc;
+ }
+
+ ctx->subsystem = subsystem;
+ ctx->requested_state = requested_state;
+ ctx->cb_fn = cb_fn;
+ ctx->cb_arg = cb_arg;
+
+ spdk_for_each_channel(subsystem->tgt,
+ subsystem_state_change_on_pg,
+ ctx,
+ subsystem_state_change_done);
+
+ return 0;
+}
+
+int
+spdk_nvmf_subsystem_start(struct spdk_nvmf_subsystem *subsystem,
+ spdk_nvmf_subsystem_state_change_done cb_fn,
+ void *cb_arg)
+{
+ return nvmf_subsystem_state_change(subsystem, SPDK_NVMF_SUBSYSTEM_ACTIVE, cb_fn, cb_arg);
+}
+
+int
+spdk_nvmf_subsystem_stop(struct spdk_nvmf_subsystem *subsystem,
+ spdk_nvmf_subsystem_state_change_done cb_fn,
+ void *cb_arg)
+{
+ return nvmf_subsystem_state_change(subsystem, SPDK_NVMF_SUBSYSTEM_INACTIVE, cb_fn, cb_arg);
+}
+
+int
+spdk_nvmf_subsystem_pause(struct spdk_nvmf_subsystem *subsystem,
+ spdk_nvmf_subsystem_state_change_done cb_fn,
+ void *cb_arg)
+{
+ return nvmf_subsystem_state_change(subsystem, SPDK_NVMF_SUBSYSTEM_PAUSED, cb_fn, cb_arg);
+}
+
+int
+spdk_nvmf_subsystem_resume(struct spdk_nvmf_subsystem *subsystem,
+ spdk_nvmf_subsystem_state_change_done cb_fn,
+ void *cb_arg)
+{
+ return nvmf_subsystem_state_change(subsystem, SPDK_NVMF_SUBSYSTEM_ACTIVE, cb_fn, cb_arg);
+}
+
+struct spdk_nvmf_subsystem *
+spdk_nvmf_subsystem_get_first(struct spdk_nvmf_tgt *tgt)
+{
+ struct spdk_nvmf_subsystem *subsystem;
+ uint32_t sid;
+
+ for (sid = 0; sid < tgt->max_subsystems; sid++) {
+ subsystem = tgt->subsystems[sid];
+ if (subsystem) {
+ return subsystem;
+ }
+ }
+
+ return NULL;
+}
+
+struct spdk_nvmf_subsystem *
+spdk_nvmf_subsystem_get_next(struct spdk_nvmf_subsystem *subsystem)
+{
+ uint32_t sid;
+ struct spdk_nvmf_tgt *tgt;
+
+ if (!subsystem) {
+ return NULL;
+ }
+
+ tgt = subsystem->tgt;
+
+ for (sid = subsystem->id + 1; sid < tgt->max_subsystems; sid++) {
+ subsystem = tgt->subsystems[sid];
+ if (subsystem) {
+ return subsystem;
+ }
+ }
+
+ return NULL;
+}
+
+static struct spdk_nvmf_host *
+nvmf_subsystem_find_host(struct spdk_nvmf_subsystem *subsystem, const char *hostnqn)
+{
+ struct spdk_nvmf_host *host = NULL;
+
+ TAILQ_FOREACH(host, &subsystem->hosts, link) {
+ if (strcmp(hostnqn, host->nqn) == 0) {
+ return host;
+ }
+ }
+
+ return NULL;
+}
+
+int
+spdk_nvmf_subsystem_add_host(struct spdk_nvmf_subsystem *subsystem, const char *hostnqn)
+{
+ struct spdk_nvmf_host *host;
+
+ if (!nvmf_valid_nqn(hostnqn)) {
+ return -EINVAL;
+ }
+
+ if (!(subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE ||
+ subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSED)) {
+ return -EAGAIN;
+ }
+
+ if (nvmf_subsystem_find_host(subsystem, hostnqn)) {
+ /* This subsystem already allows the specified host. */
+ return 0;
+ }
+
+ host = calloc(1, sizeof(*host));
+ if (!host) {
+ return -ENOMEM;
+ }
+
+ snprintf(host->nqn, sizeof(host->nqn), "%s", hostnqn);
+
+ TAILQ_INSERT_HEAD(&subsystem->hosts, host, link);
+ subsystem->tgt->discovery_genctr++;
+
+ return 0;
+}
+
+int
+spdk_nvmf_subsystem_remove_host(struct spdk_nvmf_subsystem *subsystem, const char *hostnqn)
+{
+ struct spdk_nvmf_host *host;
+
+ if (!(subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE ||
+ subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSED)) {
+ return -EAGAIN;
+ }
+
+ host = nvmf_subsystem_find_host(subsystem, hostnqn);
+ if (host == NULL) {
+ return -ENOENT;
+ }
+
+ nvmf_subsystem_remove_host(subsystem, host);
+ return 0;
+}
+
+int
+spdk_nvmf_subsystem_set_allow_any_host(struct spdk_nvmf_subsystem *subsystem, bool allow_any_host)
+{
+ if (!(subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE ||
+ subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSED)) {
+ return -EAGAIN;
+ }
+
+ subsystem->allow_any_host = allow_any_host;
+
+ return 0;
+}
+
+bool
+spdk_nvmf_subsystem_get_allow_any_host(const struct spdk_nvmf_subsystem *subsystem)
+{
+ return subsystem->allow_any_host;
+}
+
+bool
+spdk_nvmf_subsystem_host_allowed(struct spdk_nvmf_subsystem *subsystem, const char *hostnqn)
+{
+ if (!hostnqn) {
+ return false;
+ }
+
+ if (subsystem->allow_any_host) {
+ return true;
+ }
+
+ return nvmf_subsystem_find_host(subsystem, hostnqn) != NULL;
+}
+
+struct spdk_nvmf_host *
+spdk_nvmf_subsystem_get_first_host(struct spdk_nvmf_subsystem *subsystem)
+{
+ return TAILQ_FIRST(&subsystem->hosts);
+}
+
+
+struct spdk_nvmf_host *
+spdk_nvmf_subsystem_get_next_host(struct spdk_nvmf_subsystem *subsystem,
+ struct spdk_nvmf_host *prev_host)
+{
+ return TAILQ_NEXT(prev_host, link);
+}
+
+const char *
+spdk_nvmf_host_get_nqn(const struct spdk_nvmf_host *host)
+{
+ return host->nqn;
+}
+
+struct spdk_nvmf_subsystem_listener *
+nvmf_subsystem_find_listener(struct spdk_nvmf_subsystem *subsystem,
+ const struct spdk_nvme_transport_id *trid)
+{
+ struct spdk_nvmf_subsystem_listener *listener;
+
+ TAILQ_FOREACH(listener, &subsystem->listeners, link) {
+ if (spdk_nvme_transport_id_compare(listener->trid, trid) == 0) {
+ return listener;
+ }
+ }
+
+ return NULL;
+}
+
+/**
+ * Function to be called once the target is listening.
+ *
+ * \param ctx Context argument passed to this function.
+ * \param status 0 if it completed successfully, or negative errno if it failed.
+ */
+static void
+_nvmf_subsystem_add_listener_done(void *ctx, int status)
+{
+ struct spdk_nvmf_subsystem_listener *listener = ctx;
+
+ if (status) {
+ listener->cb_fn(listener->cb_arg, status);
+ free(listener);
+ return;
+ }
+
+ TAILQ_INSERT_HEAD(&listener->subsystem->listeners, listener, link);
+ listener->subsystem->tgt->discovery_genctr++;
+ listener->cb_fn(listener->cb_arg, status);
+}
+
+void
+spdk_nvmf_subsystem_add_listener(struct spdk_nvmf_subsystem *subsystem,
+ struct spdk_nvme_transport_id *trid,
+ spdk_nvmf_tgt_subsystem_listen_done_fn cb_fn,
+ void *cb_arg)
+{
+ struct spdk_nvmf_transport *transport;
+ struct spdk_nvmf_subsystem_listener *listener;
+ struct spdk_nvmf_listener *tr_listener;
+
+ assert(cb_fn != NULL);
+
+ if (!(subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE ||
+ subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSED)) {
+ cb_fn(cb_arg, -EAGAIN);
+ return;
+ }
+
+ if (nvmf_subsystem_find_listener(subsystem, trid)) {
+ /* Listener already exists in this subsystem */
+ cb_fn(cb_arg, 0);
+ return;
+ }
+
+ transport = spdk_nvmf_tgt_get_transport(subsystem->tgt, trid->trstring);
+ if (transport == NULL) {
+ SPDK_ERRLOG("Unknown transport type %d\n", trid->trtype);
+ cb_fn(cb_arg, -EINVAL);
+ return;
+ }
+
+ tr_listener = nvmf_transport_find_listener(transport, trid);
+ if (!tr_listener) {
+ SPDK_ERRLOG("Cannot find transport listener for %s\n", trid->traddr);
+ cb_fn(cb_arg, -EINVAL);
+ return;
+ }
+
+ listener = calloc(1, sizeof(*listener));
+ if (!listener) {
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ listener->trid = &tr_listener->trid;
+ listener->transport = transport;
+ listener->cb_fn = cb_fn;
+ listener->cb_arg = cb_arg;
+ listener->subsystem = subsystem;
+
+ if (transport->ops->listen_associate != NULL) {
+ transport->ops->listen_associate(transport, subsystem, trid,
+ _nvmf_subsystem_add_listener_done,
+ listener);
+ } else {
+ _nvmf_subsystem_add_listener_done(listener, 0);
+ }
+}
+
+int
+spdk_nvmf_subsystem_remove_listener(struct spdk_nvmf_subsystem *subsystem,
+ const struct spdk_nvme_transport_id *trid)
+{
+ struct spdk_nvmf_subsystem_listener *listener;
+
+ if (!(subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE ||
+ subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSED)) {
+ return -EAGAIN;
+ }
+
+ listener = nvmf_subsystem_find_listener(subsystem, trid);
+ if (listener == NULL) {
+ return -ENOENT;
+ }
+
+ _nvmf_subsystem_remove_listener(subsystem, listener, false);
+
+ return 0;
+}
+
+void
+nvmf_subsystem_remove_all_listeners(struct spdk_nvmf_subsystem *subsystem,
+ bool stop)
+{
+ struct spdk_nvmf_subsystem_listener *listener, *listener_tmp;
+
+ TAILQ_FOREACH_SAFE(listener, &subsystem->listeners, link, listener_tmp) {
+ _nvmf_subsystem_remove_listener(subsystem, listener, stop);
+ }
+}
+
+bool
+spdk_nvmf_subsystem_listener_allowed(struct spdk_nvmf_subsystem *subsystem,
+ const struct spdk_nvme_transport_id *trid)
+{
+ struct spdk_nvmf_subsystem_listener *listener;
+
+ if (!strcmp(subsystem->subnqn, SPDK_NVMF_DISCOVERY_NQN)) {
+ return true;
+ }
+
+ TAILQ_FOREACH(listener, &subsystem->listeners, link) {
+ if (spdk_nvme_transport_id_compare(listener->trid, trid) == 0) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+struct spdk_nvmf_subsystem_listener *
+spdk_nvmf_subsystem_get_first_listener(struct spdk_nvmf_subsystem *subsystem)
+{
+ return TAILQ_FIRST(&subsystem->listeners);
+}
+
+struct spdk_nvmf_subsystem_listener *
+spdk_nvmf_subsystem_get_next_listener(struct spdk_nvmf_subsystem *subsystem,
+ struct spdk_nvmf_subsystem_listener *prev_listener)
+{
+ return TAILQ_NEXT(prev_listener, link);
+}
+
+const struct spdk_nvme_transport_id *
+spdk_nvmf_subsystem_listener_get_trid(struct spdk_nvmf_subsystem_listener *listener)
+{
+ return listener->trid;
+}
+
+void
+spdk_nvmf_subsystem_allow_any_listener(struct spdk_nvmf_subsystem *subsystem,
+ bool allow_any_listener)
+{
+ subsystem->allow_any_listener = allow_any_listener;
+}
+
+bool
+spdk_nvmf_subsytem_any_listener_allowed(struct spdk_nvmf_subsystem *subsystem)
+{
+ return subsystem->allow_any_listener;
+}
+
+
+struct subsystem_update_ns_ctx {
+ struct spdk_nvmf_subsystem *subsystem;
+
+ spdk_nvmf_subsystem_state_change_done cb_fn;
+ void *cb_arg;
+};
+
+static void
+subsystem_update_ns_done(struct spdk_io_channel_iter *i, int status)
+{
+ struct subsystem_update_ns_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
+
+ if (ctx->cb_fn) {
+ ctx->cb_fn(ctx->subsystem, ctx->cb_arg, status);
+ }
+ free(ctx);
+}
+
+static void
+subsystem_update_ns_on_pg(struct spdk_io_channel_iter *i)
+{
+ int rc;
+ struct subsystem_update_ns_ctx *ctx;
+ struct spdk_nvmf_poll_group *group;
+ struct spdk_nvmf_subsystem *subsystem;
+
+ ctx = spdk_io_channel_iter_get_ctx(i);
+ group = spdk_io_channel_get_ctx(spdk_io_channel_iter_get_channel(i));
+ subsystem = ctx->subsystem;
+
+ rc = nvmf_poll_group_update_subsystem(group, subsystem);
+ spdk_for_each_channel_continue(i, rc);
+}
+
+static int
+nvmf_subsystem_update_ns(struct spdk_nvmf_subsystem *subsystem, spdk_channel_for_each_cpl cpl,
+ void *ctx)
+{
+ spdk_for_each_channel(subsystem->tgt,
+ subsystem_update_ns_on_pg,
+ ctx,
+ cpl);
+
+ return 0;
+}
+
+static void
+nvmf_subsystem_ns_changed(struct spdk_nvmf_subsystem *subsystem, uint32_t nsid)
+{
+ struct spdk_nvmf_ctrlr *ctrlr;
+
+ TAILQ_FOREACH(ctrlr, &subsystem->ctrlrs, link) {
+ nvmf_ctrlr_ns_changed(ctrlr, nsid);
+ }
+}
+
+int
+spdk_nvmf_subsystem_remove_ns(struct spdk_nvmf_subsystem *subsystem, uint32_t nsid)
+{
+ struct spdk_nvmf_ns *ns;
+ struct spdk_nvmf_registrant *reg, *reg_tmp;
+
+ if (!(subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE ||
+ subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSED)) {
+ assert(false);
+ return -1;
+ }
+
+ if (nsid == 0 || nsid > subsystem->max_nsid) {
+ return -1;
+ }
+
+ ns = subsystem->ns[nsid - 1];
+ if (!ns) {
+ return -1;
+ }
+
+ subsystem->ns[nsid - 1] = NULL;
+
+ TAILQ_FOREACH_SAFE(reg, &ns->registrants, link, reg_tmp) {
+ TAILQ_REMOVE(&ns->registrants, reg, link);
+ free(reg);
+ }
+ spdk_bdev_module_release_bdev(ns->bdev);
+ spdk_bdev_close(ns->desc);
+ if (ns->ptpl_file) {
+ free(ns->ptpl_file);
+ }
+ free(ns);
+
+ nvmf_subsystem_ns_changed(subsystem, nsid);
+
+ return 0;
+}
+
+static void
+_nvmf_ns_hot_remove(struct spdk_nvmf_subsystem *subsystem,
+ void *cb_arg, int status)
+{
+ struct spdk_nvmf_ns *ns = cb_arg;
+ int rc;
+
+ rc = spdk_nvmf_subsystem_remove_ns(subsystem, ns->opts.nsid);
+ if (rc != 0) {
+ SPDK_ERRLOG("Failed to make changes to NVME-oF subsystem with id: %u\n", subsystem->id);
+ }
+
+ spdk_nvmf_subsystem_resume(subsystem, NULL, NULL);
+}
+
+static void
+nvmf_ns_hot_remove(void *remove_ctx)
+{
+ struct spdk_nvmf_ns *ns = remove_ctx;
+ int rc;
+
+ rc = spdk_nvmf_subsystem_pause(ns->subsystem, _nvmf_ns_hot_remove, ns);
+ if (rc) {
+ SPDK_ERRLOG("Unable to pause subsystem to process namespace removal!\n");
+ }
+}
+
+static void
+_nvmf_ns_resize(struct spdk_nvmf_subsystem *subsystem, void *cb_arg, int status)
+{
+ struct spdk_nvmf_ns *ns = cb_arg;
+
+ nvmf_subsystem_ns_changed(subsystem, ns->opts.nsid);
+ spdk_nvmf_subsystem_resume(subsystem, NULL, NULL);
+}
+
+static void
+nvmf_ns_resize(void *event_ctx)
+{
+ struct spdk_nvmf_ns *ns = event_ctx;
+ int rc;
+
+ rc = spdk_nvmf_subsystem_pause(ns->subsystem, _nvmf_ns_resize, ns);
+ if (rc) {
+ SPDK_ERRLOG("Unable to pause subsystem to process namespace resize!\n");
+ }
+}
+
+static void
+nvmf_ns_event(enum spdk_bdev_event_type type,
+ struct spdk_bdev *bdev,
+ void *event_ctx)
+{
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Bdev event: type %d, name %s, subsystem_id %d, ns_id %d\n",
+ type,
+ bdev->name,
+ ((struct spdk_nvmf_ns *)event_ctx)->subsystem->id,
+ ((struct spdk_nvmf_ns *)event_ctx)->nsid);
+
+ switch (type) {
+ case SPDK_BDEV_EVENT_REMOVE:
+ nvmf_ns_hot_remove(event_ctx);
+ break;
+ case SPDK_BDEV_EVENT_RESIZE:
+ nvmf_ns_resize(event_ctx);
+ break;
+ default:
+ SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type);
+ break;
+ }
+}
+
+void
+spdk_nvmf_ns_opts_get_defaults(struct spdk_nvmf_ns_opts *opts, size_t opts_size)
+{
+ /* All current fields are set to 0 by default. */
+ memset(opts, 0, opts_size);
+}
+
+/* Dummy bdev module used to to claim bdevs. */
+static struct spdk_bdev_module ns_bdev_module = {
+ .name = "NVMe-oF Target",
+};
+
+static int
+nvmf_ns_load_reservation(const char *file, struct spdk_nvmf_reservation_info *info);
+static int
+nvmf_ns_reservation_restore(struct spdk_nvmf_ns *ns, struct spdk_nvmf_reservation_info *info);
+
+uint32_t
+spdk_nvmf_subsystem_add_ns(struct spdk_nvmf_subsystem *subsystem, struct spdk_bdev *bdev,
+ const struct spdk_nvmf_ns_opts *user_opts, size_t opts_size,
+ const char *ptpl_file)
+{
+ struct spdk_nvmf_ns_opts opts;
+ struct spdk_nvmf_ns *ns;
+ struct spdk_nvmf_reservation_info info = {0};
+ int rc;
+
+ if (!(subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE ||
+ subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSED)) {
+ return 0;
+ }
+
+ if (spdk_bdev_get_md_size(bdev) != 0 && !spdk_bdev_is_md_interleaved(bdev)) {
+ SPDK_ERRLOG("Can't attach bdev with separate metadata.\n");
+ return 0;
+ }
+
+ spdk_nvmf_ns_opts_get_defaults(&opts, sizeof(opts));
+ if (user_opts) {
+ memcpy(&opts, user_opts, spdk_min(sizeof(opts), opts_size));
+ }
+
+ if (spdk_mem_all_zero(&opts.uuid, sizeof(opts.uuid))) {
+ opts.uuid = *spdk_bdev_get_uuid(bdev);
+ }
+
+ if (opts.nsid == SPDK_NVME_GLOBAL_NS_TAG) {
+ SPDK_ERRLOG("Invalid NSID %" PRIu32 "\n", opts.nsid);
+ return 0;
+ }
+
+ if (opts.nsid == 0) {
+ /*
+ * NSID not specified - find a free index.
+ *
+ * If no free slots are found, opts.nsid will be subsystem->max_nsid + 1, which will
+ * expand max_nsid if possible.
+ */
+ for (opts.nsid = 1; opts.nsid <= subsystem->max_nsid; opts.nsid++) {
+ if (_nvmf_subsystem_get_ns(subsystem, opts.nsid) == NULL) {
+ break;
+ }
+ }
+ }
+
+ if (_nvmf_subsystem_get_ns(subsystem, opts.nsid)) {
+ SPDK_ERRLOG("Requested NSID %" PRIu32 " already in use\n", opts.nsid);
+ return 0;
+ }
+
+ if (opts.nsid > subsystem->max_nsid) {
+ struct spdk_nvmf_ns **new_ns_array;
+
+ /* If MaxNamespaces was specified, we can't extend max_nsid beyond it. */
+ if (subsystem->max_allowed_nsid > 0 && opts.nsid > subsystem->max_allowed_nsid) {
+ SPDK_ERRLOG("Can't extend NSID range above MaxNamespaces\n");
+ return 0;
+ }
+
+ /* If a controller is connected, we can't change NN. */
+ if (!TAILQ_EMPTY(&subsystem->ctrlrs)) {
+ SPDK_ERRLOG("Can't extend NSID range while controllers are connected\n");
+ return 0;
+ }
+
+ new_ns_array = realloc(subsystem->ns, sizeof(struct spdk_nvmf_ns *) * opts.nsid);
+ if (new_ns_array == NULL) {
+ SPDK_ERRLOG("Memory allocation error while resizing namespace array.\n");
+ return 0;
+ }
+
+ memset(new_ns_array + subsystem->max_nsid, 0,
+ sizeof(struct spdk_nvmf_ns *) * (opts.nsid - subsystem->max_nsid));
+ subsystem->ns = new_ns_array;
+ subsystem->max_nsid = opts.nsid;
+ }
+
+ ns = calloc(1, sizeof(*ns));
+ if (ns == NULL) {
+ SPDK_ERRLOG("Namespace allocation failed\n");
+ return 0;
+ }
+
+ ns->bdev = bdev;
+ ns->opts = opts;
+ ns->subsystem = subsystem;
+ rc = spdk_bdev_open_ext(bdev->name, true, nvmf_ns_event, ns, &ns->desc);
+ if (rc != 0) {
+ SPDK_ERRLOG("Subsystem %s: bdev %s cannot be opened, error=%d\n",
+ subsystem->subnqn, spdk_bdev_get_name(bdev), rc);
+ free(ns);
+ return 0;
+ }
+ rc = spdk_bdev_module_claim_bdev(bdev, ns->desc, &ns_bdev_module);
+ if (rc != 0) {
+ spdk_bdev_close(ns->desc);
+ free(ns);
+ return 0;
+ }
+ subsystem->ns[opts.nsid - 1] = ns;
+ ns->nsid = opts.nsid;
+ TAILQ_INIT(&ns->registrants);
+
+ if (ptpl_file) {
+ rc = nvmf_ns_load_reservation(ptpl_file, &info);
+ if (!rc) {
+ rc = nvmf_ns_reservation_restore(ns, &info);
+ if (rc) {
+ SPDK_ERRLOG("Subsystem restore reservation failed\n");
+ subsystem->ns[opts.nsid - 1] = NULL;
+ spdk_bdev_close(ns->desc);
+ free(ns);
+ return 0;
+ }
+ }
+ ns->ptpl_file = strdup(ptpl_file);
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Subsystem %s: bdev %s assigned nsid %" PRIu32 "\n",
+ spdk_nvmf_subsystem_get_nqn(subsystem),
+ spdk_bdev_get_name(bdev),
+ opts.nsid);
+
+ nvmf_subsystem_ns_changed(subsystem, opts.nsid);
+
+ return opts.nsid;
+}
+
+static uint32_t
+nvmf_subsystem_get_next_allocated_nsid(struct spdk_nvmf_subsystem *subsystem,
+ uint32_t prev_nsid)
+{
+ uint32_t nsid;
+
+ if (prev_nsid >= subsystem->max_nsid) {
+ return 0;
+ }
+
+ for (nsid = prev_nsid + 1; nsid <= subsystem->max_nsid; nsid++) {
+ if (subsystem->ns[nsid - 1]) {
+ return nsid;
+ }
+ }
+
+ return 0;
+}
+
+struct spdk_nvmf_ns *
+spdk_nvmf_subsystem_get_first_ns(struct spdk_nvmf_subsystem *subsystem)
+{
+ uint32_t first_nsid;
+
+ first_nsid = nvmf_subsystem_get_next_allocated_nsid(subsystem, 0);
+ return _nvmf_subsystem_get_ns(subsystem, first_nsid);
+}
+
+struct spdk_nvmf_ns *
+spdk_nvmf_subsystem_get_next_ns(struct spdk_nvmf_subsystem *subsystem,
+ struct spdk_nvmf_ns *prev_ns)
+{
+ uint32_t next_nsid;
+
+ next_nsid = nvmf_subsystem_get_next_allocated_nsid(subsystem, prev_ns->opts.nsid);
+ return _nvmf_subsystem_get_ns(subsystem, next_nsid);
+}
+
+struct spdk_nvmf_ns *
+spdk_nvmf_subsystem_get_ns(struct spdk_nvmf_subsystem *subsystem, uint32_t nsid)
+{
+ return _nvmf_subsystem_get_ns(subsystem, nsid);
+}
+
+uint32_t
+spdk_nvmf_ns_get_id(const struct spdk_nvmf_ns *ns)
+{
+ return ns->opts.nsid;
+}
+
+struct spdk_bdev *
+spdk_nvmf_ns_get_bdev(struct spdk_nvmf_ns *ns)
+{
+ return ns->bdev;
+}
+
+void
+spdk_nvmf_ns_get_opts(const struct spdk_nvmf_ns *ns, struct spdk_nvmf_ns_opts *opts,
+ size_t opts_size)
+{
+ memset(opts, 0, opts_size);
+ memcpy(opts, &ns->opts, spdk_min(sizeof(ns->opts), opts_size));
+}
+
+const char *
+spdk_nvmf_subsystem_get_sn(const struct spdk_nvmf_subsystem *subsystem)
+{
+ return subsystem->sn;
+}
+
+int
+spdk_nvmf_subsystem_set_sn(struct spdk_nvmf_subsystem *subsystem, const char *sn)
+{
+ size_t len, max_len;
+
+ max_len = sizeof(subsystem->sn) - 1;
+ len = strlen(sn);
+ if (len > max_len) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Invalid sn \"%s\": length %zu > max %zu\n",
+ sn, len, max_len);
+ return -1;
+ }
+
+ if (!nvmf_valid_ascii_string(sn, len)) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Non-ASCII sn\n");
+ SPDK_LOGDUMP(SPDK_LOG_NVMF, "sn", sn, len);
+ return -1;
+ }
+
+ snprintf(subsystem->sn, sizeof(subsystem->sn), "%s", sn);
+
+ return 0;
+}
+
+const char *
+spdk_nvmf_subsystem_get_mn(const struct spdk_nvmf_subsystem *subsystem)
+{
+ return subsystem->mn;
+}
+
+int
+spdk_nvmf_subsystem_set_mn(struct spdk_nvmf_subsystem *subsystem, const char *mn)
+{
+ size_t len, max_len;
+
+ if (mn == NULL) {
+ mn = MODEL_NUMBER_DEFAULT;
+ }
+ max_len = sizeof(subsystem->mn) - 1;
+ len = strlen(mn);
+ if (len > max_len) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Invalid mn \"%s\": length %zu > max %zu\n",
+ mn, len, max_len);
+ return -1;
+ }
+
+ if (!nvmf_valid_ascii_string(mn, len)) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Non-ASCII mn\n");
+ SPDK_LOGDUMP(SPDK_LOG_NVMF, "mn", mn, len);
+ return -1;
+ }
+
+ snprintf(subsystem->mn, sizeof(subsystem->mn), "%s", mn);
+
+ return 0;
+}
+
+const char *
+spdk_nvmf_subsystem_get_nqn(const struct spdk_nvmf_subsystem *subsystem)
+{
+ return subsystem->subnqn;
+}
+
+enum spdk_nvmf_subtype spdk_nvmf_subsystem_get_type(struct spdk_nvmf_subsystem *subsystem)
+{
+ return subsystem->subtype;
+}
+
+uint32_t
+spdk_nvmf_subsystem_get_max_nsid(struct spdk_nvmf_subsystem *subsystem)
+{
+ return subsystem->max_nsid;
+}
+
+static uint16_t
+nvmf_subsystem_gen_cntlid(struct spdk_nvmf_subsystem *subsystem)
+{
+ int count;
+
+ /*
+ * In the worst case, we might have to try all CNTLID values between 1 and 0xFFF0 - 1
+ * before we find one that is unused (or find that all values are in use).
+ */
+ for (count = 0; count < 0xFFF0 - 1; count++) {
+ subsystem->next_cntlid++;
+ if (subsystem->next_cntlid >= 0xFFF0) {
+ /* The spec reserves cntlid values in the range FFF0h to FFFFh. */
+ subsystem->next_cntlid = 1;
+ }
+
+ /* Check if a controller with this cntlid currently exists. */
+ if (nvmf_subsystem_get_ctrlr(subsystem, subsystem->next_cntlid) == NULL) {
+ /* Found unused cntlid */
+ return subsystem->next_cntlid;
+ }
+ }
+
+ /* All valid cntlid values are in use. */
+ return 0xFFFF;
+}
+
+int
+nvmf_subsystem_add_ctrlr(struct spdk_nvmf_subsystem *subsystem, struct spdk_nvmf_ctrlr *ctrlr)
+{
+ ctrlr->cntlid = nvmf_subsystem_gen_cntlid(subsystem);
+ if (ctrlr->cntlid == 0xFFFF) {
+ /* Unable to get a cntlid */
+ SPDK_ERRLOG("Reached max simultaneous ctrlrs\n");
+ return -EBUSY;
+ }
+
+ TAILQ_INSERT_TAIL(&subsystem->ctrlrs, ctrlr, link);
+
+ return 0;
+}
+
+void
+nvmf_subsystem_remove_ctrlr(struct spdk_nvmf_subsystem *subsystem,
+ struct spdk_nvmf_ctrlr *ctrlr)
+{
+ assert(subsystem == ctrlr->subsys);
+ TAILQ_REMOVE(&subsystem->ctrlrs, ctrlr, link);
+}
+
+struct spdk_nvmf_ctrlr *
+nvmf_subsystem_get_ctrlr(struct spdk_nvmf_subsystem *subsystem, uint16_t cntlid)
+{
+ struct spdk_nvmf_ctrlr *ctrlr;
+
+ TAILQ_FOREACH(ctrlr, &subsystem->ctrlrs, link) {
+ if (ctrlr->cntlid == cntlid) {
+ return ctrlr;
+ }
+ }
+
+ return NULL;
+}
+
+uint32_t
+spdk_nvmf_subsystem_get_max_namespaces(const struct spdk_nvmf_subsystem *subsystem)
+{
+ return subsystem->max_allowed_nsid;
+}
+
+struct _nvmf_ns_registrant {
+ uint64_t rkey;
+ char *host_uuid;
+};
+
+struct _nvmf_ns_registrants {
+ size_t num_regs;
+ struct _nvmf_ns_registrant reg[SPDK_NVMF_MAX_NUM_REGISTRANTS];
+};
+
+struct _nvmf_ns_reservation {
+ bool ptpl_activated;
+ enum spdk_nvme_reservation_type rtype;
+ uint64_t crkey;
+ char *bdev_uuid;
+ char *holder_uuid;
+ struct _nvmf_ns_registrants regs;
+};
+
+static const struct spdk_json_object_decoder nvmf_ns_pr_reg_decoders[] = {
+ {"rkey", offsetof(struct _nvmf_ns_registrant, rkey), spdk_json_decode_uint64},
+ {"host_uuid", offsetof(struct _nvmf_ns_registrant, host_uuid), spdk_json_decode_string},
+};
+
+static int
+nvmf_decode_ns_pr_reg(const struct spdk_json_val *val, void *out)
+{
+ struct _nvmf_ns_registrant *reg = out;
+
+ return spdk_json_decode_object(val, nvmf_ns_pr_reg_decoders,
+ SPDK_COUNTOF(nvmf_ns_pr_reg_decoders), reg);
+}
+
+static int
+nvmf_decode_ns_pr_regs(const struct spdk_json_val *val, void *out)
+{
+ struct _nvmf_ns_registrants *regs = out;
+
+ return spdk_json_decode_array(val, nvmf_decode_ns_pr_reg, regs->reg,
+ SPDK_NVMF_MAX_NUM_REGISTRANTS, &regs->num_regs,
+ sizeof(struct _nvmf_ns_registrant));
+}
+
+static const struct spdk_json_object_decoder nvmf_ns_pr_decoders[] = {
+ {"ptpl", offsetof(struct _nvmf_ns_reservation, ptpl_activated), spdk_json_decode_bool, true},
+ {"rtype", offsetof(struct _nvmf_ns_reservation, rtype), spdk_json_decode_uint32, true},
+ {"crkey", offsetof(struct _nvmf_ns_reservation, crkey), spdk_json_decode_uint64, true},
+ {"bdev_uuid", offsetof(struct _nvmf_ns_reservation, bdev_uuid), spdk_json_decode_string},
+ {"holder_uuid", offsetof(struct _nvmf_ns_reservation, holder_uuid), spdk_json_decode_string, true},
+ {"registrants", offsetof(struct _nvmf_ns_reservation, regs), nvmf_decode_ns_pr_regs},
+};
+
+static int
+nvmf_ns_load_reservation(const char *file, struct spdk_nvmf_reservation_info *info)
+{
+ FILE *fd;
+ size_t json_size;
+ ssize_t values_cnt, rc;
+ void *json = NULL, *end;
+ struct spdk_json_val *values = NULL;
+ struct _nvmf_ns_reservation res = {};
+ uint32_t i;
+
+ fd = fopen(file, "r");
+ /* It's not an error if the file does not exist */
+ if (!fd) {
+ SPDK_NOTICELOG("File %s does not exist\n", file);
+ return -ENOENT;
+ }
+
+ /* Load all persist file contents into a local buffer */
+ json = spdk_posix_file_load(fd, &json_size);
+ fclose(fd);
+ if (!json) {
+ SPDK_ERRLOG("Load persit file %s failed\n", file);
+ return -ENOMEM;
+ }
+
+ rc = spdk_json_parse(json, json_size, NULL, 0, &end, 0);
+ if (rc < 0) {
+ SPDK_NOTICELOG("Parsing JSON configuration failed (%zd)\n", rc);
+ goto exit;
+ }
+
+ values_cnt = rc;
+ values = calloc(values_cnt, sizeof(struct spdk_json_val));
+ if (values == NULL) {
+ goto exit;
+ }
+
+ rc = spdk_json_parse(json, json_size, values, values_cnt, &end, 0);
+ if (rc != values_cnt) {
+ SPDK_ERRLOG("Parsing JSON configuration failed (%zd)\n", rc);
+ goto exit;
+ }
+
+ /* Decode json */
+ if (spdk_json_decode_object(values, nvmf_ns_pr_decoders,
+ SPDK_COUNTOF(nvmf_ns_pr_decoders),
+ &res)) {
+ SPDK_ERRLOG("Invalid objects in the persist file %s\n", file);
+ rc = -EINVAL;
+ goto exit;
+ }
+
+ if (res.regs.num_regs > SPDK_NVMF_MAX_NUM_REGISTRANTS) {
+ SPDK_ERRLOG("Can only support up to %u registrants\n", SPDK_NVMF_MAX_NUM_REGISTRANTS);
+ rc = -ERANGE;
+ goto exit;
+ }
+
+ rc = 0;
+ info->ptpl_activated = res.ptpl_activated;
+ info->rtype = res.rtype;
+ info->crkey = res.crkey;
+ snprintf(info->bdev_uuid, sizeof(info->bdev_uuid), "%s", res.bdev_uuid);
+ snprintf(info->holder_uuid, sizeof(info->holder_uuid), "%s", res.holder_uuid);
+ info->num_regs = res.regs.num_regs;
+ for (i = 0; i < res.regs.num_regs; i++) {
+ info->registrants[i].rkey = res.regs.reg[i].rkey;
+ snprintf(info->registrants[i].host_uuid, sizeof(info->registrants[i].host_uuid), "%s",
+ res.regs.reg[i].host_uuid);
+ }
+
+exit:
+ free(json);
+ free(values);
+ free(res.bdev_uuid);
+ free(res.holder_uuid);
+ for (i = 0; i < res.regs.num_regs; i++) {
+ free(res.regs.reg[i].host_uuid);
+ }
+
+ return rc;
+}
+
+static bool
+nvmf_ns_reservation_all_registrants_type(struct spdk_nvmf_ns *ns);
+
+static int
+nvmf_ns_reservation_restore(struct spdk_nvmf_ns *ns, struct spdk_nvmf_reservation_info *info)
+{
+ uint32_t i;
+ struct spdk_nvmf_registrant *reg, *holder = NULL;
+ struct spdk_uuid bdev_uuid, holder_uuid;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "NSID %u, PTPL %u, Number of registrants %u\n",
+ ns->nsid, info->ptpl_activated, info->num_regs);
+
+ /* it's not an error */
+ if (!info->ptpl_activated || !info->num_regs) {
+ return 0;
+ }
+
+ spdk_uuid_parse(&bdev_uuid, info->bdev_uuid);
+ if (spdk_uuid_compare(&bdev_uuid, spdk_bdev_get_uuid(ns->bdev))) {
+ SPDK_ERRLOG("Existing bdev UUID is not same with configuration file\n");
+ return -EINVAL;
+ }
+
+ ns->crkey = info->crkey;
+ ns->rtype = info->rtype;
+ ns->ptpl_activated = info->ptpl_activated;
+ spdk_uuid_parse(&holder_uuid, info->holder_uuid);
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Bdev UUID %s\n", info->bdev_uuid);
+ if (info->rtype) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Holder UUID %s, RTYPE %u, RKEY 0x%"PRIx64"\n",
+ info->holder_uuid, info->rtype, info->crkey);
+ }
+
+ for (i = 0; i < info->num_regs; i++) {
+ reg = calloc(1, sizeof(*reg));
+ if (!reg) {
+ return -ENOMEM;
+ }
+ spdk_uuid_parse(&reg->hostid, info->registrants[i].host_uuid);
+ reg->rkey = info->registrants[i].rkey;
+ TAILQ_INSERT_TAIL(&ns->registrants, reg, link);
+ if (!spdk_uuid_compare(&holder_uuid, &reg->hostid)) {
+ holder = reg;
+ }
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Registrant RKEY 0x%"PRIx64", Host UUID %s\n",
+ info->registrants[i].rkey, info->registrants[i].host_uuid);
+ }
+
+ if (nvmf_ns_reservation_all_registrants_type(ns)) {
+ ns->holder = TAILQ_FIRST(&ns->registrants);
+ } else {
+ ns->holder = holder;
+ }
+
+ return 0;
+}
+
+static int
+nvmf_ns_json_write_cb(void *cb_ctx, const void *data, size_t size)
+{
+ char *file = cb_ctx;
+ size_t rc;
+ FILE *fd;
+
+ fd = fopen(file, "w");
+ if (!fd) {
+ SPDK_ERRLOG("Can't open file %s for write\n", file);
+ return -ENOENT;
+ }
+ rc = fwrite(data, 1, size, fd);
+ fclose(fd);
+
+ return rc == size ? 0 : -1;
+}
+
+static int
+nvmf_ns_reservation_update(const char *file, struct spdk_nvmf_reservation_info *info)
+{
+ struct spdk_json_write_ctx *w;
+ uint32_t i;
+ int rc = 0;
+
+ w = spdk_json_write_begin(nvmf_ns_json_write_cb, (void *)file, 0);
+ if (w == NULL) {
+ return -ENOMEM;
+ }
+ /* clear the configuration file */
+ if (!info->ptpl_activated) {
+ goto exit;
+ }
+
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_bool(w, "ptpl", info->ptpl_activated);
+ spdk_json_write_named_uint32(w, "rtype", info->rtype);
+ spdk_json_write_named_uint64(w, "crkey", info->crkey);
+ spdk_json_write_named_string(w, "bdev_uuid", info->bdev_uuid);
+ spdk_json_write_named_string(w, "holder_uuid", info->holder_uuid);
+
+ spdk_json_write_named_array_begin(w, "registrants");
+ for (i = 0; i < info->num_regs; i++) {
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_uint64(w, "rkey", info->registrants[i].rkey);
+ spdk_json_write_named_string(w, "host_uuid", info->registrants[i].host_uuid);
+ spdk_json_write_object_end(w);
+ }
+ spdk_json_write_array_end(w);
+ spdk_json_write_object_end(w);
+
+exit:
+ rc = spdk_json_write_end(w);
+ return rc;
+}
+
+static int
+nvmf_ns_update_reservation_info(struct spdk_nvmf_ns *ns)
+{
+ struct spdk_nvmf_reservation_info info;
+ struct spdk_nvmf_registrant *reg, *tmp;
+ uint32_t i = 0;
+
+ assert(ns != NULL);
+
+ if (!ns->bdev || !ns->ptpl_file) {
+ return 0;
+ }
+
+ memset(&info, 0, sizeof(info));
+ spdk_uuid_fmt_lower(info.bdev_uuid, sizeof(info.bdev_uuid), spdk_bdev_get_uuid(ns->bdev));
+
+ if (ns->rtype) {
+ info.rtype = ns->rtype;
+ info.crkey = ns->crkey;
+ if (!nvmf_ns_reservation_all_registrants_type(ns)) {
+ assert(ns->holder != NULL);
+ spdk_uuid_fmt_lower(info.holder_uuid, sizeof(info.holder_uuid), &ns->holder->hostid);
+ }
+ }
+
+ TAILQ_FOREACH_SAFE(reg, &ns->registrants, link, tmp) {
+ spdk_uuid_fmt_lower(info.registrants[i].host_uuid, sizeof(info.registrants[i].host_uuid),
+ &reg->hostid);
+ info.registrants[i++].rkey = reg->rkey;
+ }
+
+ info.num_regs = i;
+ info.ptpl_activated = ns->ptpl_activated;
+
+ return nvmf_ns_reservation_update(ns->ptpl_file, &info);
+}
+
+static struct spdk_nvmf_registrant *
+nvmf_ns_reservation_get_registrant(struct spdk_nvmf_ns *ns,
+ struct spdk_uuid *uuid)
+{
+ struct spdk_nvmf_registrant *reg, *tmp;
+
+ TAILQ_FOREACH_SAFE(reg, &ns->registrants, link, tmp) {
+ if (!spdk_uuid_compare(&reg->hostid, uuid)) {
+ return reg;
+ }
+ }
+
+ return NULL;
+}
+
+/* Generate reservation notice log to registered HostID controllers */
+static void
+nvmf_subsystem_gen_ctrlr_notification(struct spdk_nvmf_subsystem *subsystem,
+ struct spdk_nvmf_ns *ns,
+ struct spdk_uuid *hostid_list,
+ uint32_t num_hostid,
+ enum spdk_nvme_reservation_notification_log_page_type type)
+{
+ struct spdk_nvmf_ctrlr *ctrlr;
+ uint32_t i;
+
+ for (i = 0; i < num_hostid; i++) {
+ TAILQ_FOREACH(ctrlr, &subsystem->ctrlrs, link) {
+ if (!spdk_uuid_compare(&ctrlr->hostid, &hostid_list[i])) {
+ nvmf_ctrlr_reservation_notice_log(ctrlr, ns, type);
+ }
+ }
+ }
+}
+
+/* Get all registrants' hostid other than the controller who issued the command */
+static uint32_t
+nvmf_ns_reservation_get_all_other_hostid(struct spdk_nvmf_ns *ns,
+ struct spdk_uuid *hostid_list,
+ uint32_t max_num_hostid,
+ struct spdk_uuid *current_hostid)
+{
+ struct spdk_nvmf_registrant *reg, *tmp;
+ uint32_t num_hostid = 0;
+
+ TAILQ_FOREACH_SAFE(reg, &ns->registrants, link, tmp) {
+ if (spdk_uuid_compare(&reg->hostid, current_hostid)) {
+ if (num_hostid == max_num_hostid) {
+ assert(false);
+ return max_num_hostid;
+ }
+ hostid_list[num_hostid++] = reg->hostid;
+ }
+ }
+
+ return num_hostid;
+}
+
+/* Calculate the unregistered HostID list according to list
+ * prior to execute preempt command and list after executing
+ * preempt command.
+ */
+static uint32_t
+nvmf_ns_reservation_get_unregistered_hostid(struct spdk_uuid *old_hostid_list,
+ uint32_t old_num_hostid,
+ struct spdk_uuid *remaining_hostid_list,
+ uint32_t remaining_num_hostid)
+{
+ struct spdk_uuid temp_hostid_list[SPDK_NVMF_MAX_NUM_REGISTRANTS];
+ uint32_t i, j, num_hostid = 0;
+ bool found;
+
+ if (!remaining_num_hostid) {
+ return old_num_hostid;
+ }
+
+ for (i = 0; i < old_num_hostid; i++) {
+ found = false;
+ for (j = 0; j < remaining_num_hostid; j++) {
+ if (!spdk_uuid_compare(&old_hostid_list[i], &remaining_hostid_list[j])) {
+ found = true;
+ break;
+ }
+ }
+ if (!found) {
+ spdk_uuid_copy(&temp_hostid_list[num_hostid++], &old_hostid_list[i]);
+ }
+ }
+
+ if (num_hostid) {
+ memcpy(old_hostid_list, temp_hostid_list, sizeof(struct spdk_uuid) * num_hostid);
+ }
+
+ return num_hostid;
+}
+
+/* current reservation type is all registrants or not */
+static bool
+nvmf_ns_reservation_all_registrants_type(struct spdk_nvmf_ns *ns)
+{
+ return (ns->rtype == SPDK_NVME_RESERVE_WRITE_EXCLUSIVE_ALL_REGS ||
+ ns->rtype == SPDK_NVME_RESERVE_EXCLUSIVE_ACCESS_ALL_REGS);
+}
+
+/* current registrant is reservation holder or not */
+static bool
+nvmf_ns_reservation_registrant_is_holder(struct spdk_nvmf_ns *ns,
+ struct spdk_nvmf_registrant *reg)
+{
+ if (!reg) {
+ return false;
+ }
+
+ if (nvmf_ns_reservation_all_registrants_type(ns)) {
+ return true;
+ }
+
+ return (ns->holder == reg);
+}
+
+static int
+nvmf_ns_reservation_add_registrant(struct spdk_nvmf_ns *ns,
+ struct spdk_nvmf_ctrlr *ctrlr,
+ uint64_t nrkey)
+{
+ struct spdk_nvmf_registrant *reg;
+
+ reg = calloc(1, sizeof(*reg));
+ if (!reg) {
+ return -ENOMEM;
+ }
+
+ reg->rkey = nrkey;
+ /* set hostid for the registrant */
+ spdk_uuid_copy(&reg->hostid, &ctrlr->hostid);
+ TAILQ_INSERT_TAIL(&ns->registrants, reg, link);
+ ns->gen++;
+
+ return 0;
+}
+
+static void
+nvmf_ns_reservation_release_reservation(struct spdk_nvmf_ns *ns)
+{
+ ns->rtype = 0;
+ ns->crkey = 0;
+ ns->holder = NULL;
+}
+
+/* release the reservation if the last registrant was removed */
+static void
+nvmf_ns_reservation_check_release_on_remove_registrant(struct spdk_nvmf_ns *ns,
+ struct spdk_nvmf_registrant *reg)
+{
+ struct spdk_nvmf_registrant *next_reg;
+
+ /* no reservation holder */
+ if (!ns->holder) {
+ assert(ns->rtype == 0);
+ return;
+ }
+
+ next_reg = TAILQ_FIRST(&ns->registrants);
+ if (next_reg && nvmf_ns_reservation_all_registrants_type(ns)) {
+ /* the next valid registrant is the new holder now */
+ ns->holder = next_reg;
+ } else if (nvmf_ns_reservation_registrant_is_holder(ns, reg)) {
+ /* release the reservation */
+ nvmf_ns_reservation_release_reservation(ns);
+ }
+}
+
+static void
+nvmf_ns_reservation_remove_registrant(struct spdk_nvmf_ns *ns,
+ struct spdk_nvmf_registrant *reg)
+{
+ TAILQ_REMOVE(&ns->registrants, reg, link);
+ nvmf_ns_reservation_check_release_on_remove_registrant(ns, reg);
+ free(reg);
+ ns->gen++;
+ return;
+}
+
+static uint32_t
+nvmf_ns_reservation_remove_registrants_by_key(struct spdk_nvmf_ns *ns,
+ uint64_t rkey)
+{
+ struct spdk_nvmf_registrant *reg, *tmp;
+ uint32_t count = 0;
+
+ TAILQ_FOREACH_SAFE(reg, &ns->registrants, link, tmp) {
+ if (reg->rkey == rkey) {
+ nvmf_ns_reservation_remove_registrant(ns, reg);
+ count++;
+ }
+ }
+ return count;
+}
+
+static uint32_t
+nvmf_ns_reservation_remove_all_other_registrants(struct spdk_nvmf_ns *ns,
+ struct spdk_nvmf_registrant *reg)
+{
+ struct spdk_nvmf_registrant *reg_tmp, *reg_tmp2;
+ uint32_t count = 0;
+
+ TAILQ_FOREACH_SAFE(reg_tmp, &ns->registrants, link, reg_tmp2) {
+ if (reg_tmp != reg) {
+ nvmf_ns_reservation_remove_registrant(ns, reg_tmp);
+ count++;
+ }
+ }
+ return count;
+}
+
+static uint32_t
+nvmf_ns_reservation_clear_all_registrants(struct spdk_nvmf_ns *ns)
+{
+ struct spdk_nvmf_registrant *reg, *reg_tmp;
+ uint32_t count = 0;
+
+ TAILQ_FOREACH_SAFE(reg, &ns->registrants, link, reg_tmp) {
+ nvmf_ns_reservation_remove_registrant(ns, reg);
+ count++;
+ }
+ return count;
+}
+
+static void
+nvmf_ns_reservation_acquire_reservation(struct spdk_nvmf_ns *ns, uint64_t rkey,
+ enum spdk_nvme_reservation_type rtype,
+ struct spdk_nvmf_registrant *holder)
+{
+ ns->rtype = rtype;
+ ns->crkey = rkey;
+ assert(ns->holder == NULL);
+ ns->holder = holder;
+}
+
+static bool
+nvmf_ns_reservation_register(struct spdk_nvmf_ns *ns,
+ struct spdk_nvmf_ctrlr *ctrlr,
+ struct spdk_nvmf_request *req)
+{
+ struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+ uint8_t rrega, iekey, cptpl, rtype;
+ struct spdk_nvme_reservation_register_data key;
+ struct spdk_nvmf_registrant *reg;
+ uint8_t status = SPDK_NVME_SC_SUCCESS;
+ bool update_sgroup = false;
+ struct spdk_uuid hostid_list[SPDK_NVMF_MAX_NUM_REGISTRANTS];
+ uint32_t num_hostid = 0;
+ int rc;
+
+ rrega = cmd->cdw10_bits.resv_register.rrega;
+ iekey = cmd->cdw10_bits.resv_register.iekey;
+ cptpl = cmd->cdw10_bits.resv_register.cptpl;
+
+ if (req->data && req->length >= sizeof(key)) {
+ memcpy(&key, req->data, sizeof(key));
+ } else {
+ SPDK_ERRLOG("No key provided. Failing request.\n");
+ status = SPDK_NVME_SC_INVALID_FIELD;
+ goto exit;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "REGISTER: RREGA %u, IEKEY %u, CPTPL %u, "
+ "NRKEY 0x%"PRIx64", NRKEY 0x%"PRIx64"\n",
+ rrega, iekey, cptpl, key.crkey, key.nrkey);
+
+ if (cptpl == SPDK_NVME_RESERVE_PTPL_CLEAR_POWER_ON) {
+ /* Ture to OFF state, and need to be updated in the configuration file */
+ if (ns->ptpl_activated) {
+ ns->ptpl_activated = 0;
+ update_sgroup = true;
+ }
+ } else if (cptpl == SPDK_NVME_RESERVE_PTPL_PERSIST_POWER_LOSS) {
+ if (ns->ptpl_file == NULL) {
+ status = SPDK_NVME_SC_INVALID_FIELD;
+ goto exit;
+ } else if (ns->ptpl_activated == 0) {
+ ns->ptpl_activated = 1;
+ update_sgroup = true;
+ }
+ }
+
+ /* current Host Identifier has registrant or not */
+ reg = nvmf_ns_reservation_get_registrant(ns, &ctrlr->hostid);
+
+ switch (rrega) {
+ case SPDK_NVME_RESERVE_REGISTER_KEY:
+ if (!reg) {
+ /* register new controller */
+ if (key.nrkey == 0) {
+ SPDK_ERRLOG("Can't register zeroed new key\n");
+ status = SPDK_NVME_SC_INVALID_FIELD;
+ goto exit;
+ }
+ rc = nvmf_ns_reservation_add_registrant(ns, ctrlr, key.nrkey);
+ if (rc < 0) {
+ status = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+ goto exit;
+ }
+ update_sgroup = true;
+ } else {
+ /* register with same key is not an error */
+ if (reg->rkey != key.nrkey) {
+ SPDK_ERRLOG("The same host already register a "
+ "key with 0x%"PRIx64"\n",
+ reg->rkey);
+ status = SPDK_NVME_SC_RESERVATION_CONFLICT;
+ goto exit;
+ }
+ }
+ break;
+ case SPDK_NVME_RESERVE_UNREGISTER_KEY:
+ if (!reg || (!iekey && reg->rkey != key.crkey)) {
+ SPDK_ERRLOG("No registrant or current key doesn't match "
+ "with existing registrant key\n");
+ status = SPDK_NVME_SC_RESERVATION_CONFLICT;
+ goto exit;
+ }
+
+ rtype = ns->rtype;
+ num_hostid = nvmf_ns_reservation_get_all_other_hostid(ns, hostid_list,
+ SPDK_NVMF_MAX_NUM_REGISTRANTS,
+ &ctrlr->hostid);
+
+ nvmf_ns_reservation_remove_registrant(ns, reg);
+
+ if (!ns->rtype && num_hostid && (rtype == SPDK_NVME_RESERVE_WRITE_EXCLUSIVE_REG_ONLY ||
+ rtype == SPDK_NVME_RESERVE_EXCLUSIVE_ACCESS_REG_ONLY)) {
+ nvmf_subsystem_gen_ctrlr_notification(ns->subsystem, ns,
+ hostid_list,
+ num_hostid,
+ SPDK_NVME_RESERVATION_RELEASED);
+ }
+ update_sgroup = true;
+ break;
+ case SPDK_NVME_RESERVE_REPLACE_KEY:
+ if (!reg || (!iekey && reg->rkey != key.crkey)) {
+ SPDK_ERRLOG("No registrant or current key doesn't match "
+ "with existing registrant key\n");
+ status = SPDK_NVME_SC_RESERVATION_CONFLICT;
+ goto exit;
+ }
+ if (key.nrkey == 0) {
+ SPDK_ERRLOG("Can't register zeroed new key\n");
+ status = SPDK_NVME_SC_INVALID_FIELD;
+ goto exit;
+ }
+ reg->rkey = key.nrkey;
+ update_sgroup = true;
+ break;
+ default:
+ status = SPDK_NVME_SC_INVALID_FIELD;
+ goto exit;
+ }
+
+exit:
+ if (update_sgroup) {
+ rc = nvmf_ns_update_reservation_info(ns);
+ if (rc != 0) {
+ status = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+ }
+ }
+ req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+ req->rsp->nvme_cpl.status.sc = status;
+ return update_sgroup;
+}
+
+static bool
+nvmf_ns_reservation_acquire(struct spdk_nvmf_ns *ns,
+ struct spdk_nvmf_ctrlr *ctrlr,
+ struct spdk_nvmf_request *req)
+{
+ struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+ uint8_t racqa, iekey, rtype;
+ struct spdk_nvme_reservation_acquire_data key;
+ struct spdk_nvmf_registrant *reg;
+ bool all_regs = false;
+ uint32_t count = 0;
+ bool update_sgroup = true;
+ struct spdk_uuid hostid_list[SPDK_NVMF_MAX_NUM_REGISTRANTS];
+ uint32_t num_hostid = 0;
+ struct spdk_uuid new_hostid_list[SPDK_NVMF_MAX_NUM_REGISTRANTS];
+ uint32_t new_num_hostid = 0;
+ bool reservation_released = false;
+ uint8_t status = SPDK_NVME_SC_SUCCESS;
+
+ racqa = cmd->cdw10_bits.resv_acquire.racqa;
+ iekey = cmd->cdw10_bits.resv_acquire.iekey;
+ rtype = cmd->cdw10_bits.resv_acquire.rtype;
+
+ if (req->data && req->length >= sizeof(key)) {
+ memcpy(&key, req->data, sizeof(key));
+ } else {
+ SPDK_ERRLOG("No key provided. Failing request.\n");
+ status = SPDK_NVME_SC_INVALID_FIELD;
+ goto exit;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "ACQUIRE: RACQA %u, IEKEY %u, RTYPE %u, "
+ "NRKEY 0x%"PRIx64", PRKEY 0x%"PRIx64"\n",
+ racqa, iekey, rtype, key.crkey, key.prkey);
+
+ if (iekey || rtype > SPDK_NVME_RESERVE_EXCLUSIVE_ACCESS_ALL_REGS) {
+ SPDK_ERRLOG("Ignore existing key field set to 1\n");
+ status = SPDK_NVME_SC_INVALID_FIELD;
+ update_sgroup = false;
+ goto exit;
+ }
+
+ reg = nvmf_ns_reservation_get_registrant(ns, &ctrlr->hostid);
+ /* must be registrant and CRKEY must match */
+ if (!reg || reg->rkey != key.crkey) {
+ SPDK_ERRLOG("No registrant or current key doesn't match "
+ "with existing registrant key\n");
+ status = SPDK_NVME_SC_RESERVATION_CONFLICT;
+ update_sgroup = false;
+ goto exit;
+ }
+
+ all_regs = nvmf_ns_reservation_all_registrants_type(ns);
+
+ switch (racqa) {
+ case SPDK_NVME_RESERVE_ACQUIRE:
+ /* it's not an error for the holder to acquire same reservation type again */
+ if (nvmf_ns_reservation_registrant_is_holder(ns, reg) && ns->rtype == rtype) {
+ /* do nothing */
+ update_sgroup = false;
+ } else if (ns->holder == NULL) {
+ /* fisrt time to acquire the reservation */
+ nvmf_ns_reservation_acquire_reservation(ns, key.crkey, rtype, reg);
+ } else {
+ SPDK_ERRLOG("Invalid rtype or current registrant is not holder\n");
+ status = SPDK_NVME_SC_RESERVATION_CONFLICT;
+ update_sgroup = false;
+ goto exit;
+ }
+ break;
+ case SPDK_NVME_RESERVE_PREEMPT:
+ /* no reservation holder */
+ if (!ns->holder) {
+ /* unregister with PRKEY */
+ nvmf_ns_reservation_remove_registrants_by_key(ns, key.prkey);
+ break;
+ }
+ num_hostid = nvmf_ns_reservation_get_all_other_hostid(ns, hostid_list,
+ SPDK_NVMF_MAX_NUM_REGISTRANTS,
+ &ctrlr->hostid);
+
+ /* only 1 reservation holder and reservation key is valid */
+ if (!all_regs) {
+ /* preempt itself */
+ if (nvmf_ns_reservation_registrant_is_holder(ns, reg) &&
+ ns->crkey == key.prkey) {
+ ns->rtype = rtype;
+ reservation_released = true;
+ break;
+ }
+
+ if (ns->crkey == key.prkey) {
+ nvmf_ns_reservation_remove_registrant(ns, ns->holder);
+ nvmf_ns_reservation_acquire_reservation(ns, key.crkey, rtype, reg);
+ reservation_released = true;
+ } else if (key.prkey != 0) {
+ nvmf_ns_reservation_remove_registrants_by_key(ns, key.prkey);
+ } else {
+ /* PRKEY is zero */
+ SPDK_ERRLOG("Current PRKEY is zero\n");
+ status = SPDK_NVME_SC_RESERVATION_CONFLICT;
+ update_sgroup = false;
+ goto exit;
+ }
+ } else {
+ /* release all other registrants except for the current one */
+ if (key.prkey == 0) {
+ nvmf_ns_reservation_remove_all_other_registrants(ns, reg);
+ assert(ns->holder == reg);
+ } else {
+ count = nvmf_ns_reservation_remove_registrants_by_key(ns, key.prkey);
+ if (count == 0) {
+ SPDK_ERRLOG("PRKEY doesn't match any registrant\n");
+ status = SPDK_NVME_SC_RESERVATION_CONFLICT;
+ update_sgroup = false;
+ goto exit;
+ }
+ }
+ }
+ break;
+ default:
+ status = SPDK_NVME_SC_INVALID_FIELD;
+ update_sgroup = false;
+ break;
+ }
+
+exit:
+ if (update_sgroup && racqa == SPDK_NVME_RESERVE_PREEMPT) {
+ new_num_hostid = nvmf_ns_reservation_get_all_other_hostid(ns, new_hostid_list,
+ SPDK_NVMF_MAX_NUM_REGISTRANTS,
+ &ctrlr->hostid);
+ /* Preempt notification occurs on the unregistered controllers
+ * other than the controller who issued the command.
+ */
+ num_hostid = nvmf_ns_reservation_get_unregistered_hostid(hostid_list,
+ num_hostid,
+ new_hostid_list,
+ new_num_hostid);
+ if (num_hostid) {
+ nvmf_subsystem_gen_ctrlr_notification(ns->subsystem, ns,
+ hostid_list,
+ num_hostid,
+ SPDK_NVME_REGISTRATION_PREEMPTED);
+
+ }
+ /* Reservation released notification occurs on the
+ * controllers which are the remaining registrants other than
+ * the controller who issued the command.
+ */
+ if (reservation_released && new_num_hostid) {
+ nvmf_subsystem_gen_ctrlr_notification(ns->subsystem, ns,
+ new_hostid_list,
+ new_num_hostid,
+ SPDK_NVME_RESERVATION_RELEASED);
+
+ }
+ }
+ if (update_sgroup && ns->ptpl_activated) {
+ if (nvmf_ns_update_reservation_info(ns)) {
+ status = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+ }
+ }
+ req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+ req->rsp->nvme_cpl.status.sc = status;
+ return update_sgroup;
+}
+
+static bool
+nvmf_ns_reservation_release(struct spdk_nvmf_ns *ns,
+ struct spdk_nvmf_ctrlr *ctrlr,
+ struct spdk_nvmf_request *req)
+{
+ struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+ uint8_t rrela, iekey, rtype;
+ struct spdk_nvmf_registrant *reg;
+ uint64_t crkey;
+ uint8_t status = SPDK_NVME_SC_SUCCESS;
+ bool update_sgroup = true;
+ struct spdk_uuid hostid_list[SPDK_NVMF_MAX_NUM_REGISTRANTS];
+ uint32_t num_hostid = 0;
+
+ rrela = cmd->cdw10_bits.resv_release.rrela;
+ iekey = cmd->cdw10_bits.resv_release.iekey;
+ rtype = cmd->cdw10_bits.resv_release.rtype;
+
+ if (req->data && req->length >= sizeof(crkey)) {
+ memcpy(&crkey, req->data, sizeof(crkey));
+ } else {
+ SPDK_ERRLOG("No key provided. Failing request.\n");
+ status = SPDK_NVME_SC_INVALID_FIELD;
+ goto exit;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "RELEASE: RRELA %u, IEKEY %u, RTYPE %u, "
+ "CRKEY 0x%"PRIx64"\n", rrela, iekey, rtype, crkey);
+
+ if (iekey) {
+ SPDK_ERRLOG("Ignore existing key field set to 1\n");
+ status = SPDK_NVME_SC_INVALID_FIELD;
+ update_sgroup = false;
+ goto exit;
+ }
+
+ reg = nvmf_ns_reservation_get_registrant(ns, &ctrlr->hostid);
+ if (!reg || reg->rkey != crkey) {
+ SPDK_ERRLOG("No registrant or current key doesn't match "
+ "with existing registrant key\n");
+ status = SPDK_NVME_SC_RESERVATION_CONFLICT;
+ update_sgroup = false;
+ goto exit;
+ }
+
+ num_hostid = nvmf_ns_reservation_get_all_other_hostid(ns, hostid_list,
+ SPDK_NVMF_MAX_NUM_REGISTRANTS,
+ &ctrlr->hostid);
+
+ switch (rrela) {
+ case SPDK_NVME_RESERVE_RELEASE:
+ if (!ns->holder) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF, "RELEASE: no holder\n");
+ update_sgroup = false;
+ goto exit;
+ }
+ if (ns->rtype != rtype) {
+ SPDK_ERRLOG("Type doesn't match\n");
+ status = SPDK_NVME_SC_INVALID_FIELD;
+ update_sgroup = false;
+ goto exit;
+ }
+ if (!nvmf_ns_reservation_registrant_is_holder(ns, reg)) {
+ /* not the reservation holder, this isn't an error */
+ update_sgroup = false;
+ goto exit;
+ }
+
+ rtype = ns->rtype;
+ nvmf_ns_reservation_release_reservation(ns);
+
+ if (num_hostid && rtype != SPDK_NVME_RESERVE_WRITE_EXCLUSIVE &&
+ rtype != SPDK_NVME_RESERVE_EXCLUSIVE_ACCESS) {
+ nvmf_subsystem_gen_ctrlr_notification(ns->subsystem, ns,
+ hostid_list,
+ num_hostid,
+ SPDK_NVME_RESERVATION_RELEASED);
+ }
+ break;
+ case SPDK_NVME_RESERVE_CLEAR:
+ nvmf_ns_reservation_clear_all_registrants(ns);
+ if (num_hostid) {
+ nvmf_subsystem_gen_ctrlr_notification(ns->subsystem, ns,
+ hostid_list,
+ num_hostid,
+ SPDK_NVME_RESERVATION_PREEMPTED);
+ }
+ break;
+ default:
+ status = SPDK_NVME_SC_INVALID_FIELD;
+ update_sgroup = false;
+ goto exit;
+ }
+
+exit:
+ if (update_sgroup && ns->ptpl_activated) {
+ if (nvmf_ns_update_reservation_info(ns)) {
+ status = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+ }
+ }
+ req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+ req->rsp->nvme_cpl.status.sc = status;
+ return update_sgroup;
+}
+
+static void
+nvmf_ns_reservation_report(struct spdk_nvmf_ns *ns,
+ struct spdk_nvmf_ctrlr *ctrlr,
+ struct spdk_nvmf_request *req)
+{
+ struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+ struct spdk_nvmf_subsystem *subsystem = ctrlr->subsys;
+ struct spdk_nvmf_ctrlr *ctrlr_tmp;
+ struct spdk_nvmf_registrant *reg, *tmp;
+ struct spdk_nvme_reservation_status_extended_data *status_data;
+ struct spdk_nvme_registered_ctrlr_extended_data *ctrlr_data;
+ uint8_t *payload;
+ uint32_t len, count = 0;
+ uint32_t regctl = 0;
+ uint8_t status = SPDK_NVME_SC_SUCCESS;
+
+ if (req->data == NULL) {
+ SPDK_ERRLOG("No data transfer specified for request. "
+ " Unable to transfer back response.\n");
+ status = SPDK_NVME_SC_INVALID_FIELD;
+ goto exit;
+ }
+
+ if (!cmd->cdw11_bits.resv_report.eds) {
+ SPDK_ERRLOG("NVMeoF uses extended controller data structure, "
+ "please set EDS bit in cdw11 and try again\n");
+ status = SPDK_NVME_SC_HOSTID_INCONSISTENT_FORMAT;
+ goto exit;
+ }
+
+ /* Get number of registerd controllers, one Host may have more than
+ * one controller based on different ports.
+ */
+ TAILQ_FOREACH(ctrlr_tmp, &subsystem->ctrlrs, link) {
+ reg = nvmf_ns_reservation_get_registrant(ns, &ctrlr_tmp->hostid);
+ if (reg) {
+ regctl++;
+ }
+ }
+
+ len = sizeof(*status_data) + sizeof(*ctrlr_data) * regctl;
+ payload = calloc(1, len);
+ if (!payload) {
+ status = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+ goto exit;
+ }
+
+ status_data = (struct spdk_nvme_reservation_status_extended_data *)payload;
+ status_data->data.gen = ns->gen;
+ status_data->data.rtype = ns->rtype;
+ status_data->data.regctl = regctl;
+ status_data->data.ptpls = ns->ptpl_activated;
+
+ TAILQ_FOREACH_SAFE(reg, &ns->registrants, link, tmp) {
+ assert(count <= regctl);
+ ctrlr_data = (struct spdk_nvme_registered_ctrlr_extended_data *)
+ (payload + sizeof(*status_data) + sizeof(*ctrlr_data) * count);
+ /* Set to 0xffffh for dynamic controller */
+ ctrlr_data->cntlid = 0xffff;
+ ctrlr_data->rcsts.status = (ns->holder == reg) ? true : false;
+ ctrlr_data->rkey = reg->rkey;
+ spdk_uuid_copy((struct spdk_uuid *)ctrlr_data->hostid, &reg->hostid);
+ count++;
+ }
+
+ memcpy(req->data, payload, spdk_min(len, (cmd->cdw10 + 1) * sizeof(uint32_t)));
+ free(payload);
+
+exit:
+ req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+ req->rsp->nvme_cpl.status.sc = status;
+ return;
+}
+
+static void
+nvmf_ns_reservation_complete(void *ctx)
+{
+ struct spdk_nvmf_request *req = ctx;
+
+ spdk_nvmf_request_complete(req);
+}
+
+static void
+_nvmf_ns_reservation_update_done(struct spdk_nvmf_subsystem *subsystem,
+ void *cb_arg, int status)
+{
+ struct spdk_nvmf_request *req = (struct spdk_nvmf_request *)cb_arg;
+ struct spdk_nvmf_poll_group *group = req->qpair->group;
+
+ spdk_thread_send_msg(group->thread, nvmf_ns_reservation_complete, req);
+}
+
+void
+nvmf_ns_reservation_request(void *ctx)
+{
+ struct spdk_nvmf_request *req = (struct spdk_nvmf_request *)ctx;
+ struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd;
+ struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr;
+ struct subsystem_update_ns_ctx *update_ctx;
+ uint32_t nsid;
+ struct spdk_nvmf_ns *ns;
+ bool update_sgroup = false;
+
+ nsid = cmd->nsid;
+ ns = _nvmf_subsystem_get_ns(ctrlr->subsys, nsid);
+ assert(ns != NULL);
+
+ switch (cmd->opc) {
+ case SPDK_NVME_OPC_RESERVATION_REGISTER:
+ update_sgroup = nvmf_ns_reservation_register(ns, ctrlr, req);
+ break;
+ case SPDK_NVME_OPC_RESERVATION_ACQUIRE:
+ update_sgroup = nvmf_ns_reservation_acquire(ns, ctrlr, req);
+ break;
+ case SPDK_NVME_OPC_RESERVATION_RELEASE:
+ update_sgroup = nvmf_ns_reservation_release(ns, ctrlr, req);
+ break;
+ case SPDK_NVME_OPC_RESERVATION_REPORT:
+ nvmf_ns_reservation_report(ns, ctrlr, req);
+ break;
+ default:
+ break;
+ }
+
+ /* update reservation information to subsystem's poll group */
+ if (update_sgroup) {
+ update_ctx = calloc(1, sizeof(*update_ctx));
+ if (update_ctx == NULL) {
+ SPDK_ERRLOG("Can't alloc subsystem poll group update context\n");
+ goto update_done;
+ }
+ update_ctx->subsystem = ctrlr->subsys;
+ update_ctx->cb_fn = _nvmf_ns_reservation_update_done;
+ update_ctx->cb_arg = req;
+
+ nvmf_subsystem_update_ns(ctrlr->subsys, subsystem_update_ns_done, update_ctx);
+ return;
+ }
+
+update_done:
+ _nvmf_ns_reservation_update_done(ctrlr->subsys, (void *)req, 0);
+}
diff --git a/src/spdk/lib/nvmf/tcp.c b/src/spdk/lib/nvmf/tcp.c
new file mode 100644
index 000000000..391d4bcf1
--- /dev/null
+++ b/src/spdk/lib/nvmf/tcp.c
@@ -0,0 +1,2631 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation. All rights reserved.
+ * Copyright (c) 2019, 2020 Mellanox Technologies LTD. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+#include "spdk/crc32.h"
+#include "spdk/endian.h"
+#include "spdk/assert.h"
+#include "spdk/thread.h"
+#include "spdk/nvmf_transport.h"
+#include "spdk/sock.h"
+#include "spdk/string.h"
+#include "spdk/trace.h"
+#include "spdk/util.h"
+
+#include "spdk_internal/assert.h"
+#include "spdk_internal/log.h"
+#include "spdk_internal/nvme_tcp.h"
+
+#include "nvmf_internal.h"
+
+#define NVMF_TCP_MAX_ACCEPT_SOCK_ONE_TIME 16
+#define SPDK_NVMF_TCP_DEFAULT_MAX_SOCK_PRIORITY 6
+
+const struct spdk_nvmf_transport_ops spdk_nvmf_transport_tcp;
+
+/* spdk nvmf related structure */
+enum spdk_nvmf_tcp_req_state {
+
+ /* The request is not currently in use */
+ TCP_REQUEST_STATE_FREE = 0,
+
+ /* Initial state when request first received */
+ TCP_REQUEST_STATE_NEW,
+
+ /* The request is queued until a data buffer is available. */
+ TCP_REQUEST_STATE_NEED_BUFFER,
+
+ /* The request is currently transferring data from the host to the controller. */
+ TCP_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER,
+
+ /* The request is waiting for the R2T send acknowledgement. */
+ TCP_REQUEST_STATE_AWAITING_R2T_ACK,
+
+ /* The request is ready to execute at the block device */
+ TCP_REQUEST_STATE_READY_TO_EXECUTE,
+
+ /* The request is currently executing at the block device */
+ TCP_REQUEST_STATE_EXECUTING,
+
+ /* The request finished executing at the block device */
+ TCP_REQUEST_STATE_EXECUTED,
+
+ /* The request is ready to send a completion */
+ TCP_REQUEST_STATE_READY_TO_COMPLETE,
+
+ /* The request is currently transferring final pdus from the controller to the host. */
+ TCP_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST,
+
+ /* The request completed and can be marked free. */
+ TCP_REQUEST_STATE_COMPLETED,
+
+ /* Terminator */
+ TCP_REQUEST_NUM_STATES,
+};
+
+static const char *spdk_nvmf_tcp_term_req_fes_str[] = {
+ "Invalid PDU Header Field",
+ "PDU Sequence Error",
+ "Header Digiest Error",
+ "Data Transfer Out of Range",
+ "R2T Limit Exceeded",
+ "Unsupported parameter",
+};
+
+#define OBJECT_NVMF_TCP_IO 0x80
+
+#define TRACE_GROUP_NVMF_TCP 0x5
+#define TRACE_TCP_REQUEST_STATE_NEW SPDK_TPOINT_ID(TRACE_GROUP_NVMF_TCP, 0x0)
+#define TRACE_TCP_REQUEST_STATE_NEED_BUFFER SPDK_TPOINT_ID(TRACE_GROUP_NVMF_TCP, 0x1)
+#define TRACE_TCP_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER SPDK_TPOINT_ID(TRACE_GROUP_NVMF_TCP, 0x2)
+#define TRACE_TCP_REQUEST_STATE_READY_TO_EXECUTE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_TCP, 0x3)
+#define TRACE_TCP_REQUEST_STATE_EXECUTING SPDK_TPOINT_ID(TRACE_GROUP_NVMF_TCP, 0x4)
+#define TRACE_TCP_REQUEST_STATE_EXECUTED SPDK_TPOINT_ID(TRACE_GROUP_NVMF_TCP, 0x5)
+#define TRACE_TCP_REQUEST_STATE_READY_TO_COMPLETE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_TCP, 0x6)
+#define TRACE_TCP_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST SPDK_TPOINT_ID(TRACE_GROUP_NVMF_TCP, 0x7)
+#define TRACE_TCP_REQUEST_STATE_COMPLETED SPDK_TPOINT_ID(TRACE_GROUP_NVMF_TCP, 0x8)
+#define TRACE_TCP_FLUSH_WRITEBUF_START SPDK_TPOINT_ID(TRACE_GROUP_NVMF_TCP, 0x9)
+#define TRACE_TCP_FLUSH_WRITEBUF_DONE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_TCP, 0xA)
+#define TRACE_TCP_READ_FROM_SOCKET_DONE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_TCP, 0xB)
+#define TRACE_TCP_REQUEST_STATE_AWAIT_R2T_ACK SPDK_TPOINT_ID(TRACE_GROUP_NVMF_TCP, 0xC)
+
+SPDK_TRACE_REGISTER_FN(nvmf_tcp_trace, "nvmf_tcp", TRACE_GROUP_NVMF_TCP)
+{
+ spdk_trace_register_object(OBJECT_NVMF_TCP_IO, 'r');
+ spdk_trace_register_description("TCP_REQ_NEW",
+ TRACE_TCP_REQUEST_STATE_NEW,
+ OWNER_NONE, OBJECT_NVMF_TCP_IO, 1, 1, "");
+ spdk_trace_register_description("TCP_REQ_NEED_BUFFER",
+ TRACE_TCP_REQUEST_STATE_NEED_BUFFER,
+ OWNER_NONE, OBJECT_NVMF_TCP_IO, 0, 1, "");
+ spdk_trace_register_description("TCP_REQ_TX_H_TO_C",
+ TRACE_TCP_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER,
+ OWNER_NONE, OBJECT_NVMF_TCP_IO, 0, 1, "");
+ spdk_trace_register_description("TCP_REQ_RDY_TO_EXECUTE",
+ TRACE_TCP_REQUEST_STATE_READY_TO_EXECUTE,
+ OWNER_NONE, OBJECT_NVMF_TCP_IO, 0, 1, "");
+ spdk_trace_register_description("TCP_REQ_EXECUTING",
+ TRACE_TCP_REQUEST_STATE_EXECUTING,
+ OWNER_NONE, OBJECT_NVMF_TCP_IO, 0, 1, "");
+ spdk_trace_register_description("TCP_REQ_EXECUTED",
+ TRACE_TCP_REQUEST_STATE_EXECUTED,
+ OWNER_NONE, OBJECT_NVMF_TCP_IO, 0, 1, "");
+ spdk_trace_register_description("TCP_REQ_RDY_TO_COMPLETE",
+ TRACE_TCP_REQUEST_STATE_READY_TO_COMPLETE,
+ OWNER_NONE, OBJECT_NVMF_TCP_IO, 0, 1, "");
+ spdk_trace_register_description("TCP_REQ_TRANSFER_C2H",
+ TRACE_TCP_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST,
+ OWNER_NONE, OBJECT_NVMF_TCP_IO, 0, 1, "");
+ spdk_trace_register_description("TCP_REQ_COMPLETED",
+ TRACE_TCP_REQUEST_STATE_COMPLETED,
+ OWNER_NONE, OBJECT_NVMF_TCP_IO, 0, 1, "");
+ spdk_trace_register_description("TCP_WRITE_START",
+ TRACE_TCP_FLUSH_WRITEBUF_START,
+ OWNER_NONE, OBJECT_NONE, 0, 0, "");
+ spdk_trace_register_description("TCP_WRITE_DONE",
+ TRACE_TCP_FLUSH_WRITEBUF_DONE,
+ OWNER_NONE, OBJECT_NONE, 0, 0, "");
+ spdk_trace_register_description("TCP_READ_DONE",
+ TRACE_TCP_READ_FROM_SOCKET_DONE,
+ OWNER_NONE, OBJECT_NONE, 0, 0, "");
+ spdk_trace_register_description("TCP_REQ_AWAIT_R2T_ACK",
+ TRACE_TCP_REQUEST_STATE_AWAIT_R2T_ACK,
+ OWNER_NONE, OBJECT_NVMF_TCP_IO, 0, 1, "");
+}
+
+struct spdk_nvmf_tcp_req {
+ struct spdk_nvmf_request req;
+ struct spdk_nvme_cpl rsp;
+ struct spdk_nvme_cmd cmd;
+
+ /* A PDU that can be used for sending responses. This is
+ * not the incoming PDU! */
+ struct nvme_tcp_pdu *pdu;
+
+ /*
+ * The PDU for a request may be used multiple times in serial over
+ * the request's lifetime. For example, first to send an R2T, then
+ * to send a completion. To catch mistakes where the PDU is used
+ * twice at the same time, add a debug flag here for init/fini.
+ */
+ bool pdu_in_use;
+
+ /* In-capsule data buffer */
+ uint8_t *buf;
+
+ bool has_incapsule_data;
+
+ /* transfer_tag */
+ uint16_t ttag;
+
+ enum spdk_nvmf_tcp_req_state state;
+
+ /*
+ * h2c_offset is used when we receive the h2c_data PDU.
+ */
+ uint32_t h2c_offset;
+
+ STAILQ_ENTRY(spdk_nvmf_tcp_req) link;
+ TAILQ_ENTRY(spdk_nvmf_tcp_req) state_link;
+};
+
+struct spdk_nvmf_tcp_qpair {
+ struct spdk_nvmf_qpair qpair;
+ struct spdk_nvmf_tcp_poll_group *group;
+ struct spdk_nvmf_tcp_port *port;
+ struct spdk_sock *sock;
+
+ enum nvme_tcp_pdu_recv_state recv_state;
+ enum nvme_tcp_qpair_state state;
+
+ /* PDU being actively received */
+ struct nvme_tcp_pdu pdu_in_progress;
+ uint32_t recv_buf_size;
+
+ /* This is a spare PDU used for sending special management
+ * operations. Primarily, this is used for the initial
+ * connection response and c2h termination request. */
+ struct nvme_tcp_pdu mgmt_pdu;
+
+ TAILQ_HEAD(, nvme_tcp_pdu) send_queue;
+
+ /* Arrays of in-capsule buffers, requests, and pdus.
+ * Each array is 'resource_count' number of elements */
+ void *bufs;
+ struct spdk_nvmf_tcp_req *reqs;
+ struct nvme_tcp_pdu *pdus;
+ uint32_t resource_count;
+
+ /* Queues to track the requests in all states */
+ TAILQ_HEAD(, spdk_nvmf_tcp_req) state_queue[TCP_REQUEST_NUM_STATES];
+ /* Number of requests in each state */
+ uint32_t state_cntr[TCP_REQUEST_NUM_STATES];
+
+ uint8_t cpda;
+
+ bool host_hdgst_enable;
+ bool host_ddgst_enable;
+
+ /* IP address */
+ char initiator_addr[SPDK_NVMF_TRADDR_MAX_LEN];
+ char target_addr[SPDK_NVMF_TRADDR_MAX_LEN];
+
+ /* IP port */
+ uint16_t initiator_port;
+ uint16_t target_port;
+
+ /* Timer used to destroy qpair after detecting transport error issue if initiator does
+ * not close the connection.
+ */
+ struct spdk_poller *timeout_poller;
+
+ TAILQ_ENTRY(spdk_nvmf_tcp_qpair) link;
+};
+
+struct spdk_nvmf_tcp_poll_group {
+ struct spdk_nvmf_transport_poll_group group;
+ struct spdk_sock_group *sock_group;
+
+ TAILQ_HEAD(, spdk_nvmf_tcp_qpair) qpairs;
+ TAILQ_HEAD(, spdk_nvmf_tcp_qpair) await_req;
+};
+
+struct spdk_nvmf_tcp_port {
+ const struct spdk_nvme_transport_id *trid;
+ struct spdk_sock *listen_sock;
+ TAILQ_ENTRY(spdk_nvmf_tcp_port) link;
+};
+
+struct spdk_nvmf_tcp_transport {
+ struct spdk_nvmf_transport transport;
+
+ pthread_mutex_t lock;
+
+ TAILQ_HEAD(, spdk_nvmf_tcp_port) ports;
+};
+
+static bool nvmf_tcp_req_process(struct spdk_nvmf_tcp_transport *ttransport,
+ struct spdk_nvmf_tcp_req *tcp_req);
+
+static void
+nvmf_tcp_req_set_state(struct spdk_nvmf_tcp_req *tcp_req,
+ enum spdk_nvmf_tcp_req_state state)
+{
+ struct spdk_nvmf_qpair *qpair;
+ struct spdk_nvmf_tcp_qpair *tqpair;
+
+ qpair = tcp_req->req.qpair;
+ tqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_tcp_qpair, qpair);
+
+ TAILQ_REMOVE(&tqpair->state_queue[tcp_req->state], tcp_req, state_link);
+ assert(tqpair->state_cntr[tcp_req->state] > 0);
+ tqpair->state_cntr[tcp_req->state]--;
+
+ TAILQ_INSERT_TAIL(&tqpair->state_queue[state], tcp_req, state_link);
+ tqpair->state_cntr[state]++;
+
+ tcp_req->state = state;
+}
+
+static inline struct nvme_tcp_pdu *
+nvmf_tcp_req_pdu_init(struct spdk_nvmf_tcp_req *tcp_req)
+{
+ assert(tcp_req->pdu_in_use == false);
+ tcp_req->pdu_in_use = true;
+
+ memset(tcp_req->pdu, 0, sizeof(*tcp_req->pdu));
+ tcp_req->pdu->qpair = SPDK_CONTAINEROF(tcp_req->req.qpair, struct spdk_nvmf_tcp_qpair, qpair);
+
+ return tcp_req->pdu;
+}
+
+static inline void
+nvmf_tcp_req_pdu_fini(struct spdk_nvmf_tcp_req *tcp_req)
+{
+ tcp_req->pdu_in_use = false;
+}
+
+static struct spdk_nvmf_tcp_req *
+nvmf_tcp_req_get(struct spdk_nvmf_tcp_qpair *tqpair)
+{
+ struct spdk_nvmf_tcp_req *tcp_req;
+
+ tcp_req = TAILQ_FIRST(&tqpair->state_queue[TCP_REQUEST_STATE_FREE]);
+ if (!tcp_req) {
+ return NULL;
+ }
+
+ memset(&tcp_req->rsp, 0, sizeof(tcp_req->rsp));
+ tcp_req->h2c_offset = 0;
+ tcp_req->has_incapsule_data = false;
+ tcp_req->req.dif.dif_insert_or_strip = false;
+
+ nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_NEW);
+ return tcp_req;
+}
+
+static void
+nvmf_tcp_request_free(struct spdk_nvmf_tcp_req *tcp_req)
+{
+ struct spdk_nvmf_tcp_transport *ttransport;
+
+ assert(tcp_req != NULL);
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "tcp_req=%p will be freed\n", tcp_req);
+ ttransport = SPDK_CONTAINEROF(tcp_req->req.qpair->transport,
+ struct spdk_nvmf_tcp_transport, transport);
+ nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_COMPLETED);
+ nvmf_tcp_req_process(ttransport, tcp_req);
+}
+
+static int
+nvmf_tcp_req_free(struct spdk_nvmf_request *req)
+{
+ struct spdk_nvmf_tcp_req *tcp_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_tcp_req, req);
+
+ nvmf_tcp_request_free(tcp_req);
+
+ return 0;
+}
+
+static void
+nvmf_tcp_drain_state_queue(struct spdk_nvmf_tcp_qpair *tqpair,
+ enum spdk_nvmf_tcp_req_state state)
+{
+ struct spdk_nvmf_tcp_req *tcp_req, *req_tmp;
+
+ TAILQ_FOREACH_SAFE(tcp_req, &tqpair->state_queue[state], state_link, req_tmp) {
+ nvmf_tcp_request_free(tcp_req);
+ }
+}
+
+static void
+nvmf_tcp_cleanup_all_states(struct spdk_nvmf_tcp_qpair *tqpair)
+{
+ struct spdk_nvmf_tcp_req *tcp_req, *req_tmp;
+
+ assert(TAILQ_EMPTY(&tqpair->send_queue));
+
+ nvmf_tcp_drain_state_queue(tqpair, TCP_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST);
+ nvmf_tcp_drain_state_queue(tqpair, TCP_REQUEST_STATE_NEW);
+
+ /* Wipe the requests waiting for buffer from the global list */
+ TAILQ_FOREACH_SAFE(tcp_req, &tqpair->state_queue[TCP_REQUEST_STATE_NEED_BUFFER], state_link,
+ req_tmp) {
+ STAILQ_REMOVE(&tqpair->group->group.pending_buf_queue, &tcp_req->req,
+ spdk_nvmf_request, buf_link);
+ }
+
+ nvmf_tcp_drain_state_queue(tqpair, TCP_REQUEST_STATE_NEED_BUFFER);
+ nvmf_tcp_drain_state_queue(tqpair, TCP_REQUEST_STATE_EXECUTING);
+ nvmf_tcp_drain_state_queue(tqpair, TCP_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER);
+ nvmf_tcp_drain_state_queue(tqpair, TCP_REQUEST_STATE_AWAITING_R2T_ACK);
+}
+
+static void
+nvmf_tcp_dump_qpair_req_contents(struct spdk_nvmf_tcp_qpair *tqpair)
+{
+ int i;
+ struct spdk_nvmf_tcp_req *tcp_req;
+
+ SPDK_ERRLOG("Dumping contents of queue pair (QID %d)\n", tqpair->qpair.qid);
+ for (i = 1; i < TCP_REQUEST_NUM_STATES; i++) {
+ SPDK_ERRLOG("\tNum of requests in state[%d] = %u\n", i, tqpair->state_cntr[i]);
+ TAILQ_FOREACH(tcp_req, &tqpair->state_queue[i], state_link) {
+ SPDK_ERRLOG("\t\tRequest Data From Pool: %d\n", tcp_req->req.data_from_pool);
+ SPDK_ERRLOG("\t\tRequest opcode: %d\n", tcp_req->req.cmd->nvmf_cmd.opcode);
+ }
+ }
+}
+
+static void
+nvmf_tcp_qpair_destroy(struct spdk_nvmf_tcp_qpair *tqpair)
+{
+ int err = 0;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "enter\n");
+
+ err = spdk_sock_close(&tqpair->sock);
+ assert(err == 0);
+ nvmf_tcp_cleanup_all_states(tqpair);
+
+ if (tqpair->state_cntr[TCP_REQUEST_STATE_FREE] != tqpair->resource_count) {
+ SPDK_ERRLOG("tqpair(%p) free tcp request num is %u but should be %u\n", tqpair,
+ tqpair->state_cntr[TCP_REQUEST_STATE_FREE],
+ tqpair->resource_count);
+ err++;
+ }
+
+ if (err > 0) {
+ nvmf_tcp_dump_qpair_req_contents(tqpair);
+ }
+
+ spdk_dma_free(tqpair->pdus);
+ free(tqpair->reqs);
+ spdk_free(tqpair->bufs);
+ free(tqpair);
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "Leave\n");
+}
+
+static int
+nvmf_tcp_destroy(struct spdk_nvmf_transport *transport)
+{
+ struct spdk_nvmf_tcp_transport *ttransport;
+
+ assert(transport != NULL);
+ ttransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_tcp_transport, transport);
+
+ pthread_mutex_destroy(&ttransport->lock);
+ free(ttransport);
+ return 0;
+}
+
+static struct spdk_nvmf_transport *
+nvmf_tcp_create(struct spdk_nvmf_transport_opts *opts)
+{
+ struct spdk_nvmf_tcp_transport *ttransport;
+ uint32_t sge_count;
+ uint32_t min_shared_buffers;
+
+ ttransport = calloc(1, sizeof(*ttransport));
+ if (!ttransport) {
+ return NULL;
+ }
+
+ TAILQ_INIT(&ttransport->ports);
+
+ ttransport->transport.ops = &spdk_nvmf_transport_tcp;
+
+ SPDK_NOTICELOG("*** TCP Transport Init ***\n");
+
+ SPDK_INFOLOG(SPDK_LOG_NVMF_TCP, "*** TCP Transport Init ***\n"
+ " Transport opts: max_ioq_depth=%d, max_io_size=%d,\n"
+ " max_io_qpairs_per_ctrlr=%d, io_unit_size=%d,\n"
+ " in_capsule_data_size=%d, max_aq_depth=%d\n"
+ " num_shared_buffers=%d, c2h_success=%d,\n"
+ " dif_insert_or_strip=%d, sock_priority=%d\n"
+ " abort_timeout_sec=%d\n",
+ opts->max_queue_depth,
+ opts->max_io_size,
+ opts->max_qpairs_per_ctrlr - 1,
+ opts->io_unit_size,
+ opts->in_capsule_data_size,
+ opts->max_aq_depth,
+ opts->num_shared_buffers,
+ opts->c2h_success,
+ opts->dif_insert_or_strip,
+ opts->sock_priority,
+ opts->abort_timeout_sec);
+
+ if (opts->sock_priority > SPDK_NVMF_TCP_DEFAULT_MAX_SOCK_PRIORITY) {
+ SPDK_ERRLOG("Unsupported socket_priority=%d, the current range is: 0 to %d\n"
+ "you can use man 7 socket to view the range of priority under SO_PRIORITY item\n",
+ opts->sock_priority, SPDK_NVMF_TCP_DEFAULT_MAX_SOCK_PRIORITY);
+ free(ttransport);
+ return NULL;
+ }
+
+ /* I/O unit size cannot be larger than max I/O size */
+ if (opts->io_unit_size > opts->max_io_size) {
+ opts->io_unit_size = opts->max_io_size;
+ }
+
+ sge_count = opts->max_io_size / opts->io_unit_size;
+ if (sge_count > SPDK_NVMF_MAX_SGL_ENTRIES) {
+ SPDK_ERRLOG("Unsupported IO Unit size specified, %d bytes\n", opts->io_unit_size);
+ free(ttransport);
+ return NULL;
+ }
+
+ min_shared_buffers = spdk_thread_get_count() * opts->buf_cache_size;
+ if (min_shared_buffers > opts->num_shared_buffers) {
+ SPDK_ERRLOG("There are not enough buffers to satisfy"
+ "per-poll group caches for each thread. (%" PRIu32 ")"
+ "supplied. (%" PRIu32 ") required\n", opts->num_shared_buffers, min_shared_buffers);
+ SPDK_ERRLOG("Please specify a larger number of shared buffers\n");
+ nvmf_tcp_destroy(&ttransport->transport);
+ return NULL;
+ }
+
+ pthread_mutex_init(&ttransport->lock, NULL);
+
+ return &ttransport->transport;
+}
+
+static int
+nvmf_tcp_trsvcid_to_int(const char *trsvcid)
+{
+ unsigned long long ull;
+ char *end = NULL;
+
+ ull = strtoull(trsvcid, &end, 10);
+ if (end == NULL || end == trsvcid || *end != '\0') {
+ return -1;
+ }
+
+ /* Valid TCP/IP port numbers are in [0, 65535] */
+ if (ull > 65535) {
+ return -1;
+ }
+
+ return (int)ull;
+}
+
+/**
+ * Canonicalize a listen address trid.
+ */
+static int
+nvmf_tcp_canon_listen_trid(struct spdk_nvme_transport_id *canon_trid,
+ const struct spdk_nvme_transport_id *trid)
+{
+ int trsvcid_int;
+
+ trsvcid_int = nvmf_tcp_trsvcid_to_int(trid->trsvcid);
+ if (trsvcid_int < 0) {
+ return -EINVAL;
+ }
+
+ memset(canon_trid, 0, sizeof(*canon_trid));
+ spdk_nvme_trid_populate_transport(canon_trid, SPDK_NVME_TRANSPORT_TCP);
+ canon_trid->adrfam = trid->adrfam;
+ snprintf(canon_trid->traddr, sizeof(canon_trid->traddr), "%s", trid->traddr);
+ snprintf(canon_trid->trsvcid, sizeof(canon_trid->trsvcid), "%d", trsvcid_int);
+
+ return 0;
+}
+
+/**
+ * Find an existing listening port.
+ *
+ * Caller must hold ttransport->lock.
+ */
+static struct spdk_nvmf_tcp_port *
+nvmf_tcp_find_port(struct spdk_nvmf_tcp_transport *ttransport,
+ const struct spdk_nvme_transport_id *trid)
+{
+ struct spdk_nvme_transport_id canon_trid;
+ struct spdk_nvmf_tcp_port *port;
+
+ if (nvmf_tcp_canon_listen_trid(&canon_trid, trid) != 0) {
+ return NULL;
+ }
+
+ TAILQ_FOREACH(port, &ttransport->ports, link) {
+ if (spdk_nvme_transport_id_compare(&canon_trid, port->trid) == 0) {
+ return port;
+ }
+ }
+
+ return NULL;
+}
+
+static int
+nvmf_tcp_listen(struct spdk_nvmf_transport *transport,
+ const struct spdk_nvme_transport_id *trid)
+{
+ struct spdk_nvmf_tcp_transport *ttransport;
+ struct spdk_nvmf_tcp_port *port;
+ int trsvcid_int;
+ uint8_t adrfam;
+ struct spdk_sock_opts opts;
+
+ ttransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_tcp_transport, transport);
+
+ trsvcid_int = nvmf_tcp_trsvcid_to_int(trid->trsvcid);
+ if (trsvcid_int < 0) {
+ SPDK_ERRLOG("Invalid trsvcid '%s'\n", trid->trsvcid);
+ return -EINVAL;
+ }
+
+ pthread_mutex_lock(&ttransport->lock);
+ port = calloc(1, sizeof(*port));
+ if (!port) {
+ SPDK_ERRLOG("Port allocation failed\n");
+ pthread_mutex_unlock(&ttransport->lock);
+ return -ENOMEM;
+ }
+
+ port->trid = trid;
+ opts.opts_size = sizeof(opts);
+ spdk_sock_get_default_opts(&opts);
+ opts.priority = transport->opts.sock_priority;
+ port->listen_sock = spdk_sock_listen_ext(trid->traddr, trsvcid_int,
+ NULL, &opts);
+ if (port->listen_sock == NULL) {
+ SPDK_ERRLOG("spdk_sock_listen(%s, %d) failed: %s (%d)\n",
+ trid->traddr, trsvcid_int,
+ spdk_strerror(errno), errno);
+ free(port);
+ pthread_mutex_unlock(&ttransport->lock);
+ return -errno;
+ }
+
+ if (spdk_sock_is_ipv4(port->listen_sock)) {
+ adrfam = SPDK_NVMF_ADRFAM_IPV4;
+ } else if (spdk_sock_is_ipv6(port->listen_sock)) {
+ adrfam = SPDK_NVMF_ADRFAM_IPV6;
+ } else {
+ SPDK_ERRLOG("Unhandled socket type\n");
+ adrfam = 0;
+ }
+
+ if (adrfam != trid->adrfam) {
+ SPDK_ERRLOG("Socket address family mismatch\n");
+ spdk_sock_close(&port->listen_sock);
+ free(port);
+ pthread_mutex_unlock(&ttransport->lock);
+ return -EINVAL;
+ }
+
+ SPDK_NOTICELOG("*** NVMe/TCP Target Listening on %s port %s ***\n",
+ trid->traddr, trid->trsvcid);
+
+ TAILQ_INSERT_TAIL(&ttransport->ports, port, link);
+ pthread_mutex_unlock(&ttransport->lock);
+ return 0;
+}
+
+static void
+nvmf_tcp_stop_listen(struct spdk_nvmf_transport *transport,
+ const struct spdk_nvme_transport_id *trid)
+{
+ struct spdk_nvmf_tcp_transport *ttransport;
+ struct spdk_nvmf_tcp_port *port;
+
+ ttransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_tcp_transport, transport);
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "Removing listen address %s port %s\n",
+ trid->traddr, trid->trsvcid);
+
+ pthread_mutex_lock(&ttransport->lock);
+ port = nvmf_tcp_find_port(ttransport, trid);
+ if (port) {
+ TAILQ_REMOVE(&ttransport->ports, port, link);
+ spdk_sock_close(&port->listen_sock);
+ free(port);
+ }
+
+ pthread_mutex_unlock(&ttransport->lock);
+}
+
+static void nvmf_tcp_qpair_set_recv_state(struct spdk_nvmf_tcp_qpair *tqpair,
+ enum nvme_tcp_pdu_recv_state state);
+
+static void
+nvmf_tcp_qpair_disconnect(struct spdk_nvmf_tcp_qpair *tqpair)
+{
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "Disconnecting qpair %p\n", tqpair);
+
+ if (tqpair->state <= NVME_TCP_QPAIR_STATE_RUNNING) {
+ tqpair->state = NVME_TCP_QPAIR_STATE_EXITING;
+ nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR);
+ spdk_poller_unregister(&tqpair->timeout_poller);
+
+ /* This will end up calling nvmf_tcp_close_qpair */
+ spdk_nvmf_qpair_disconnect(&tqpair->qpair, NULL, NULL);
+ }
+}
+
+static void
+_pdu_write_done(void *_pdu, int err)
+{
+ struct nvme_tcp_pdu *pdu = _pdu;
+ struct spdk_nvmf_tcp_qpair *tqpair = pdu->qpair;
+
+ TAILQ_REMOVE(&tqpair->send_queue, pdu, tailq);
+
+ if (err != 0) {
+ nvmf_tcp_qpair_disconnect(tqpair);
+ return;
+ }
+
+ assert(pdu->cb_fn != NULL);
+ pdu->cb_fn(pdu->cb_arg);
+}
+
+static void
+nvmf_tcp_qpair_write_pdu(struct spdk_nvmf_tcp_qpair *tqpair,
+ struct nvme_tcp_pdu *pdu,
+ nvme_tcp_qpair_xfer_complete_cb cb_fn,
+ void *cb_arg)
+{
+ int hlen;
+ uint32_t crc32c;
+ uint32_t mapped_length = 0;
+ ssize_t rc;
+
+ assert(&tqpair->pdu_in_progress != pdu);
+
+ hlen = pdu->hdr.common.hlen;
+
+ /* Header Digest */
+ if (g_nvme_tcp_hdgst[pdu->hdr.common.pdu_type] && tqpair->host_hdgst_enable) {
+ crc32c = nvme_tcp_pdu_calc_header_digest(pdu);
+ MAKE_DIGEST_WORD((uint8_t *)pdu->hdr.raw + hlen, crc32c);
+ }
+
+ /* Data Digest */
+ if (pdu->data_len > 0 && g_nvme_tcp_ddgst[pdu->hdr.common.pdu_type] && tqpair->host_ddgst_enable) {
+ crc32c = nvme_tcp_pdu_calc_data_digest(pdu);
+ MAKE_DIGEST_WORD(pdu->data_digest, crc32c);
+ }
+
+ pdu->cb_fn = cb_fn;
+ pdu->cb_arg = cb_arg;
+
+ pdu->sock_req.iovcnt = nvme_tcp_build_iovs(pdu->iov, SPDK_COUNTOF(pdu->iov), pdu,
+ tqpair->host_hdgst_enable, tqpair->host_ddgst_enable,
+ &mapped_length);
+ pdu->sock_req.cb_fn = _pdu_write_done;
+ pdu->sock_req.cb_arg = pdu;
+ TAILQ_INSERT_TAIL(&tqpair->send_queue, pdu, tailq);
+ if (pdu->hdr.common.pdu_type == SPDK_NVME_TCP_PDU_TYPE_IC_RESP ||
+ pdu->hdr.common.pdu_type == SPDK_NVME_TCP_PDU_TYPE_C2H_TERM_REQ) {
+ rc = spdk_sock_writev(tqpair->sock, pdu->iov, pdu->sock_req.iovcnt);
+ if (rc == mapped_length) {
+ _pdu_write_done(pdu, 0);
+ } else {
+ SPDK_ERRLOG("IC_RESP or TERM_REQ could not write to socket.\n");
+ _pdu_write_done(pdu, -1);
+ }
+ } else {
+ spdk_sock_writev_async(tqpair->sock, &pdu->sock_req);
+ }
+}
+
+static int
+nvmf_tcp_qpair_init_mem_resource(struct spdk_nvmf_tcp_qpair *tqpair)
+{
+ uint32_t i;
+ struct spdk_nvmf_transport_opts *opts;
+ uint32_t in_capsule_data_size;
+
+ opts = &tqpair->qpair.transport->opts;
+
+ in_capsule_data_size = opts->in_capsule_data_size;
+ if (opts->dif_insert_or_strip) {
+ in_capsule_data_size = SPDK_BDEV_BUF_SIZE_WITH_MD(in_capsule_data_size);
+ }
+
+ tqpair->resource_count = opts->max_queue_depth;
+
+ tqpair->mgmt_pdu.qpair = tqpair;
+
+ tqpair->reqs = calloc(tqpair->resource_count, sizeof(*tqpair->reqs));
+ if (!tqpair->reqs) {
+ SPDK_ERRLOG("Unable to allocate reqs on tqpair=%p\n", tqpair);
+ return -1;
+ }
+
+ if (in_capsule_data_size) {
+ tqpair->bufs = spdk_zmalloc(tqpair->resource_count * in_capsule_data_size, 0x1000,
+ NULL, SPDK_ENV_LCORE_ID_ANY,
+ SPDK_MALLOC_DMA);
+ if (!tqpair->bufs) {
+ SPDK_ERRLOG("Unable to allocate bufs on tqpair=%p.\n", tqpair);
+ return -1;
+ }
+ }
+
+ tqpair->pdus = spdk_dma_malloc(tqpair->resource_count * sizeof(*tqpair->pdus), 0x1000, NULL);
+ if (!tqpair->pdus) {
+ SPDK_ERRLOG("Unable to allocate pdu pool on tqpair =%p.\n", tqpair);
+ return -1;
+ }
+
+ for (i = 0; i < tqpair->resource_count; i++) {
+ struct spdk_nvmf_tcp_req *tcp_req = &tqpair->reqs[i];
+
+ tcp_req->ttag = i + 1;
+ tcp_req->req.qpair = &tqpair->qpair;
+
+ tcp_req->pdu = &tqpair->pdus[i];
+ tcp_req->pdu->qpair = tqpair;
+
+ /* Set up memory to receive commands */
+ if (tqpair->bufs) {
+ tcp_req->buf = (void *)((uintptr_t)tqpair->bufs + (i * in_capsule_data_size));
+ }
+
+ /* Set the cmdn and rsp */
+ tcp_req->req.rsp = (union nvmf_c2h_msg *)&tcp_req->rsp;
+ tcp_req->req.cmd = (union nvmf_h2c_msg *)&tcp_req->cmd;
+
+ /* Initialize request state to FREE */
+ tcp_req->state = TCP_REQUEST_STATE_FREE;
+ TAILQ_INSERT_TAIL(&tqpair->state_queue[tcp_req->state], tcp_req, state_link);
+ tqpair->state_cntr[TCP_REQUEST_STATE_FREE]++;
+ }
+
+ tqpair->recv_buf_size = (in_capsule_data_size + sizeof(struct spdk_nvme_tcp_cmd) + 2 *
+ SPDK_NVME_TCP_DIGEST_LEN) * SPDK_NVMF_TCP_RECV_BUF_SIZE_FACTOR;
+
+ return 0;
+}
+
+static int
+nvmf_tcp_qpair_init(struct spdk_nvmf_qpair *qpair)
+{
+ struct spdk_nvmf_tcp_qpair *tqpair;
+ int i;
+
+ tqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_tcp_qpair, qpair);
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "New TCP Connection: %p\n", qpair);
+
+ TAILQ_INIT(&tqpair->send_queue);
+
+ /* Initialise request state queues of the qpair */
+ for (i = TCP_REQUEST_STATE_FREE; i < TCP_REQUEST_NUM_STATES; i++) {
+ TAILQ_INIT(&tqpair->state_queue[i]);
+ }
+
+ tqpair->host_hdgst_enable = true;
+ tqpair->host_ddgst_enable = true;
+
+ return 0;
+}
+
+static int
+nvmf_tcp_qpair_sock_init(struct spdk_nvmf_tcp_qpair *tqpair)
+{
+ int rc;
+
+ /* set low water mark */
+ rc = spdk_sock_set_recvlowat(tqpair->sock, sizeof(struct spdk_nvme_tcp_common_pdu_hdr));
+ if (rc != 0) {
+ SPDK_ERRLOG("spdk_sock_set_recvlowat() failed\n");
+ return rc;
+ }
+
+ return 0;
+}
+
+static void
+nvmf_tcp_handle_connect(struct spdk_nvmf_transport *transport,
+ struct spdk_nvmf_tcp_port *port,
+ struct spdk_sock *sock)
+{
+ struct spdk_nvmf_tcp_qpair *tqpair;
+ int rc;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "New connection accepted on %s port %s\n",
+ port->trid->traddr, port->trid->trsvcid);
+
+ tqpair = calloc(1, sizeof(struct spdk_nvmf_tcp_qpair));
+ if (tqpair == NULL) {
+ SPDK_ERRLOG("Could not allocate new connection.\n");
+ spdk_sock_close(&sock);
+ return;
+ }
+
+ tqpair->sock = sock;
+ tqpair->state_cntr[TCP_REQUEST_STATE_FREE] = 0;
+ tqpair->port = port;
+ tqpair->qpair.transport = transport;
+
+ rc = spdk_sock_getaddr(tqpair->sock, tqpair->target_addr,
+ sizeof(tqpair->target_addr), &tqpair->target_port,
+ tqpair->initiator_addr, sizeof(tqpair->initiator_addr),
+ &tqpair->initiator_port);
+ if (rc < 0) {
+ SPDK_ERRLOG("spdk_sock_getaddr() failed of tqpair=%p\n", tqpair);
+ nvmf_tcp_qpair_destroy(tqpair);
+ return;
+ }
+
+ spdk_nvmf_tgt_new_qpair(transport->tgt, &tqpair->qpair);
+}
+
+static uint32_t
+nvmf_tcp_port_accept(struct spdk_nvmf_transport *transport, struct spdk_nvmf_tcp_port *port)
+{
+ struct spdk_sock *sock;
+ uint32_t count = 0;
+ int i;
+
+ for (i = 0; i < NVMF_TCP_MAX_ACCEPT_SOCK_ONE_TIME; i++) {
+ sock = spdk_sock_accept(port->listen_sock);
+ if (sock == NULL) {
+ break;
+ }
+ count++;
+ nvmf_tcp_handle_connect(transport, port, sock);
+ }
+
+ return count;
+}
+
+static uint32_t
+nvmf_tcp_accept(struct spdk_nvmf_transport *transport)
+{
+ struct spdk_nvmf_tcp_transport *ttransport;
+ struct spdk_nvmf_tcp_port *port;
+ uint32_t count = 0;
+
+ ttransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_tcp_transport, transport);
+
+ TAILQ_FOREACH(port, &ttransport->ports, link) {
+ count += nvmf_tcp_port_accept(transport, port);
+ }
+
+ return count;
+}
+
+static void
+nvmf_tcp_discover(struct spdk_nvmf_transport *transport,
+ struct spdk_nvme_transport_id *trid,
+ struct spdk_nvmf_discovery_log_page_entry *entry)
+{
+ entry->trtype = SPDK_NVMF_TRTYPE_TCP;
+ entry->adrfam = trid->adrfam;
+ entry->treq.secure_channel = SPDK_NVMF_TREQ_SECURE_CHANNEL_NOT_REQUIRED;
+
+ spdk_strcpy_pad(entry->trsvcid, trid->trsvcid, sizeof(entry->trsvcid), ' ');
+ spdk_strcpy_pad(entry->traddr, trid->traddr, sizeof(entry->traddr), ' ');
+
+ entry->tsas.tcp.sectype = SPDK_NVME_TCP_SECURITY_NONE;
+}
+
+static struct spdk_nvmf_transport_poll_group *
+nvmf_tcp_poll_group_create(struct spdk_nvmf_transport *transport)
+{
+ struct spdk_nvmf_tcp_poll_group *tgroup;
+
+ tgroup = calloc(1, sizeof(*tgroup));
+ if (!tgroup) {
+ return NULL;
+ }
+
+ tgroup->sock_group = spdk_sock_group_create(&tgroup->group);
+ if (!tgroup->sock_group) {
+ goto cleanup;
+ }
+
+ TAILQ_INIT(&tgroup->qpairs);
+ TAILQ_INIT(&tgroup->await_req);
+
+ return &tgroup->group;
+
+cleanup:
+ free(tgroup);
+ return NULL;
+}
+
+static struct spdk_nvmf_transport_poll_group *
+nvmf_tcp_get_optimal_poll_group(struct spdk_nvmf_qpair *qpair)
+{
+ struct spdk_nvmf_tcp_qpair *tqpair;
+ struct spdk_sock_group *group = NULL;
+ int rc;
+
+ tqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_tcp_qpair, qpair);
+ rc = spdk_sock_get_optimal_sock_group(tqpair->sock, &group);
+ if (!rc && group != NULL) {
+ return spdk_sock_group_get_ctx(group);
+ }
+
+ return NULL;
+}
+
+static void
+nvmf_tcp_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group)
+{
+ struct spdk_nvmf_tcp_poll_group *tgroup;
+
+ tgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_tcp_poll_group, group);
+ spdk_sock_group_close(&tgroup->sock_group);
+
+ free(tgroup);
+}
+
+static void
+nvmf_tcp_qpair_set_recv_state(struct spdk_nvmf_tcp_qpair *tqpair,
+ enum nvme_tcp_pdu_recv_state state)
+{
+ if (tqpair->recv_state == state) {
+ SPDK_ERRLOG("The recv state of tqpair=%p is same with the state(%d) to be set\n",
+ tqpair, state);
+ return;
+ }
+
+ if (tqpair->recv_state == NVME_TCP_PDU_RECV_STATE_AWAIT_REQ) {
+ /* When leaving the await req state, move the qpair to the main list */
+ TAILQ_REMOVE(&tqpair->group->await_req, tqpair, link);
+ TAILQ_INSERT_TAIL(&tqpair->group->qpairs, tqpair, link);
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "tqpair(%p) recv state=%d\n", tqpair, state);
+ tqpair->recv_state = state;
+
+ switch (state) {
+ case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_CH:
+ case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PSH:
+ case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD:
+ break;
+ case NVME_TCP_PDU_RECV_STATE_AWAIT_REQ:
+ TAILQ_REMOVE(&tqpair->group->qpairs, tqpair, link);
+ TAILQ_INSERT_TAIL(&tqpair->group->await_req, tqpair, link);
+ break;
+ case NVME_TCP_PDU_RECV_STATE_ERROR:
+ case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY:
+ memset(&tqpair->pdu_in_progress, 0, sizeof(tqpair->pdu_in_progress));
+ break;
+ default:
+ SPDK_ERRLOG("The state(%d) is invalid\n", state);
+ abort();
+ break;
+ }
+}
+
+static int
+nvmf_tcp_qpair_handle_timeout(void *ctx)
+{
+ struct spdk_nvmf_tcp_qpair *tqpair = ctx;
+
+ assert(tqpair->recv_state == NVME_TCP_PDU_RECV_STATE_ERROR);
+
+ SPDK_ERRLOG("No pdu coming for tqpair=%p within %d seconds\n", tqpair,
+ SPDK_NVME_TCP_QPAIR_EXIT_TIMEOUT);
+
+ nvmf_tcp_qpair_disconnect(tqpair);
+ return SPDK_POLLER_BUSY;
+}
+
+static void
+nvmf_tcp_send_c2h_term_req_complete(void *cb_arg)
+{
+ struct spdk_nvmf_tcp_qpair *tqpair = (struct spdk_nvmf_tcp_qpair *)cb_arg;
+
+ if (!tqpair->timeout_poller) {
+ tqpair->timeout_poller = SPDK_POLLER_REGISTER(nvmf_tcp_qpair_handle_timeout, tqpair,
+ SPDK_NVME_TCP_QPAIR_EXIT_TIMEOUT * 1000000);
+ }
+}
+
+static void
+nvmf_tcp_send_c2h_term_req(struct spdk_nvmf_tcp_qpair *tqpair, struct nvme_tcp_pdu *pdu,
+ enum spdk_nvme_tcp_term_req_fes fes, uint32_t error_offset)
+{
+ struct nvme_tcp_pdu *rsp_pdu;
+ struct spdk_nvme_tcp_term_req_hdr *c2h_term_req;
+ uint32_t c2h_term_req_hdr_len = sizeof(*c2h_term_req);
+ uint32_t copy_len;
+
+ rsp_pdu = &tqpair->mgmt_pdu;
+
+ c2h_term_req = &rsp_pdu->hdr.term_req;
+ c2h_term_req->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_C2H_TERM_REQ;
+ c2h_term_req->common.hlen = c2h_term_req_hdr_len;
+
+ if ((fes == SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD) ||
+ (fes == SPDK_NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER)) {
+ DSET32(&c2h_term_req->fei, error_offset);
+ }
+
+ copy_len = spdk_min(pdu->hdr.common.hlen, SPDK_NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE);
+
+ /* Copy the error info into the buffer */
+ memcpy((uint8_t *)rsp_pdu->hdr.raw + c2h_term_req_hdr_len, pdu->hdr.raw, copy_len);
+ nvme_tcp_pdu_set_data(rsp_pdu, (uint8_t *)rsp_pdu->hdr.raw + c2h_term_req_hdr_len, copy_len);
+
+ /* Contain the header of the wrong received pdu */
+ c2h_term_req->common.plen = c2h_term_req->common.hlen + copy_len;
+ nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR);
+ nvmf_tcp_qpair_write_pdu(tqpair, rsp_pdu, nvmf_tcp_send_c2h_term_req_complete, tqpair);
+}
+
+static void
+nvmf_tcp_capsule_cmd_hdr_handle(struct spdk_nvmf_tcp_transport *ttransport,
+ struct spdk_nvmf_tcp_qpair *tqpair,
+ struct nvme_tcp_pdu *pdu)
+{
+ struct spdk_nvmf_tcp_req *tcp_req;
+
+ assert(pdu->psh_valid_bytes == pdu->psh_len);
+ assert(pdu->hdr.common.pdu_type == SPDK_NVME_TCP_PDU_TYPE_CAPSULE_CMD);
+
+ tcp_req = nvmf_tcp_req_get(tqpair);
+ if (!tcp_req) {
+ /* Directly return and make the allocation retry again */
+ if (tqpair->state_cntr[TCP_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST] > 0) {
+ return;
+ }
+
+ /* The host sent more commands than the maximum queue depth. */
+ SPDK_ERRLOG("Cannot allocate tcp_req on tqpair=%p\n", tqpair);
+ nvmf_tcp_qpair_disconnect(tqpair);
+ return;
+ }
+
+ pdu->req = tcp_req;
+ assert(tcp_req->state == TCP_REQUEST_STATE_NEW);
+ nvmf_tcp_req_process(ttransport, tcp_req);
+}
+
+static void
+nvmf_tcp_capsule_cmd_payload_handle(struct spdk_nvmf_tcp_transport *ttransport,
+ struct spdk_nvmf_tcp_qpair *tqpair,
+ struct nvme_tcp_pdu *pdu)
+{
+ struct spdk_nvmf_tcp_req *tcp_req;
+ struct spdk_nvme_tcp_cmd *capsule_cmd;
+ uint32_t error_offset = 0;
+ enum spdk_nvme_tcp_term_req_fes fes;
+
+ capsule_cmd = &pdu->hdr.capsule_cmd;
+ tcp_req = pdu->req;
+ assert(tcp_req != NULL);
+ if (capsule_cmd->common.pdo > SPDK_NVME_TCP_PDU_PDO_MAX_OFFSET) {
+ SPDK_ERRLOG("Expected ICReq capsule_cmd pdu offset <= %d, got %c\n",
+ SPDK_NVME_TCP_PDU_PDO_MAX_OFFSET, capsule_cmd->common.pdo);
+ fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
+ error_offset = offsetof(struct spdk_nvme_tcp_common_pdu_hdr, pdo);
+ goto err;
+ }
+
+ nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY);
+ nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_READY_TO_EXECUTE);
+ nvmf_tcp_req_process(ttransport, tcp_req);
+
+ return;
+err:
+ nvmf_tcp_send_c2h_term_req(tqpair, pdu, fes, error_offset);
+}
+
+static int
+nvmf_tcp_find_req_in_state(struct spdk_nvmf_tcp_qpair *tqpair,
+ enum spdk_nvmf_tcp_req_state state,
+ uint16_t cid, uint16_t tag,
+ struct spdk_nvmf_tcp_req **req)
+{
+ struct spdk_nvmf_tcp_req *tcp_req = NULL;
+
+ TAILQ_FOREACH(tcp_req, &tqpair->state_queue[state], state_link) {
+ if (tcp_req->req.cmd->nvme_cmd.cid != cid) {
+ continue;
+ }
+
+ if (tcp_req->ttag == tag) {
+ *req = tcp_req;
+ return 0;
+ }
+
+ *req = NULL;
+ return -1;
+ }
+
+ /* Didn't find it, but not an error */
+ *req = NULL;
+ return 0;
+}
+
+static void
+nvmf_tcp_h2c_data_hdr_handle(struct spdk_nvmf_tcp_transport *ttransport,
+ struct spdk_nvmf_tcp_qpair *tqpair,
+ struct nvme_tcp_pdu *pdu)
+{
+ struct spdk_nvmf_tcp_req *tcp_req;
+ uint32_t error_offset = 0;
+ enum spdk_nvme_tcp_term_req_fes fes = 0;
+ struct spdk_nvme_tcp_h2c_data_hdr *h2c_data;
+ int rc;
+
+ h2c_data = &pdu->hdr.h2c_data;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "tqpair=%p, r2t_info: datao=%u, datal=%u, cccid=%u, ttag=%u\n",
+ tqpair, h2c_data->datao, h2c_data->datal, h2c_data->cccid, h2c_data->ttag);
+
+ rc = nvmf_tcp_find_req_in_state(tqpair, TCP_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER,
+ h2c_data->cccid, h2c_data->ttag, &tcp_req);
+ if (rc == 0 && tcp_req == NULL) {
+ rc = nvmf_tcp_find_req_in_state(tqpair, TCP_REQUEST_STATE_AWAITING_R2T_ACK, h2c_data->cccid,
+ h2c_data->ttag, &tcp_req);
+ }
+
+ if (!tcp_req) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "tcp_req is not found for tqpair=%p\n", tqpair);
+ fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER;
+ if (rc == 0) {
+ error_offset = offsetof(struct spdk_nvme_tcp_h2c_data_hdr, cccid);
+ } else {
+ error_offset = offsetof(struct spdk_nvme_tcp_h2c_data_hdr, ttag);
+ }
+ goto err;
+ }
+
+ if (tcp_req->h2c_offset != h2c_data->datao) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP,
+ "tcp_req(%p), tqpair=%p, expected data offset %u, but data offset is %u\n",
+ tcp_req, tqpair, tcp_req->h2c_offset, h2c_data->datao);
+ fes = SPDK_NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE;
+ goto err;
+ }
+
+ if ((h2c_data->datao + h2c_data->datal) > tcp_req->req.length) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP,
+ "tcp_req(%p), tqpair=%p, (datao=%u + datal=%u) execeeds requested length=%u\n",
+ tcp_req, tqpair, h2c_data->datao, h2c_data->datal, tcp_req->req.length);
+ fes = SPDK_NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE;
+ goto err;
+ }
+
+ pdu->req = tcp_req;
+
+ if (spdk_unlikely(tcp_req->req.dif.dif_insert_or_strip)) {
+ pdu->dif_ctx = &tcp_req->req.dif.dif_ctx;
+ }
+
+ nvme_tcp_pdu_set_data_buf(pdu, tcp_req->req.iov, tcp_req->req.iovcnt,
+ h2c_data->datao, h2c_data->datal);
+ nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD);
+ return;
+
+err:
+ nvmf_tcp_send_c2h_term_req(tqpair, pdu, fes, error_offset);
+}
+
+static void
+nvmf_tcp_pdu_cmd_complete(void *cb_arg)
+{
+ struct spdk_nvmf_tcp_req *tcp_req = cb_arg;
+ nvmf_tcp_request_free(tcp_req);
+}
+
+static void
+nvmf_tcp_send_capsule_resp_pdu(struct spdk_nvmf_tcp_req *tcp_req,
+ struct spdk_nvmf_tcp_qpair *tqpair)
+{
+ struct nvme_tcp_pdu *rsp_pdu;
+ struct spdk_nvme_tcp_rsp *capsule_resp;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "enter, tqpair=%p\n", tqpair);
+
+ rsp_pdu = nvmf_tcp_req_pdu_init(tcp_req);
+ assert(rsp_pdu != NULL);
+
+ capsule_resp = &rsp_pdu->hdr.capsule_resp;
+ capsule_resp->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_CAPSULE_RESP;
+ capsule_resp->common.plen = capsule_resp->common.hlen = sizeof(*capsule_resp);
+ capsule_resp->rccqe = tcp_req->req.rsp->nvme_cpl;
+ if (tqpair->host_hdgst_enable) {
+ capsule_resp->common.flags |= SPDK_NVME_TCP_CH_FLAGS_HDGSTF;
+ capsule_resp->common.plen += SPDK_NVME_TCP_DIGEST_LEN;
+ }
+
+ nvmf_tcp_qpair_write_pdu(tqpair, rsp_pdu, nvmf_tcp_pdu_cmd_complete, tcp_req);
+}
+
+static void
+nvmf_tcp_pdu_c2h_data_complete(void *cb_arg)
+{
+ struct spdk_nvmf_tcp_req *tcp_req = cb_arg;
+ struct spdk_nvmf_tcp_qpair *tqpair = SPDK_CONTAINEROF(tcp_req->req.qpair,
+ struct spdk_nvmf_tcp_qpair, qpair);
+
+ assert(tqpair != NULL);
+ if (tqpair->qpair.transport->opts.c2h_success) {
+ nvmf_tcp_request_free(tcp_req);
+ } else {
+ nvmf_tcp_req_pdu_fini(tcp_req);
+ nvmf_tcp_send_capsule_resp_pdu(tcp_req, tqpair);
+ }
+}
+
+static void
+nvmf_tcp_r2t_complete(void *cb_arg)
+{
+ struct spdk_nvmf_tcp_req *tcp_req = cb_arg;
+ struct spdk_nvmf_tcp_transport *ttransport;
+
+ nvmf_tcp_req_pdu_fini(tcp_req);
+
+ ttransport = SPDK_CONTAINEROF(tcp_req->req.qpair->transport,
+ struct spdk_nvmf_tcp_transport, transport);
+
+ nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER);
+
+ if (tcp_req->h2c_offset == tcp_req->req.length) {
+ nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_READY_TO_EXECUTE);
+ nvmf_tcp_req_process(ttransport, tcp_req);
+ }
+}
+
+static void
+nvmf_tcp_send_r2t_pdu(struct spdk_nvmf_tcp_qpair *tqpair,
+ struct spdk_nvmf_tcp_req *tcp_req)
+{
+ struct nvme_tcp_pdu *rsp_pdu;
+ struct spdk_nvme_tcp_r2t_hdr *r2t;
+
+ rsp_pdu = nvmf_tcp_req_pdu_init(tcp_req);
+ assert(rsp_pdu != NULL);
+
+ r2t = &rsp_pdu->hdr.r2t;
+ r2t->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_R2T;
+ r2t->common.plen = r2t->common.hlen = sizeof(*r2t);
+
+ if (tqpair->host_hdgst_enable) {
+ r2t->common.flags |= SPDK_NVME_TCP_CH_FLAGS_HDGSTF;
+ r2t->common.plen += SPDK_NVME_TCP_DIGEST_LEN;
+ }
+
+ r2t->cccid = tcp_req->req.cmd->nvme_cmd.cid;
+ r2t->ttag = tcp_req->ttag;
+ r2t->r2to = tcp_req->h2c_offset;
+ r2t->r2tl = tcp_req->req.length;
+
+ nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_AWAITING_R2T_ACK);
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP,
+ "tcp_req(%p) on tqpair(%p), r2t_info: cccid=%u, ttag=%u, r2to=%u, r2tl=%u\n",
+ tcp_req, tqpair, r2t->cccid, r2t->ttag, r2t->r2to, r2t->r2tl);
+ nvmf_tcp_qpair_write_pdu(tqpair, rsp_pdu, nvmf_tcp_r2t_complete, tcp_req);
+}
+
+static void
+nvmf_tcp_h2c_data_payload_handle(struct spdk_nvmf_tcp_transport *ttransport,
+ struct spdk_nvmf_tcp_qpair *tqpair,
+ struct nvme_tcp_pdu *pdu)
+{
+ struct spdk_nvmf_tcp_req *tcp_req;
+
+ tcp_req = pdu->req;
+ assert(tcp_req != NULL);
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "enter\n");
+
+ tcp_req->h2c_offset += pdu->data_len;
+
+ nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY);
+
+ /* Wait for all of the data to arrive AND for the initial R2T PDU send to be
+ * acknowledged before moving on. */
+ if (tcp_req->h2c_offset == tcp_req->req.length &&
+ tcp_req->state == TCP_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER) {
+ nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_READY_TO_EXECUTE);
+ nvmf_tcp_req_process(ttransport, tcp_req);
+ }
+}
+
+static void
+nvmf_tcp_h2c_term_req_dump(struct spdk_nvme_tcp_term_req_hdr *h2c_term_req)
+{
+ SPDK_ERRLOG("Error info of pdu(%p): %s\n", h2c_term_req,
+ spdk_nvmf_tcp_term_req_fes_str[h2c_term_req->fes]);
+ if ((h2c_term_req->fes == SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD) ||
+ (h2c_term_req->fes == SPDK_NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER)) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "The offset from the start of the PDU header is %u\n",
+ DGET32(h2c_term_req->fei));
+ }
+}
+
+static void
+nvmf_tcp_h2c_term_req_hdr_handle(struct spdk_nvmf_tcp_qpair *tqpair,
+ struct nvme_tcp_pdu *pdu)
+{
+ struct spdk_nvme_tcp_term_req_hdr *h2c_term_req = &pdu->hdr.term_req;
+ uint32_t error_offset = 0;
+ enum spdk_nvme_tcp_term_req_fes fes;
+
+
+ if (h2c_term_req->fes > SPDK_NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER) {
+ SPDK_ERRLOG("Fatal Error Stauts(FES) is unknown for h2c_term_req pdu=%p\n", pdu);
+ fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
+ error_offset = offsetof(struct spdk_nvme_tcp_term_req_hdr, fes);
+ goto end;
+ }
+
+ /* set the data buffer */
+ nvme_tcp_pdu_set_data(pdu, (uint8_t *)pdu->hdr.raw + h2c_term_req->common.hlen,
+ h2c_term_req->common.plen - h2c_term_req->common.hlen);
+ nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD);
+ return;
+end:
+ nvmf_tcp_send_c2h_term_req(tqpair, pdu, fes, error_offset);
+}
+
+static void
+nvmf_tcp_h2c_term_req_payload_handle(struct spdk_nvmf_tcp_qpair *tqpair,
+ struct nvme_tcp_pdu *pdu)
+{
+ struct spdk_nvme_tcp_term_req_hdr *h2c_term_req = &pdu->hdr.term_req;
+
+ nvmf_tcp_h2c_term_req_dump(h2c_term_req);
+ nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR);
+}
+
+static void
+nvmf_tcp_pdu_payload_handle(struct spdk_nvmf_tcp_qpair *tqpair,
+ struct spdk_nvmf_tcp_transport *ttransport)
+{
+ int rc = 0;
+ struct nvme_tcp_pdu *pdu;
+ uint32_t crc32c, error_offset = 0;
+ enum spdk_nvme_tcp_term_req_fes fes;
+
+ assert(tqpair->recv_state == NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD);
+ pdu = &tqpair->pdu_in_progress;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "enter\n");
+ /* check data digest if need */
+ if (pdu->ddgst_enable) {
+ crc32c = nvme_tcp_pdu_calc_data_digest(pdu);
+ rc = MATCH_DIGEST_WORD(pdu->data_digest, crc32c);
+ if (rc == 0) {
+ SPDK_ERRLOG("Data digest error on tqpair=(%p) with pdu=%p\n", tqpair, pdu);
+ fes = SPDK_NVME_TCP_TERM_REQ_FES_HDGST_ERROR;
+ nvmf_tcp_send_c2h_term_req(tqpair, pdu, fes, error_offset);
+ return;
+
+ }
+ }
+
+ switch (pdu->hdr.common.pdu_type) {
+ case SPDK_NVME_TCP_PDU_TYPE_CAPSULE_CMD:
+ nvmf_tcp_capsule_cmd_payload_handle(ttransport, tqpair, pdu);
+ break;
+ case SPDK_NVME_TCP_PDU_TYPE_H2C_DATA:
+ nvmf_tcp_h2c_data_payload_handle(ttransport, tqpair, pdu);
+ break;
+
+ case SPDK_NVME_TCP_PDU_TYPE_H2C_TERM_REQ:
+ nvmf_tcp_h2c_term_req_payload_handle(tqpair, pdu);
+ break;
+
+ default:
+ /* The code should not go to here */
+ SPDK_ERRLOG("The code should not go to here\n");
+ break;
+ }
+}
+
+static void
+nvmf_tcp_send_icresp_complete(void *cb_arg)
+{
+ struct spdk_nvmf_tcp_qpair *tqpair = cb_arg;
+
+ tqpair->state = NVME_TCP_QPAIR_STATE_RUNNING;
+}
+
+static void
+nvmf_tcp_icreq_handle(struct spdk_nvmf_tcp_transport *ttransport,
+ struct spdk_nvmf_tcp_qpair *tqpair,
+ struct nvme_tcp_pdu *pdu)
+{
+ struct spdk_nvme_tcp_ic_req *ic_req = &pdu->hdr.ic_req;
+ struct nvme_tcp_pdu *rsp_pdu;
+ struct spdk_nvme_tcp_ic_resp *ic_resp;
+ uint32_t error_offset = 0;
+ enum spdk_nvme_tcp_term_req_fes fes;
+
+ /* Only PFV 0 is defined currently */
+ if (ic_req->pfv != 0) {
+ SPDK_ERRLOG("Expected ICReq PFV %u, got %u\n", 0u, ic_req->pfv);
+ fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
+ error_offset = offsetof(struct spdk_nvme_tcp_ic_req, pfv);
+ goto end;
+ }
+
+ /* MAXR2T is 0's based */
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "maxr2t =%u\n", (ic_req->maxr2t + 1u));
+
+ tqpair->host_hdgst_enable = ic_req->dgst.bits.hdgst_enable ? true : false;
+ if (!tqpair->host_hdgst_enable) {
+ tqpair->recv_buf_size -= SPDK_NVME_TCP_DIGEST_LEN * SPDK_NVMF_TCP_RECV_BUF_SIZE_FACTOR;
+ }
+
+ tqpair->host_ddgst_enable = ic_req->dgst.bits.ddgst_enable ? true : false;
+ if (!tqpair->host_ddgst_enable) {
+ tqpair->recv_buf_size -= SPDK_NVME_TCP_DIGEST_LEN * SPDK_NVMF_TCP_RECV_BUF_SIZE_FACTOR;
+ }
+
+ /* Now that we know whether digests are enabled, properly size the receive buffer */
+ if (spdk_sock_set_recvbuf(tqpair->sock, tqpair->recv_buf_size) < 0) {
+ SPDK_WARNLOG("Unable to allocate enough memory for receive buffer on tqpair=%p with size=%d\n",
+ tqpair,
+ tqpair->recv_buf_size);
+ /* Not fatal. */
+ }
+
+ tqpair->cpda = spdk_min(ic_req->hpda, SPDK_NVME_TCP_CPDA_MAX);
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "cpda of tqpair=(%p) is : %u\n", tqpair, tqpair->cpda);
+
+ rsp_pdu = &tqpair->mgmt_pdu;
+
+ ic_resp = &rsp_pdu->hdr.ic_resp;
+ ic_resp->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_IC_RESP;
+ ic_resp->common.hlen = ic_resp->common.plen = sizeof(*ic_resp);
+ ic_resp->pfv = 0;
+ ic_resp->cpda = tqpair->cpda;
+ ic_resp->maxh2cdata = ttransport->transport.opts.max_io_size;
+ ic_resp->dgst.bits.hdgst_enable = tqpair->host_hdgst_enable ? 1 : 0;
+ ic_resp->dgst.bits.ddgst_enable = tqpair->host_ddgst_enable ? 1 : 0;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "host_hdgst_enable: %u\n", tqpair->host_hdgst_enable);
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "host_ddgst_enable: %u\n", tqpair->host_ddgst_enable);
+
+ tqpair->state = NVME_TCP_QPAIR_STATE_INITIALIZING;
+ nvmf_tcp_qpair_write_pdu(tqpair, rsp_pdu, nvmf_tcp_send_icresp_complete, tqpair);
+ nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY);
+ return;
+end:
+ nvmf_tcp_send_c2h_term_req(tqpair, pdu, fes, error_offset);
+}
+
+static void
+nvmf_tcp_pdu_psh_handle(struct spdk_nvmf_tcp_qpair *tqpair,
+ struct spdk_nvmf_tcp_transport *ttransport)
+{
+ struct nvme_tcp_pdu *pdu;
+ int rc;
+ uint32_t crc32c, error_offset = 0;
+ enum spdk_nvme_tcp_term_req_fes fes;
+
+ assert(tqpair->recv_state == NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PSH);
+ pdu = &tqpair->pdu_in_progress;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "pdu type of tqpair(%p) is %d\n", tqpair,
+ pdu->hdr.common.pdu_type);
+ /* check header digest if needed */
+ if (pdu->has_hdgst) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "Compare the header of pdu=%p on tqpair=%p\n", pdu, tqpair);
+ crc32c = nvme_tcp_pdu_calc_header_digest(pdu);
+ rc = MATCH_DIGEST_WORD((uint8_t *)pdu->hdr.raw + pdu->hdr.common.hlen, crc32c);
+ if (rc == 0) {
+ SPDK_ERRLOG("Header digest error on tqpair=(%p) with pdu=%p\n", tqpair, pdu);
+ fes = SPDK_NVME_TCP_TERM_REQ_FES_HDGST_ERROR;
+ nvmf_tcp_send_c2h_term_req(tqpair, pdu, fes, error_offset);
+ return;
+
+ }
+ }
+
+ switch (pdu->hdr.common.pdu_type) {
+ case SPDK_NVME_TCP_PDU_TYPE_IC_REQ:
+ nvmf_tcp_icreq_handle(ttransport, tqpair, pdu);
+ break;
+ case SPDK_NVME_TCP_PDU_TYPE_CAPSULE_CMD:
+ nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_REQ);
+ break;
+ case SPDK_NVME_TCP_PDU_TYPE_H2C_DATA:
+ nvmf_tcp_h2c_data_hdr_handle(ttransport, tqpair, pdu);
+ break;
+
+ case SPDK_NVME_TCP_PDU_TYPE_H2C_TERM_REQ:
+ nvmf_tcp_h2c_term_req_hdr_handle(tqpair, pdu);
+ break;
+
+ default:
+ SPDK_ERRLOG("Unexpected PDU type 0x%02x\n", tqpair->pdu_in_progress.hdr.common.pdu_type);
+ fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
+ error_offset = 1;
+ nvmf_tcp_send_c2h_term_req(tqpair, pdu, fes, error_offset);
+ break;
+ }
+}
+
+static void
+nvmf_tcp_pdu_ch_handle(struct spdk_nvmf_tcp_qpair *tqpair)
+{
+ struct nvme_tcp_pdu *pdu;
+ uint32_t error_offset = 0;
+ enum spdk_nvme_tcp_term_req_fes fes;
+ uint8_t expected_hlen, pdo;
+ bool plen_error = false, pdo_error = false;
+
+ assert(tqpair->recv_state == NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_CH);
+ pdu = &tqpair->pdu_in_progress;
+
+ if (pdu->hdr.common.pdu_type == SPDK_NVME_TCP_PDU_TYPE_IC_REQ) {
+ if (tqpair->state != NVME_TCP_QPAIR_STATE_INVALID) {
+ SPDK_ERRLOG("Already received ICreq PDU, and reject this pdu=%p\n", pdu);
+ fes = SPDK_NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR;
+ goto err;
+ }
+ expected_hlen = sizeof(struct spdk_nvme_tcp_ic_req);
+ if (pdu->hdr.common.plen != expected_hlen) {
+ plen_error = true;
+ }
+ } else {
+ if (tqpair->state != NVME_TCP_QPAIR_STATE_RUNNING) {
+ SPDK_ERRLOG("The TCP/IP connection is not negotitated\n");
+ fes = SPDK_NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR;
+ goto err;
+ }
+
+ switch (pdu->hdr.common.pdu_type) {
+ case SPDK_NVME_TCP_PDU_TYPE_CAPSULE_CMD:
+ expected_hlen = sizeof(struct spdk_nvme_tcp_cmd);
+ pdo = pdu->hdr.common.pdo;
+ if ((tqpair->cpda != 0) && (pdo != ((tqpair->cpda + 1) << 2))) {
+ pdo_error = true;
+ break;
+ }
+
+ if (pdu->hdr.common.plen < expected_hlen) {
+ plen_error = true;
+ }
+ break;
+ case SPDK_NVME_TCP_PDU_TYPE_H2C_DATA:
+ expected_hlen = sizeof(struct spdk_nvme_tcp_h2c_data_hdr);
+ pdo = pdu->hdr.common.pdo;
+ if ((tqpair->cpda != 0) && (pdo != ((tqpair->cpda + 1) << 2))) {
+ pdo_error = true;
+ break;
+ }
+ if (pdu->hdr.common.plen < expected_hlen) {
+ plen_error = true;
+ }
+ break;
+
+ case SPDK_NVME_TCP_PDU_TYPE_H2C_TERM_REQ:
+ expected_hlen = sizeof(struct spdk_nvme_tcp_term_req_hdr);
+ if ((pdu->hdr.common.plen <= expected_hlen) ||
+ (pdu->hdr.common.plen > SPDK_NVME_TCP_TERM_REQ_PDU_MAX_SIZE)) {
+ plen_error = true;
+ }
+ break;
+
+ default:
+ SPDK_ERRLOG("Unexpected PDU type 0x%02x\n", pdu->hdr.common.pdu_type);
+ fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
+ error_offset = offsetof(struct spdk_nvme_tcp_common_pdu_hdr, pdu_type);
+ goto err;
+ }
+ }
+
+ if (pdu->hdr.common.hlen != expected_hlen) {
+ SPDK_ERRLOG("PDU type=0x%02x, Expected ICReq header length %u, got %u on tqpair=%p\n",
+ pdu->hdr.common.pdu_type,
+ expected_hlen, pdu->hdr.common.hlen, tqpair);
+ fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
+ error_offset = offsetof(struct spdk_nvme_tcp_common_pdu_hdr, hlen);
+ goto err;
+ } else if (pdo_error) {
+ fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
+ error_offset = offsetof(struct spdk_nvme_tcp_common_pdu_hdr, pdo);
+ } else if (plen_error) {
+ fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
+ error_offset = offsetof(struct spdk_nvme_tcp_common_pdu_hdr, plen);
+ goto err;
+ } else {
+ nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PSH);
+ nvme_tcp_pdu_calc_psh_len(&tqpair->pdu_in_progress, tqpair->host_hdgst_enable);
+ return;
+ }
+err:
+ nvmf_tcp_send_c2h_term_req(tqpair, pdu, fes, error_offset);
+}
+
+static int
+nvmf_tcp_pdu_payload_insert_dif(struct nvme_tcp_pdu *pdu, uint32_t read_offset,
+ int read_len)
+{
+ int rc;
+
+ rc = spdk_dif_generate_stream(pdu->data_iov, pdu->data_iovcnt,
+ read_offset, read_len, pdu->dif_ctx);
+ if (rc != 0) {
+ SPDK_ERRLOG("DIF generate failed\n");
+ }
+
+ return rc;
+}
+
+static int
+nvmf_tcp_sock_process(struct spdk_nvmf_tcp_qpair *tqpair)
+{
+ int rc = 0;
+ struct nvme_tcp_pdu *pdu;
+ enum nvme_tcp_pdu_recv_state prev_state;
+ uint32_t data_len;
+ struct spdk_nvmf_tcp_transport *ttransport = SPDK_CONTAINEROF(tqpair->qpair.transport,
+ struct spdk_nvmf_tcp_transport, transport);
+
+ /* The loop here is to allow for several back-to-back state changes. */
+ do {
+ prev_state = tqpair->recv_state;
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "tqpair(%p) recv pdu entering state %d\n", tqpair, prev_state);
+
+ pdu = &tqpair->pdu_in_progress;
+ switch (tqpair->recv_state) {
+ /* Wait for the common header */
+ case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY:
+ case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_CH:
+ if (spdk_unlikely(tqpair->state == NVME_TCP_QPAIR_STATE_INITIALIZING)) {
+ return rc;
+ }
+
+ rc = nvme_tcp_read_data(tqpair->sock,
+ sizeof(struct spdk_nvme_tcp_common_pdu_hdr) - pdu->ch_valid_bytes,
+ (void *)&pdu->hdr.common + pdu->ch_valid_bytes);
+ if (rc < 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "will disconnect tqpair=%p\n", tqpair);
+ return NVME_TCP_PDU_FATAL;
+ } else if (rc > 0) {
+ pdu->ch_valid_bytes += rc;
+ spdk_trace_record(TRACE_TCP_READ_FROM_SOCKET_DONE, 0, rc, 0, 0);
+ if (spdk_likely(tqpair->recv_state == NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY)) {
+ nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_CH);
+ }
+ }
+
+ if (pdu->ch_valid_bytes < sizeof(struct spdk_nvme_tcp_common_pdu_hdr)) {
+ return NVME_TCP_PDU_IN_PROGRESS;
+ }
+
+ /* The command header of this PDU has now been read from the socket. */
+ nvmf_tcp_pdu_ch_handle(tqpair);
+ break;
+ /* Wait for the pdu specific header */
+ case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PSH:
+ rc = nvme_tcp_read_data(tqpair->sock,
+ pdu->psh_len - pdu->psh_valid_bytes,
+ (void *)&pdu->hdr.raw + sizeof(struct spdk_nvme_tcp_common_pdu_hdr) + pdu->psh_valid_bytes);
+ if (rc < 0) {
+ return NVME_TCP_PDU_FATAL;
+ } else if (rc > 0) {
+ spdk_trace_record(TRACE_TCP_READ_FROM_SOCKET_DONE,
+ 0, rc, 0, 0);
+ pdu->psh_valid_bytes += rc;
+ }
+
+ if (pdu->psh_valid_bytes < pdu->psh_len) {
+ return NVME_TCP_PDU_IN_PROGRESS;
+ }
+
+ /* All header(ch, psh, head digist) of this PDU has now been read from the socket. */
+ nvmf_tcp_pdu_psh_handle(tqpair, ttransport);
+ break;
+ /* Wait for the req slot */
+ case NVME_TCP_PDU_RECV_STATE_AWAIT_REQ:
+ nvmf_tcp_capsule_cmd_hdr_handle(ttransport, tqpair, pdu);
+ break;
+ case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD:
+ /* check whether the data is valid, if not we just return */
+ if (!pdu->data_len) {
+ return NVME_TCP_PDU_IN_PROGRESS;
+ }
+
+ data_len = pdu->data_len;
+ /* data digest */
+ if (spdk_unlikely((pdu->hdr.common.pdu_type != SPDK_NVME_TCP_PDU_TYPE_H2C_TERM_REQ) &&
+ tqpair->host_ddgst_enable)) {
+ data_len += SPDK_NVME_TCP_DIGEST_LEN;
+ pdu->ddgst_enable = true;
+ }
+
+ rc = nvme_tcp_read_payload_data(tqpair->sock, pdu);
+ if (rc < 0) {
+ return NVME_TCP_PDU_FATAL;
+ }
+ pdu->readv_offset += rc;
+
+ if (spdk_unlikely(pdu->dif_ctx != NULL)) {
+ rc = nvmf_tcp_pdu_payload_insert_dif(pdu, pdu->readv_offset - rc, rc);
+ if (rc != 0) {
+ return NVME_TCP_PDU_FATAL;
+ }
+ }
+
+ if (pdu->readv_offset < data_len) {
+ return NVME_TCP_PDU_IN_PROGRESS;
+ }
+
+ /* All of this PDU has now been read from the socket. */
+ nvmf_tcp_pdu_payload_handle(tqpair, ttransport);
+ break;
+ case NVME_TCP_PDU_RECV_STATE_ERROR:
+ if (!spdk_sock_is_connected(tqpair->sock)) {
+ return NVME_TCP_PDU_FATAL;
+ }
+ break;
+ default:
+ assert(0);
+ SPDK_ERRLOG("code should not come to here");
+ break;
+ }
+ } while (tqpair->recv_state != prev_state);
+
+ return rc;
+}
+
+static int
+nvmf_tcp_req_parse_sgl(struct spdk_nvmf_tcp_req *tcp_req,
+ struct spdk_nvmf_transport *transport,
+ struct spdk_nvmf_transport_poll_group *group)
+{
+ struct spdk_nvmf_request *req = &tcp_req->req;
+ struct spdk_nvme_cmd *cmd;
+ struct spdk_nvme_cpl *rsp;
+ struct spdk_nvme_sgl_descriptor *sgl;
+ uint32_t length;
+
+ cmd = &req->cmd->nvme_cmd;
+ rsp = &req->rsp->nvme_cpl;
+ sgl = &cmd->dptr.sgl1;
+
+ length = sgl->unkeyed.length;
+
+ if (sgl->generic.type == SPDK_NVME_SGL_TYPE_TRANSPORT_DATA_BLOCK &&
+ sgl->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_TRANSPORT) {
+ if (length > transport->opts.max_io_size) {
+ SPDK_ERRLOG("SGL length 0x%x exceeds max io size 0x%x\n",
+ length, transport->opts.max_io_size);
+ rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID;
+ return -1;
+ }
+
+ /* fill request length and populate iovs */
+ req->length = length;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "Data requested length= 0x%x\n", length);
+
+ if (spdk_unlikely(req->dif.dif_insert_or_strip)) {
+ req->dif.orig_length = length;
+ length = spdk_dif_get_length_with_md(length, &req->dif.dif_ctx);
+ req->dif.elba_length = length;
+ }
+
+ if (spdk_nvmf_request_get_buffers(req, group, transport, length)) {
+ /* No available buffers. Queue this request up. */
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "No available large data buffers. Queueing request %p\n",
+ tcp_req);
+ return 0;
+ }
+
+ /* backward compatible */
+ req->data = req->iov[0].iov_base;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "Request %p took %d buffer/s from central pool, and data=%p\n",
+ tcp_req, req->iovcnt, req->data);
+
+ return 0;
+ } else if (sgl->generic.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK &&
+ sgl->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET) {
+ uint64_t offset = sgl->address;
+ uint32_t max_len = transport->opts.in_capsule_data_size;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "In-capsule data: offset 0x%" PRIx64 ", length 0x%x\n",
+ offset, length);
+
+ if (offset > max_len) {
+ SPDK_ERRLOG("In-capsule offset 0x%" PRIx64 " exceeds capsule length 0x%x\n",
+ offset, max_len);
+ rsp->status.sc = SPDK_NVME_SC_INVALID_SGL_OFFSET;
+ return -1;
+ }
+ max_len -= (uint32_t)offset;
+
+ if (length > max_len) {
+ SPDK_ERRLOG("In-capsule data length 0x%x exceeds capsule length 0x%x\n",
+ length, max_len);
+ rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID;
+ return -1;
+ }
+
+ req->data = tcp_req->buf + offset;
+ req->data_from_pool = false;
+ req->length = length;
+
+ if (spdk_unlikely(req->dif.dif_insert_or_strip)) {
+ length = spdk_dif_get_length_with_md(length, &req->dif.dif_ctx);
+ req->dif.elba_length = length;
+ }
+
+ req->iov[0].iov_base = req->data;
+ req->iov[0].iov_len = length;
+ req->iovcnt = 1;
+
+ return 0;
+ }
+
+ SPDK_ERRLOG("Invalid NVMf I/O Command SGL: Type 0x%x, Subtype 0x%x\n",
+ sgl->generic.type, sgl->generic.subtype);
+ rsp->status.sc = SPDK_NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID;
+ return -1;
+}
+
+static inline enum spdk_nvme_media_error_status_code
+nvmf_tcp_dif_error_to_compl_status(uint8_t err_type) {
+ enum spdk_nvme_media_error_status_code result;
+
+ switch (err_type)
+ {
+ case SPDK_DIF_REFTAG_ERROR:
+ result = SPDK_NVME_SC_REFERENCE_TAG_CHECK_ERROR;
+ break;
+ case SPDK_DIF_APPTAG_ERROR:
+ result = SPDK_NVME_SC_APPLICATION_TAG_CHECK_ERROR;
+ break;
+ case SPDK_DIF_GUARD_ERROR:
+ result = SPDK_NVME_SC_GUARD_CHECK_ERROR;
+ break;
+ default:
+ SPDK_UNREACHABLE();
+ break;
+ }
+
+ return result;
+}
+
+static void
+nvmf_tcp_send_c2h_data(struct spdk_nvmf_tcp_qpair *tqpair,
+ struct spdk_nvmf_tcp_req *tcp_req)
+{
+ struct nvme_tcp_pdu *rsp_pdu;
+ struct spdk_nvme_tcp_c2h_data_hdr *c2h_data;
+ uint32_t plen, pdo, alignment;
+ int rc;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "enter\n");
+
+ rsp_pdu = nvmf_tcp_req_pdu_init(tcp_req);
+ assert(rsp_pdu != NULL);
+
+ c2h_data = &rsp_pdu->hdr.c2h_data;
+ c2h_data->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_C2H_DATA;
+ plen = c2h_data->common.hlen = sizeof(*c2h_data);
+
+ if (tqpair->host_hdgst_enable) {
+ plen += SPDK_NVME_TCP_DIGEST_LEN;
+ c2h_data->common.flags |= SPDK_NVME_TCP_CH_FLAGS_HDGSTF;
+ }
+
+ /* set the psh */
+ c2h_data->cccid = tcp_req->req.cmd->nvme_cmd.cid;
+ c2h_data->datal = tcp_req->req.length;
+ c2h_data->datao = 0;
+
+ /* set the padding */
+ rsp_pdu->padding_len = 0;
+ pdo = plen;
+ if (tqpair->cpda) {
+ alignment = (tqpair->cpda + 1) << 2;
+ if (alignment > plen) {
+ rsp_pdu->padding_len = alignment - plen;
+ pdo = plen = alignment;
+ }
+ }
+
+ c2h_data->common.pdo = pdo;
+ plen += c2h_data->datal;
+ if (tqpair->host_ddgst_enable) {
+ c2h_data->common.flags |= SPDK_NVME_TCP_CH_FLAGS_DDGSTF;
+ plen += SPDK_NVME_TCP_DIGEST_LEN;
+ }
+
+ c2h_data->common.plen = plen;
+
+ if (spdk_unlikely(tcp_req->req.dif.dif_insert_or_strip)) {
+ rsp_pdu->dif_ctx = &tcp_req->req.dif.dif_ctx;
+ }
+
+ nvme_tcp_pdu_set_data_buf(rsp_pdu, tcp_req->req.iov, tcp_req->req.iovcnt,
+ c2h_data->datao, c2h_data->datal);
+
+ if (spdk_unlikely(tcp_req->req.dif.dif_insert_or_strip)) {
+ struct spdk_nvme_cpl *rsp = &tcp_req->req.rsp->nvme_cpl;
+ struct spdk_dif_error err_blk = {};
+
+ rc = spdk_dif_verify_stream(rsp_pdu->data_iov, rsp_pdu->data_iovcnt,
+ 0, rsp_pdu->data_len, rsp_pdu->dif_ctx, &err_blk);
+ if (rc != 0) {
+ SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n",
+ err_blk.err_type, err_blk.err_offset);
+ rsp->status.sct = SPDK_NVME_SCT_MEDIA_ERROR;
+ rsp->status.sc = nvmf_tcp_dif_error_to_compl_status(err_blk.err_type);
+ nvmf_tcp_req_pdu_fini(tcp_req);
+ nvmf_tcp_send_capsule_resp_pdu(tcp_req, tqpair);
+ return;
+ }
+ }
+
+ c2h_data->common.flags |= SPDK_NVME_TCP_C2H_DATA_FLAGS_LAST_PDU;
+ if (tqpair->qpair.transport->opts.c2h_success) {
+ c2h_data->common.flags |= SPDK_NVME_TCP_C2H_DATA_FLAGS_SUCCESS;
+ }
+
+ nvmf_tcp_qpair_write_pdu(tqpair, rsp_pdu, nvmf_tcp_pdu_c2h_data_complete, tcp_req);
+}
+
+static int
+request_transfer_out(struct spdk_nvmf_request *req)
+{
+ struct spdk_nvmf_tcp_req *tcp_req;
+ struct spdk_nvmf_qpair *qpair;
+ struct spdk_nvmf_tcp_qpair *tqpair;
+ struct spdk_nvme_cpl *rsp;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "enter\n");
+
+ qpair = req->qpair;
+ rsp = &req->rsp->nvme_cpl;
+ tcp_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_tcp_req, req);
+
+ /* Advance our sq_head pointer */
+ if (qpair->sq_head == qpair->sq_head_max) {
+ qpair->sq_head = 0;
+ } else {
+ qpair->sq_head++;
+ }
+ rsp->sqhd = qpair->sq_head;
+
+ tqpair = SPDK_CONTAINEROF(tcp_req->req.qpair, struct spdk_nvmf_tcp_qpair, qpair);
+ nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST);
+ if (rsp->status.sc == SPDK_NVME_SC_SUCCESS && req->xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) {
+ nvmf_tcp_send_c2h_data(tqpair, tcp_req);
+ } else {
+ nvmf_tcp_send_capsule_resp_pdu(tcp_req, tqpair);
+ }
+
+ return 0;
+}
+
+static void
+nvmf_tcp_set_incapsule_data(struct spdk_nvmf_tcp_qpair *tqpair,
+ struct spdk_nvmf_tcp_req *tcp_req)
+{
+ struct nvme_tcp_pdu *pdu;
+ uint32_t plen = 0;
+
+ pdu = &tqpair->pdu_in_progress;
+ plen = pdu->hdr.common.hlen;
+
+ if (tqpair->host_hdgst_enable) {
+ plen += SPDK_NVME_TCP_DIGEST_LEN;
+ }
+
+ if (pdu->hdr.common.plen != plen) {
+ tcp_req->has_incapsule_data = true;
+ }
+}
+
+static bool
+nvmf_tcp_req_process(struct spdk_nvmf_tcp_transport *ttransport,
+ struct spdk_nvmf_tcp_req *tcp_req)
+{
+ struct spdk_nvmf_tcp_qpair *tqpair;
+ int rc;
+ enum spdk_nvmf_tcp_req_state prev_state;
+ bool progress = false;
+ struct spdk_nvmf_transport *transport = &ttransport->transport;
+ struct spdk_nvmf_transport_poll_group *group;
+
+ tqpair = SPDK_CONTAINEROF(tcp_req->req.qpair, struct spdk_nvmf_tcp_qpair, qpair);
+ group = &tqpair->group->group;
+ assert(tcp_req->state != TCP_REQUEST_STATE_FREE);
+
+ /* If the qpair is not active, we need to abort the outstanding requests. */
+ if (tqpair->qpair.state != SPDK_NVMF_QPAIR_ACTIVE) {
+ if (tcp_req->state == TCP_REQUEST_STATE_NEED_BUFFER) {
+ STAILQ_REMOVE(&group->pending_buf_queue, &tcp_req->req, spdk_nvmf_request, buf_link);
+ }
+ nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_COMPLETED);
+ }
+
+ /* The loop here is to allow for several back-to-back state changes. */
+ do {
+ prev_state = tcp_req->state;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "Request %p entering state %d on tqpair=%p\n", tcp_req, prev_state,
+ tqpair);
+
+ switch (tcp_req->state) {
+ case TCP_REQUEST_STATE_FREE:
+ /* Some external code must kick a request into TCP_REQUEST_STATE_NEW
+ * to escape this state. */
+ break;
+ case TCP_REQUEST_STATE_NEW:
+ spdk_trace_record(TRACE_TCP_REQUEST_STATE_NEW, 0, 0, (uintptr_t)tcp_req, 0);
+
+ /* copy the cmd from the receive pdu */
+ tcp_req->cmd = tqpair->pdu_in_progress.hdr.capsule_cmd.ccsqe;
+
+ if (spdk_unlikely(spdk_nvmf_request_get_dif_ctx(&tcp_req->req, &tcp_req->req.dif.dif_ctx))) {
+ tcp_req->req.dif.dif_insert_or_strip = true;
+ tqpair->pdu_in_progress.dif_ctx = &tcp_req->req.dif.dif_ctx;
+ }
+
+ /* The next state transition depends on the data transfer needs of this request. */
+ tcp_req->req.xfer = spdk_nvmf_req_get_xfer(&tcp_req->req);
+
+ /* If no data to transfer, ready to execute. */
+ if (tcp_req->req.xfer == SPDK_NVME_DATA_NONE) {
+ /* Reset the tqpair receving pdu state */
+ nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY);
+ nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_READY_TO_EXECUTE);
+ break;
+ }
+
+ nvmf_tcp_set_incapsule_data(tqpair, tcp_req);
+
+ if (!tcp_req->has_incapsule_data) {
+ nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY);
+ }
+
+ nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_NEED_BUFFER);
+ STAILQ_INSERT_TAIL(&group->pending_buf_queue, &tcp_req->req, buf_link);
+ break;
+ case TCP_REQUEST_STATE_NEED_BUFFER:
+ spdk_trace_record(TRACE_TCP_REQUEST_STATE_NEED_BUFFER, 0, 0, (uintptr_t)tcp_req, 0);
+
+ assert(tcp_req->req.xfer != SPDK_NVME_DATA_NONE);
+
+ if (!tcp_req->has_incapsule_data && (&tcp_req->req != STAILQ_FIRST(&group->pending_buf_queue))) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP,
+ "Not the first element to wait for the buf for tcp_req(%p) on tqpair=%p\n",
+ tcp_req, tqpair);
+ /* This request needs to wait in line to obtain a buffer */
+ break;
+ }
+
+ /* Try to get a data buffer */
+ rc = nvmf_tcp_req_parse_sgl(tcp_req, transport, group);
+ if (rc < 0) {
+ STAILQ_REMOVE_HEAD(&group->pending_buf_queue, buf_link);
+ /* Reset the tqpair receving pdu state */
+ nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR);
+ nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_READY_TO_COMPLETE);
+ break;
+ }
+
+ if (!tcp_req->req.data) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "No buffer allocated for tcp_req(%p) on tqpair(%p\n)",
+ tcp_req, tqpair);
+ /* No buffers available. */
+ break;
+ }
+
+ STAILQ_REMOVE(&group->pending_buf_queue, &tcp_req->req, spdk_nvmf_request, buf_link);
+
+ /* If data is transferring from host to controller, we need to do a transfer from the host. */
+ if (tcp_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) {
+ if (tcp_req->req.data_from_pool) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "Sending R2T for tcp_req(%p) on tqpair=%p\n", tcp_req, tqpair);
+ nvmf_tcp_send_r2t_pdu(tqpair, tcp_req);
+ } else {
+ struct nvme_tcp_pdu *pdu;
+
+ nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER);
+
+ pdu = &tqpair->pdu_in_progress;
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "Not need to send r2t for tcp_req(%p) on tqpair=%p\n", tcp_req,
+ tqpair);
+ /* No need to send r2t, contained in the capsuled data */
+ nvme_tcp_pdu_set_data_buf(pdu, tcp_req->req.iov, tcp_req->req.iovcnt,
+ 0, tcp_req->req.length);
+ nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD);
+ }
+ break;
+ }
+
+ nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_READY_TO_EXECUTE);
+ break;
+ case TCP_REQUEST_STATE_AWAITING_R2T_ACK:
+ spdk_trace_record(TRACE_TCP_REQUEST_STATE_AWAIT_R2T_ACK, 0, 0, (uintptr_t)tcp_req, 0);
+ /* The R2T completion or the h2c data incoming will kick it out of this state. */
+ break;
+ case TCP_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER:
+
+ spdk_trace_record(TRACE_TCP_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER, 0, 0,
+ (uintptr_t)tcp_req, 0);
+ /* Some external code must kick a request into TCP_REQUEST_STATE_READY_TO_EXECUTE
+ * to escape this state. */
+ break;
+ case TCP_REQUEST_STATE_READY_TO_EXECUTE:
+ spdk_trace_record(TRACE_TCP_REQUEST_STATE_READY_TO_EXECUTE, 0, 0, (uintptr_t)tcp_req, 0);
+
+ if (spdk_unlikely(tcp_req->req.dif.dif_insert_or_strip)) {
+ assert(tcp_req->req.dif.elba_length >= tcp_req->req.length);
+ tcp_req->req.length = tcp_req->req.dif.elba_length;
+ }
+
+ nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_EXECUTING);
+ spdk_nvmf_request_exec(&tcp_req->req);
+ break;
+ case TCP_REQUEST_STATE_EXECUTING:
+ spdk_trace_record(TRACE_TCP_REQUEST_STATE_EXECUTING, 0, 0, (uintptr_t)tcp_req, 0);
+ /* Some external code must kick a request into TCP_REQUEST_STATE_EXECUTED
+ * to escape this state. */
+ break;
+ case TCP_REQUEST_STATE_EXECUTED:
+ spdk_trace_record(TRACE_TCP_REQUEST_STATE_EXECUTED, 0, 0, (uintptr_t)tcp_req, 0);
+
+ if (spdk_unlikely(tcp_req->req.dif.dif_insert_or_strip)) {
+ tcp_req->req.length = tcp_req->req.dif.orig_length;
+ }
+
+ nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_READY_TO_COMPLETE);
+ break;
+ case TCP_REQUEST_STATE_READY_TO_COMPLETE:
+ spdk_trace_record(TRACE_TCP_REQUEST_STATE_READY_TO_COMPLETE, 0, 0, (uintptr_t)tcp_req, 0);
+ rc = request_transfer_out(&tcp_req->req);
+ assert(rc == 0); /* No good way to handle this currently */
+ break;
+ case TCP_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST:
+ spdk_trace_record(TRACE_TCP_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST, 0, 0,
+ (uintptr_t)tcp_req,
+ 0);
+ /* Some external code must kick a request into TCP_REQUEST_STATE_COMPLETED
+ * to escape this state. */
+ break;
+ case TCP_REQUEST_STATE_COMPLETED:
+ spdk_trace_record(TRACE_TCP_REQUEST_STATE_COMPLETED, 0, 0, (uintptr_t)tcp_req, 0);
+ if (tcp_req->req.data_from_pool) {
+ spdk_nvmf_request_free_buffers(&tcp_req->req, group, transport);
+ }
+ tcp_req->req.length = 0;
+ tcp_req->req.iovcnt = 0;
+ tcp_req->req.data = NULL;
+
+ nvmf_tcp_req_pdu_fini(tcp_req);
+
+ nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_FREE);
+ break;
+ case TCP_REQUEST_NUM_STATES:
+ default:
+ assert(0);
+ break;
+ }
+
+ if (tcp_req->state != prev_state) {
+ progress = true;
+ }
+ } while (tcp_req->state != prev_state);
+
+ return progress;
+}
+
+static void
+nvmf_tcp_sock_cb(void *arg, struct spdk_sock_group *group, struct spdk_sock *sock)
+{
+ struct spdk_nvmf_tcp_qpair *tqpair = arg;
+ int rc;
+
+ assert(tqpair != NULL);
+ rc = nvmf_tcp_sock_process(tqpair);
+
+ /* If there was a new socket error, disconnect */
+ if (rc < 0) {
+ nvmf_tcp_qpair_disconnect(tqpair);
+ }
+}
+
+static int
+nvmf_tcp_poll_group_add(struct spdk_nvmf_transport_poll_group *group,
+ struct spdk_nvmf_qpair *qpair)
+{
+ struct spdk_nvmf_tcp_poll_group *tgroup;
+ struct spdk_nvmf_tcp_qpair *tqpair;
+ int rc;
+
+ tgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_tcp_poll_group, group);
+ tqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_tcp_qpair, qpair);
+
+ rc = spdk_sock_group_add_sock(tgroup->sock_group, tqpair->sock,
+ nvmf_tcp_sock_cb, tqpair);
+ if (rc != 0) {
+ SPDK_ERRLOG("Could not add sock to sock_group: %s (%d)\n",
+ spdk_strerror(errno), errno);
+ return -1;
+ }
+
+ rc = nvmf_tcp_qpair_sock_init(tqpair);
+ if (rc != 0) {
+ SPDK_ERRLOG("Cannot set sock opt for tqpair=%p\n", tqpair);
+ return -1;
+ }
+
+ rc = nvmf_tcp_qpair_init(&tqpair->qpair);
+ if (rc < 0) {
+ SPDK_ERRLOG("Cannot init tqpair=%p\n", tqpair);
+ return -1;
+ }
+
+ rc = nvmf_tcp_qpair_init_mem_resource(tqpair);
+ if (rc < 0) {
+ SPDK_ERRLOG("Cannot init memory resource info for tqpair=%p\n", tqpair);
+ return -1;
+ }
+
+ tqpair->group = tgroup;
+ tqpair->state = NVME_TCP_QPAIR_STATE_INVALID;
+ TAILQ_INSERT_TAIL(&tgroup->qpairs, tqpair, link);
+
+ return 0;
+}
+
+static int
+nvmf_tcp_poll_group_remove(struct spdk_nvmf_transport_poll_group *group,
+ struct spdk_nvmf_qpair *qpair)
+{
+ struct spdk_nvmf_tcp_poll_group *tgroup;
+ struct spdk_nvmf_tcp_qpair *tqpair;
+ int rc;
+
+ tgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_tcp_poll_group, group);
+ tqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_tcp_qpair, qpair);
+
+ assert(tqpair->group == tgroup);
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "remove tqpair=%p from the tgroup=%p\n", tqpair, tgroup);
+ if (tqpair->recv_state == NVME_TCP_PDU_RECV_STATE_AWAIT_REQ) {
+ TAILQ_REMOVE(&tgroup->await_req, tqpair, link);
+ } else {
+ TAILQ_REMOVE(&tgroup->qpairs, tqpair, link);
+ }
+
+ rc = spdk_sock_group_remove_sock(tgroup->sock_group, tqpair->sock);
+ if (rc != 0) {
+ SPDK_ERRLOG("Could not remove sock from sock_group: %s (%d)\n",
+ spdk_strerror(errno), errno);
+ }
+
+ return rc;
+}
+
+static int
+nvmf_tcp_req_complete(struct spdk_nvmf_request *req)
+{
+ struct spdk_nvmf_tcp_transport *ttransport;
+ struct spdk_nvmf_tcp_req *tcp_req;
+
+ ttransport = SPDK_CONTAINEROF(req->qpair->transport, struct spdk_nvmf_tcp_transport, transport);
+ tcp_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_tcp_req, req);
+
+ nvmf_tcp_req_set_state(tcp_req, TCP_REQUEST_STATE_EXECUTED);
+ nvmf_tcp_req_process(ttransport, tcp_req);
+
+ return 0;
+}
+
+static void
+nvmf_tcp_close_qpair(struct spdk_nvmf_qpair *qpair)
+{
+ struct spdk_nvmf_tcp_qpair *tqpair;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVMF_TCP, "Qpair: %p\n", qpair);
+
+ tqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_tcp_qpair, qpair);
+ tqpair->state = NVME_TCP_QPAIR_STATE_EXITED;
+ nvmf_tcp_qpair_destroy(tqpair);
+}
+
+static int
+nvmf_tcp_poll_group_poll(struct spdk_nvmf_transport_poll_group *group)
+{
+ struct spdk_nvmf_tcp_poll_group *tgroup;
+ int rc;
+ struct spdk_nvmf_request *req, *req_tmp;
+ struct spdk_nvmf_tcp_req *tcp_req;
+ struct spdk_nvmf_tcp_qpair *tqpair, *tqpair_tmp;
+ struct spdk_nvmf_tcp_transport *ttransport = SPDK_CONTAINEROF(group->transport,
+ struct spdk_nvmf_tcp_transport, transport);
+
+ tgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_tcp_poll_group, group);
+
+ if (spdk_unlikely(TAILQ_EMPTY(&tgroup->qpairs) && TAILQ_EMPTY(&tgroup->await_req))) {
+ return 0;
+ }
+
+ STAILQ_FOREACH_SAFE(req, &group->pending_buf_queue, buf_link, req_tmp) {
+ tcp_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_tcp_req, req);
+ if (nvmf_tcp_req_process(ttransport, tcp_req) == false) {
+ break;
+ }
+ }
+
+ rc = spdk_sock_group_poll(tgroup->sock_group);
+ if (rc < 0) {
+ SPDK_ERRLOG("Failed to poll sock_group=%p\n", tgroup->sock_group);
+ }
+
+ TAILQ_FOREACH_SAFE(tqpair, &tgroup->await_req, link, tqpair_tmp) {
+ nvmf_tcp_sock_process(tqpair);
+ }
+
+ return rc;
+}
+
+static int
+nvmf_tcp_qpair_get_trid(struct spdk_nvmf_qpair *qpair,
+ struct spdk_nvme_transport_id *trid, bool peer)
+{
+ struct spdk_nvmf_tcp_qpair *tqpair;
+ uint16_t port;
+
+ tqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_tcp_qpair, qpair);
+ spdk_nvme_trid_populate_transport(trid, SPDK_NVME_TRANSPORT_TCP);
+
+ if (peer) {
+ snprintf(trid->traddr, sizeof(trid->traddr), "%s", tqpair->initiator_addr);
+ port = tqpair->initiator_port;
+ } else {
+ snprintf(trid->traddr, sizeof(trid->traddr), "%s", tqpair->target_addr);
+ port = tqpair->target_port;
+ }
+
+ if (spdk_sock_is_ipv4(tqpair->sock)) {
+ trid->adrfam = SPDK_NVMF_ADRFAM_IPV4;
+ } else if (spdk_sock_is_ipv6(tqpair->sock)) {
+ trid->adrfam = SPDK_NVMF_ADRFAM_IPV6;
+ } else {
+ return -1;
+ }
+
+ snprintf(trid->trsvcid, sizeof(trid->trsvcid), "%d", port);
+ return 0;
+}
+
+static int
+nvmf_tcp_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair,
+ struct spdk_nvme_transport_id *trid)
+{
+ return nvmf_tcp_qpair_get_trid(qpair, trid, 0);
+}
+
+static int
+nvmf_tcp_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair,
+ struct spdk_nvme_transport_id *trid)
+{
+ return nvmf_tcp_qpair_get_trid(qpair, trid, 1);
+}
+
+static int
+nvmf_tcp_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair,
+ struct spdk_nvme_transport_id *trid)
+{
+ return nvmf_tcp_qpair_get_trid(qpair, trid, 0);
+}
+
+static void
+nvmf_tcp_req_set_abort_status(struct spdk_nvmf_request *req,
+ struct spdk_nvmf_tcp_req *tcp_req_to_abort)
+{
+ tcp_req_to_abort->req.rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+ tcp_req_to_abort->req.rsp->nvme_cpl.status.sc = SPDK_NVME_SC_ABORTED_BY_REQUEST;
+
+ nvmf_tcp_req_set_state(tcp_req_to_abort, TCP_REQUEST_STATE_READY_TO_COMPLETE);
+
+ req->rsp->nvme_cpl.cdw0 &= ~1U; /* Command was successfully aborted. */
+}
+
+static int
+_nvmf_tcp_qpair_abort_request(void *ctx)
+{
+ struct spdk_nvmf_request *req = ctx;
+ struct spdk_nvmf_tcp_req *tcp_req_to_abort = SPDK_CONTAINEROF(req->req_to_abort,
+ struct spdk_nvmf_tcp_req, req);
+ struct spdk_nvmf_tcp_qpair *tqpair = SPDK_CONTAINEROF(req->req_to_abort->qpair,
+ struct spdk_nvmf_tcp_qpair, qpair);
+ int rc;
+
+ spdk_poller_unregister(&req->poller);
+
+ switch (tcp_req_to_abort->state) {
+ case TCP_REQUEST_STATE_EXECUTING:
+ rc = nvmf_ctrlr_abort_request(req);
+ if (rc == SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS) {
+ return SPDK_POLLER_BUSY;
+ }
+ break;
+
+ case TCP_REQUEST_STATE_NEED_BUFFER:
+ STAILQ_REMOVE(&tqpair->group->group.pending_buf_queue,
+ &tcp_req_to_abort->req, spdk_nvmf_request, buf_link);
+
+ nvmf_tcp_req_set_abort_status(req, tcp_req_to_abort);
+ break;
+
+ case TCP_REQUEST_STATE_AWAITING_R2T_ACK:
+ nvmf_tcp_req_set_abort_status(req, tcp_req_to_abort);
+ break;
+
+ case TCP_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER:
+ if (spdk_get_ticks() < req->timeout_tsc) {
+ req->poller = SPDK_POLLER_REGISTER(_nvmf_tcp_qpair_abort_request, req, 0);
+ return SPDK_POLLER_BUSY;
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ spdk_nvmf_request_complete(req);
+ return SPDK_POLLER_BUSY;
+}
+
+static void
+nvmf_tcp_qpair_abort_request(struct spdk_nvmf_qpair *qpair,
+ struct spdk_nvmf_request *req)
+{
+ struct spdk_nvmf_tcp_qpair *tqpair;
+ struct spdk_nvmf_tcp_transport *ttransport;
+ struct spdk_nvmf_transport *transport;
+ uint16_t cid;
+ uint32_t i;
+ struct spdk_nvmf_tcp_req *tcp_req_to_abort = NULL;
+
+ tqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_tcp_qpair, qpair);
+ ttransport = SPDK_CONTAINEROF(qpair->transport, struct spdk_nvmf_tcp_transport, transport);
+ transport = &ttransport->transport;
+
+ cid = req->cmd->nvme_cmd.cdw10_bits.abort.cid;
+
+ for (i = 0; i < tqpair->resource_count; i++) {
+ tcp_req_to_abort = &tqpair->reqs[i];
+
+ if (tcp_req_to_abort->state != TCP_REQUEST_STATE_FREE &&
+ tcp_req_to_abort->req.cmd->nvme_cmd.cid == cid) {
+ break;
+ }
+ }
+
+ if (tcp_req_to_abort == NULL) {
+ spdk_nvmf_request_complete(req);
+ return;
+ }
+
+ req->req_to_abort = &tcp_req_to_abort->req;
+ req->timeout_tsc = spdk_get_ticks() +
+ transport->opts.abort_timeout_sec * spdk_get_ticks_hz();
+ req->poller = NULL;
+
+ _nvmf_tcp_qpair_abort_request(req);
+}
+
+#define SPDK_NVMF_TCP_DEFAULT_MAX_QUEUE_DEPTH 128
+#define SPDK_NVMF_TCP_DEFAULT_AQ_DEPTH 128
+#define SPDK_NVMF_TCP_DEFAULT_MAX_QPAIRS_PER_CTRLR 128
+#define SPDK_NVMF_TCP_DEFAULT_IN_CAPSULE_DATA_SIZE 4096
+#define SPDK_NVMF_TCP_DEFAULT_MAX_IO_SIZE 131072
+#define SPDK_NVMF_TCP_DEFAULT_IO_UNIT_SIZE 131072
+#define SPDK_NVMF_TCP_DEFAULT_NUM_SHARED_BUFFERS 511
+#define SPDK_NVMF_TCP_DEFAULT_BUFFER_CACHE_SIZE 32
+#define SPDK_NVMF_TCP_DEFAULT_SUCCESS_OPTIMIZATION true
+#define SPDK_NVMF_TCP_DEFAULT_DIF_INSERT_OR_STRIP false
+#define SPDK_NVMF_TCP_DEFAULT_SOCK_PRIORITY 0
+#define SPDK_NVMF_TCP_DEFAULT_ABORT_TIMEOUT_SEC 1
+
+static void
+nvmf_tcp_opts_init(struct spdk_nvmf_transport_opts *opts)
+{
+ opts->max_queue_depth = SPDK_NVMF_TCP_DEFAULT_MAX_QUEUE_DEPTH;
+ opts->max_qpairs_per_ctrlr = SPDK_NVMF_TCP_DEFAULT_MAX_QPAIRS_PER_CTRLR;
+ opts->in_capsule_data_size = SPDK_NVMF_TCP_DEFAULT_IN_CAPSULE_DATA_SIZE;
+ opts->max_io_size = SPDK_NVMF_TCP_DEFAULT_MAX_IO_SIZE;
+ opts->io_unit_size = SPDK_NVMF_TCP_DEFAULT_IO_UNIT_SIZE;
+ opts->max_aq_depth = SPDK_NVMF_TCP_DEFAULT_AQ_DEPTH;
+ opts->num_shared_buffers = SPDK_NVMF_TCP_DEFAULT_NUM_SHARED_BUFFERS;
+ opts->buf_cache_size = SPDK_NVMF_TCP_DEFAULT_BUFFER_CACHE_SIZE;
+ opts->c2h_success = SPDK_NVMF_TCP_DEFAULT_SUCCESS_OPTIMIZATION;
+ opts->dif_insert_or_strip = SPDK_NVMF_TCP_DEFAULT_DIF_INSERT_OR_STRIP;
+ opts->sock_priority = SPDK_NVMF_TCP_DEFAULT_SOCK_PRIORITY;
+ opts->abort_timeout_sec = SPDK_NVMF_TCP_DEFAULT_ABORT_TIMEOUT_SEC;
+}
+
+const struct spdk_nvmf_transport_ops spdk_nvmf_transport_tcp = {
+ .name = "TCP",
+ .type = SPDK_NVME_TRANSPORT_TCP,
+ .opts_init = nvmf_tcp_opts_init,
+ .create = nvmf_tcp_create,
+ .destroy = nvmf_tcp_destroy,
+
+ .listen = nvmf_tcp_listen,
+ .stop_listen = nvmf_tcp_stop_listen,
+ .accept = nvmf_tcp_accept,
+
+ .listener_discover = nvmf_tcp_discover,
+
+ .poll_group_create = nvmf_tcp_poll_group_create,
+ .get_optimal_poll_group = nvmf_tcp_get_optimal_poll_group,
+ .poll_group_destroy = nvmf_tcp_poll_group_destroy,
+ .poll_group_add = nvmf_tcp_poll_group_add,
+ .poll_group_remove = nvmf_tcp_poll_group_remove,
+ .poll_group_poll = nvmf_tcp_poll_group_poll,
+
+ .req_free = nvmf_tcp_req_free,
+ .req_complete = nvmf_tcp_req_complete,
+
+ .qpair_fini = nvmf_tcp_close_qpair,
+ .qpair_get_local_trid = nvmf_tcp_qpair_get_local_trid,
+ .qpair_get_peer_trid = nvmf_tcp_qpair_get_peer_trid,
+ .qpair_get_listen_trid = nvmf_tcp_qpair_get_listen_trid,
+ .qpair_abort_request = nvmf_tcp_qpair_abort_request,
+};
+
+SPDK_NVMF_TRANSPORT_REGISTER(tcp, &spdk_nvmf_transport_tcp);
+SPDK_LOG_REGISTER_COMPONENT("nvmf_tcp", SPDK_LOG_NVMF_TCP)
diff --git a/src/spdk/lib/nvmf/transport.c b/src/spdk/lib/nvmf/transport.c
new file mode 100644
index 000000000..11bb152df
--- /dev/null
+++ b/src/spdk/lib/nvmf/transport.c
@@ -0,0 +1,572 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation. All rights reserved.
+ * Copyright (c) 2018-2019 Mellanox Technologies LTD. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "nvmf_internal.h"
+#include "transport.h"
+
+#include "spdk/config.h"
+#include "spdk/log.h"
+#include "spdk/nvmf.h"
+#include "spdk/nvmf_transport.h"
+#include "spdk/queue.h"
+#include "spdk/util.h"
+
+#define MAX_MEMPOOL_NAME_LENGTH 40
+
+struct nvmf_transport_ops_list_element {
+ struct spdk_nvmf_transport_ops ops;
+ TAILQ_ENTRY(nvmf_transport_ops_list_element) link;
+};
+
+TAILQ_HEAD(nvmf_transport_ops_list, nvmf_transport_ops_list_element)
+g_spdk_nvmf_transport_ops = TAILQ_HEAD_INITIALIZER(g_spdk_nvmf_transport_ops);
+
+static inline const struct spdk_nvmf_transport_ops *
+nvmf_get_transport_ops(const char *transport_name)
+{
+ struct nvmf_transport_ops_list_element *ops;
+ TAILQ_FOREACH(ops, &g_spdk_nvmf_transport_ops, link) {
+ if (strcasecmp(transport_name, ops->ops.name) == 0) {
+ return &ops->ops;
+ }
+ }
+ return NULL;
+}
+
+void
+spdk_nvmf_transport_register(const struct spdk_nvmf_transport_ops *ops)
+{
+ struct nvmf_transport_ops_list_element *new_ops;
+
+ if (nvmf_get_transport_ops(ops->name) != NULL) {
+ SPDK_ERRLOG("Double registering nvmf transport type %s.\n", ops->name);
+ assert(false);
+ return;
+ }
+
+ new_ops = calloc(1, sizeof(*new_ops));
+ if (new_ops == NULL) {
+ SPDK_ERRLOG("Unable to allocate memory to register new transport type %s.\n", ops->name);
+ assert(false);
+ return;
+ }
+
+ new_ops->ops = *ops;
+
+ TAILQ_INSERT_TAIL(&g_spdk_nvmf_transport_ops, new_ops, link);
+}
+
+const struct spdk_nvmf_transport_opts *
+spdk_nvmf_get_transport_opts(struct spdk_nvmf_transport *transport)
+{
+ return &transport->opts;
+}
+
+spdk_nvme_transport_type_t
+spdk_nvmf_get_transport_type(struct spdk_nvmf_transport *transport)
+{
+ return transport->ops->type;
+}
+
+const char *
+spdk_nvmf_get_transport_name(struct spdk_nvmf_transport *transport)
+{
+ return transport->ops->name;
+}
+
+struct spdk_nvmf_transport *
+spdk_nvmf_transport_create(const char *transport_name, struct spdk_nvmf_transport_opts *opts)
+{
+ const struct spdk_nvmf_transport_ops *ops = NULL;
+ struct spdk_nvmf_transport *transport;
+ char spdk_mempool_name[MAX_MEMPOOL_NAME_LENGTH];
+ int chars_written;
+
+ ops = nvmf_get_transport_ops(transport_name);
+ if (!ops) {
+ SPDK_ERRLOG("Transport type '%s' unavailable.\n", transport_name);
+ return NULL;
+ }
+
+ if (opts->max_aq_depth < SPDK_NVMF_MIN_ADMIN_MAX_SQ_SIZE) {
+ SPDK_ERRLOG("max_aq_depth %u is less than minimum defined by NVMf spec, use min value\n",
+ opts->max_aq_depth);
+ opts->max_aq_depth = SPDK_NVMF_MIN_ADMIN_MAX_SQ_SIZE;
+ }
+
+ transport = ops->create(opts);
+ if (!transport) {
+ SPDK_ERRLOG("Unable to create new transport of type %s\n", transport_name);
+ return NULL;
+ }
+
+ TAILQ_INIT(&transport->listeners);
+
+ transport->ops = ops;
+ transport->opts = *opts;
+ chars_written = snprintf(spdk_mempool_name, MAX_MEMPOOL_NAME_LENGTH, "%s_%s_%s", "spdk_nvmf",
+ transport_name, "data");
+ if (chars_written < 0) {
+ SPDK_ERRLOG("Unable to generate transport data buffer pool name.\n");
+ ops->destroy(transport);
+ return NULL;
+ }
+
+ transport->data_buf_pool = spdk_mempool_create(spdk_mempool_name,
+ opts->num_shared_buffers,
+ opts->io_unit_size + NVMF_DATA_BUFFER_ALIGNMENT,
+ SPDK_MEMPOOL_DEFAULT_CACHE_SIZE,
+ SPDK_ENV_SOCKET_ID_ANY);
+
+ if (!transport->data_buf_pool) {
+ SPDK_ERRLOG("Unable to allocate buffer pool for poll group\n");
+ ops->destroy(transport);
+ return NULL;
+ }
+
+ return transport;
+}
+
+struct spdk_nvmf_transport *
+spdk_nvmf_transport_get_first(struct spdk_nvmf_tgt *tgt)
+{
+ return TAILQ_FIRST(&tgt->transports);
+}
+
+struct spdk_nvmf_transport *
+spdk_nvmf_transport_get_next(struct spdk_nvmf_transport *transport)
+{
+ return TAILQ_NEXT(transport, link);
+}
+
+int
+spdk_nvmf_transport_destroy(struct spdk_nvmf_transport *transport)
+{
+ if (transport->data_buf_pool != NULL) {
+ if (spdk_mempool_count(transport->data_buf_pool) !=
+ transport->opts.num_shared_buffers) {
+ SPDK_ERRLOG("transport buffer pool count is %zu but should be %u\n",
+ spdk_mempool_count(transport->data_buf_pool),
+ transport->opts.num_shared_buffers);
+ }
+ }
+
+ spdk_mempool_free(transport->data_buf_pool);
+
+ return transport->ops->destroy(transport);
+}
+
+struct spdk_nvmf_listener *
+nvmf_transport_find_listener(struct spdk_nvmf_transport *transport,
+ const struct spdk_nvme_transport_id *trid)
+{
+ struct spdk_nvmf_listener *listener;
+
+ TAILQ_FOREACH(listener, &transport->listeners, link) {
+ if (spdk_nvme_transport_id_compare(&listener->trid, trid) == 0) {
+ return listener;
+ }
+ }
+
+ return NULL;
+}
+
+int
+spdk_nvmf_transport_listen(struct spdk_nvmf_transport *transport,
+ const struct spdk_nvme_transport_id *trid)
+{
+ struct spdk_nvmf_listener *listener;
+ int rc;
+
+ listener = nvmf_transport_find_listener(transport, trid);
+ if (!listener) {
+ listener = calloc(1, sizeof(*listener));
+ if (!listener) {
+ return -ENOMEM;
+ }
+
+ listener->ref = 1;
+ listener->trid = *trid;
+ TAILQ_INSERT_TAIL(&transport->listeners, listener, link);
+
+ rc = transport->ops->listen(transport, &listener->trid);
+ if (rc != 0) {
+ TAILQ_REMOVE(&transport->listeners, listener, link);
+ free(listener);
+ }
+ return rc;
+ }
+
+ ++listener->ref;
+
+ return 0;
+}
+
+int
+spdk_nvmf_transport_stop_listen(struct spdk_nvmf_transport *transport,
+ const struct spdk_nvme_transport_id *trid)
+{
+ struct spdk_nvmf_listener *listener;
+
+ listener = nvmf_transport_find_listener(transport, trid);
+ if (!listener) {
+ return -ENOENT;
+ }
+
+ if (--listener->ref == 0) {
+ TAILQ_REMOVE(&transport->listeners, listener, link);
+ transport->ops->stop_listen(transport, trid);
+ free(listener);
+ }
+
+ return 0;
+}
+
+uint32_t
+nvmf_transport_accept(struct spdk_nvmf_transport *transport)
+{
+ return transport->ops->accept(transport);
+}
+
+void
+nvmf_transport_listener_discover(struct spdk_nvmf_transport *transport,
+ struct spdk_nvme_transport_id *trid,
+ struct spdk_nvmf_discovery_log_page_entry *entry)
+{
+ transport->ops->listener_discover(transport, trid, entry);
+}
+
+struct spdk_nvmf_transport_poll_group *
+nvmf_transport_poll_group_create(struct spdk_nvmf_transport *transport)
+{
+ struct spdk_nvmf_transport_poll_group *group;
+ struct spdk_nvmf_transport_pg_cache_buf *buf;
+
+ group = transport->ops->poll_group_create(transport);
+ if (!group) {
+ return NULL;
+ }
+ group->transport = transport;
+
+ STAILQ_INIT(&group->pending_buf_queue);
+ STAILQ_INIT(&group->buf_cache);
+
+ if (transport->opts.buf_cache_size) {
+ group->buf_cache_count = 0;
+ group->buf_cache_size = transport->opts.buf_cache_size;
+ while (group->buf_cache_count < group->buf_cache_size) {
+ buf = (struct spdk_nvmf_transport_pg_cache_buf *)spdk_mempool_get(transport->data_buf_pool);
+ if (!buf) {
+ SPDK_NOTICELOG("Unable to reserve the full number of buffers for the pg buffer cache.\n");
+ break;
+ }
+ STAILQ_INSERT_HEAD(&group->buf_cache, buf, link);
+ group->buf_cache_count++;
+ }
+ }
+ return group;
+}
+
+struct spdk_nvmf_transport_poll_group *
+nvmf_transport_get_optimal_poll_group(struct spdk_nvmf_transport *transport,
+ struct spdk_nvmf_qpair *qpair)
+{
+ if (transport->ops->get_optimal_poll_group) {
+ return transport->ops->get_optimal_poll_group(qpair);
+ } else {
+ return NULL;
+ }
+}
+
+void
+nvmf_transport_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group)
+{
+ struct spdk_nvmf_transport_pg_cache_buf *buf, *tmp;
+
+ if (!STAILQ_EMPTY(&group->pending_buf_queue)) {
+ SPDK_ERRLOG("Pending I/O list wasn't empty on poll group destruction\n");
+ }
+
+ STAILQ_FOREACH_SAFE(buf, &group->buf_cache, link, tmp) {
+ STAILQ_REMOVE(&group->buf_cache, buf, spdk_nvmf_transport_pg_cache_buf, link);
+ spdk_mempool_put(group->transport->data_buf_pool, buf);
+ }
+ group->transport->ops->poll_group_destroy(group);
+}
+
+int
+nvmf_transport_poll_group_add(struct spdk_nvmf_transport_poll_group *group,
+ struct spdk_nvmf_qpair *qpair)
+{
+ if (qpair->transport) {
+ assert(qpair->transport == group->transport);
+ if (qpair->transport != group->transport) {
+ return -1;
+ }
+ } else {
+ qpair->transport = group->transport;
+ }
+
+ return group->transport->ops->poll_group_add(group, qpair);
+}
+
+int
+nvmf_transport_poll_group_remove(struct spdk_nvmf_transport_poll_group *group,
+ struct spdk_nvmf_qpair *qpair)
+{
+ int rc = ENOTSUP;
+
+ assert(qpair->transport == group->transport);
+ if (group->transport->ops->poll_group_remove) {
+ rc = group->transport->ops->poll_group_remove(group, qpair);
+ }
+
+ return rc;
+}
+
+int
+nvmf_transport_poll_group_poll(struct spdk_nvmf_transport_poll_group *group)
+{
+ return group->transport->ops->poll_group_poll(group);
+}
+
+int
+nvmf_transport_req_free(struct spdk_nvmf_request *req)
+{
+ return req->qpair->transport->ops->req_free(req);
+}
+
+int
+nvmf_transport_req_complete(struct spdk_nvmf_request *req)
+{
+ return req->qpair->transport->ops->req_complete(req);
+}
+
+void
+nvmf_transport_qpair_fini(struct spdk_nvmf_qpair *qpair)
+{
+ qpair->transport->ops->qpair_fini(qpair);
+}
+
+int
+nvmf_transport_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair,
+ struct spdk_nvme_transport_id *trid)
+{
+ return qpair->transport->ops->qpair_get_peer_trid(qpair, trid);
+}
+
+int
+nvmf_transport_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair,
+ struct spdk_nvme_transport_id *trid)
+{
+ return qpair->transport->ops->qpair_get_local_trid(qpair, trid);
+}
+
+int
+nvmf_transport_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair,
+ struct spdk_nvme_transport_id *trid)
+{
+ return qpair->transport->ops->qpair_get_listen_trid(qpair, trid);
+}
+
+void
+nvmf_transport_qpair_abort_request(struct spdk_nvmf_qpair *qpair,
+ struct spdk_nvmf_request *req)
+{
+ qpair->transport->ops->qpair_abort_request(qpair, req);
+}
+
+bool
+spdk_nvmf_transport_opts_init(const char *transport_name,
+ struct spdk_nvmf_transport_opts *opts)
+{
+ const struct spdk_nvmf_transport_ops *ops;
+
+ ops = nvmf_get_transport_ops(transport_name);
+ if (!ops) {
+ SPDK_ERRLOG("Transport type %s unavailable.\n", transport_name);
+ return false;
+ }
+
+ ops->opts_init(opts);
+ return true;
+}
+
+int
+spdk_nvmf_transport_poll_group_get_stat(struct spdk_nvmf_tgt *tgt,
+ struct spdk_nvmf_transport *transport,
+ struct spdk_nvmf_transport_poll_group_stat **stat)
+{
+ if (transport->ops->poll_group_get_stat) {
+ return transport->ops->poll_group_get_stat(tgt, stat);
+ } else {
+ return -ENOTSUP;
+ }
+}
+
+void
+spdk_nvmf_transport_poll_group_free_stat(struct spdk_nvmf_transport *transport,
+ struct spdk_nvmf_transport_poll_group_stat *stat)
+{
+ if (transport->ops->poll_group_free_stat) {
+ transport->ops->poll_group_free_stat(stat);
+ }
+}
+
+void
+spdk_nvmf_request_free_buffers(struct spdk_nvmf_request *req,
+ struct spdk_nvmf_transport_poll_group *group,
+ struct spdk_nvmf_transport *transport)
+{
+ uint32_t i;
+
+ for (i = 0; i < req->iovcnt; i++) {
+ if (group->buf_cache_count < group->buf_cache_size) {
+ STAILQ_INSERT_HEAD(&group->buf_cache,
+ (struct spdk_nvmf_transport_pg_cache_buf *)req->buffers[i],
+ link);
+ group->buf_cache_count++;
+ } else {
+ spdk_mempool_put(transport->data_buf_pool, req->buffers[i]);
+ }
+ req->iov[i].iov_base = NULL;
+ req->buffers[i] = NULL;
+ req->iov[i].iov_len = 0;
+ }
+ req->data_from_pool = false;
+}
+
+static inline int
+nvmf_request_set_buffer(struct spdk_nvmf_request *req, void *buf, uint32_t length,
+ uint32_t io_unit_size)
+{
+ req->buffers[req->iovcnt] = buf;
+ req->iov[req->iovcnt].iov_base = (void *)((uintptr_t)(buf + NVMF_DATA_BUFFER_MASK) &
+ ~NVMF_DATA_BUFFER_MASK);
+ req->iov[req->iovcnt].iov_len = spdk_min(length, io_unit_size);
+ length -= req->iov[req->iovcnt].iov_len;
+ req->iovcnt++;
+
+ return length;
+}
+
+static int
+nvmf_request_get_buffers(struct spdk_nvmf_request *req,
+ struct spdk_nvmf_transport_poll_group *group,
+ struct spdk_nvmf_transport *transport,
+ uint32_t length)
+{
+ uint32_t io_unit_size = transport->opts.io_unit_size;
+ uint32_t num_buffers;
+ uint32_t i = 0, j;
+ void *buffer, *buffers[NVMF_REQ_MAX_BUFFERS];
+
+ /* If the number of buffers is too large, then we know the I/O is larger than allowed.
+ * Fail it.
+ */
+ num_buffers = SPDK_CEIL_DIV(length, io_unit_size);
+ if (num_buffers + req->iovcnt > NVMF_REQ_MAX_BUFFERS) {
+ return -EINVAL;
+ }
+
+ while (i < num_buffers) {
+ if (!(STAILQ_EMPTY(&group->buf_cache))) {
+ group->buf_cache_count--;
+ buffer = STAILQ_FIRST(&group->buf_cache);
+ STAILQ_REMOVE_HEAD(&group->buf_cache, link);
+ assert(buffer != NULL);
+
+ length = nvmf_request_set_buffer(req, buffer, length, io_unit_size);
+ i++;
+ } else {
+ if (spdk_mempool_get_bulk(transport->data_buf_pool, buffers,
+ num_buffers - i)) {
+ return -ENOMEM;
+ }
+ for (j = 0; j < num_buffers - i; j++) {
+ length = nvmf_request_set_buffer(req, buffers[j], length, io_unit_size);
+ }
+ i += num_buffers - i;
+ }
+ }
+
+ assert(length == 0);
+
+ req->data_from_pool = true;
+ return 0;
+}
+
+int
+spdk_nvmf_request_get_buffers(struct spdk_nvmf_request *req,
+ struct spdk_nvmf_transport_poll_group *group,
+ struct spdk_nvmf_transport *transport,
+ uint32_t length)
+{
+ int rc;
+
+ req->iovcnt = 0;
+
+ rc = nvmf_request_get_buffers(req, group, transport, length);
+ if (rc == -ENOMEM) {
+ spdk_nvmf_request_free_buffers(req, group, transport);
+ }
+
+ return rc;
+}
+
+int
+spdk_nvmf_request_get_buffers_multi(struct spdk_nvmf_request *req,
+ struct spdk_nvmf_transport_poll_group *group,
+ struct spdk_nvmf_transport *transport,
+ uint32_t *lengths, uint32_t num_lengths)
+{
+ int rc = 0;
+ uint32_t i;
+
+ req->iovcnt = 0;
+
+ for (i = 0; i < num_lengths; i++) {
+ rc = nvmf_request_get_buffers(req, group, transport, lengths[i]);
+ if (rc != 0) {
+ goto err_exit;
+ }
+ }
+
+ return 0;
+
+err_exit:
+ spdk_nvmf_request_free_buffers(req, group, transport);
+ return rc;
+}
diff --git a/src/spdk/lib/nvmf/transport.h b/src/spdk/lib/nvmf/transport.h
new file mode 100644
index 000000000..38b5d8db3
--- /dev/null
+++ b/src/spdk/lib/nvmf/transport.h
@@ -0,0 +1,82 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation. All rights reserved.
+ * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_NVMF_TRANSPORT_H
+#define SPDK_NVMF_TRANSPORT_H
+
+#include "spdk/stdinc.h"
+
+#include "spdk/nvme.h"
+#include "spdk/nvmf.h"
+#include "spdk/nvmf_transport.h"
+
+uint32_t nvmf_transport_accept(struct spdk_nvmf_transport *transport);
+
+void nvmf_transport_listener_discover(struct spdk_nvmf_transport *transport,
+ struct spdk_nvme_transport_id *trid,
+ struct spdk_nvmf_discovery_log_page_entry *entry);
+
+struct spdk_nvmf_transport_poll_group *nvmf_transport_poll_group_create(
+ struct spdk_nvmf_transport *transport);
+struct spdk_nvmf_transport_poll_group *nvmf_transport_get_optimal_poll_group(
+ struct spdk_nvmf_transport *transport, struct spdk_nvmf_qpair *qpair);
+
+void nvmf_transport_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group);
+
+int nvmf_transport_poll_group_add(struct spdk_nvmf_transport_poll_group *group,
+ struct spdk_nvmf_qpair *qpair);
+
+int nvmf_transport_poll_group_remove(struct spdk_nvmf_transport_poll_group *group,
+ struct spdk_nvmf_qpair *qpair);
+
+int nvmf_transport_poll_group_poll(struct spdk_nvmf_transport_poll_group *group);
+
+int nvmf_transport_req_free(struct spdk_nvmf_request *req);
+
+int nvmf_transport_req_complete(struct spdk_nvmf_request *req);
+
+void nvmf_transport_qpair_fini(struct spdk_nvmf_qpair *qpair);
+
+int nvmf_transport_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair,
+ struct spdk_nvme_transport_id *trid);
+
+int nvmf_transport_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair,
+ struct spdk_nvme_transport_id *trid);
+
+int nvmf_transport_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair,
+ struct spdk_nvme_transport_id *trid);
+
+void nvmf_transport_qpair_abort_request(struct spdk_nvmf_qpair *qpair,
+ struct spdk_nvmf_request *req);
+
+#endif /* SPDK_NVMF_TRANSPORT_H */
diff --git a/src/spdk/lib/rdma/Makefile b/src/spdk/lib/rdma/Makefile
new file mode 100644
index 000000000..e6374557d
--- /dev/null
+++ b/src/spdk/lib/rdma/Makefile
@@ -0,0 +1,70 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation. All rights reserved.
+# Copyright (c) Mellanox Technologies LTD. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 1
+SO_MINOR := 0
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_rdma.map)
+
+LIBNAME = rdma
+
+ifeq ($(CONFIG_RDMA_PROV),verbs)
+C_SRCS = rdma_verbs.c
+else ifeq ($(CONFIG_RDMA_PROV),mlx5_dv)
+C_SRCS = rdma_mlx5_dv.c
+LOCAL_SYS_LIBS += -lmlx5
+else
+$(error Wrong RDMA provider specified: $(CONFIG_RDMA_PROV))
+endif
+
+LOCAL_SYS_LIBS += -libverbs -lrdmacm
+#Attach only if FreeBSD and RDMA is specified with configure
+ifeq ($(OS),FreeBSD)
+# Mellanox - MLX4 HBA Userspace Library
+ifneq ("$(wildcard /usr/lib/libmlx4.*)","")
+LOCAL_SYS_LIBS += -lmlx4
+endif
+# Mellanox - MLX5 HBA Userspace Library
+ifneq ("$(wildcard /usr/lib/libmlx5.*)","")
+LOCAL_SYS_LIBS += -lmlx5
+endif
+# Chelsio HBA Userspace Library
+ifneq ("$(wildcard /usr/lib/libcxgb4.*)","")
+LOCAL_SYS_LIBS += -lcxgb4
+endif
+endif
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/rdma/rdma_mlx5_dv.c b/src/spdk/lib/rdma/rdma_mlx5_dv.c
new file mode 100644
index 000000000..bae3afdda
--- /dev/null
+++ b/src/spdk/lib/rdma/rdma_mlx5_dv.c
@@ -0,0 +1,316 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation. All rights reserved.
+ * Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <rdma/rdma_cma.h>
+#include <infiniband/mlx5dv.h>
+
+#include "spdk/stdinc.h"
+#include "spdk/string.h"
+#include "spdk/likely.h"
+
+#include "spdk_internal/rdma.h"
+#include "spdk_internal/log.h"
+
+struct spdk_rdma_mlx5_dv_qp {
+ struct spdk_rdma_qp common;
+ struct ibv_qp_ex *qpex;
+};
+
+static int
+rdma_mlx5_dv_init_qpair(struct spdk_rdma_mlx5_dv_qp *mlx5_qp)
+{
+ struct ibv_qp_attr qp_attr;
+ int qp_attr_mask, rc;
+
+ qp_attr.qp_state = IBV_QPS_INIT;
+ rc = rdma_init_qp_attr(mlx5_qp->common.cm_id, &qp_attr, &qp_attr_mask);
+ if (rc) {
+ SPDK_ERRLOG("Failed to init attr IBV_QPS_INIT, errno %s (%d)\n", spdk_strerror(errno), errno);
+ return rc;
+ }
+
+ rc = ibv_modify_qp(mlx5_qp->common.qp, &qp_attr, qp_attr_mask);
+ if (rc) {
+ SPDK_ERRLOG("ibv_modify_qp(IBV_QPS_INIT) failed, rc %d\n", rc);
+ return rc;
+ }
+
+ qp_attr.qp_state = IBV_QPS_RTR;
+ rc = rdma_init_qp_attr(mlx5_qp->common.cm_id, &qp_attr, &qp_attr_mask);
+ if (rc) {
+ SPDK_ERRLOG("Failed to init attr IBV_QPS_RTR, errno %s (%d)\n", spdk_strerror(errno), errno);
+ return rc;
+ }
+
+ rc = ibv_modify_qp(mlx5_qp->common.qp, &qp_attr, qp_attr_mask);
+ if (rc) {
+ SPDK_ERRLOG("ibv_modify_qp(IBV_QPS_RTR) failed, rc %d\n", rc);
+ return rc;
+ }
+
+ qp_attr.qp_state = IBV_QPS_RTS;
+ rc = rdma_init_qp_attr(mlx5_qp->common.cm_id, &qp_attr, &qp_attr_mask);
+ if (rc) {
+ SPDK_ERRLOG("Failed to init attr IBV_QPS_RTR, errno %s (%d)\n", spdk_strerror(errno), errno);
+ return rc;
+ }
+
+ rc = ibv_modify_qp(mlx5_qp->common.qp, &qp_attr, qp_attr_mask);
+ if (rc) {
+ SPDK_ERRLOG("ibv_modify_qp(IBV_QPS_RTS) failed, rc %d\n", rc);
+ }
+
+ return rc;
+}
+
+struct spdk_rdma_qp *
+spdk_rdma_qp_create(struct rdma_cm_id *cm_id, struct spdk_rdma_qp_init_attr *qp_attr)
+{
+ assert(cm_id);
+ assert(qp_attr);
+
+ struct ibv_qp *qp;
+ struct spdk_rdma_mlx5_dv_qp *mlx5_qp;
+ struct ibv_qp_init_attr_ex dv_qp_attr = {
+ .qp_context = qp_attr->qp_context,
+ .send_cq = qp_attr->send_cq,
+ .recv_cq = qp_attr->recv_cq,
+ .srq = qp_attr->srq,
+ .cap = qp_attr->cap,
+ .qp_type = IBV_QPT_RC,
+ .comp_mask = IBV_QP_INIT_ATTR_PD | IBV_QP_INIT_ATTR_SEND_OPS_FLAGS,
+ .pd = qp_attr->pd ? qp_attr->pd : cm_id->pd
+ };
+
+ assert(dv_qp_attr.pd);
+
+ mlx5_qp = calloc(1, sizeof(*mlx5_qp));
+ if (!mlx5_qp) {
+ SPDK_ERRLOG("qp memory allocation failed\n");
+ return NULL;
+ }
+
+ qp = mlx5dv_create_qp(cm_id->verbs, &dv_qp_attr, NULL);
+
+ if (!qp) {
+ SPDK_ERRLOG("Failed to create qpair, errno %s (%d)\n", spdk_strerror(errno), errno);
+ free(mlx5_qp);
+ return NULL;
+ }
+
+ mlx5_qp->common.qp = qp;
+ mlx5_qp->common.cm_id = cm_id;
+ mlx5_qp->qpex = ibv_qp_to_qp_ex(qp);
+
+ if (!mlx5_qp->qpex) {
+ spdk_rdma_qp_destroy(&mlx5_qp->common);
+ return NULL;
+ }
+
+ qp_attr->cap = dv_qp_attr.cap;
+
+ return &mlx5_qp->common;
+}
+
+int
+spdk_rdma_qp_accept(struct spdk_rdma_qp *spdk_rdma_qp, struct rdma_conn_param *conn_param)
+{
+ struct spdk_rdma_mlx5_dv_qp *mlx5_qp;
+
+ assert(spdk_rdma_qp != NULL);
+ assert(spdk_rdma_qp->cm_id != NULL);
+
+ mlx5_qp = SPDK_CONTAINEROF(spdk_rdma_qp, struct spdk_rdma_mlx5_dv_qp, common);
+
+ /* NVMEoF target must move qpair to RTS state */
+ if (rdma_mlx5_dv_init_qpair(mlx5_qp) != 0) {
+ SPDK_ERRLOG("Failed to initialize qpair\n");
+ /* Set errno to be compliant with rdma_accept behaviour */
+ errno = ECONNABORTED;
+ return -1;
+ }
+
+ return rdma_accept(spdk_rdma_qp->cm_id, conn_param);
+}
+
+int
+spdk_rdma_qp_complete_connect(struct spdk_rdma_qp *spdk_rdma_qp)
+{
+ struct spdk_rdma_mlx5_dv_qp *mlx5_qp;
+ int rc;
+
+ assert(spdk_rdma_qp);
+
+ mlx5_qp = SPDK_CONTAINEROF(spdk_rdma_qp, struct spdk_rdma_mlx5_dv_qp, common);
+
+ rc = rdma_mlx5_dv_init_qpair(mlx5_qp);
+ if (rc) {
+ SPDK_ERRLOG("Failed to initialize qpair\n");
+ return rc;
+ }
+
+ rc = rdma_establish(mlx5_qp->common.cm_id);
+ if (rc) {
+ SPDK_ERRLOG("rdma_establish failed, errno %s (%d)\n", spdk_strerror(errno), errno);
+ }
+
+ return rc;
+}
+
+void
+spdk_rdma_qp_destroy(struct spdk_rdma_qp *spdk_rdma_qp)
+{
+ struct spdk_rdma_mlx5_dv_qp *mlx5_qp;
+ int rc;
+
+ assert(spdk_rdma_qp != NULL);
+
+ mlx5_qp = SPDK_CONTAINEROF(spdk_rdma_qp, struct spdk_rdma_mlx5_dv_qp, common);
+
+ if (spdk_rdma_qp->send_wrs.first != NULL) {
+ SPDK_WARNLOG("Destroying qpair with queued Work Requests\n");
+ }
+
+ if (mlx5_qp->common.qp) {
+ rc = ibv_destroy_qp(mlx5_qp->common.qp);
+ if (rc) {
+ SPDK_ERRLOG("Failed to destroy ibv qp %p, rc %d\n", mlx5_qp->common.qp, rc);
+ }
+ }
+
+ free(mlx5_qp);
+}
+
+int
+spdk_rdma_qp_disconnect(struct spdk_rdma_qp *spdk_rdma_qp)
+{
+ int rc = 0;
+
+ assert(spdk_rdma_qp != NULL);
+
+ if (spdk_rdma_qp->qp) {
+ struct ibv_qp_attr qp_attr = {.qp_state = IBV_QPS_ERR};
+
+ rc = ibv_modify_qp(spdk_rdma_qp->qp, &qp_attr, IBV_QP_STATE);
+ if (rc) {
+ SPDK_ERRLOG("Failed to modify ibv qp %p state to ERR, rc %d\n", spdk_rdma_qp->qp, rc);
+ return rc;
+ }
+ }
+
+ if (spdk_rdma_qp->cm_id) {
+ rc = rdma_disconnect(spdk_rdma_qp->cm_id);
+ if (rc) {
+ SPDK_ERRLOG("rdma_disconnect failed, errno %s (%d)\n", spdk_strerror(errno), errno);
+ }
+ }
+
+ return rc;
+}
+
+bool
+spdk_rdma_qp_queue_send_wrs(struct spdk_rdma_qp *spdk_rdma_qp, struct ibv_send_wr *first)
+{
+ struct ibv_send_wr *tmp;
+ struct spdk_rdma_mlx5_dv_qp *mlx5_qp;
+ bool is_first;
+
+ assert(spdk_rdma_qp);
+ assert(first);
+
+ is_first = spdk_rdma_qp->send_wrs.first == NULL;
+ mlx5_qp = SPDK_CONTAINEROF(spdk_rdma_qp, struct spdk_rdma_mlx5_dv_qp, common);
+
+ if (is_first) {
+ ibv_wr_start(mlx5_qp->qpex);
+ spdk_rdma_qp->send_wrs.first = first;
+ } else {
+ spdk_rdma_qp->send_wrs.last->next = first;
+ }
+
+ for (tmp = first; tmp != NULL; tmp = tmp->next) {
+ mlx5_qp->qpex->wr_id = tmp->wr_id;
+ mlx5_qp->qpex->wr_flags = tmp->send_flags;
+
+ switch (tmp->opcode) {
+ case IBV_WR_SEND:
+ ibv_wr_send(mlx5_qp->qpex);
+ break;
+ case IBV_WR_SEND_WITH_INV:
+ ibv_wr_send_inv(mlx5_qp->qpex, tmp->invalidate_rkey);
+ break;
+ case IBV_WR_RDMA_READ:
+ ibv_wr_rdma_read(mlx5_qp->qpex, tmp->wr.rdma.rkey, tmp->wr.rdma.remote_addr);
+ break;
+ case IBV_WR_RDMA_WRITE:
+ ibv_wr_rdma_write(mlx5_qp->qpex, tmp->wr.rdma.rkey, tmp->wr.rdma.remote_addr);
+ break;
+ default:
+ SPDK_ERRLOG("Unexpected opcode %d\n", tmp->opcode);
+ assert(0);
+ }
+
+ ibv_wr_set_sge_list(mlx5_qp->qpex, tmp->num_sge, tmp->sg_list);
+
+ spdk_rdma_qp->send_wrs.last = tmp;
+ }
+
+ return is_first;
+}
+
+int
+spdk_rdma_qp_flush_send_wrs(struct spdk_rdma_qp *spdk_rdma_qp, struct ibv_send_wr **bad_wr)
+{
+ struct spdk_rdma_mlx5_dv_qp *mlx5_qp;
+ int rc;
+
+ assert(bad_wr);
+ assert(spdk_rdma_qp);
+
+ mlx5_qp = SPDK_CONTAINEROF(spdk_rdma_qp, struct spdk_rdma_mlx5_dv_qp, common);
+
+ if (spdk_unlikely(spdk_rdma_qp->send_wrs.first == NULL)) {
+ return 0;
+ }
+
+ rc = ibv_wr_complete(mlx5_qp->qpex);
+
+ if (spdk_unlikely(rc)) {
+ /* If ibv_wr_complete reports an error that means that no WRs are posted to NIC */
+ *bad_wr = spdk_rdma_qp->send_wrs.first;
+ }
+
+ spdk_rdma_qp->send_wrs.first = NULL;
+
+ return rc;
+}
diff --git a/src/spdk/lib/rdma/rdma_verbs.c b/src/spdk/lib/rdma/rdma_verbs.c
new file mode 100644
index 000000000..66be5bf60
--- /dev/null
+++ b/src/spdk/lib/rdma/rdma_verbs.c
@@ -0,0 +1,167 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation. All rights reserved.
+ * Copyright (c) Mellanox Technologies LTD. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <rdma/rdma_cma.h>
+
+#include "spdk/stdinc.h"
+#include "spdk/string.h"
+#include "spdk/likely.h"
+
+#include "spdk_internal/rdma.h"
+#include "spdk_internal/log.h"
+
+struct spdk_rdma_qp *
+spdk_rdma_qp_create(struct rdma_cm_id *cm_id, struct spdk_rdma_qp_init_attr *qp_attr)
+{
+ struct spdk_rdma_qp *spdk_rdma_qp;
+ int rc;
+ struct ibv_qp_init_attr attr = {
+ .qp_context = qp_attr->qp_context,
+ .send_cq = qp_attr->send_cq,
+ .recv_cq = qp_attr->recv_cq,
+ .srq = qp_attr->srq,
+ .cap = qp_attr->cap,
+ .qp_type = IBV_QPT_RC
+ };
+
+ spdk_rdma_qp = calloc(1, sizeof(*spdk_rdma_qp));
+ if (!spdk_rdma_qp) {
+ SPDK_ERRLOG("qp memory allocation failed\n");
+ return NULL;
+ }
+
+ rc = rdma_create_qp(cm_id, qp_attr->pd, &attr);
+ if (rc) {
+ SPDK_ERRLOG("Failed to create qp, errno %s (%d)\n", spdk_strerror(errno), errno);
+ free(spdk_rdma_qp);
+ return NULL;
+ }
+
+ qp_attr->cap = attr.cap;
+ spdk_rdma_qp->qp = cm_id->qp;
+ spdk_rdma_qp->cm_id = cm_id;
+
+ return spdk_rdma_qp;
+}
+
+int
+spdk_rdma_qp_accept(struct spdk_rdma_qp *spdk_rdma_qp, struct rdma_conn_param *conn_param)
+{
+ assert(spdk_rdma_qp != NULL);
+ assert(spdk_rdma_qp->cm_id != NULL);
+
+ return rdma_accept(spdk_rdma_qp->cm_id, conn_param);
+}
+
+int
+spdk_rdma_qp_complete_connect(struct spdk_rdma_qp *spdk_rdma_qp)
+{
+ /* Nothing to be done for Verbs */
+ return 0;
+}
+
+void
+spdk_rdma_qp_destroy(struct spdk_rdma_qp *spdk_rdma_qp)
+{
+ assert(spdk_rdma_qp != NULL);
+
+ if (spdk_rdma_qp->send_wrs.first != NULL) {
+ SPDK_WARNLOG("Destroying qpair with queued Work Requests\n");
+ }
+
+ if (spdk_rdma_qp->qp) {
+ rdma_destroy_qp(spdk_rdma_qp->cm_id);
+ }
+
+ free(spdk_rdma_qp);
+}
+
+int
+spdk_rdma_qp_disconnect(struct spdk_rdma_qp *spdk_rdma_qp)
+{
+ int rc = 0;
+
+ assert(spdk_rdma_qp != NULL);
+
+ if (spdk_rdma_qp->cm_id) {
+ rc = rdma_disconnect(spdk_rdma_qp->cm_id);
+ if (rc) {
+ SPDK_ERRLOG("rdma_disconnect failed, errno %s (%d)\n", spdk_strerror(errno), errno);
+ }
+ }
+
+ return rc;
+}
+
+bool
+spdk_rdma_qp_queue_send_wrs(struct spdk_rdma_qp *spdk_rdma_qp, struct ibv_send_wr *first)
+{
+ struct ibv_send_wr *last;
+
+ assert(spdk_rdma_qp);
+ assert(first);
+
+ last = first;
+ while (last->next != NULL) {
+ last = last->next;
+ }
+
+ if (spdk_rdma_qp->send_wrs.first == NULL) {
+ spdk_rdma_qp->send_wrs.first = first;
+ spdk_rdma_qp->send_wrs.last = last;
+ return true;
+ } else {
+ spdk_rdma_qp->send_wrs.last->next = first;
+ spdk_rdma_qp->send_wrs.last = last;
+ return false;
+ }
+}
+
+int
+spdk_rdma_qp_flush_send_wrs(struct spdk_rdma_qp *spdk_rdma_qp, struct ibv_send_wr **bad_wr)
+{
+ int rc;
+
+ assert(spdk_rdma_qp);
+ assert(bad_wr);
+
+ if (spdk_unlikely(!spdk_rdma_qp->send_wrs.first)) {
+ return 0;
+ }
+
+ rc = ibv_post_send(spdk_rdma_qp->qp, spdk_rdma_qp->send_wrs.first, bad_wr);
+
+ spdk_rdma_qp->send_wrs.first = NULL;
+
+ return rc;
+}
diff --git a/src/spdk/lib/rdma/spdk_rdma.map b/src/spdk/lib/rdma/spdk_rdma.map
new file mode 100644
index 000000000..9268a2191
--- /dev/null
+++ b/src/spdk/lib/rdma/spdk_rdma.map
@@ -0,0 +1,14 @@
+{
+ global:
+
+ # Public functions
+ spdk_rdma_qp_create;
+ spdk_rdma_qp_accept;
+ spdk_rdma_qp_complete_connect;
+ spdk_rdma_qp_destroy;
+ spdk_rdma_qp_disconnect;
+ spdk_rdma_qp_queue_send_wrs;
+ spdk_rdma_qp_flush_send_wrs;
+
+ local: *;
+};
diff --git a/src/spdk/lib/reduce/Makefile b/src/spdk/lib/reduce/Makefile
new file mode 100644
index 000000000..fb417cd57
--- /dev/null
+++ b/src/spdk/lib/reduce/Makefile
@@ -0,0 +1,45 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = reduce.c
+LIBNAME = reduce
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_reduce.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/reduce/reduce.c b/src/spdk/lib/reduce/reduce.c
new file mode 100644
index 000000000..6188f6c6c
--- /dev/null
+++ b/src/spdk/lib/reduce/reduce.c
@@ -0,0 +1,1625 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/reduce.h"
+#include "spdk/env.h"
+#include "spdk/string.h"
+#include "spdk/bit_array.h"
+#include "spdk/util.h"
+#include "spdk_internal/log.h"
+
+#include "libpmem.h"
+
+/* Always round up the size of the PM region to the nearest cacheline. */
+#define REDUCE_PM_SIZE_ALIGNMENT 64
+
+/* Offset into the backing device where the persistent memory file's path is stored. */
+#define REDUCE_BACKING_DEV_PATH_OFFSET 4096
+
+#define REDUCE_EMPTY_MAP_ENTRY -1ULL
+
+#define REDUCE_NUM_VOL_REQUESTS 256
+
+/* Structure written to offset 0 of both the pm file and the backing device. */
+struct spdk_reduce_vol_superblock {
+ uint8_t signature[8];
+ struct spdk_reduce_vol_params params;
+ uint8_t reserved[4048];
+};
+SPDK_STATIC_ASSERT(sizeof(struct spdk_reduce_vol_superblock) == 4096, "size incorrect");
+
+#define SPDK_REDUCE_SIGNATURE "SPDKREDU"
+/* null terminator counts one */
+SPDK_STATIC_ASSERT(sizeof(SPDK_REDUCE_SIGNATURE) - 1 ==
+ sizeof(((struct spdk_reduce_vol_superblock *)0)->signature), "size incorrect");
+
+#define REDUCE_PATH_MAX 4096
+
+#define REDUCE_ZERO_BUF_SIZE 0x100000
+
+/**
+ * Describes a persistent memory file used to hold metadata associated with a
+ * compressed volume.
+ */
+struct spdk_reduce_pm_file {
+ char path[REDUCE_PATH_MAX];
+ void *pm_buf;
+ int pm_is_pmem;
+ uint64_t size;
+};
+
+#define REDUCE_IO_READV 1
+#define REDUCE_IO_WRITEV 2
+
+struct spdk_reduce_chunk_map {
+ uint32_t compressed_size;
+ uint32_t reserved;
+ uint64_t io_unit_index[0];
+};
+
+struct spdk_reduce_vol_request {
+ /**
+ * Scratch buffer used for uncompressed chunk. This is used for:
+ * 1) source buffer for compression operations
+ * 2) destination buffer for decompression operations
+ * 3) data buffer when writing uncompressed chunk to disk
+ * 4) data buffer when reading uncompressed chunk from disk
+ */
+ uint8_t *decomp_buf;
+ struct iovec *decomp_buf_iov;
+
+ /**
+ * These are used to construct the iovecs that are sent to
+ * the decomp engine, they point to a mix of the scratch buffer
+ * and user buffer
+ */
+ struct iovec decomp_iov[REDUCE_MAX_IOVECS + 2];
+ int decomp_iovcnt;
+
+ /**
+ * Scratch buffer used for compressed chunk. This is used for:
+ * 1) destination buffer for compression operations
+ * 2) source buffer for decompression operations
+ * 3) data buffer when writing compressed chunk to disk
+ * 4) data buffer when reading compressed chunk from disk
+ */
+ uint8_t *comp_buf;
+ struct iovec *comp_buf_iov;
+ struct iovec *iov;
+ bool rmw;
+ struct spdk_reduce_vol *vol;
+ int type;
+ int reduce_errno;
+ int iovcnt;
+ int num_backing_ops;
+ uint32_t num_io_units;
+ bool chunk_is_compressed;
+ uint64_t offset;
+ uint64_t logical_map_index;
+ uint64_t length;
+ uint64_t chunk_map_index;
+ struct spdk_reduce_chunk_map *chunk;
+ spdk_reduce_vol_op_complete cb_fn;
+ void *cb_arg;
+ TAILQ_ENTRY(spdk_reduce_vol_request) tailq;
+ struct spdk_reduce_vol_cb_args backing_cb_args;
+};
+
+struct spdk_reduce_vol {
+ struct spdk_reduce_vol_params params;
+ uint32_t backing_io_units_per_chunk;
+ uint32_t backing_lba_per_io_unit;
+ uint32_t logical_blocks_per_chunk;
+ struct spdk_reduce_pm_file pm_file;
+ struct spdk_reduce_backing_dev *backing_dev;
+ struct spdk_reduce_vol_superblock *backing_super;
+ struct spdk_reduce_vol_superblock *pm_super;
+ uint64_t *pm_logical_map;
+ uint64_t *pm_chunk_maps;
+
+ struct spdk_bit_array *allocated_chunk_maps;
+ struct spdk_bit_array *allocated_backing_io_units;
+
+ struct spdk_reduce_vol_request *request_mem;
+ TAILQ_HEAD(, spdk_reduce_vol_request) free_requests;
+ TAILQ_HEAD(, spdk_reduce_vol_request) executing_requests;
+ TAILQ_HEAD(, spdk_reduce_vol_request) queued_requests;
+
+ /* Single contiguous buffer used for all request buffers for this volume. */
+ uint8_t *buf_mem;
+ struct iovec *buf_iov_mem;
+};
+
+static void _start_readv_request(struct spdk_reduce_vol_request *req);
+static void _start_writev_request(struct spdk_reduce_vol_request *req);
+static uint8_t *g_zero_buf;
+static int g_vol_count = 0;
+
+/*
+ * Allocate extra metadata chunks and corresponding backing io units to account for
+ * outstanding IO in worst case scenario where logical map is completely allocated
+ * and no data can be compressed. We need extra chunks in this case to handle
+ * in-flight writes since reduce never writes data in place.
+ */
+#define REDUCE_NUM_EXTRA_CHUNKS 128
+
+static void
+_reduce_persist(struct spdk_reduce_vol *vol, const void *addr, size_t len)
+{
+ if (vol->pm_file.pm_is_pmem) {
+ pmem_persist(addr, len);
+ } else {
+ pmem_msync(addr, len);
+ }
+}
+
+static uint64_t
+_get_pm_logical_map_size(uint64_t vol_size, uint64_t chunk_size)
+{
+ uint64_t chunks_in_logical_map, logical_map_size;
+
+ chunks_in_logical_map = vol_size / chunk_size;
+ logical_map_size = chunks_in_logical_map * sizeof(uint64_t);
+
+ /* Round up to next cacheline. */
+ return spdk_divide_round_up(logical_map_size, REDUCE_PM_SIZE_ALIGNMENT) *
+ REDUCE_PM_SIZE_ALIGNMENT;
+}
+
+static uint64_t
+_get_total_chunks(uint64_t vol_size, uint64_t chunk_size)
+{
+ uint64_t num_chunks;
+
+ num_chunks = vol_size / chunk_size;
+ num_chunks += REDUCE_NUM_EXTRA_CHUNKS;
+
+ return num_chunks;
+}
+
+static inline uint32_t
+_reduce_vol_get_chunk_struct_size(uint64_t backing_io_units_per_chunk)
+{
+ return sizeof(struct spdk_reduce_chunk_map) + sizeof(uint64_t) * backing_io_units_per_chunk;
+}
+
+static uint64_t
+_get_pm_total_chunks_size(uint64_t vol_size, uint64_t chunk_size, uint64_t backing_io_unit_size)
+{
+ uint64_t io_units_per_chunk, num_chunks, total_chunks_size;
+
+ num_chunks = _get_total_chunks(vol_size, chunk_size);
+ io_units_per_chunk = chunk_size / backing_io_unit_size;
+
+ total_chunks_size = num_chunks * _reduce_vol_get_chunk_struct_size(io_units_per_chunk);
+
+ return spdk_divide_round_up(total_chunks_size, REDUCE_PM_SIZE_ALIGNMENT) *
+ REDUCE_PM_SIZE_ALIGNMENT;
+}
+
+static struct spdk_reduce_chunk_map *
+_reduce_vol_get_chunk_map(struct spdk_reduce_vol *vol, uint64_t chunk_map_index)
+{
+ uintptr_t chunk_map_addr;
+
+ assert(chunk_map_index < _get_total_chunks(vol->params.vol_size, vol->params.chunk_size));
+
+ chunk_map_addr = (uintptr_t)vol->pm_chunk_maps;
+ chunk_map_addr += chunk_map_index *
+ _reduce_vol_get_chunk_struct_size(vol->backing_io_units_per_chunk);
+
+ return (struct spdk_reduce_chunk_map *)chunk_map_addr;
+}
+
+static int
+_validate_vol_params(struct spdk_reduce_vol_params *params)
+{
+ if (params->vol_size > 0) {
+ /**
+ * User does not pass in the vol size - it gets calculated by libreduce from
+ * values in this structure plus the size of the backing device.
+ */
+ return -EINVAL;
+ }
+
+ if (params->chunk_size == 0 || params->backing_io_unit_size == 0 ||
+ params->logical_block_size == 0) {
+ return -EINVAL;
+ }
+
+ /* Chunk size must be an even multiple of the backing io unit size. */
+ if ((params->chunk_size % params->backing_io_unit_size) != 0) {
+ return -EINVAL;
+ }
+
+ /* Chunk size must be an even multiple of the logical block size. */
+ if ((params->chunk_size % params->logical_block_size) != 0) {
+ return -1;
+ }
+
+ return 0;
+}
+
+static uint64_t
+_get_vol_size(uint64_t chunk_size, uint64_t backing_dev_size)
+{
+ uint64_t num_chunks;
+
+ num_chunks = backing_dev_size / chunk_size;
+ if (num_chunks <= REDUCE_NUM_EXTRA_CHUNKS) {
+ return 0;
+ }
+
+ num_chunks -= REDUCE_NUM_EXTRA_CHUNKS;
+ return num_chunks * chunk_size;
+}
+
+static uint64_t
+_get_pm_file_size(struct spdk_reduce_vol_params *params)
+{
+ uint64_t total_pm_size;
+
+ total_pm_size = sizeof(struct spdk_reduce_vol_superblock);
+ total_pm_size += _get_pm_logical_map_size(params->vol_size, params->chunk_size);
+ total_pm_size += _get_pm_total_chunks_size(params->vol_size, params->chunk_size,
+ params->backing_io_unit_size);
+ return total_pm_size;
+}
+
+const struct spdk_uuid *
+spdk_reduce_vol_get_uuid(struct spdk_reduce_vol *vol)
+{
+ return &vol->params.uuid;
+}
+
+static void
+_initialize_vol_pm_pointers(struct spdk_reduce_vol *vol)
+{
+ uint64_t logical_map_size;
+
+ /* Superblock is at the beginning of the pm file. */
+ vol->pm_super = (struct spdk_reduce_vol_superblock *)vol->pm_file.pm_buf;
+
+ /* Logical map immediately follows the super block. */
+ vol->pm_logical_map = (uint64_t *)(vol->pm_super + 1);
+
+ /* Chunks maps follow the logical map. */
+ logical_map_size = _get_pm_logical_map_size(vol->params.vol_size, vol->params.chunk_size);
+ vol->pm_chunk_maps = (uint64_t *)((uint8_t *)vol->pm_logical_map + logical_map_size);
+}
+
+/* We need 2 iovs during load - one for the superblock, another for the path */
+#define LOAD_IOV_COUNT 2
+
+struct reduce_init_load_ctx {
+ struct spdk_reduce_vol *vol;
+ struct spdk_reduce_vol_cb_args backing_cb_args;
+ spdk_reduce_vol_op_with_handle_complete cb_fn;
+ void *cb_arg;
+ struct iovec iov[LOAD_IOV_COUNT];
+ void *path;
+};
+
+static int
+_allocate_vol_requests(struct spdk_reduce_vol *vol)
+{
+ struct spdk_reduce_vol_request *req;
+ int i;
+
+ /* Allocate 2x since we need buffers for both read/write and compress/decompress
+ * intermediate buffers.
+ */
+ vol->buf_mem = spdk_malloc(2 * REDUCE_NUM_VOL_REQUESTS * vol->params.chunk_size,
+ 64, NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+ if (vol->buf_mem == NULL) {
+ return -ENOMEM;
+ }
+
+ vol->request_mem = calloc(REDUCE_NUM_VOL_REQUESTS, sizeof(*req));
+ if (vol->request_mem == NULL) {
+ spdk_free(vol->buf_mem);
+ vol->buf_mem = NULL;
+ return -ENOMEM;
+ }
+
+ /* Allocate 2x since we need iovs for both read/write and compress/decompress intermediate
+ * buffers.
+ */
+ vol->buf_iov_mem = calloc(REDUCE_NUM_VOL_REQUESTS,
+ 2 * sizeof(struct iovec) * vol->backing_io_units_per_chunk);
+ if (vol->buf_iov_mem == NULL) {
+ free(vol->request_mem);
+ spdk_free(vol->buf_mem);
+ vol->request_mem = NULL;
+ vol->buf_mem = NULL;
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < REDUCE_NUM_VOL_REQUESTS; i++) {
+ req = &vol->request_mem[i];
+ TAILQ_INSERT_HEAD(&vol->free_requests, req, tailq);
+ req->decomp_buf_iov = &vol->buf_iov_mem[(2 * i) * vol->backing_io_units_per_chunk];
+ req->decomp_buf = vol->buf_mem + (2 * i) * vol->params.chunk_size;
+ req->comp_buf_iov = &vol->buf_iov_mem[(2 * i + 1) * vol->backing_io_units_per_chunk];
+ req->comp_buf = vol->buf_mem + (2 * i + 1) * vol->params.chunk_size;
+ }
+
+ return 0;
+}
+
+static void
+_init_load_cleanup(struct spdk_reduce_vol *vol, struct reduce_init_load_ctx *ctx)
+{
+ if (ctx != NULL) {
+ spdk_free(ctx->path);
+ free(ctx);
+ }
+
+ if (vol != NULL) {
+ if (vol->pm_file.pm_buf != NULL) {
+ pmem_unmap(vol->pm_file.pm_buf, vol->pm_file.size);
+ }
+
+ spdk_free(vol->backing_super);
+ spdk_bit_array_free(&vol->allocated_chunk_maps);
+ spdk_bit_array_free(&vol->allocated_backing_io_units);
+ free(vol->request_mem);
+ free(vol->buf_iov_mem);
+ spdk_free(vol->buf_mem);
+ free(vol);
+ }
+}
+
+static int
+_alloc_zero_buff(void)
+{
+ int rc = 0;
+
+ /* The zero buffer is shared between all volumnes and just used
+ * for reads so allocate one global instance here if not already
+ * allocated when another vol init'd or loaded.
+ */
+ if (g_vol_count++ == 0) {
+ g_zero_buf = spdk_zmalloc(REDUCE_ZERO_BUF_SIZE,
+ 64, NULL, SPDK_ENV_LCORE_ID_ANY,
+ SPDK_MALLOC_DMA);
+ if (g_zero_buf == NULL) {
+ rc = -ENOMEM;
+ }
+ }
+ return rc;
+}
+
+static void
+_init_write_super_cpl(void *cb_arg, int reduce_errno)
+{
+ struct reduce_init_load_ctx *init_ctx = cb_arg;
+ int rc;
+
+ rc = _allocate_vol_requests(init_ctx->vol);
+ if (rc != 0) {
+ init_ctx->cb_fn(init_ctx->cb_arg, NULL, rc);
+ _init_load_cleanup(init_ctx->vol, init_ctx);
+ return;
+ }
+
+ rc = _alloc_zero_buff();
+ if (rc != 0) {
+ init_ctx->cb_fn(init_ctx->cb_arg, NULL, rc);
+ _init_load_cleanup(init_ctx->vol, init_ctx);
+ return;
+ }
+
+ init_ctx->cb_fn(init_ctx->cb_arg, init_ctx->vol, reduce_errno);
+ /* Only clean up the ctx - the vol has been passed to the application
+ * for use now that initialization was successful.
+ */
+ _init_load_cleanup(NULL, init_ctx);
+}
+
+static void
+_init_write_path_cpl(void *cb_arg, int reduce_errno)
+{
+ struct reduce_init_load_ctx *init_ctx = cb_arg;
+ struct spdk_reduce_vol *vol = init_ctx->vol;
+
+ init_ctx->iov[0].iov_base = vol->backing_super;
+ init_ctx->iov[0].iov_len = sizeof(*vol->backing_super);
+ init_ctx->backing_cb_args.cb_fn = _init_write_super_cpl;
+ init_ctx->backing_cb_args.cb_arg = init_ctx;
+ vol->backing_dev->writev(vol->backing_dev, init_ctx->iov, 1,
+ 0, sizeof(*vol->backing_super) / vol->backing_dev->blocklen,
+ &init_ctx->backing_cb_args);
+}
+
+static int
+_allocate_bit_arrays(struct spdk_reduce_vol *vol)
+{
+ uint64_t total_chunks, total_backing_io_units;
+ uint32_t i, num_metadata_io_units;
+
+ total_chunks = _get_total_chunks(vol->params.vol_size, vol->params.chunk_size);
+ vol->allocated_chunk_maps = spdk_bit_array_create(total_chunks);
+ total_backing_io_units = total_chunks * (vol->params.chunk_size / vol->params.backing_io_unit_size);
+ vol->allocated_backing_io_units = spdk_bit_array_create(total_backing_io_units);
+
+ if (vol->allocated_chunk_maps == NULL || vol->allocated_backing_io_units == NULL) {
+ return -ENOMEM;
+ }
+
+ /* Set backing io unit bits associated with metadata. */
+ num_metadata_io_units = (sizeof(*vol->backing_super) + REDUCE_PATH_MAX) /
+ vol->backing_dev->blocklen;
+ for (i = 0; i < num_metadata_io_units; i++) {
+ spdk_bit_array_set(vol->allocated_backing_io_units, i);
+ }
+
+ return 0;
+}
+
+void
+spdk_reduce_vol_init(struct spdk_reduce_vol_params *params,
+ struct spdk_reduce_backing_dev *backing_dev,
+ const char *pm_file_dir,
+ spdk_reduce_vol_op_with_handle_complete cb_fn, void *cb_arg)
+{
+ struct spdk_reduce_vol *vol;
+ struct reduce_init_load_ctx *init_ctx;
+ uint64_t backing_dev_size;
+ size_t mapped_len;
+ int dir_len, max_dir_len, rc;
+
+ /* We need to append a path separator and the UUID to the supplied
+ * path.
+ */
+ max_dir_len = REDUCE_PATH_MAX - SPDK_UUID_STRING_LEN - 1;
+ dir_len = strnlen(pm_file_dir, max_dir_len);
+ /* Strip trailing slash if the user provided one - we will add it back
+ * later when appending the filename.
+ */
+ if (pm_file_dir[dir_len - 1] == '/') {
+ dir_len--;
+ }
+ if (dir_len == max_dir_len) {
+ SPDK_ERRLOG("pm_file_dir (%s) too long\n", pm_file_dir);
+ cb_fn(cb_arg, NULL, -EINVAL);
+ return;
+ }
+
+ rc = _validate_vol_params(params);
+ if (rc != 0) {
+ SPDK_ERRLOG("invalid vol params\n");
+ cb_fn(cb_arg, NULL, rc);
+ return;
+ }
+
+ backing_dev_size = backing_dev->blockcnt * backing_dev->blocklen;
+ params->vol_size = _get_vol_size(params->chunk_size, backing_dev_size);
+ if (params->vol_size == 0) {
+ SPDK_ERRLOG("backing device is too small\n");
+ cb_fn(cb_arg, NULL, -EINVAL);
+ return;
+ }
+
+ if (backing_dev->readv == NULL || backing_dev->writev == NULL ||
+ backing_dev->unmap == NULL) {
+ SPDK_ERRLOG("backing_dev function pointer not specified\n");
+ cb_fn(cb_arg, NULL, -EINVAL);
+ return;
+ }
+
+ vol = calloc(1, sizeof(*vol));
+ if (vol == NULL) {
+ cb_fn(cb_arg, NULL, -ENOMEM);
+ return;
+ }
+
+ TAILQ_INIT(&vol->free_requests);
+ TAILQ_INIT(&vol->executing_requests);
+ TAILQ_INIT(&vol->queued_requests);
+
+ vol->backing_super = spdk_zmalloc(sizeof(*vol->backing_super), 0, NULL,
+ SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+ if (vol->backing_super == NULL) {
+ cb_fn(cb_arg, NULL, -ENOMEM);
+ _init_load_cleanup(vol, NULL);
+ return;
+ }
+
+ init_ctx = calloc(1, sizeof(*init_ctx));
+ if (init_ctx == NULL) {
+ cb_fn(cb_arg, NULL, -ENOMEM);
+ _init_load_cleanup(vol, NULL);
+ return;
+ }
+
+ init_ctx->path = spdk_zmalloc(REDUCE_PATH_MAX, 0, NULL,
+ SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+ if (init_ctx->path == NULL) {
+ cb_fn(cb_arg, NULL, -ENOMEM);
+ _init_load_cleanup(vol, init_ctx);
+ return;
+ }
+
+ if (spdk_mem_all_zero(&params->uuid, sizeof(params->uuid))) {
+ spdk_uuid_generate(&params->uuid);
+ }
+
+ memcpy(vol->pm_file.path, pm_file_dir, dir_len);
+ vol->pm_file.path[dir_len] = '/';
+ spdk_uuid_fmt_lower(&vol->pm_file.path[dir_len + 1], SPDK_UUID_STRING_LEN,
+ &params->uuid);
+ vol->pm_file.size = _get_pm_file_size(params);
+ vol->pm_file.pm_buf = pmem_map_file(vol->pm_file.path, vol->pm_file.size,
+ PMEM_FILE_CREATE | PMEM_FILE_EXCL, 0600,
+ &mapped_len, &vol->pm_file.pm_is_pmem);
+ if (vol->pm_file.pm_buf == NULL) {
+ SPDK_ERRLOG("could not pmem_map_file(%s): %s\n",
+ vol->pm_file.path, strerror(errno));
+ cb_fn(cb_arg, NULL, -errno);
+ _init_load_cleanup(vol, init_ctx);
+ return;
+ }
+
+ if (vol->pm_file.size != mapped_len) {
+ SPDK_ERRLOG("could not map entire pmem file (size=%" PRIu64 " mapped=%" PRIu64 ")\n",
+ vol->pm_file.size, mapped_len);
+ cb_fn(cb_arg, NULL, -ENOMEM);
+ _init_load_cleanup(vol, init_ctx);
+ return;
+ }
+
+ vol->backing_io_units_per_chunk = params->chunk_size / params->backing_io_unit_size;
+ vol->logical_blocks_per_chunk = params->chunk_size / params->logical_block_size;
+ vol->backing_lba_per_io_unit = params->backing_io_unit_size / backing_dev->blocklen;
+ memcpy(&vol->params, params, sizeof(*params));
+
+ vol->backing_dev = backing_dev;
+
+ rc = _allocate_bit_arrays(vol);
+ if (rc != 0) {
+ cb_fn(cb_arg, NULL, rc);
+ _init_load_cleanup(vol, init_ctx);
+ return;
+ }
+
+ memcpy(vol->backing_super->signature, SPDK_REDUCE_SIGNATURE,
+ sizeof(vol->backing_super->signature));
+ memcpy(&vol->backing_super->params, params, sizeof(*params));
+
+ _initialize_vol_pm_pointers(vol);
+
+ memcpy(vol->pm_super, vol->backing_super, sizeof(*vol->backing_super));
+ /* Writing 0xFF's is equivalent of filling it all with SPDK_EMPTY_MAP_ENTRY.
+ * Note that this writes 0xFF to not just the logical map but the chunk maps as well.
+ */
+ memset(vol->pm_logical_map, 0xFF, vol->pm_file.size - sizeof(*vol->backing_super));
+ _reduce_persist(vol, vol->pm_file.pm_buf, vol->pm_file.size);
+
+ init_ctx->vol = vol;
+ init_ctx->cb_fn = cb_fn;
+ init_ctx->cb_arg = cb_arg;
+
+ memcpy(init_ctx->path, vol->pm_file.path, REDUCE_PATH_MAX);
+ init_ctx->iov[0].iov_base = init_ctx->path;
+ init_ctx->iov[0].iov_len = REDUCE_PATH_MAX;
+ init_ctx->backing_cb_args.cb_fn = _init_write_path_cpl;
+ init_ctx->backing_cb_args.cb_arg = init_ctx;
+ /* Write path to offset 4K on backing device - just after where the super
+ * block will be written. We wait until this is committed before writing the
+ * super block to guarantee we don't get the super block written without the
+ * the path if the system crashed in the middle of a write operation.
+ */
+ vol->backing_dev->writev(vol->backing_dev, init_ctx->iov, 1,
+ REDUCE_BACKING_DEV_PATH_OFFSET / vol->backing_dev->blocklen,
+ REDUCE_PATH_MAX / vol->backing_dev->blocklen,
+ &init_ctx->backing_cb_args);
+}
+
+static void destroy_load_cb(void *cb_arg, struct spdk_reduce_vol *vol, int reduce_errno);
+
+static void
+_load_read_super_and_path_cpl(void *cb_arg, int reduce_errno)
+{
+ struct reduce_init_load_ctx *load_ctx = cb_arg;
+ struct spdk_reduce_vol *vol = load_ctx->vol;
+ uint64_t backing_dev_size;
+ uint64_t i, num_chunks, logical_map_index;
+ struct spdk_reduce_chunk_map *chunk;
+ size_t mapped_len;
+ uint32_t j;
+ int rc;
+
+ rc = _alloc_zero_buff();
+ if (rc) {
+ goto error;
+ }
+
+ if (memcmp(vol->backing_super->signature,
+ SPDK_REDUCE_SIGNATURE,
+ sizeof(vol->backing_super->signature)) != 0) {
+ /* This backing device isn't a libreduce backing device. */
+ rc = -EILSEQ;
+ goto error;
+ }
+
+ /* If the cb_fn is destroy_load_cb, it means we are wanting to destroy this compress bdev.
+ * So don't bother getting the volume ready to use - invoke the callback immediately
+ * so destroy_load_cb can delete the metadata off of the block device and delete the
+ * persistent memory file if it exists.
+ */
+ memcpy(vol->pm_file.path, load_ctx->path, sizeof(vol->pm_file.path));
+ if (load_ctx->cb_fn == (*destroy_load_cb)) {
+ load_ctx->cb_fn(load_ctx->cb_arg, vol, 0);
+ _init_load_cleanup(NULL, load_ctx);
+ return;
+ }
+
+ memcpy(&vol->params, &vol->backing_super->params, sizeof(vol->params));
+ vol->backing_io_units_per_chunk = vol->params.chunk_size / vol->params.backing_io_unit_size;
+ vol->logical_blocks_per_chunk = vol->params.chunk_size / vol->params.logical_block_size;
+ vol->backing_lba_per_io_unit = vol->params.backing_io_unit_size / vol->backing_dev->blocklen;
+
+ rc = _allocate_bit_arrays(vol);
+ if (rc != 0) {
+ goto error;
+ }
+
+ backing_dev_size = vol->backing_dev->blockcnt * vol->backing_dev->blocklen;
+ if (_get_vol_size(vol->params.chunk_size, backing_dev_size) < vol->params.vol_size) {
+ SPDK_ERRLOG("backing device size %" PRIi64 " smaller than expected\n",
+ backing_dev_size);
+ rc = -EILSEQ;
+ goto error;
+ }
+
+ vol->pm_file.size = _get_pm_file_size(&vol->params);
+ vol->pm_file.pm_buf = pmem_map_file(vol->pm_file.path, 0, 0, 0, &mapped_len,
+ &vol->pm_file.pm_is_pmem);
+ if (vol->pm_file.pm_buf == NULL) {
+ SPDK_ERRLOG("could not pmem_map_file(%s): %s\n", vol->pm_file.path, strerror(errno));
+ rc = -errno;
+ goto error;
+ }
+
+ if (vol->pm_file.size != mapped_len) {
+ SPDK_ERRLOG("could not map entire pmem file (size=%" PRIu64 " mapped=%" PRIu64 ")\n",
+ vol->pm_file.size, mapped_len);
+ rc = -ENOMEM;
+ goto error;
+ }
+
+ rc = _allocate_vol_requests(vol);
+ if (rc != 0) {
+ goto error;
+ }
+
+ _initialize_vol_pm_pointers(vol);
+
+ num_chunks = vol->params.vol_size / vol->params.chunk_size;
+ for (i = 0; i < num_chunks; i++) {
+ logical_map_index = vol->pm_logical_map[i];
+ if (logical_map_index == REDUCE_EMPTY_MAP_ENTRY) {
+ continue;
+ }
+ spdk_bit_array_set(vol->allocated_chunk_maps, logical_map_index);
+ chunk = _reduce_vol_get_chunk_map(vol, logical_map_index);
+ for (j = 0; j < vol->backing_io_units_per_chunk; j++) {
+ if (chunk->io_unit_index[j] != REDUCE_EMPTY_MAP_ENTRY) {
+ spdk_bit_array_set(vol->allocated_backing_io_units, chunk->io_unit_index[j]);
+ }
+ }
+ }
+
+ load_ctx->cb_fn(load_ctx->cb_arg, vol, 0);
+ /* Only clean up the ctx - the vol has been passed to the application
+ * for use now that volume load was successful.
+ */
+ _init_load_cleanup(NULL, load_ctx);
+ return;
+
+error:
+ load_ctx->cb_fn(load_ctx->cb_arg, NULL, rc);
+ _init_load_cleanup(vol, load_ctx);
+}
+
+void
+spdk_reduce_vol_load(struct spdk_reduce_backing_dev *backing_dev,
+ spdk_reduce_vol_op_with_handle_complete cb_fn, void *cb_arg)
+{
+ struct spdk_reduce_vol *vol;
+ struct reduce_init_load_ctx *load_ctx;
+
+ if (backing_dev->readv == NULL || backing_dev->writev == NULL ||
+ backing_dev->unmap == NULL) {
+ SPDK_ERRLOG("backing_dev function pointer not specified\n");
+ cb_fn(cb_arg, NULL, -EINVAL);
+ return;
+ }
+
+ vol = calloc(1, sizeof(*vol));
+ if (vol == NULL) {
+ cb_fn(cb_arg, NULL, -ENOMEM);
+ return;
+ }
+
+ TAILQ_INIT(&vol->free_requests);
+ TAILQ_INIT(&vol->executing_requests);
+ TAILQ_INIT(&vol->queued_requests);
+
+ vol->backing_super = spdk_zmalloc(sizeof(*vol->backing_super), 64, NULL,
+ SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+ if (vol->backing_super == NULL) {
+ _init_load_cleanup(vol, NULL);
+ cb_fn(cb_arg, NULL, -ENOMEM);
+ return;
+ }
+
+ vol->backing_dev = backing_dev;
+
+ load_ctx = calloc(1, sizeof(*load_ctx));
+ if (load_ctx == NULL) {
+ _init_load_cleanup(vol, NULL);
+ cb_fn(cb_arg, NULL, -ENOMEM);
+ return;
+ }
+
+ load_ctx->path = spdk_zmalloc(REDUCE_PATH_MAX, 64, NULL,
+ SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+ if (load_ctx->path == NULL) {
+ _init_load_cleanup(vol, load_ctx);
+ cb_fn(cb_arg, NULL, -ENOMEM);
+ return;
+ }
+
+ load_ctx->vol = vol;
+ load_ctx->cb_fn = cb_fn;
+ load_ctx->cb_arg = cb_arg;
+
+ load_ctx->iov[0].iov_base = vol->backing_super;
+ load_ctx->iov[0].iov_len = sizeof(*vol->backing_super);
+ load_ctx->iov[1].iov_base = load_ctx->path;
+ load_ctx->iov[1].iov_len = REDUCE_PATH_MAX;
+ load_ctx->backing_cb_args.cb_fn = _load_read_super_and_path_cpl;
+ load_ctx->backing_cb_args.cb_arg = load_ctx;
+ vol->backing_dev->readv(vol->backing_dev, load_ctx->iov, LOAD_IOV_COUNT, 0,
+ (sizeof(*vol->backing_super) + REDUCE_PATH_MAX) /
+ vol->backing_dev->blocklen,
+ &load_ctx->backing_cb_args);
+}
+
+void
+spdk_reduce_vol_unload(struct spdk_reduce_vol *vol,
+ spdk_reduce_vol_op_complete cb_fn, void *cb_arg)
+{
+ if (vol == NULL) {
+ /* This indicates a programming error. */
+ assert(false);
+ cb_fn(cb_arg, -EINVAL);
+ return;
+ }
+
+ if (--g_vol_count == 0) {
+ spdk_free(g_zero_buf);
+ }
+ assert(g_vol_count >= 0);
+ _init_load_cleanup(vol, NULL);
+ cb_fn(cb_arg, 0);
+}
+
+struct reduce_destroy_ctx {
+ spdk_reduce_vol_op_complete cb_fn;
+ void *cb_arg;
+ struct spdk_reduce_vol *vol;
+ struct spdk_reduce_vol_superblock *super;
+ struct iovec iov;
+ struct spdk_reduce_vol_cb_args backing_cb_args;
+ int reduce_errno;
+ char pm_path[REDUCE_PATH_MAX];
+};
+
+static void
+destroy_unload_cpl(void *cb_arg, int reduce_errno)
+{
+ struct reduce_destroy_ctx *destroy_ctx = cb_arg;
+
+ if (destroy_ctx->reduce_errno == 0) {
+ if (unlink(destroy_ctx->pm_path)) {
+ SPDK_ERRLOG("%s could not be unlinked: %s\n",
+ destroy_ctx->pm_path, strerror(errno));
+ }
+ }
+
+ /* Even if the unload somehow failed, we still pass the destroy_ctx
+ * reduce_errno since that indicates whether or not the volume was
+ * actually destroyed.
+ */
+ destroy_ctx->cb_fn(destroy_ctx->cb_arg, destroy_ctx->reduce_errno);
+ spdk_free(destroy_ctx->super);
+ free(destroy_ctx);
+}
+
+static void
+_destroy_zero_super_cpl(void *cb_arg, int reduce_errno)
+{
+ struct reduce_destroy_ctx *destroy_ctx = cb_arg;
+ struct spdk_reduce_vol *vol = destroy_ctx->vol;
+
+ destroy_ctx->reduce_errno = reduce_errno;
+ spdk_reduce_vol_unload(vol, destroy_unload_cpl, destroy_ctx);
+}
+
+static void
+destroy_load_cb(void *cb_arg, struct spdk_reduce_vol *vol, int reduce_errno)
+{
+ struct reduce_destroy_ctx *destroy_ctx = cb_arg;
+
+ if (reduce_errno != 0) {
+ destroy_ctx->cb_fn(destroy_ctx->cb_arg, reduce_errno);
+ spdk_free(destroy_ctx->super);
+ free(destroy_ctx);
+ return;
+ }
+
+ destroy_ctx->vol = vol;
+ memcpy(destroy_ctx->pm_path, vol->pm_file.path, sizeof(destroy_ctx->pm_path));
+ destroy_ctx->iov.iov_base = destroy_ctx->super;
+ destroy_ctx->iov.iov_len = sizeof(*destroy_ctx->super);
+ destroy_ctx->backing_cb_args.cb_fn = _destroy_zero_super_cpl;
+ destroy_ctx->backing_cb_args.cb_arg = destroy_ctx;
+ vol->backing_dev->writev(vol->backing_dev, &destroy_ctx->iov, 1, 0,
+ sizeof(*destroy_ctx->super) / vol->backing_dev->blocklen,
+ &destroy_ctx->backing_cb_args);
+}
+
+void
+spdk_reduce_vol_destroy(struct spdk_reduce_backing_dev *backing_dev,
+ spdk_reduce_vol_op_complete cb_fn, void *cb_arg)
+{
+ struct reduce_destroy_ctx *destroy_ctx;
+
+ destroy_ctx = calloc(1, sizeof(*destroy_ctx));
+ if (destroy_ctx == NULL) {
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ destroy_ctx->super = spdk_zmalloc(sizeof(*destroy_ctx->super), 64, NULL,
+ SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+ if (destroy_ctx->super == NULL) {
+ free(destroy_ctx);
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+ destroy_ctx->cb_fn = cb_fn;
+ destroy_ctx->cb_arg = cb_arg;
+ spdk_reduce_vol_load(backing_dev, destroy_load_cb, destroy_ctx);
+}
+
+static bool
+_request_spans_chunk_boundary(struct spdk_reduce_vol *vol, uint64_t offset, uint64_t length)
+{
+ uint64_t start_chunk, end_chunk;
+
+ start_chunk = offset / vol->logical_blocks_per_chunk;
+ end_chunk = (offset + length - 1) / vol->logical_blocks_per_chunk;
+
+ return (start_chunk != end_chunk);
+}
+
+typedef void (*reduce_request_fn)(void *_req, int reduce_errno);
+
+static void
+_reduce_vol_complete_req(struct spdk_reduce_vol_request *req, int reduce_errno)
+{
+ struct spdk_reduce_vol_request *next_req;
+ struct spdk_reduce_vol *vol = req->vol;
+
+ req->cb_fn(req->cb_arg, reduce_errno);
+ TAILQ_REMOVE(&vol->executing_requests, req, tailq);
+
+ TAILQ_FOREACH(next_req, &vol->queued_requests, tailq) {
+ if (next_req->logical_map_index == req->logical_map_index) {
+ TAILQ_REMOVE(&vol->queued_requests, next_req, tailq);
+ if (next_req->type == REDUCE_IO_READV) {
+ _start_readv_request(next_req);
+ } else {
+ assert(next_req->type == REDUCE_IO_WRITEV);
+ _start_writev_request(next_req);
+ }
+ break;
+ }
+ }
+
+ TAILQ_INSERT_HEAD(&vol->free_requests, req, tailq);
+}
+
+static void
+_write_write_done(void *_req, int reduce_errno)
+{
+ struct spdk_reduce_vol_request *req = _req;
+ struct spdk_reduce_vol *vol = req->vol;
+ uint64_t old_chunk_map_index;
+ struct spdk_reduce_chunk_map *old_chunk;
+ uint32_t i;
+
+ if (reduce_errno != 0) {
+ req->reduce_errno = reduce_errno;
+ }
+
+ assert(req->num_backing_ops > 0);
+ if (--req->num_backing_ops > 0) {
+ return;
+ }
+
+ if (req->reduce_errno != 0) {
+ _reduce_vol_complete_req(req, req->reduce_errno);
+ return;
+ }
+
+ old_chunk_map_index = vol->pm_logical_map[req->logical_map_index];
+ if (old_chunk_map_index != REDUCE_EMPTY_MAP_ENTRY) {
+ old_chunk = _reduce_vol_get_chunk_map(vol, old_chunk_map_index);
+ for (i = 0; i < vol->backing_io_units_per_chunk; i++) {
+ if (old_chunk->io_unit_index[i] == REDUCE_EMPTY_MAP_ENTRY) {
+ break;
+ }
+ assert(spdk_bit_array_get(vol->allocated_backing_io_units, old_chunk->io_unit_index[i]) == true);
+ spdk_bit_array_clear(vol->allocated_backing_io_units, old_chunk->io_unit_index[i]);
+ old_chunk->io_unit_index[i] = REDUCE_EMPTY_MAP_ENTRY;
+ }
+ spdk_bit_array_clear(vol->allocated_chunk_maps, old_chunk_map_index);
+ }
+
+ /*
+ * We don't need to persist the clearing of the old chunk map here. The old chunk map
+ * becomes invalid after we update the logical map, since the old chunk map will no
+ * longer have a reference to it in the logical map.
+ */
+
+ /* Persist the new chunk map. This must be persisted before we update the logical map. */
+ _reduce_persist(vol, req->chunk,
+ _reduce_vol_get_chunk_struct_size(vol->backing_io_units_per_chunk));
+
+ vol->pm_logical_map[req->logical_map_index] = req->chunk_map_index;
+
+ _reduce_persist(vol, &vol->pm_logical_map[req->logical_map_index], sizeof(uint64_t));
+
+ _reduce_vol_complete_req(req, 0);
+}
+
+static void
+_issue_backing_ops(struct spdk_reduce_vol_request *req, struct spdk_reduce_vol *vol,
+ reduce_request_fn next_fn, bool is_write)
+{
+ struct iovec *iov;
+ uint8_t *buf;
+ uint32_t i;
+
+ if (req->chunk_is_compressed) {
+ iov = req->comp_buf_iov;
+ buf = req->comp_buf;
+ } else {
+ iov = req->decomp_buf_iov;
+ buf = req->decomp_buf;
+ }
+
+ req->num_backing_ops = req->num_io_units;
+ req->backing_cb_args.cb_fn = next_fn;
+ req->backing_cb_args.cb_arg = req;
+ for (i = 0; i < req->num_io_units; i++) {
+ iov[i].iov_base = buf + i * vol->params.backing_io_unit_size;
+ iov[i].iov_len = vol->params.backing_io_unit_size;
+ if (is_write) {
+ vol->backing_dev->writev(vol->backing_dev, &iov[i], 1,
+ req->chunk->io_unit_index[i] * vol->backing_lba_per_io_unit,
+ vol->backing_lba_per_io_unit, &req->backing_cb_args);
+ } else {
+ vol->backing_dev->readv(vol->backing_dev, &iov[i], 1,
+ req->chunk->io_unit_index[i] * vol->backing_lba_per_io_unit,
+ vol->backing_lba_per_io_unit, &req->backing_cb_args);
+ }
+ }
+}
+
+static void
+_reduce_vol_write_chunk(struct spdk_reduce_vol_request *req, reduce_request_fn next_fn,
+ uint32_t compressed_size)
+{
+ struct spdk_reduce_vol *vol = req->vol;
+ uint32_t i;
+ uint64_t chunk_offset, remainder, total_len = 0;
+ uint8_t *buf;
+ int j;
+
+ req->chunk_map_index = spdk_bit_array_find_first_clear(vol->allocated_chunk_maps, 0);
+
+ /* TODO: fail if no chunk map found - but really this should not happen if we
+ * size the number of requests similarly to number of extra chunk maps
+ */
+ assert(req->chunk_map_index != UINT32_MAX);
+ spdk_bit_array_set(vol->allocated_chunk_maps, req->chunk_map_index);
+
+ req->chunk = _reduce_vol_get_chunk_map(vol, req->chunk_map_index);
+ req->num_io_units = spdk_divide_round_up(compressed_size,
+ vol->params.backing_io_unit_size);
+ req->chunk_is_compressed = (req->num_io_units != vol->backing_io_units_per_chunk);
+ req->chunk->compressed_size =
+ req->chunk_is_compressed ? compressed_size : vol->params.chunk_size;
+
+ /* if the chunk is uncompressed we need to copy the data from the host buffers. */
+ if (req->chunk_is_compressed == false) {
+ chunk_offset = req->offset % vol->logical_blocks_per_chunk;
+ buf = req->decomp_buf;
+ total_len = chunk_offset * vol->params.logical_block_size;
+
+ /* zero any offset into chunk */
+ if (req->rmw == false && chunk_offset) {
+ memset(buf, 0, total_len);
+ }
+ buf += total_len;
+
+ /* copy the data */
+ for (j = 0; j < req->iovcnt; j++) {
+ memcpy(buf, req->iov[j].iov_base, req->iov[j].iov_len);
+ buf += req->iov[j].iov_len;
+ total_len += req->iov[j].iov_len;
+ }
+
+ /* zero any remainder */
+ remainder = vol->params.chunk_size - total_len;
+ total_len += remainder;
+ if (req->rmw == false && remainder) {
+ memset(buf, 0, remainder);
+ }
+ assert(total_len == vol->params.chunk_size);
+ }
+
+ for (i = 0; i < req->num_io_units; i++) {
+ req->chunk->io_unit_index[i] = spdk_bit_array_find_first_clear(vol->allocated_backing_io_units, 0);
+ /* TODO: fail if no backing block found - but really this should also not
+ * happen (see comment above).
+ */
+ assert(req->chunk->io_unit_index[i] != UINT32_MAX);
+ spdk_bit_array_set(vol->allocated_backing_io_units, req->chunk->io_unit_index[i]);
+ }
+
+ _issue_backing_ops(req, vol, next_fn, true /* write */);
+}
+
+static void
+_write_compress_done(void *_req, int reduce_errno)
+{
+ struct spdk_reduce_vol_request *req = _req;
+
+ /* Negative reduce_errno indicates failure for compression operations.
+ * Just write the uncompressed data instead. Force this to happen
+ * by just passing the full chunk size to _reduce_vol_write_chunk.
+ * When it sees the data couldn't be compressed, it will just write
+ * the uncompressed buffer to disk.
+ */
+ if (reduce_errno < 0) {
+ reduce_errno = req->vol->params.chunk_size;
+ }
+
+ /* Positive reduce_errno indicates number of bytes in compressed buffer. */
+ _reduce_vol_write_chunk(req, _write_write_done, (uint32_t)reduce_errno);
+}
+
+static void
+_reduce_vol_compress_chunk(struct spdk_reduce_vol_request *req, reduce_request_fn next_fn)
+{
+ struct spdk_reduce_vol *vol = req->vol;
+
+ req->backing_cb_args.cb_fn = next_fn;
+ req->backing_cb_args.cb_arg = req;
+ req->comp_buf_iov[0].iov_base = req->comp_buf;
+ req->comp_buf_iov[0].iov_len = vol->params.chunk_size;
+ vol->backing_dev->compress(vol->backing_dev,
+ &req->decomp_iov[0], req->decomp_iovcnt, req->comp_buf_iov, 1,
+ &req->backing_cb_args);
+}
+
+static void
+_reduce_vol_decompress_chunk_scratch(struct spdk_reduce_vol_request *req, reduce_request_fn next_fn)
+{
+ struct spdk_reduce_vol *vol = req->vol;
+
+ req->backing_cb_args.cb_fn = next_fn;
+ req->backing_cb_args.cb_arg = req;
+ req->comp_buf_iov[0].iov_base = req->comp_buf;
+ req->comp_buf_iov[0].iov_len = req->chunk->compressed_size;
+ req->decomp_buf_iov[0].iov_base = req->decomp_buf;
+ req->decomp_buf_iov[0].iov_len = vol->params.chunk_size;
+ vol->backing_dev->decompress(vol->backing_dev,
+ req->comp_buf_iov, 1, req->decomp_buf_iov, 1,
+ &req->backing_cb_args);
+}
+
+static void
+_reduce_vol_decompress_chunk(struct spdk_reduce_vol_request *req, reduce_request_fn next_fn)
+{
+ struct spdk_reduce_vol *vol = req->vol;
+ uint64_t chunk_offset, remainder = 0;
+ uint64_t ttl_len = 0;
+ int i;
+
+ req->decomp_iovcnt = 0;
+ chunk_offset = req->offset % vol->logical_blocks_per_chunk;
+
+ if (chunk_offset) {
+ /* first iov point to our scratch buffer for any offset into the chunk */
+ req->decomp_iov[0].iov_base = req->decomp_buf;
+ req->decomp_iov[0].iov_len = chunk_offset * vol->params.logical_block_size;
+ ttl_len += req->decomp_iov[0].iov_len;
+ req->decomp_iovcnt = 1;
+ }
+
+ /* now the user data iov, direct to the user buffer */
+ for (i = 0; i < req->iovcnt; i++) {
+ req->decomp_iov[i + req->decomp_iovcnt].iov_base = req->iov[i].iov_base;
+ req->decomp_iov[i + req->decomp_iovcnt].iov_len = req->iov[i].iov_len;
+ ttl_len += req->decomp_iov[i + req->decomp_iovcnt].iov_len;
+ }
+ req->decomp_iovcnt += req->iovcnt;
+
+ /* send the rest of the chunk to our scratch buffer */
+ remainder = vol->params.chunk_size - ttl_len;
+ if (remainder) {
+ req->decomp_iov[req->decomp_iovcnt].iov_base = req->decomp_buf + ttl_len;
+ req->decomp_iov[req->decomp_iovcnt].iov_len = remainder;
+ ttl_len += req->decomp_iov[req->decomp_iovcnt].iov_len;
+ req->decomp_iovcnt++;
+ }
+ assert(ttl_len == vol->params.chunk_size);
+
+ req->backing_cb_args.cb_fn = next_fn;
+ req->backing_cb_args.cb_arg = req;
+ req->comp_buf_iov[0].iov_base = req->comp_buf;
+ req->comp_buf_iov[0].iov_len = req->chunk->compressed_size;
+ vol->backing_dev->decompress(vol->backing_dev,
+ req->comp_buf_iov, 1, &req->decomp_iov[0], req->decomp_iovcnt,
+ &req->backing_cb_args);
+}
+
+static void
+_write_decompress_done(void *_req, int reduce_errno)
+{
+ struct spdk_reduce_vol_request *req = _req;
+ struct spdk_reduce_vol *vol = req->vol;
+ uint64_t chunk_offset, remainder, ttl_len = 0;
+ int i;
+
+ /* Negative reduce_errno indicates failure for compression operations. */
+ if (reduce_errno < 0) {
+ _reduce_vol_complete_req(req, reduce_errno);
+ return;
+ }
+
+ /* Positive reduce_errno indicates number of bytes in decompressed
+ * buffer. This should equal the chunk size - otherwise that's another
+ * type of failure.
+ */
+ if ((uint32_t)reduce_errno != vol->params.chunk_size) {
+ _reduce_vol_complete_req(req, -EIO);
+ return;
+ }
+
+ req->decomp_iovcnt = 0;
+ chunk_offset = req->offset % vol->logical_blocks_per_chunk;
+
+ if (chunk_offset) {
+ req->decomp_iov[0].iov_base = req->decomp_buf;
+ req->decomp_iov[0].iov_len = chunk_offset * vol->params.logical_block_size;
+ ttl_len += req->decomp_iov[0].iov_len;
+ req->decomp_iovcnt = 1;
+ }
+
+ for (i = 0; i < req->iovcnt; i++) {
+ req->decomp_iov[i + req->decomp_iovcnt].iov_base = req->iov[i].iov_base;
+ req->decomp_iov[i + req->decomp_iovcnt].iov_len = req->iov[i].iov_len;
+ ttl_len += req->decomp_iov[i + req->decomp_iovcnt].iov_len;
+ }
+ req->decomp_iovcnt += req->iovcnt;
+
+ remainder = vol->params.chunk_size - ttl_len;
+ if (remainder) {
+ req->decomp_iov[req->decomp_iovcnt].iov_base = req->decomp_buf + ttl_len;
+ req->decomp_iov[req->decomp_iovcnt].iov_len = remainder;
+ ttl_len += req->decomp_iov[req->decomp_iovcnt].iov_len;
+ req->decomp_iovcnt++;
+ }
+ assert(ttl_len == vol->params.chunk_size);
+
+ _reduce_vol_compress_chunk(req, _write_compress_done);
+}
+
+static void
+_write_read_done(void *_req, int reduce_errno)
+{
+ struct spdk_reduce_vol_request *req = _req;
+
+ if (reduce_errno != 0) {
+ req->reduce_errno = reduce_errno;
+ }
+
+ assert(req->num_backing_ops > 0);
+ if (--req->num_backing_ops > 0) {
+ return;
+ }
+
+ if (req->reduce_errno != 0) {
+ _reduce_vol_complete_req(req, req->reduce_errno);
+ return;
+ }
+
+ if (req->chunk_is_compressed) {
+ _reduce_vol_decompress_chunk_scratch(req, _write_decompress_done);
+ } else {
+ _write_decompress_done(req, req->chunk->compressed_size);
+ }
+}
+
+static void
+_read_decompress_done(void *_req, int reduce_errno)
+{
+ struct spdk_reduce_vol_request *req = _req;
+ struct spdk_reduce_vol *vol = req->vol;
+
+ /* Negative reduce_errno indicates failure for compression operations. */
+ if (reduce_errno < 0) {
+ _reduce_vol_complete_req(req, reduce_errno);
+ return;
+ }
+
+ /* Positive reduce_errno indicates number of bytes in decompressed
+ * buffer. This should equal the chunk size - otherwise that's another
+ * type of failure.
+ */
+ if ((uint32_t)reduce_errno != vol->params.chunk_size) {
+ _reduce_vol_complete_req(req, -EIO);
+ return;
+ }
+
+ _reduce_vol_complete_req(req, 0);
+}
+
+static void
+_read_read_done(void *_req, int reduce_errno)
+{
+ struct spdk_reduce_vol_request *req = _req;
+ uint64_t chunk_offset;
+ uint8_t *buf;
+ int i;
+
+ if (reduce_errno != 0) {
+ req->reduce_errno = reduce_errno;
+ }
+
+ assert(req->num_backing_ops > 0);
+ if (--req->num_backing_ops > 0) {
+ return;
+ }
+
+ if (req->reduce_errno != 0) {
+ _reduce_vol_complete_req(req, req->reduce_errno);
+ return;
+ }
+
+ if (req->chunk_is_compressed) {
+ _reduce_vol_decompress_chunk(req, _read_decompress_done);
+ } else {
+
+ /* If the chunk was compressed, the data would have been sent to the
+ * host buffers by the decompression operation, if not we need to memcpy here.
+ */
+ chunk_offset = req->offset % req->vol->logical_blocks_per_chunk;
+ buf = req->decomp_buf + chunk_offset * req->vol->params.logical_block_size;
+ for (i = 0; i < req->iovcnt; i++) {
+ memcpy(req->iov[i].iov_base, buf, req->iov[i].iov_len);
+ buf += req->iov[i].iov_len;
+ }
+
+ _read_decompress_done(req, req->chunk->compressed_size);
+ }
+}
+
+static void
+_reduce_vol_read_chunk(struct spdk_reduce_vol_request *req, reduce_request_fn next_fn)
+{
+ struct spdk_reduce_vol *vol = req->vol;
+
+ req->chunk_map_index = vol->pm_logical_map[req->logical_map_index];
+ assert(req->chunk_map_index != UINT32_MAX);
+
+ req->chunk = _reduce_vol_get_chunk_map(vol, req->chunk_map_index);
+ req->num_io_units = spdk_divide_round_up(req->chunk->compressed_size,
+ vol->params.backing_io_unit_size);
+ req->chunk_is_compressed = (req->num_io_units != vol->backing_io_units_per_chunk);
+
+ _issue_backing_ops(req, vol, next_fn, false /* read */);
+}
+
+static bool
+_iov_array_is_valid(struct spdk_reduce_vol *vol, struct iovec *iov, int iovcnt,
+ uint64_t length)
+{
+ uint64_t size = 0;
+ int i;
+
+ if (iovcnt > REDUCE_MAX_IOVECS) {
+ return false;
+ }
+
+ for (i = 0; i < iovcnt; i++) {
+ size += iov[i].iov_len;
+ }
+
+ return size == (length * vol->params.logical_block_size);
+}
+
+static bool
+_check_overlap(struct spdk_reduce_vol *vol, uint64_t logical_map_index)
+{
+ struct spdk_reduce_vol_request *req;
+
+ TAILQ_FOREACH(req, &vol->executing_requests, tailq) {
+ if (logical_map_index == req->logical_map_index) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static void
+_start_readv_request(struct spdk_reduce_vol_request *req)
+{
+ TAILQ_INSERT_TAIL(&req->vol->executing_requests, req, tailq);
+ _reduce_vol_read_chunk(req, _read_read_done);
+}
+
+void
+spdk_reduce_vol_readv(struct spdk_reduce_vol *vol,
+ struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length,
+ spdk_reduce_vol_op_complete cb_fn, void *cb_arg)
+{
+ struct spdk_reduce_vol_request *req;
+ uint64_t logical_map_index;
+ bool overlapped;
+ int i;
+
+ if (length == 0) {
+ cb_fn(cb_arg, 0);
+ return;
+ }
+
+ if (_request_spans_chunk_boundary(vol, offset, length)) {
+ cb_fn(cb_arg, -EINVAL);
+ return;
+ }
+
+ if (!_iov_array_is_valid(vol, iov, iovcnt, length)) {
+ cb_fn(cb_arg, -EINVAL);
+ return;
+ }
+
+ logical_map_index = offset / vol->logical_blocks_per_chunk;
+ overlapped = _check_overlap(vol, logical_map_index);
+
+ if (!overlapped && vol->pm_logical_map[logical_map_index] == REDUCE_EMPTY_MAP_ENTRY) {
+ /*
+ * This chunk hasn't been allocated. So treat the data as all
+ * zeroes for this chunk - do the memset and immediately complete
+ * the operation.
+ */
+ for (i = 0; i < iovcnt; i++) {
+ memset(iov[i].iov_base, 0, iov[i].iov_len);
+ }
+ cb_fn(cb_arg, 0);
+ return;
+ }
+
+ req = TAILQ_FIRST(&vol->free_requests);
+ if (req == NULL) {
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ TAILQ_REMOVE(&vol->free_requests, req, tailq);
+ req->type = REDUCE_IO_READV;
+ req->vol = vol;
+ req->iov = iov;
+ req->iovcnt = iovcnt;
+ req->offset = offset;
+ req->logical_map_index = logical_map_index;
+ req->length = length;
+ req->cb_fn = cb_fn;
+ req->cb_arg = cb_arg;
+
+ if (!overlapped) {
+ _start_readv_request(req);
+ } else {
+ TAILQ_INSERT_TAIL(&vol->queued_requests, req, tailq);
+ }
+}
+
+static void
+_start_writev_request(struct spdk_reduce_vol_request *req)
+{
+ struct spdk_reduce_vol *vol = req->vol;
+ uint64_t chunk_offset, ttl_len = 0;
+ uint64_t remainder = 0;
+ uint32_t lbsize;
+ int i;
+
+ TAILQ_INSERT_TAIL(&req->vol->executing_requests, req, tailq);
+ if (vol->pm_logical_map[req->logical_map_index] != REDUCE_EMPTY_MAP_ENTRY) {
+ if ((req->length * vol->params.logical_block_size) < vol->params.chunk_size) {
+ /* Read old chunk, then overwrite with data from this write
+ * operation.
+ */
+ req->rmw = true;
+ _reduce_vol_read_chunk(req, _write_read_done);
+ return;
+ }
+ }
+
+ lbsize = vol->params.logical_block_size;
+ req->decomp_iovcnt = 0;
+ req->rmw = false;
+
+ /* Note: point to our zero buf for offset into the chunk. */
+ chunk_offset = req->offset % vol->logical_blocks_per_chunk;
+ if (chunk_offset != 0) {
+ ttl_len += chunk_offset * lbsize;
+ req->decomp_iov[0].iov_base = g_zero_buf;
+ req->decomp_iov[0].iov_len = ttl_len;
+ req->decomp_iovcnt = 1;
+ }
+
+ /* now the user data iov, direct from the user buffer */
+ for (i = 0; i < req->iovcnt; i++) {
+ req->decomp_iov[i + req->decomp_iovcnt].iov_base = req->iov[i].iov_base;
+ req->decomp_iov[i + req->decomp_iovcnt].iov_len = req->iov[i].iov_len;
+ ttl_len += req->decomp_iov[i + req->decomp_iovcnt].iov_len;
+ }
+ req->decomp_iovcnt += req->iovcnt;
+
+ remainder = vol->params.chunk_size - ttl_len;
+ if (remainder) {
+ req->decomp_iov[req->decomp_iovcnt].iov_base = g_zero_buf;
+ req->decomp_iov[req->decomp_iovcnt].iov_len = remainder;
+ ttl_len += req->decomp_iov[req->decomp_iovcnt].iov_len;
+ req->decomp_iovcnt++;
+ }
+ assert(ttl_len == req->vol->params.chunk_size);
+
+ _reduce_vol_compress_chunk(req, _write_compress_done);
+}
+
+void
+spdk_reduce_vol_writev(struct spdk_reduce_vol *vol,
+ struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length,
+ spdk_reduce_vol_op_complete cb_fn, void *cb_arg)
+{
+ struct spdk_reduce_vol_request *req;
+ uint64_t logical_map_index;
+ bool overlapped;
+
+ if (length == 0) {
+ cb_fn(cb_arg, 0);
+ return;
+ }
+
+ if (_request_spans_chunk_boundary(vol, offset, length)) {
+ cb_fn(cb_arg, -EINVAL);
+ return;
+ }
+
+ if (!_iov_array_is_valid(vol, iov, iovcnt, length)) {
+ cb_fn(cb_arg, -EINVAL);
+ return;
+ }
+
+ logical_map_index = offset / vol->logical_blocks_per_chunk;
+ overlapped = _check_overlap(vol, logical_map_index);
+
+ req = TAILQ_FIRST(&vol->free_requests);
+ if (req == NULL) {
+ cb_fn(cb_arg, -ENOMEM);
+ return;
+ }
+
+ TAILQ_REMOVE(&vol->free_requests, req, tailq);
+ req->type = REDUCE_IO_WRITEV;
+ req->vol = vol;
+ req->iov = iov;
+ req->iovcnt = iovcnt;
+ req->offset = offset;
+ req->logical_map_index = logical_map_index;
+ req->length = length;
+ req->cb_fn = cb_fn;
+ req->cb_arg = cb_arg;
+
+ if (!overlapped) {
+ _start_writev_request(req);
+ } else {
+ TAILQ_INSERT_TAIL(&vol->queued_requests, req, tailq);
+ }
+}
+
+const struct spdk_reduce_vol_params *
+spdk_reduce_vol_get_params(struct spdk_reduce_vol *vol)
+{
+ return &vol->params;
+}
+
+void spdk_reduce_vol_print_info(struct spdk_reduce_vol *vol)
+{
+ uint64_t logical_map_size, num_chunks, ttl_chunk_sz;
+ uint32_t struct_size;
+ uint64_t chunk_map_size;
+
+ SPDK_NOTICELOG("vol info:\n");
+ SPDK_NOTICELOG("\tvol->params.backing_io_unit_size = 0x%x\n", vol->params.backing_io_unit_size);
+ SPDK_NOTICELOG("\tvol->params.logical_block_size = 0x%x\n", vol->params.logical_block_size);
+ SPDK_NOTICELOG("\tvol->params.chunk_size = 0x%x\n", vol->params.chunk_size);
+ SPDK_NOTICELOG("\tvol->params.vol_size = 0x%" PRIx64 "\n", vol->params.vol_size);
+ num_chunks = _get_total_chunks(vol->params.vol_size, vol->params.chunk_size);
+ SPDK_NOTICELOG("\ttotal chunks (including extra) = 0x%" PRIx64 "\n", num_chunks);
+ SPDK_NOTICELOG("\ttotal chunks (excluding extra) = 0x%" PRIx64 "\n",
+ vol->params.vol_size / vol->params.chunk_size);
+ ttl_chunk_sz = _get_pm_total_chunks_size(vol->params.vol_size, vol->params.chunk_size,
+ vol->params.backing_io_unit_size);
+ SPDK_NOTICELOG("\ttotal_chunks_size = 0x%" PRIx64 "\n", ttl_chunk_sz);
+ struct_size = _reduce_vol_get_chunk_struct_size(vol->backing_io_units_per_chunk);
+ SPDK_NOTICELOG("\tchunk_struct_size = 0x%x\n", struct_size);
+
+ SPDK_NOTICELOG("pmem info:\n");
+ SPDK_NOTICELOG("\tvol->pm_file.size = 0x%" PRIx64 "\n", vol->pm_file.size);
+ SPDK_NOTICELOG("\tvol->pm_file.pm_buf = %p\n", (void *)vol->pm_file.pm_buf);
+ SPDK_NOTICELOG("\tvol->pm_super = %p\n", (void *)vol->pm_super);
+ SPDK_NOTICELOG("\tvol->pm_logical_map = %p\n", (void *)vol->pm_logical_map);
+ logical_map_size = _get_pm_logical_map_size(vol->params.vol_size,
+ vol->params.chunk_size);
+ SPDK_NOTICELOG("\tlogical_map_size = 0x%" PRIx64 "\n", logical_map_size);
+ SPDK_NOTICELOG("\tvol->pm_chunk_maps = %p\n", (void *)vol->pm_chunk_maps);
+ chunk_map_size = _get_pm_total_chunks_size(vol->params.vol_size, vol->params.chunk_size,
+ vol->params.backing_io_unit_size);
+ SPDK_NOTICELOG("\tchunk_map_size = 0x%" PRIx64 "\n", chunk_map_size);
+}
+
+SPDK_LOG_REGISTER_COMPONENT("reduce", SPDK_LOG_REDUCE)
diff --git a/src/spdk/lib/reduce/spdk_reduce.map b/src/spdk/lib/reduce/spdk_reduce.map
new file mode 100644
index 000000000..c53792710
--- /dev/null
+++ b/src/spdk/lib/reduce/spdk_reduce.map
@@ -0,0 +1,16 @@
+{
+ global:
+
+ # public functions
+ spdk_reduce_vol_get_uuid;
+ spdk_reduce_vol_init;
+ spdk_reduce_vol_load;
+ spdk_reduce_vol_unload;
+ spdk_reduce_vol_destroy;
+ spdk_reduce_vol_readv;
+ spdk_reduce_vol_writev;
+ spdk_reduce_vol_get_params;
+ spdk_reduce_vol_print_info;
+
+ local: *;
+};
diff --git a/src/spdk/lib/rocksdb/env_spdk.cc b/src/spdk/lib/rocksdb/env_spdk.cc
new file mode 100644
index 000000000..8695acca6
--- /dev/null
+++ b/src/spdk/lib/rocksdb/env_spdk.cc
@@ -0,0 +1,798 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "rocksdb/env.h"
+#include <set>
+#include <iostream>
+#include <stdexcept>
+
+extern "C" {
+#include "spdk/env.h"
+#include "spdk/event.h"
+#include "spdk/blob.h"
+#include "spdk/blobfs.h"
+#include "spdk/blob_bdev.h"
+#include "spdk/log.h"
+#include "spdk/thread.h"
+#include "spdk/bdev.h"
+
+#include "spdk_internal/thread.h"
+}
+
+namespace rocksdb
+{
+
+struct spdk_filesystem *g_fs = NULL;
+struct spdk_bs_dev *g_bs_dev;
+uint32_t g_lcore = 0;
+std::string g_bdev_name;
+volatile bool g_spdk_ready = false;
+volatile bool g_spdk_start_failure = false;
+
+void SpdkInitializeThread(void);
+
+class SpdkThreadCtx
+{
+public:
+ struct spdk_fs_thread_ctx *channel;
+
+ SpdkThreadCtx(void) : channel(NULL)
+ {
+ SpdkInitializeThread();
+ }
+
+ ~SpdkThreadCtx(void)
+ {
+ if (channel) {
+ spdk_fs_free_thread_ctx(channel);
+ channel = NULL;
+ }
+ }
+
+private:
+ SpdkThreadCtx(const SpdkThreadCtx &);
+ SpdkThreadCtx &operator=(const SpdkThreadCtx &);
+};
+
+thread_local SpdkThreadCtx g_sync_args;
+
+static void
+set_channel()
+{
+ struct spdk_thread *thread;
+
+ if (g_fs != NULL && g_sync_args.channel == NULL) {
+ thread = spdk_thread_create("spdK_rocksdb", NULL);
+ spdk_set_thread(thread);
+ g_sync_args.channel = spdk_fs_alloc_thread_ctx(g_fs);
+ }
+}
+
+static void
+__call_fn(void *arg1, void *arg2)
+{
+ fs_request_fn fn;
+
+ fn = (fs_request_fn)arg1;
+ fn(arg2);
+}
+
+static void
+__send_request(fs_request_fn fn, void *arg)
+{
+ struct spdk_event *event;
+
+ event = spdk_event_allocate(g_lcore, __call_fn, (void *)fn, arg);
+ spdk_event_call(event);
+}
+
+static std::string
+sanitize_path(const std::string &input, const std::string &mount_directory)
+{
+ int index = 0;
+ std::string name;
+ std::string input_tmp;
+
+ input_tmp = input.substr(mount_directory.length(), input.length());
+ for (const char &c : input_tmp) {
+ if (index == 0) {
+ if (c != '/') {
+ name = name.insert(index, 1, '/');
+ index++;
+ }
+ name = name.insert(index, 1, c);
+ index++;
+ } else {
+ if (name[index - 1] == '/' && c == '/') {
+ continue;
+ } else {
+ name = name.insert(index, 1, c);
+ index++;
+ }
+ }
+ }
+
+ if (name[name.size() - 1] == '/') {
+ name = name.erase(name.size() - 1, 1);
+ }
+ return name;
+}
+
+class SpdkSequentialFile : public SequentialFile
+{
+ struct spdk_file *mFile;
+ uint64_t mOffset;
+public:
+ SpdkSequentialFile(struct spdk_file *file) : mFile(file), mOffset(0) {}
+ virtual ~SpdkSequentialFile();
+
+ virtual Status Read(size_t n, Slice *result, char *scratch) override;
+ virtual Status Skip(uint64_t n) override;
+ virtual Status InvalidateCache(size_t offset, size_t length) override;
+};
+
+SpdkSequentialFile::~SpdkSequentialFile(void)
+{
+ set_channel();
+ spdk_file_close(mFile, g_sync_args.channel);
+}
+
+Status
+SpdkSequentialFile::Read(size_t n, Slice *result, char *scratch)
+{
+ int64_t ret;
+
+ set_channel();
+ ret = spdk_file_read(mFile, g_sync_args.channel, scratch, mOffset, n);
+ if (ret >= 0) {
+ mOffset += ret;
+ *result = Slice(scratch, ret);
+ return Status::OK();
+ } else {
+ errno = -ret;
+ return Status::IOError(spdk_file_get_name(mFile), strerror(errno));
+ }
+}
+
+Status
+SpdkSequentialFile::Skip(uint64_t n)
+{
+ mOffset += n;
+ return Status::OK();
+}
+
+Status
+SpdkSequentialFile::InvalidateCache(__attribute__((unused)) size_t offset,
+ __attribute__((unused)) size_t length)
+{
+ return Status::OK();
+}
+
+class SpdkRandomAccessFile : public RandomAccessFile
+{
+ struct spdk_file *mFile;
+public:
+ SpdkRandomAccessFile(struct spdk_file *file) : mFile(file) {}
+ virtual ~SpdkRandomAccessFile();
+
+ virtual Status Read(uint64_t offset, size_t n, Slice *result, char *scratch) const override;
+ virtual Status InvalidateCache(size_t offset, size_t length) override;
+};
+
+SpdkRandomAccessFile::~SpdkRandomAccessFile(void)
+{
+ set_channel();
+ spdk_file_close(mFile, g_sync_args.channel);
+}
+
+Status
+SpdkRandomAccessFile::Read(uint64_t offset, size_t n, Slice *result, char *scratch) const
+{
+ int64_t rc;
+
+ set_channel();
+ rc = spdk_file_read(mFile, g_sync_args.channel, scratch, offset, n);
+ if (rc >= 0) {
+ *result = Slice(scratch, n);
+ return Status::OK();
+ } else {
+ errno = -rc;
+ return Status::IOError(spdk_file_get_name(mFile), strerror(errno));
+ }
+}
+
+Status
+SpdkRandomAccessFile::InvalidateCache(__attribute__((unused)) size_t offset,
+ __attribute__((unused)) size_t length)
+{
+ return Status::OK();
+}
+
+class SpdkWritableFile : public WritableFile
+{
+ struct spdk_file *mFile;
+ uint64_t mSize;
+
+public:
+ SpdkWritableFile(struct spdk_file *file) : mFile(file), mSize(0) {}
+ ~SpdkWritableFile()
+ {
+ if (mFile != NULL) {
+ Close();
+ }
+ }
+
+ virtual void SetIOPriority(Env::IOPriority pri)
+ {
+ if (pri == Env::IO_HIGH) {
+ spdk_file_set_priority(mFile, SPDK_FILE_PRIORITY_HIGH);
+ }
+ }
+
+ virtual Status Truncate(uint64_t size) override
+ {
+ int rc;
+
+ set_channel();
+ rc = spdk_file_truncate(mFile, g_sync_args.channel, size);
+ if (!rc) {
+ mSize = size;
+ return Status::OK();
+ } else {
+ errno = -rc;
+ return Status::IOError(spdk_file_get_name(mFile), strerror(errno));
+ }
+ }
+ virtual Status Close() override
+ {
+ set_channel();
+ spdk_file_close(mFile, g_sync_args.channel);
+ mFile = NULL;
+ return Status::OK();
+ }
+ virtual Status Append(const Slice &data) override;
+ virtual Status Flush() override
+ {
+ return Status::OK();
+ }
+ virtual Status Sync() override
+ {
+ int rc;
+
+ set_channel();
+ rc = spdk_file_sync(mFile, g_sync_args.channel);
+ if (!rc) {
+ return Status::OK();
+ } else {
+ errno = -rc;
+ return Status::IOError(spdk_file_get_name(mFile), strerror(errno));
+ }
+ }
+ virtual Status Fsync() override
+ {
+ int rc;
+
+ set_channel();
+ rc = spdk_file_sync(mFile, g_sync_args.channel);
+ if (!rc) {
+ return Status::OK();
+ } else {
+ errno = -rc;
+ return Status::IOError(spdk_file_get_name(mFile), strerror(errno));
+ }
+ }
+ virtual bool IsSyncThreadSafe() const override
+ {
+ return true;
+ }
+ virtual uint64_t GetFileSize() override
+ {
+ return mSize;
+ }
+ virtual Status InvalidateCache(__attribute__((unused)) size_t offset,
+ __attribute__((unused)) size_t length) override
+ {
+ return Status::OK();
+ }
+ virtual Status Allocate(uint64_t offset, uint64_t len) override
+ {
+ int rc;
+
+ set_channel();
+ rc = spdk_file_truncate(mFile, g_sync_args.channel, offset + len);
+ if (!rc) {
+ return Status::OK();
+ } else {
+ errno = -rc;
+ return Status::IOError(spdk_file_get_name(mFile), strerror(errno));
+ }
+ }
+ virtual Status RangeSync(__attribute__((unused)) uint64_t offset,
+ __attribute__((unused)) uint64_t nbytes) override
+ {
+ int rc;
+
+ /*
+ * SPDK BlobFS does not have a range sync operation yet, so just sync
+ * the whole file.
+ */
+ set_channel();
+ rc = spdk_file_sync(mFile, g_sync_args.channel);
+ if (!rc) {
+ return Status::OK();
+ } else {
+ errno = -rc;
+ return Status::IOError(spdk_file_get_name(mFile), strerror(errno));
+ }
+ }
+ virtual size_t GetUniqueId(char *id, size_t max_size) const override
+ {
+ int rc;
+
+ rc = spdk_file_get_id(mFile, id, max_size);
+ if (rc < 0) {
+ return 0;
+ } else {
+ return rc;
+ }
+ }
+};
+
+Status
+SpdkWritableFile::Append(const Slice &data)
+{
+ int64_t rc;
+
+ set_channel();
+ rc = spdk_file_write(mFile, g_sync_args.channel, (void *)data.data(), mSize, data.size());
+ if (rc >= 0) {
+ mSize += data.size();
+ return Status::OK();
+ } else {
+ errno = -rc;
+ return Status::IOError(spdk_file_get_name(mFile), strerror(errno));
+ }
+}
+
+class SpdkDirectory : public Directory
+{
+public:
+ SpdkDirectory() {}
+ ~SpdkDirectory() {}
+ Status Fsync() override
+ {
+ return Status::OK();
+ }
+};
+
+class SpdkAppStartException : public std::runtime_error
+{
+public:
+ SpdkAppStartException(std::string mess): std::runtime_error(mess) {}
+};
+
+class SpdkEnv : public EnvWrapper
+{
+private:
+ pthread_t mSpdkTid;
+ std::string mDirectory;
+ std::string mConfig;
+ std::string mBdev;
+
+public:
+ SpdkEnv(Env *base_env, const std::string &dir, const std::string &conf,
+ const std::string &bdev, uint64_t cache_size_in_mb);
+
+ virtual ~SpdkEnv();
+
+ virtual Status NewSequentialFile(const std::string &fname,
+ unique_ptr<SequentialFile> *result,
+ const EnvOptions &options) override
+ {
+ if (fname.compare(0, mDirectory.length(), mDirectory) == 0) {
+ struct spdk_file *file;
+ int rc;
+
+ std::string name = sanitize_path(fname, mDirectory);
+ set_channel();
+ rc = spdk_fs_open_file(g_fs, g_sync_args.channel,
+ name.c_str(), 0, &file);
+ if (rc == 0) {
+ result->reset(new SpdkSequentialFile(file));
+ return Status::OK();
+ } else {
+ /* Myrocks engine uses errno(ENOENT) as one
+ * special condition, for the purpose to
+ * support MySQL, set the errno to right value.
+ */
+ errno = -rc;
+ return Status::IOError(name, strerror(errno));
+ }
+ } else {
+ return EnvWrapper::NewSequentialFile(fname, result, options);
+ }
+ }
+
+ virtual Status NewRandomAccessFile(const std::string &fname,
+ unique_ptr<RandomAccessFile> *result,
+ const EnvOptions &options) override
+ {
+ if (fname.compare(0, mDirectory.length(), mDirectory) == 0) {
+ std::string name = sanitize_path(fname, mDirectory);
+ struct spdk_file *file;
+ int rc;
+
+ set_channel();
+ rc = spdk_fs_open_file(g_fs, g_sync_args.channel,
+ name.c_str(), 0, &file);
+ if (rc == 0) {
+ result->reset(new SpdkRandomAccessFile(file));
+ return Status::OK();
+ } else {
+ errno = -rc;
+ return Status::IOError(name, strerror(errno));
+ }
+ } else {
+ return EnvWrapper::NewRandomAccessFile(fname, result, options);
+ }
+ }
+
+ virtual Status NewWritableFile(const std::string &fname,
+ unique_ptr<WritableFile> *result,
+ const EnvOptions &options) override
+ {
+ if (fname.compare(0, mDirectory.length(), mDirectory) == 0) {
+ std::string name = sanitize_path(fname, mDirectory);
+ struct spdk_file *file;
+ int rc;
+
+ set_channel();
+ rc = spdk_fs_open_file(g_fs, g_sync_args.channel, name.c_str(),
+ SPDK_BLOBFS_OPEN_CREATE, &file);
+ if (rc == 0) {
+ result->reset(new SpdkWritableFile(file));
+ return Status::OK();
+ } else {
+ errno = -rc;
+ return Status::IOError(name, strerror(errno));
+ }
+ } else {
+ return EnvWrapper::NewWritableFile(fname, result, options);
+ }
+ }
+
+ virtual Status ReuseWritableFile(const std::string &fname,
+ const std::string &old_fname,
+ unique_ptr<WritableFile> *result,
+ const EnvOptions &options) override
+ {
+ return EnvWrapper::ReuseWritableFile(fname, old_fname, result, options);
+ }
+
+ virtual Status NewDirectory(__attribute__((unused)) const std::string &name,
+ unique_ptr<Directory> *result) override
+ {
+ result->reset(new SpdkDirectory());
+ return Status::OK();
+ }
+ virtual Status FileExists(const std::string &fname) override
+ {
+ struct spdk_file_stat stat;
+ int rc;
+ std::string name = sanitize_path(fname, mDirectory);
+
+ set_channel();
+ rc = spdk_fs_file_stat(g_fs, g_sync_args.channel, name.c_str(), &stat);
+ if (rc == 0) {
+ return Status::OK();
+ }
+ return EnvWrapper::FileExists(fname);
+ }
+ virtual Status RenameFile(const std::string &src, const std::string &t) override
+ {
+ int rc;
+ std::string src_name = sanitize_path(src, mDirectory);
+ std::string target_name = sanitize_path(t, mDirectory);
+
+ set_channel();
+ rc = spdk_fs_rename_file(g_fs, g_sync_args.channel,
+ src_name.c_str(), target_name.c_str());
+ if (rc == -ENOENT) {
+ return EnvWrapper::RenameFile(src, t);
+ }
+ return Status::OK();
+ }
+ virtual Status LinkFile(__attribute__((unused)) const std::string &src,
+ __attribute__((unused)) const std::string &t) override
+ {
+ return Status::NotSupported("SpdkEnv does not support LinkFile");
+ }
+ virtual Status GetFileSize(const std::string &fname, uint64_t *size) override
+ {
+ struct spdk_file_stat stat;
+ int rc;
+ std::string name = sanitize_path(fname, mDirectory);
+
+ set_channel();
+ rc = spdk_fs_file_stat(g_fs, g_sync_args.channel, name.c_str(), &stat);
+ if (rc == -ENOENT) {
+ return EnvWrapper::GetFileSize(fname, size);
+ }
+ *size = stat.size;
+ return Status::OK();
+ }
+ virtual Status DeleteFile(const std::string &fname) override
+ {
+ int rc;
+ std::string name = sanitize_path(fname, mDirectory);
+
+ set_channel();
+ rc = spdk_fs_delete_file(g_fs, g_sync_args.channel, name.c_str());
+ if (rc == -ENOENT) {
+ return EnvWrapper::DeleteFile(fname);
+ }
+ return Status::OK();
+ }
+ virtual Status LockFile(const std::string &fname, FileLock **lock) override
+ {
+ std::string name = sanitize_path(fname, mDirectory);
+ int64_t rc;
+
+ set_channel();
+ rc = spdk_fs_open_file(g_fs, g_sync_args.channel, name.c_str(),
+ SPDK_BLOBFS_OPEN_CREATE, (struct spdk_file **)lock);
+ if (!rc) {
+ return Status::OK();
+ } else {
+ errno = -rc;
+ return Status::IOError(name, strerror(errno));
+ }
+ }
+ virtual Status UnlockFile(FileLock *lock) override
+ {
+ set_channel();
+ spdk_file_close((struct spdk_file *)lock, g_sync_args.channel);
+ return Status::OK();
+ }
+ virtual Status GetChildren(const std::string &dir,
+ std::vector<std::string> *result) override
+ {
+ std::string::size_type pos;
+ std::set<std::string> dir_and_file_set;
+ std::string full_path;
+ std::string filename;
+ std::string dir_name;
+
+ if (dir.find("archive") != std::string::npos) {
+ return Status::OK();
+ }
+ if (dir.compare(0, mDirectory.length(), mDirectory) == 0) {
+ spdk_fs_iter iter;
+ struct spdk_file *file;
+ dir_name = sanitize_path(dir, mDirectory);
+
+ iter = spdk_fs_iter_first(g_fs);
+ while (iter != NULL) {
+ file = spdk_fs_iter_get_file(iter);
+ full_path = spdk_file_get_name(file);
+ if (strncmp(dir_name.c_str(), full_path.c_str(), dir_name.length())) {
+ iter = spdk_fs_iter_next(iter);
+ continue;
+ }
+ pos = full_path.find("/", dir_name.length() + 1);
+
+ if (pos != std::string::npos) {
+ filename = full_path.substr(dir_name.length() + 1, pos - dir_name.length() - 1);
+ } else {
+ filename = full_path.substr(dir_name.length() + 1);
+ }
+ dir_and_file_set.insert(filename);
+ iter = spdk_fs_iter_next(iter);
+ }
+
+ for (auto &s : dir_and_file_set) {
+ result->push_back(s);
+ }
+
+ result->push_back(".");
+ result->push_back("..");
+
+ return Status::OK();
+ }
+ return EnvWrapper::GetChildren(dir, result);
+ }
+};
+
+/* The thread local constructor doesn't work for the main thread, since
+ * the filesystem hasn't been loaded yet. So we break out this
+ * SpdkInitializeThread function, so that the main thread can explicitly
+ * call it after the filesystem has been loaded.
+ */
+void SpdkInitializeThread(void)
+{
+ struct spdk_thread *thread;
+
+ if (g_fs != NULL) {
+ if (g_sync_args.channel) {
+ spdk_fs_free_thread_ctx(g_sync_args.channel);
+ }
+ thread = spdk_thread_create("spdk_rocksdb", NULL);
+ spdk_set_thread(thread);
+ g_sync_args.channel = spdk_fs_alloc_thread_ctx(g_fs);
+ }
+}
+
+static void
+fs_load_cb(__attribute__((unused)) void *ctx,
+ struct spdk_filesystem *fs, int fserrno)
+{
+ if (fserrno == 0) {
+ g_fs = fs;
+ }
+ g_spdk_ready = true;
+}
+
+static void
+rocksdb_run(__attribute__((unused)) void *arg1)
+{
+ struct spdk_bdev *bdev;
+
+ bdev = spdk_bdev_get_by_name(g_bdev_name.c_str());
+
+ if (bdev == NULL) {
+ SPDK_ERRLOG("bdev %s not found\n", g_bdev_name.c_str());
+ exit(1);
+ }
+
+ g_lcore = spdk_env_get_first_core();
+
+ g_bs_dev = spdk_bdev_create_bs_dev(bdev, NULL, NULL);
+ printf("using bdev %s\n", g_bdev_name.c_str());
+ spdk_fs_load(g_bs_dev, __send_request, fs_load_cb, NULL);
+}
+
+static void
+fs_unload_cb(__attribute__((unused)) void *ctx,
+ __attribute__((unused)) int fserrno)
+{
+ assert(fserrno == 0);
+
+ spdk_app_stop(0);
+}
+
+static void
+rocksdb_shutdown(void)
+{
+ if (g_fs != NULL) {
+ spdk_fs_unload(g_fs, fs_unload_cb, NULL);
+ } else {
+ fs_unload_cb(NULL, 0);
+ }
+}
+
+static void *
+initialize_spdk(void *arg)
+{
+ struct spdk_app_opts *opts = (struct spdk_app_opts *)arg;
+ int rc;
+
+ rc = spdk_app_start(opts, rocksdb_run, NULL);
+ /*
+ * TODO: Revisit for case of internal failure of
+ * spdk_app_start(), itself. At this time, it's known
+ * the only application's use of spdk_app_stop() passes
+ * a zero; i.e. no fail (non-zero) cases so here we
+ * assume there was an internal failure and flag it
+ * so we can throw an exception.
+ */
+ if (rc) {
+ g_spdk_start_failure = true;
+ } else {
+ spdk_app_fini();
+ delete opts;
+ }
+ pthread_exit(NULL);
+
+}
+
+SpdkEnv::SpdkEnv(Env *base_env, const std::string &dir, const std::string &conf,
+ const std::string &bdev, uint64_t cache_size_in_mb)
+ : EnvWrapper(base_env), mDirectory(dir), mConfig(conf), mBdev(bdev)
+{
+ struct spdk_app_opts *opts = new struct spdk_app_opts;
+
+ spdk_app_opts_init(opts);
+ opts->name = "rocksdb";
+ opts->config_file = mConfig.c_str();
+ opts->shutdown_cb = rocksdb_shutdown;
+
+ spdk_fs_set_cache_size(cache_size_in_mb);
+ g_bdev_name = mBdev;
+
+ pthread_create(&mSpdkTid, NULL, &initialize_spdk, opts);
+ while (!g_spdk_ready && !g_spdk_start_failure)
+ ;
+ if (g_spdk_start_failure) {
+ delete opts;
+ throw SpdkAppStartException("spdk_app_start() unable to start rocksdb_run()");
+ }
+
+ SpdkInitializeThread();
+}
+
+SpdkEnv::~SpdkEnv()
+{
+ /* This is a workaround for rocksdb test, we close the files if the rocksdb not
+ * do the work before the test quit.
+ */
+ if (g_fs != NULL) {
+ spdk_fs_iter iter;
+ struct spdk_file *file;
+
+ if (!g_sync_args.channel) {
+ SpdkInitializeThread();
+ }
+
+ iter = spdk_fs_iter_first(g_fs);
+ while (iter != NULL) {
+ file = spdk_fs_iter_get_file(iter);
+ spdk_file_close(file, g_sync_args.channel);
+ iter = spdk_fs_iter_next(iter);
+ }
+ }
+
+ spdk_app_start_shutdown();
+ pthread_join(mSpdkTid, NULL);
+}
+
+Env *NewSpdkEnv(Env *base_env, const std::string &dir, const std::string &conf,
+ const std::string &bdev, uint64_t cache_size_in_mb)
+{
+ try {
+ SpdkEnv *spdk_env = new SpdkEnv(base_env, dir, conf, bdev, cache_size_in_mb);
+ if (g_fs != NULL) {
+ return spdk_env;
+ } else {
+ delete spdk_env;
+ return NULL;
+ }
+ } catch (SpdkAppStartException &e) {
+ SPDK_ERRLOG("NewSpdkEnv: exception caught: %s", e.what());
+ return NULL;
+ } catch (...) {
+ SPDK_ERRLOG("NewSpdkEnv: default exception caught");
+ return NULL;
+ }
+}
+
+} // namespace rocksdb
diff --git a/src/spdk/lib/rocksdb/spdk.rocksdb.mk b/src/spdk/lib/rocksdb/spdk.rocksdb.mk
new file mode 100644
index 000000000..fe498cc39
--- /dev/null
+++ b/src/spdk/lib/rocksdb/spdk.rocksdb.mk
@@ -0,0 +1,70 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+# This snippet will be included into the RocksDB Makefile
+
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+include $(SPDK_ROOT_DIR)/mk/spdk.app.mk
+include $(SPDK_ROOT_DIR)/mk/spdk.modules.mk
+
+CXXFLAGS += -I$(SPDK_DIR)/include -Iinclude/
+
+# The SPDK makefiles turn this on, but RocksDB won't compile with it. So
+# turn it off after including the SPDK makefiles.
+CXXFLAGS += -Wno-missing-declarations
+
+# The SPDK Makefiles may turn these options on but we do not want to enable
+# them for the RocksDB source files.
+CXXFLAGS += -fno-profile-arcs -fno-test-coverage
+ifeq ($(CONFIG_UBSAN),y)
+CXXFLAGS += -fno-sanitize=undefined
+endif
+ifeq ($(CONFIG_ASAN),y)
+CXXFLAGS += -fno-sanitize=address
+endif
+
+SPDK_LIB_LIST = $(ALL_MODULES_LIST)
+SPDK_LIB_LIST += $(EVENT_BDEV_SUBSYSTEM)
+SPDK_LIB_LIST += bdev accel event util conf trace log jsonrpc json rpc sock thread notify
+SPDK_LIB_LIST += bdev_rpc blobfs_bdev
+
+AM_LINK += $(SPDK_LIB_LINKER_ARGS) $(ENV_LINKER_ARGS)
+AM_LINK += $(SYS_LIBS)
+
+ifeq ($(CONFIG_UBSAN),y)
+AM_LINK += -fsanitize=undefined
+endif
+
+ifeq ($(CONFIG_COVERAGE),y)
+AM_LINK += -fprofile-arcs -ftest-coverage
+endif
diff --git a/src/spdk/lib/rpc/Makefile b/src/spdk/lib/rpc/Makefile
new file mode 100644
index 000000000..ead36f6ba
--- /dev/null
+++ b/src/spdk/lib/rpc/Makefile
@@ -0,0 +1,45 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = rpc.c
+LIBNAME = rpc
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_rpc.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/rpc/rpc.c b/src/spdk/lib/rpc/rpc.c
new file mode 100644
index 000000000..7182f41e9
--- /dev/null
+++ b/src/spdk/lib/rpc/rpc.c
@@ -0,0 +1,392 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation. All rights reserved.
+ * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/file.h>
+
+#include "spdk/stdinc.h"
+
+#include "spdk/queue.h"
+#include "spdk/rpc.h"
+#include "spdk/env.h"
+#include "spdk/log.h"
+#include "spdk/string.h"
+#include "spdk/util.h"
+#include "spdk/version.h"
+
+static struct sockaddr_un g_rpc_listen_addr_unix = {};
+static char g_rpc_lock_path[sizeof(g_rpc_listen_addr_unix.sun_path) + sizeof(".lock")];
+static int g_rpc_lock_fd = -1;
+
+static struct spdk_jsonrpc_server *g_jsonrpc_server = NULL;
+static uint32_t g_rpc_state;
+static bool g_rpcs_correct = true;
+
+struct spdk_rpc_method {
+ const char *name;
+ spdk_rpc_method_handler func;
+ SLIST_ENTRY(spdk_rpc_method) slist;
+ uint32_t state_mask;
+ bool is_deprecated;
+ struct spdk_rpc_method *is_alias_of;
+ bool deprecation_warning_printed;
+};
+
+static SLIST_HEAD(, spdk_rpc_method) g_rpc_methods = SLIST_HEAD_INITIALIZER(g_rpc_methods);
+
+void
+spdk_rpc_set_state(uint32_t state)
+{
+ g_rpc_state = state;
+}
+
+uint32_t
+spdk_rpc_get_state(void)
+{
+ return g_rpc_state;
+}
+
+static struct spdk_rpc_method *
+_get_rpc_method(const struct spdk_json_val *method)
+{
+ struct spdk_rpc_method *m;
+
+ SLIST_FOREACH(m, &g_rpc_methods, slist) {
+ if (spdk_json_strequal(method, m->name)) {
+ return m;
+ }
+ }
+
+ return NULL;
+}
+
+static struct spdk_rpc_method *
+_get_rpc_method_raw(const char *method)
+{
+ struct spdk_json_val method_val;
+
+ method_val.type = SPDK_JSON_VAL_STRING;
+ method_val.len = strlen(method);
+ method_val.start = (char *)method;
+
+ return _get_rpc_method(&method_val);
+}
+
+static void
+jsonrpc_handler(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *method,
+ const struct spdk_json_val *params)
+{
+ struct spdk_rpc_method *m;
+
+ assert(method != NULL);
+
+ m = _get_rpc_method(method);
+ if (m == NULL) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_METHOD_NOT_FOUND, "Method not found");
+ return;
+ }
+
+ if (m->is_alias_of != NULL) {
+ if (m->is_deprecated && !m->deprecation_warning_printed) {
+ SPDK_WARNLOG("RPC method %s is deprecated. Use %s instead.\n", m->name, m->is_alias_of->name);
+ m->deprecation_warning_printed = true;
+ }
+ m = m->is_alias_of;
+ }
+
+ if ((m->state_mask & g_rpc_state) == g_rpc_state) {
+ m->func(request, params);
+ } else {
+ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_STATE,
+ "Method is allowed in any state in the mask (%"PRIx32"),"
+ " but current state is (%"PRIx32")",
+ m->state_mask, g_rpc_state);
+ }
+}
+
+int
+spdk_rpc_listen(const char *listen_addr)
+{
+ int rc;
+
+ memset(&g_rpc_listen_addr_unix, 0, sizeof(g_rpc_listen_addr_unix));
+
+ g_rpc_listen_addr_unix.sun_family = AF_UNIX;
+ rc = snprintf(g_rpc_listen_addr_unix.sun_path,
+ sizeof(g_rpc_listen_addr_unix.sun_path),
+ "%s", listen_addr);
+ if (rc < 0 || (size_t)rc >= sizeof(g_rpc_listen_addr_unix.sun_path)) {
+ SPDK_ERRLOG("RPC Listen address Unix socket path too long\n");
+ g_rpc_listen_addr_unix.sun_path[0] = '\0';
+ return -1;
+ }
+
+ rc = snprintf(g_rpc_lock_path, sizeof(g_rpc_lock_path), "%s.lock",
+ g_rpc_listen_addr_unix.sun_path);
+ if (rc < 0 || (size_t)rc >= sizeof(g_rpc_lock_path)) {
+ SPDK_ERRLOG("RPC lock path too long\n");
+ g_rpc_listen_addr_unix.sun_path[0] = '\0';
+ g_rpc_lock_path[0] = '\0';
+ return -1;
+ }
+
+ g_rpc_lock_fd = open(g_rpc_lock_path, O_RDONLY | O_CREAT, 0600);
+ if (g_rpc_lock_fd == -1) {
+ SPDK_ERRLOG("Cannot open lock file %s: %s\n",
+ g_rpc_lock_path, spdk_strerror(errno));
+ g_rpc_listen_addr_unix.sun_path[0] = '\0';
+ g_rpc_lock_path[0] = '\0';
+ return -1;
+ }
+
+ rc = flock(g_rpc_lock_fd, LOCK_EX | LOCK_NB);
+ if (rc != 0) {
+ SPDK_ERRLOG("RPC Unix domain socket path %s in use. Specify another.\n",
+ g_rpc_listen_addr_unix.sun_path);
+ g_rpc_listen_addr_unix.sun_path[0] = '\0';
+ g_rpc_lock_path[0] = '\0';
+ return -1;
+ }
+
+ /*
+ * Since we acquired the lock, it is safe to delete the Unix socket file
+ * if it still exists from a previous process.
+ */
+ unlink(g_rpc_listen_addr_unix.sun_path);
+
+ g_jsonrpc_server = spdk_jsonrpc_server_listen(AF_UNIX, 0,
+ (struct sockaddr *)&g_rpc_listen_addr_unix,
+ sizeof(g_rpc_listen_addr_unix),
+ jsonrpc_handler);
+ if (g_jsonrpc_server == NULL) {
+ SPDK_ERRLOG("spdk_jsonrpc_server_listen() failed\n");
+ close(g_rpc_lock_fd);
+ g_rpc_lock_fd = -1;
+ unlink(g_rpc_lock_path);
+ g_rpc_lock_path[0] = '\0';
+ return -1;
+ }
+
+ return 0;
+}
+
+void
+spdk_rpc_accept(void)
+{
+ spdk_jsonrpc_server_poll(g_jsonrpc_server);
+}
+
+void
+spdk_rpc_register_method(const char *method, spdk_rpc_method_handler func, uint32_t state_mask)
+{
+ struct spdk_rpc_method *m;
+
+ m = _get_rpc_method_raw(method);
+ if (m != NULL) {
+ SPDK_ERRLOG("duplicate RPC %s registered...\n", method);
+ g_rpcs_correct = false;
+ return;
+ }
+
+ m = calloc(1, sizeof(struct spdk_rpc_method));
+ assert(m != NULL);
+
+ m->name = strdup(method);
+ assert(m->name != NULL);
+
+ m->func = func;
+ m->state_mask = state_mask;
+
+ /* TODO: use a hash table or sorted list */
+ SLIST_INSERT_HEAD(&g_rpc_methods, m, slist);
+}
+
+void
+spdk_rpc_register_alias_deprecated(const char *method, const char *alias)
+{
+ struct spdk_rpc_method *m, *base;
+
+ base = _get_rpc_method_raw(method);
+ if (base == NULL) {
+ SPDK_ERRLOG("cannot create alias %s - method %s does not exist\n",
+ alias, method);
+ g_rpcs_correct = false;
+ return;
+ }
+
+ if (base->is_alias_of != NULL) {
+ SPDK_ERRLOG("cannot create alias %s of alias %s\n", alias, method);
+ g_rpcs_correct = false;
+ return;
+ }
+
+ m = calloc(1, sizeof(struct spdk_rpc_method));
+ assert(m != NULL);
+
+ m->name = strdup(alias);
+ assert(m->name != NULL);
+
+ m->is_alias_of = base;
+ m->is_deprecated = true;
+ m->state_mask = base->state_mask;
+
+ /* TODO: use a hash table or sorted list */
+ SLIST_INSERT_HEAD(&g_rpc_methods, m, slist);
+}
+
+bool
+spdk_rpc_verify_methods(void)
+{
+ return g_rpcs_correct;
+}
+
+int
+spdk_rpc_is_method_allowed(const char *method, uint32_t state_mask)
+{
+ struct spdk_rpc_method *m;
+
+ SLIST_FOREACH(m, &g_rpc_methods, slist) {
+ if (strcmp(m->name, method) != 0) {
+ continue;
+ }
+
+ if ((m->state_mask & state_mask) == state_mask) {
+ return 0;
+ } else {
+ return -EPERM;
+ }
+ }
+
+ return -ENOENT;
+}
+
+void
+spdk_rpc_close(void)
+{
+ if (g_jsonrpc_server) {
+ if (g_rpc_listen_addr_unix.sun_path[0]) {
+ /* Delete the Unix socket file */
+ unlink(g_rpc_listen_addr_unix.sun_path);
+ g_rpc_listen_addr_unix.sun_path[0] = '\0';
+ }
+
+ spdk_jsonrpc_server_shutdown(g_jsonrpc_server);
+ g_jsonrpc_server = NULL;
+
+ if (g_rpc_lock_fd != -1) {
+ close(g_rpc_lock_fd);
+ g_rpc_lock_fd = -1;
+ }
+
+ if (g_rpc_lock_path[0]) {
+ unlink(g_rpc_lock_path);
+ g_rpc_lock_path[0] = '\0';
+ }
+ }
+}
+
+struct rpc_get_methods {
+ bool current;
+ bool include_aliases;
+};
+
+static const struct spdk_json_object_decoder rpc_get_methods_decoders[] = {
+ {"current", offsetof(struct rpc_get_methods, current), spdk_json_decode_bool, true},
+ {"include_aliases", offsetof(struct rpc_get_methods, include_aliases), spdk_json_decode_bool, true},
+};
+
+static void
+rpc_get_methods(struct spdk_jsonrpc_request *request, const struct spdk_json_val *params)
+{
+ struct rpc_get_methods req = {};
+ struct spdk_json_write_ctx *w;
+ struct spdk_rpc_method *m;
+
+ if (params != NULL) {
+ if (spdk_json_decode_object(params, rpc_get_methods_decoders,
+ SPDK_COUNTOF(rpc_get_methods_decoders), &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid parameters");
+ return;
+ }
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_array_begin(w);
+ SLIST_FOREACH(m, &g_rpc_methods, slist) {
+ if (m->is_alias_of != NULL && !req.include_aliases) {
+ continue;
+ }
+ if (req.current && ((m->state_mask & g_rpc_state) != g_rpc_state)) {
+ continue;
+ }
+ spdk_json_write_string(w, m->name);
+ }
+ spdk_json_write_array_end(w);
+ spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("rpc_get_methods", rpc_get_methods, SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(rpc_get_methods, get_rpc_methods)
+
+static void
+rpc_spdk_get_version(struct spdk_jsonrpc_request *request, const struct spdk_json_val *params)
+{
+ struct spdk_json_write_ctx *w;
+
+ if (params != NULL) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "spdk_get_version method requires no parameters");
+ return;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_string_fmt(w, "version", "%s", SPDK_VERSION_STRING);
+ spdk_json_write_named_object_begin(w, "fields");
+ spdk_json_write_named_uint32(w, "major", SPDK_VERSION_MAJOR);
+ spdk_json_write_named_uint32(w, "minor", SPDK_VERSION_MINOR);
+ spdk_json_write_named_uint32(w, "patch", SPDK_VERSION_PATCH);
+ spdk_json_write_named_string_fmt(w, "suffix", "%s", SPDK_VERSION_SUFFIX);
+#ifdef SPDK_GIT_COMMIT
+ spdk_json_write_named_string_fmt(w, "commit", "%s", SPDK_GIT_COMMIT_STRING);
+#endif
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+ spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("spdk_get_version", rpc_spdk_get_version,
+ SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(spdk_get_version, get_spdk_version)
diff --git a/src/spdk/lib/rpc/spdk_rpc.map b/src/spdk/lib/rpc/spdk_rpc.map
new file mode 100644
index 000000000..e15ff8b53
--- /dev/null
+++ b/src/spdk/lib/rpc/spdk_rpc.map
@@ -0,0 +1,16 @@
+{
+ global:
+
+ # public functions
+ spdk_rpc_verify_methods;
+ spdk_rpc_listen;
+ spdk_rpc_accept;
+ spdk_rpc_close;
+ spdk_rpc_register_method;
+ spdk_rpc_register_alias_deprecated;
+ spdk_rpc_is_method_allowed;
+ spdk_rpc_set_state;
+ spdk_rpc_get_state;
+
+ local: *;
+};
diff --git a/src/spdk/lib/rte_vhost/Makefile b/src/spdk/lib/rte_vhost/Makefile
new file mode 100644
index 000000000..aa073c6ca
--- /dev/null
+++ b/src/spdk/lib/rte_vhost/Makefile
@@ -0,0 +1,50 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+CFLAGS += -I.
+CFLAGS += $(ENV_CFLAGS)
+CFLAGS += -include rte_config.h
+CFLAGS += -Wno-address-of-packed-member
+
+# These are the DPDK vhost files copied (for now) into SPDK
+C_SRCS += fd_man.c socket.c vhost_user.c vhost.c
+
+LIBNAME = rte_vhost
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/rte_vhost/fd_man.c b/src/spdk/lib/rte_vhost/fd_man.c
new file mode 100644
index 000000000..2ceacc9ab
--- /dev/null
+++ b/src/spdk/lib/rte_vhost/fd_man.c
@@ -0,0 +1,300 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <string.h>
+
+#include <rte_common.h>
+#include <rte_log.h>
+
+#include "fd_man.h"
+
+#define FDPOLLERR (POLLERR | POLLHUP | POLLNVAL)
+
+static int
+get_last_valid_idx(struct fdset *pfdset, int last_valid_idx)
+{
+ int i;
+
+ for (i = last_valid_idx; i >= 0 && pfdset->fd[i].fd == -1; i--)
+ ;
+
+ return i;
+}
+
+static void
+fdset_move(struct fdset *pfdset, int dst, int src)
+{
+ pfdset->fd[dst] = pfdset->fd[src];
+ pfdset->rwfds[dst] = pfdset->rwfds[src];
+}
+
+static void
+fdset_shrink_nolock(struct fdset *pfdset)
+{
+ int i;
+ int last_valid_idx = get_last_valid_idx(pfdset, pfdset->num - 1);
+
+ for (i = 0; i < last_valid_idx; i++) {
+ if (pfdset->fd[i].fd != -1)
+ continue;
+
+ fdset_move(pfdset, i, last_valid_idx);
+ last_valid_idx = get_last_valid_idx(pfdset, last_valid_idx - 1);
+ }
+ pfdset->num = last_valid_idx + 1;
+}
+
+/*
+ * Find deleted fd entries and remove them
+ */
+static void
+fdset_shrink(struct fdset *pfdset)
+{
+ pthread_mutex_lock(&pfdset->fd_mutex);
+ fdset_shrink_nolock(pfdset);
+ pthread_mutex_unlock(&pfdset->fd_mutex);
+}
+
+/**
+ * Returns the index in the fdset for a given fd.
+ * @return
+ * index for the fd, or -1 if fd isn't in the fdset.
+ */
+static int
+fdset_find_fd(struct fdset *pfdset, int fd)
+{
+ int i;
+
+ for (i = 0; i < pfdset->num && pfdset->fd[i].fd != fd; i++)
+ ;
+
+ return i == pfdset->num ? -1 : i;
+}
+
+static void
+fdset_add_fd(struct fdset *pfdset, int idx, int fd,
+ fd_cb rcb, fd_cb wcb, void *dat)
+{
+ struct fdentry *pfdentry = &pfdset->fd[idx];
+ struct pollfd *pfd = &pfdset->rwfds[idx];
+
+ pfdentry->fd = fd;
+ pfdentry->rcb = rcb;
+ pfdentry->wcb = wcb;
+ pfdentry->dat = dat;
+
+ pfd->fd = fd;
+ pfd->events = rcb ? POLLIN : 0;
+ pfd->events |= wcb ? POLLOUT : 0;
+ pfd->revents = 0;
+}
+
+void
+fdset_init(struct fdset *pfdset)
+{
+ int i;
+
+ if (pfdset == NULL)
+ return;
+
+ for (i = 0; i < MAX_FDS; i++) {
+ pfdset->fd[i].fd = -1;
+ pfdset->fd[i].dat = NULL;
+ }
+ pfdset->num = 0;
+}
+
+/**
+ * Register the fd in the fdset with read/write handler and context.
+ */
+int
+fdset_add(struct fdset *pfdset, int fd, fd_cb rcb, fd_cb wcb, void *dat)
+{
+ int i;
+
+ if (pfdset == NULL || fd == -1)
+ return -1;
+
+ pthread_mutex_lock(&pfdset->fd_mutex);
+ i = pfdset->num < MAX_FDS ? pfdset->num++ : -1;
+ if (i == -1) {
+ fdset_shrink_nolock(pfdset);
+ i = pfdset->num < MAX_FDS ? pfdset->num++ : -1;
+ if (i == -1) {
+ pthread_mutex_unlock(&pfdset->fd_mutex);
+ return -2;
+ }
+ }
+
+ fdset_add_fd(pfdset, i, fd, rcb, wcb, dat);
+ pthread_mutex_unlock(&pfdset->fd_mutex);
+
+ return 0;
+}
+
+/**
+ * Unregister the fd from the fdset.
+ * Returns context of a given fd or NULL.
+ */
+void *
+fdset_del(struct fdset *pfdset, int fd)
+{
+ int i;
+ void *dat = NULL;
+
+ if (pfdset == NULL || fd == -1)
+ return NULL;
+
+ do {
+ pthread_mutex_lock(&pfdset->fd_mutex);
+
+ i = fdset_find_fd(pfdset, fd);
+ if (i != -1 && pfdset->fd[i].busy == 0) {
+ /* busy indicates r/wcb is executing! */
+ dat = pfdset->fd[i].dat;
+ pfdset->fd[i].fd = -1;
+ pfdset->fd[i].rcb = pfdset->fd[i].wcb = NULL;
+ pfdset->fd[i].dat = NULL;
+ i = -1;
+ }
+ pthread_mutex_unlock(&pfdset->fd_mutex);
+ } while (i != -1);
+
+ return dat;
+}
+
+
+/**
+ * This functions runs in infinite blocking loop until there is no fd in
+ * pfdset. It calls corresponding r/w handler if there is event on the fd.
+ *
+ * Before the callback is called, we set the flag to busy status; If other
+ * thread(now rte_vhost_driver_unregister) calls fdset_del concurrently, it
+ * will wait until the flag is reset to zero(which indicates the callback is
+ * finished), then it could free the context after fdset_del.
+ */
+void *
+fdset_event_dispatch(void *arg)
+{
+ int i;
+ struct pollfd *pfd;
+ struct fdentry *pfdentry;
+ fd_cb rcb, wcb;
+ void *dat;
+ int fd, numfds;
+ int remove1, remove2;
+ int need_shrink;
+ struct fdset *pfdset = arg;
+
+ if (pfdset == NULL)
+ return NULL;
+
+ while (1) {
+
+ /*
+ * When poll is blocked, other threads might unregister
+ * listenfds from and register new listenfds into fdset.
+ * When poll returns, the entries for listenfds in the fdset
+ * might have been updated. It is ok if there is unwanted call
+ * for new listenfds.
+ */
+ pthread_mutex_lock(&pfdset->fd_mutex);
+ numfds = pfdset->num;
+ pthread_mutex_unlock(&pfdset->fd_mutex);
+
+ poll(pfdset->rwfds, numfds, 1000 /* millisecs */);
+
+ need_shrink = 0;
+ for (i = 0; i < numfds; i++) {
+ pthread_mutex_lock(&pfdset->fd_mutex);
+
+ pfdentry = &pfdset->fd[i];
+ fd = pfdentry->fd;
+ pfd = &pfdset->rwfds[i];
+
+ if (fd < 0) {
+ need_shrink = 1;
+ pthread_mutex_unlock(&pfdset->fd_mutex);
+ continue;
+ }
+
+ if (!pfd->revents) {
+ pthread_mutex_unlock(&pfdset->fd_mutex);
+ continue;
+ }
+
+ remove1 = remove2 = 0;
+
+ rcb = pfdentry->rcb;
+ wcb = pfdentry->wcb;
+ dat = pfdentry->dat;
+ pfdentry->busy = 1;
+
+ pthread_mutex_unlock(&pfdset->fd_mutex);
+
+ if (rcb && pfd->revents & (POLLIN | FDPOLLERR))
+ rcb(fd, dat, &remove1);
+ if (wcb && pfd->revents & (POLLOUT | FDPOLLERR))
+ wcb(fd, dat, &remove2);
+ pfdentry->busy = 0;
+ /*
+ * fdset_del needs to check busy flag.
+ * We don't allow fdset_del to be called in callback
+ * directly.
+ */
+ /*
+ * When we are to clean up the fd from fdset,
+ * because the fd is closed in the cb,
+ * the old fd val could be reused by when creates new
+ * listen fd in another thread, we couldn't call
+ * fd_set_del.
+ */
+ if (remove1 || remove2) {
+ pfdentry->fd = -1;
+ need_shrink = 1;
+ }
+ }
+
+ if (need_shrink)
+ fdset_shrink(pfdset);
+ }
+
+ return NULL;
+}
diff --git a/src/spdk/lib/rte_vhost/fd_man.h b/src/spdk/lib/rte_vhost/fd_man.h
new file mode 100644
index 000000000..3a9d269b3
--- /dev/null
+++ b/src/spdk/lib/rte_vhost/fd_man.h
@@ -0,0 +1,69 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _FD_MAN_H_
+#define _FD_MAN_H_
+#include <stdint.h>
+#include <pthread.h>
+#include <poll.h>
+
+#define MAX_FDS 1024
+
+typedef void (*fd_cb)(int fd, void *dat, int *remove);
+
+struct fdentry {
+ int fd; /* -1 indicates this entry is empty */
+ fd_cb rcb; /* callback when this fd is readable. */
+ fd_cb wcb; /* callback when this fd is writeable. */
+ void *dat; /* fd context */
+ int busy; /* whether this entry is being used in cb. */
+};
+
+struct fdset {
+ struct pollfd rwfds[MAX_FDS];
+ struct fdentry fd[MAX_FDS];
+ pthread_mutex_t fd_mutex;
+ int num; /* current fd number of this fdset */
+};
+
+
+void fdset_init(struct fdset *pfdset);
+
+int fdset_add(struct fdset *pfdset, int fd,
+ fd_cb rcb, fd_cb wcb, void *dat);
+
+void *fdset_del(struct fdset *pfdset, int fd);
+
+void *fdset_event_dispatch(void *arg);
+
+#endif
diff --git a/src/spdk/lib/rte_vhost/rte_vhost.h b/src/spdk/lib/rte_vhost/rte_vhost.h
new file mode 100644
index 000000000..b1b7f2cd8
--- /dev/null
+++ b/src/spdk/lib/rte_vhost/rte_vhost.h
@@ -0,0 +1,635 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2017 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_VHOST_H_
+#define _RTE_VHOST_H_
+
+/**
+ * @file
+ * Interface to vhost-user
+ */
+
+#include <stdint.h>
+#include <linux/vhost.h>
+#include <linux/virtio_ring.h>
+#include <sys/eventfd.h>
+
+#include <rte_config.h>
+#include <rte_memory.h>
+#include <rte_mempool.h>
+
+#define RTE_VHOST_USER_CLIENT (1ULL << 0)
+#define RTE_VHOST_USER_NO_RECONNECT (1ULL << 1)
+#define RTE_VHOST_USER_DEQUEUE_ZERO_COPY (1ULL << 2)
+
+/**
+ * Information relating to memory regions including offsets to
+ * addresses in QEMUs memory file.
+ */
+struct rte_vhost_mem_region {
+ uint64_t guest_phys_addr;
+ uint64_t guest_user_addr;
+ uint64_t host_user_addr;
+ uint64_t size;
+ void *mmap_addr;
+ uint64_t mmap_size;
+ int fd;
+};
+
+/**
+ * Memory structure includes region and mapping information.
+ */
+struct rte_vhost_memory {
+ uint32_t nregions;
+ struct rte_vhost_mem_region regions[0];
+};
+
+struct rte_vhost_inflight_desc_split {
+ uint8_t inflight;
+ uint8_t padding[5];
+ uint16_t next;
+ uint64_t counter;
+};
+
+struct rte_vhost_inflight_info_split {
+ uint64_t features;
+ uint16_t version;
+ uint16_t desc_num;
+ uint16_t last_inflight_io;
+ uint16_t used_idx;
+ struct rte_vhost_inflight_desc_split desc[0];
+};
+
+struct rte_vhost_resubmit_desc {
+ uint16_t index;
+ uint64_t counter;
+};
+
+struct rte_vhost_resubmit_info {
+ struct rte_vhost_resubmit_desc *resubmit_list;
+ uint16_t resubmit_num;
+};
+
+struct rte_vhost_ring_inflight {
+ struct rte_vhost_inflight_info_split *inflight_split;
+ struct rte_vhost_resubmit_info *resubmit_inflight;
+};
+
+struct rte_vhost_vring {
+ union {
+ struct vring_desc *desc;
+ struct vring_packed_desc *desc_packed;
+ };
+ union {
+ struct vring_avail *avail;
+ struct vring_packed_desc_event *driver_event;
+ };
+ union {
+ struct vring_used *used;
+ struct vring_packed_desc_event *device_event;
+ };
+ uint64_t log_guest_addr;
+
+ int callfd;
+ int kickfd;
+ uint16_t size;
+};
+
+/**
+ * Device and vring operations.
+ */
+struct vhost_device_ops {
+ int (*new_device)(int vid); /**< Add device. */
+ void (*destroy_device)(int vid); /**< Remove device. */
+
+ int (*vring_state_changed)(int vid, uint16_t queue_id, int enable); /**< triggered when a vring is enabled or disabled */
+
+ /**
+ * Features could be changed after the feature negotiation.
+ * For example, VHOST_F_LOG_ALL will be set/cleared at the
+ * start/end of live migration, respectively. This callback
+ * is used to inform the application on such change.
+ */
+ int (*features_changed)(int vid, uint64_t features);
+ int (*vhost_nvme_admin_passthrough)(int vid, void *cmd, void *cqe, void *buf);
+ int (*vhost_nvme_set_cq_call)(int vid, uint16_t qid, int fd);
+ int (*vhost_nvme_set_bar_mr)(int vid, void *bar_addr, uint64_t bar_size);
+ int (*vhost_nvme_get_cap)(int vid, uint64_t *cap);
+
+ int (*new_connection)(int vid);
+ void (*destroy_connection)(int vid);
+
+ int (*get_config)(int vid, uint8_t *config, uint32_t config_len);
+ int (*set_config)(int vid, uint8_t *config, uint32_t offset,
+ uint32_t len, uint32_t flags);
+
+ void *reserved[2]; /**< Reserved for future extension */
+};
+
+/**
+ * Convert guest physical address to host virtual address
+ *
+ * @param mem
+ * the guest memory regions
+ * @param gpa
+ * the guest physical address for querying
+ * @return
+ * the host virtual address on success, 0 on failure
+ */
+static inline uint64_t __attribute__((always_inline))
+rte_vhost_gpa_to_vva(struct rte_vhost_memory *mem, uint64_t gpa)
+{
+ struct rte_vhost_mem_region *reg;
+ uint32_t i;
+
+ for (i = 0; i < mem->nregions; i++) {
+ reg = &mem->regions[i];
+ if (gpa >= reg->guest_phys_addr &&
+ gpa < reg->guest_phys_addr + reg->size) {
+ return gpa - reg->guest_phys_addr +
+ reg->host_user_addr;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * Convert guest physical address to host virtual address safely
+ *
+ * This variant of rte_vhost_gpa_to_vva() takes care all the
+ * requested length is mapped and contiguous in process address
+ * space.
+ *
+ * @param mem
+ * the guest memory regions
+ * @param gpa
+ * the guest physical address for querying
+ * @param len
+ * the size of the requested area to map,
+ * updated with actual size mapped
+ * @return
+ * the host virtual address on success, 0 on failure */
+static inline uint64_t
+rte_vhost_va_from_guest_pa(struct rte_vhost_memory *mem,
+ uint64_t gpa, uint64_t *len)
+{
+ struct rte_vhost_mem_region *r;
+ uint32_t i;
+
+ for (i = 0; i < mem->nregions; i++) {
+ r = &mem->regions[i];
+ if (gpa >= r->guest_phys_addr &&
+ gpa < r->guest_phys_addr + r->size) {
+
+ if (unlikely(*len > r->guest_phys_addr + r->size - gpa))
+ *len = r->guest_phys_addr + r->size - gpa;
+
+ return gpa - r->guest_phys_addr +
+ r->host_user_addr;
+ }
+ }
+ *len = 0;
+
+ return 0;
+}
+
+#define RTE_VHOST_NEED_LOG(features) ((features) & (1ULL << VHOST_F_LOG_ALL))
+
+/**
+ * Log the memory write start with given address.
+ *
+ * This function only need be invoked when the live migration starts.
+ * Therefore, we won't need call it at all in the most of time. For
+ * making the performance impact be minimum, it's suggested to do a
+ * check before calling it:
+ *
+ * if (unlikely(RTE_VHOST_NEED_LOG(features)))
+ * rte_vhost_log_write(vid, addr, len);
+ *
+ * @param vid
+ * vhost device ID
+ * @param addr
+ * the starting address for write
+ * @param len
+ * the length to write
+ */
+void rte_vhost_log_write(int vid, uint64_t addr, uint64_t len);
+
+/**
+ * Log the used ring update start at given offset.
+ *
+ * Same as rte_vhost_log_write, it's suggested to do a check before
+ * calling it:
+ *
+ * if (unlikely(RTE_VHOST_NEED_LOG(features)))
+ * rte_vhost_log_used_vring(vid, vring_idx, offset, len);
+ *
+ * @param vid
+ * vhost device ID
+ * @param vring_idx
+ * the vring index
+ * @param offset
+ * the offset inside the used ring
+ * @param len
+ * the length to write
+ */
+void rte_vhost_log_used_vring(int vid, uint16_t vring_idx,
+ uint64_t offset, uint64_t len);
+
+int rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable);
+
+/**
+ * Register vhost driver. path could be different for multiple
+ * instance support.
+ */
+int rte_vhost_driver_register(const char *path, uint64_t flags);
+
+/* Unregister vhost driver. This is only meaningful to vhost user. */
+int rte_vhost_driver_unregister(const char *path);
+
+/**
+ * Set the feature bits the vhost-user driver supports.
+ *
+ * @param path
+ * The vhost-user socket file path
+ * @return
+ * 0 on success, -1 on failure
+ */
+int rte_vhost_driver_set_features(const char *path, uint64_t features);
+
+/**
+ * Enable vhost-user driver features.
+ *
+ * Note that
+ * - the param @features should be a subset of the feature bits provided
+ * by rte_vhost_driver_set_features().
+ * - it must be invoked before vhost-user negotiation starts.
+ *
+ * @param path
+ * The vhost-user socket file path
+ * @param features
+ * Features to enable
+ * @return
+ * 0 on success, -1 on failure
+ */
+int rte_vhost_driver_enable_features(const char *path, uint64_t features);
+
+/**
+ * Disable vhost-user driver features.
+ *
+ * The two notes at rte_vhost_driver_enable_features() also apply here.
+ *
+ * @param path
+ * The vhost-user socket file path
+ * @param features
+ * Features to disable
+ * @return
+ * 0 on success, -1 on failure
+ */
+int rte_vhost_driver_disable_features(const char *path, uint64_t features);
+
+/**
+ * Get the feature bits before feature negotiation.
+ *
+ * @param path
+ * The vhost-user socket file path
+ * @param features
+ * A pointer to store the queried feature bits
+ * @return
+ * 0 on success, -1 on failure
+ */
+int rte_vhost_driver_get_features(const char *path, uint64_t *features);
+
+/**
+ * Get the feature bits after negotiation
+ *
+ * @param vid
+ * Vhost device ID
+ * @param features
+ * A pointer to store the queried feature bits
+ * @return
+ * 0 on success, -1 on failure
+ */
+int rte_vhost_get_negotiated_features(int vid, uint64_t *features);
+
+/* Register callbacks. */
+int rte_vhost_driver_callback_register(const char *path,
+ struct vhost_device_ops const * const ops);
+
+/**
+ *
+ * Start the vhost-user driver.
+ *
+ * This function triggers the vhost-user negotiation.
+ *
+ * @param path
+ * The vhost-user socket file path
+ * @return
+ * 0 on success, -1 on failure
+ */
+int rte_vhost_driver_start(const char *path);
+
+/**
+ * Get the MTU value of the device if set in QEMU.
+ *
+ * @param vid
+ * virtio-net device ID
+ * @param mtu
+ * The variable to store the MTU value
+ *
+ * @return
+ * 0: success
+ * -EAGAIN: device not yet started
+ * -ENOTSUP: device does not support MTU feature
+ */
+int rte_vhost_get_mtu(int vid, uint16_t *mtu);
+
+/**
+ * Get the numa node from which the virtio net device's memory
+ * is allocated.
+ *
+ * @param vid
+ * vhost device ID
+ *
+ * @return
+ * The numa node, -1 on failure
+ */
+int rte_vhost_get_numa_node(int vid);
+
+/**
+ * Get the virtio net device's ifname, which is the vhost-user socket
+ * file path.
+ *
+ * @param vid
+ * vhost device ID
+ * @param buf
+ * The buffer to stored the queried ifname
+ * @param len
+ * The length of buf
+ *
+ * @return
+ * 0 on success, -1 on failure
+ */
+int rte_vhost_get_ifname(int vid, char *buf, size_t len);
+
+/**
+ * Get how many avail entries are left in the queue
+ *
+ * @param vid
+ * vhost device ID
+ * @param queue_id
+ * virtio queue index
+ *
+ * @return
+ * num of avail entires left
+ */
+uint16_t rte_vhost_avail_entries(int vid, uint16_t queue_id);
+
+struct rte_mbuf;
+struct rte_mempool;
+/**
+ * This function adds buffers to the virtio devices RX virtqueue. Buffers can
+ * be received from the physical port or from another virtual device. A packet
+ * count is returned to indicate the number of packets that were succesfully
+ * added to the RX queue.
+ * @param vid
+ * vhost device ID
+ * @param queue_id
+ * virtio queue index in mq case
+ * @param pkts
+ * array to contain packets to be enqueued
+ * @param count
+ * packets num to be enqueued
+ * @return
+ * num of packets enqueued
+ */
+uint16_t rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
+ struct rte_mbuf **pkts, uint16_t count);
+
+/**
+ * This function gets guest buffers from the virtio device TX virtqueue,
+ * construct host mbufs, copies guest buffer content to host mbufs and
+ * store them in pkts to be processed.
+ * @param vid
+ * vhost device ID
+ * @param queue_id
+ * virtio queue index in mq case
+ * @param mbuf_pool
+ * mbuf_pool where host mbuf is allocated.
+ * @param pkts
+ * array to contain packets to be dequeued
+ * @param count
+ * packets num to be dequeued
+ * @return
+ * num of packets dequeued
+ */
+uint16_t rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
+ struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count);
+
+/**
+ * Get guest mem table: a list of memory regions.
+ *
+ * An rte_vhost_vhost_memory object will be allocated internaly, to hold the
+ * guest memory regions. Application should free it at destroy_device()
+ * callback.
+ *
+ * @param vid
+ * vhost device ID
+ * @param mem
+ * To store the returned mem regions
+ * @return
+ * 0 on success, -1 on failure
+ */
+int rte_vhost_get_mem_table(int vid, struct rte_vhost_memory **mem);
+
+/**
+ * Get guest vring info, including the vring address, vring size, etc.
+ *
+ * @param vid
+ * vhost device ID
+ * @param vring_idx
+ * vring index
+ * @param vring
+ * the structure to hold the requested vring info
+ * @return
+ * 0 on success, -1 on failure
+ */
+int rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx,
+ struct rte_vhost_vring *vring);
+
+/**
+ * Set id of the last descriptors in avail and used guest vrings.
+ *
+ * In case user application operates directly on buffers, it should use this
+ * function on device destruction to retrieve the same values later on in device
+ * creation via rte_vhost_get_vhost_vring(int, uint16_t, struct rte_vhost_vring *)
+ *
+ * @param vid
+ * vhost device ID
+ * @param vring_idx
+ * vring index
+ * @param last_avail_idx
+ * id of the last descriptor in avail ring to be set
+ * @param last_used_idx
+ * id of the last descriptor in used ring to be set
+ * @return
+ * 0 on success, -1 on failure
+ */
+int rte_vhost_set_vring_base(int vid, uint16_t queue_id,
+ uint16_t last_avail_idx, uint16_t last_used_idx);
+
+int rte_vhost_get_vring_base(int vid, uint16_t queue_id,
+ uint16_t *last_avail_idx, uint16_t *last_used_idx);
+
+/**
+ * Notify the guest that used descriptors have been added to the vring.
+ *
+ * @param vid
+ * vhost device ID
+ * @param vring_idx
+ * vring index
+ * @return
+ * 0 on success, -1 on failure
+ */
+int rte_vhost_vring_call(int vid, uint16_t vring_idx);
+
+/**
+ * Get guest inflight vring info, including inflight ring and resubmit list.
+ *
+ * @param vid
+ * vhost device ID
+ * @param vring_idx
+ * vring index
+ * @param vring
+ * the structure to hold the requested inflight vring info
+ * @return
+ * 0 on success, -1 on failure
+ */
+__rte_experimental
+int
+rte_vhost_get_vhost_ring_inflight(int vid, uint16_t vring_idx,
+ struct rte_vhost_ring_inflight *vring);
+
+/**
+ * Set split inflight descriptor.
+ *
+ * This function save descriptors that has been comsumed in available
+ * ring
+ *
+ * @param vid
+ * vhost device ID
+ * @param vring_idx
+ * vring index
+ * @param idx
+ * inflight entry index
+ * @return
+ * 0 on success, -1 on failure
+ */
+__rte_experimental
+int
+rte_vhost_set_inflight_desc_split(int vid, uint16_t vring_idx,
+ uint16_t idx);
+
+/**
+ * Save the head of list that the last batch of used descriptors.
+ *
+ * @param vid
+ * vhost device ID
+ * @param vring_idx
+ * vring index
+ * @param idx
+ * descriptor entry index
+ * @return
+ * 0 on success, -1 on failure
+ */
+__rte_experimental
+int
+rte_vhost_set_last_inflight_io_split(int vid,
+ uint16_t vring_idx, uint16_t idx);
+
+/**
+ * Clear the split inflight status.
+ *
+ * @param vid
+ * vhost device ID
+ * @param vring_idx
+ * vring index
+ * @param last_used_idx
+ * last used idx of used ring
+ * @param idx
+ * inflight entry index
+ * @return
+ * 0 on success, -1 on failure
+ */
+__rte_experimental
+int
+rte_vhost_clr_inflight_desc_split(int vid, uint16_t vring_idx,
+ uint16_t last_used_idx, uint16_t idx);
+
+/**
+ * Save the head of list that the last batch of used descriptors.
+ *
+ * @param vid
+ * vhost device ID
+ * @param vring_idx
+ * vring index
+ * @param idx
+ * descriptor entry index
+ * @return
+ * 0 on success, -1 on failure
+ */
+__rte_experimental
+int
+rte_vhost_set_last_inflight_io_split(int vid,
+ uint16_t vring_idx, uint16_t idx);
+
+/**
+ * Clear the split inflight status.
+ *
+ * @param vid
+ * vhost device ID
+ * @param vring_idx
+ * vring index
+ * @param last_used_idx
+ * last used idx of used ring
+ * @param idx
+ * inflight entry index
+ * @return
+ * 0 on success, -1 on failure
+ */
+__rte_experimental
+int
+rte_vhost_clr_inflight_desc_split(int vid, uint16_t vring_idx,
+ uint16_t last_used_idx, uint16_t idx);
+#endif /* _RTE_VHOST_H_ */
diff --git a/src/spdk/lib/rte_vhost/socket.c b/src/spdk/lib/rte_vhost/socket.c
new file mode 100644
index 000000000..ec923518b
--- /dev/null
+++ b/src/spdk/lib/rte_vhost/socket.c
@@ -0,0 +1,841 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <sys/queue.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+
+#include <rte_log.h>
+
+#include "fd_man.h"
+#include "vhost.h"
+#include "vhost_user.h"
+
+
+TAILQ_HEAD(vhost_user_connection_list, vhost_user_connection);
+
+/*
+ * Every time rte_vhost_driver_register() is invoked, an associated
+ * vhost_user_socket struct will be created.
+ */
+struct vhost_user_socket {
+ struct vhost_user_connection_list conn_list;
+ pthread_mutex_t conn_mutex;
+ char *path;
+ int socket_fd;
+ struct sockaddr_un un;
+ bool is_server;
+ bool reconnect;
+ bool dequeue_zero_copy;
+
+ /*
+ * The "supported_features" indicates the feature bits the
+ * vhost driver supports. The "features" indicates the feature
+ * bits after the rte_vhost_driver_features_disable/enable().
+ * It is also the final feature bits used for vhost-user
+ * features negotiation.
+ */
+ uint64_t supported_features;
+ uint64_t features;
+
+ struct vhost_device_ops const *notify_ops;
+};
+
+struct vhost_user_connection {
+ struct vhost_user_socket *vsocket;
+ int connfd;
+ int vid;
+
+ TAILQ_ENTRY(vhost_user_connection) next;
+};
+
+#define MAX_VHOST_SOCKET 1024
+struct vhost_user {
+ struct vhost_user_socket *vsockets[MAX_VHOST_SOCKET];
+ struct fdset fdset;
+ int vsocket_cnt;
+ pthread_mutex_t mutex;
+};
+
+#define MAX_VIRTIO_BACKLOG 128
+
+static void vhost_user_server_new_connection(int fd, void *data, int *remove);
+static void vhost_user_read_cb(int fd, void *dat, int *remove);
+static int create_unix_socket(struct vhost_user_socket *vsocket);
+static int vhost_user_start_client(struct vhost_user_socket *vsocket);
+
+static struct vhost_user vhost_user = {
+ .fdset = {
+ .fd = { [0 ... MAX_FDS - 1] = {-1, NULL, NULL, NULL, 0} },
+ .fd_mutex = PTHREAD_MUTEX_INITIALIZER,
+ .num = 0
+ },
+ .vsocket_cnt = 0,
+ .mutex = PTHREAD_MUTEX_INITIALIZER,
+};
+
+/* return bytes# of read on success or negative val on failure. */
+int
+read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num)
+{
+ struct iovec iov;
+ struct msghdr msgh;
+ size_t fdsize = fd_num * sizeof(int);
+ char control[CMSG_SPACE(fdsize)];
+ struct cmsghdr *cmsg;
+ int ret;
+
+ memset(&msgh, 0, sizeof(msgh));
+ iov.iov_base = buf;
+ iov.iov_len = buflen;
+
+ msgh.msg_iov = &iov;
+ msgh.msg_iovlen = 1;
+ msgh.msg_control = control;
+ msgh.msg_controllen = sizeof(control);
+
+ ret = recvmsg(sockfd, &msgh, 0);
+ if (ret <= 0) {
+ if (ret)
+ RTE_LOG(ERR, VHOST_CONFIG, "recvmsg failed, %s\n", strerror(errno));
+ else
+ RTE_LOG(INFO, VHOST_CONFIG, "peer closed\n");
+ return ret;
+ }
+
+ if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) {
+ RTE_LOG(ERR, VHOST_CONFIG, "truncted msg\n");
+ return -1;
+ }
+
+ for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
+ cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
+ if ((cmsg->cmsg_level == SOL_SOCKET) &&
+ (cmsg->cmsg_type == SCM_RIGHTS)) {
+ memcpy(fds, CMSG_DATA(cmsg), fdsize);
+ break;
+ }
+ }
+
+ return ret;
+}
+
+int
+send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num)
+{
+
+ struct iovec iov;
+ struct msghdr msgh;
+ size_t fdsize = fd_num * sizeof(int);
+ char control[CMSG_SPACE(fdsize)];
+ struct cmsghdr *cmsg;
+ int ret;
+
+ memset(&msgh, 0, sizeof(msgh));
+ iov.iov_base = buf;
+ iov.iov_len = buflen;
+
+ msgh.msg_iov = &iov;
+ msgh.msg_iovlen = 1;
+
+ if (fds && fd_num > 0) {
+ msgh.msg_control = control;
+ msgh.msg_controllen = sizeof(control);
+ cmsg = CMSG_FIRSTHDR(&msgh);
+ if (cmsg == NULL) {
+ RTE_LOG(ERR, VHOST_CONFIG, "cmsg == NULL\n");
+ errno = EINVAL;
+ return -1;
+ }
+ cmsg->cmsg_len = CMSG_LEN(fdsize);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ memcpy(CMSG_DATA(cmsg), fds, fdsize);
+ } else {
+ msgh.msg_control = NULL;
+ msgh.msg_controllen = 0;
+ }
+
+ do {
+ ret = sendmsg(sockfd, &msgh, 0);
+ } while (ret < 0 && errno == EINTR);
+
+ if (ret < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG, "sendmsg error\n");
+ return ret;
+ }
+
+ return ret;
+}
+
+static void
+vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket)
+{
+ int vid;
+ size_t size;
+ struct vhost_user_connection *conn;
+ int ret;
+
+ conn = malloc(sizeof(*conn));
+ if (conn == NULL) {
+ close(fd);
+ return;
+ }
+
+ vid = vhost_new_device(vsocket->features, vsocket->notify_ops);
+ if (vid == -1) {
+ goto err;
+ }
+
+ size = strnlen(vsocket->path, PATH_MAX);
+ vhost_set_ifname(vid, vsocket->path, size);
+
+ if (vsocket->dequeue_zero_copy)
+ vhost_enable_dequeue_zero_copy(vid);
+
+ RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", vid);
+
+ if (vsocket->notify_ops->new_connection) {
+ ret = vsocket->notify_ops->new_connection(vid);
+ if (ret < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "failed to add vhost user connection with fd %d\n",
+ fd);
+ goto err;
+ }
+ }
+
+ conn->connfd = fd;
+ conn->vsocket = vsocket;
+ conn->vid = vid;
+ ret = fdset_add(&vhost_user.fdset, fd, vhost_user_read_cb,
+ NULL, conn);
+ if (ret < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "failed to add fd %d into vhost server fdset\n",
+ fd);
+
+ if (vsocket->notify_ops->destroy_connection)
+ vsocket->notify_ops->destroy_connection(conn->vid);
+
+ goto err;
+ }
+
+ pthread_mutex_lock(&vsocket->conn_mutex);
+ TAILQ_INSERT_TAIL(&vsocket->conn_list, conn, next);
+ pthread_mutex_unlock(&vsocket->conn_mutex);
+ return;
+
+err:
+ free(conn);
+ close(fd);
+}
+
+/* call back when there is new vhost-user connection from client */
+static void
+vhost_user_server_new_connection(int fd, void *dat, int *remove __rte_unused)
+{
+ struct vhost_user_socket *vsocket = dat;
+
+ fd = accept(fd, NULL, NULL);
+ if (fd < 0)
+ return;
+
+ RTE_LOG(INFO, VHOST_CONFIG, "new vhost user connection is %d\n", fd);
+ vhost_user_add_connection(fd, vsocket);
+}
+
+static void
+vhost_user_read_cb(int connfd, void *dat, int *remove)
+{
+ struct vhost_user_connection *conn = dat;
+ struct vhost_user_socket *vsocket = conn->vsocket;
+ int ret;
+
+ ret = vhost_user_msg_handler(conn->vid, connfd);
+ if (ret < 0) {
+ *remove = 1;
+ vhost_destroy_device(conn->vid);
+
+ if (vsocket->notify_ops->destroy_connection)
+ vsocket->notify_ops->destroy_connection(conn->vid);
+
+ pthread_mutex_lock(&vsocket->conn_mutex);
+ TAILQ_REMOVE(&vsocket->conn_list, conn, next);
+ if (conn->connfd != -1) {
+ close(conn->connfd);
+ conn->connfd = -1;
+ }
+ pthread_mutex_unlock(&vsocket->conn_mutex);
+
+ free(conn);
+
+ if (vsocket->reconnect) {
+ create_unix_socket(vsocket);
+ vhost_user_start_client(vsocket);
+ }
+ }
+}
+
+static int
+create_unix_socket(struct vhost_user_socket *vsocket)
+{
+ int fd;
+ struct sockaddr_un *un = &vsocket->un;
+
+ fd = socket(AF_UNIX, SOCK_STREAM, 0);
+ if (fd < 0)
+ return -1;
+ RTE_LOG(INFO, VHOST_CONFIG, "vhost-user %s: socket created, fd: %d\n",
+ vsocket->is_server ? "server" : "client", fd);
+
+ if (!vsocket->is_server && fcntl(fd, F_SETFL, O_NONBLOCK)) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "vhost-user: can't set nonblocking mode for socket, fd: "
+ "%d (%s)\n", fd, strerror(errno));
+ close(fd);
+ return -1;
+ }
+
+ memset(un, 0, sizeof(*un));
+ un->sun_family = AF_UNIX;
+ strncpy(un->sun_path, vsocket->path, sizeof(un->sun_path));
+ un->sun_path[sizeof(un->sun_path) - 1] = '\0';
+
+ vsocket->socket_fd = fd;
+ return 0;
+}
+
+static int
+vhost_user_start_server(struct vhost_user_socket *vsocket)
+{
+ int ret;
+ int fd = vsocket->socket_fd;
+ const char *path = vsocket->path;
+
+ ret = bind(fd, (struct sockaddr *)&vsocket->un, sizeof(vsocket->un));
+ if (ret < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "failed to bind to %s: %s; remove it and try again\n",
+ path, strerror(errno));
+ goto err;
+ }
+ RTE_LOG(INFO, VHOST_CONFIG, "bind to %s\n", path);
+
+ ret = listen(fd, MAX_VIRTIO_BACKLOG);
+ if (ret < 0)
+ goto err;
+
+ ret = fdset_add(&vhost_user.fdset, fd, vhost_user_server_new_connection,
+ NULL, vsocket);
+ if (ret < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "failed to add listen fd %d to vhost server fdset\n",
+ fd);
+ goto err;
+ }
+
+ return 0;
+
+err:
+ close(fd);
+ return -1;
+}
+
+struct vhost_user_reconnect {
+ struct sockaddr_un un;
+ int fd;
+ struct vhost_user_socket *vsocket;
+
+ TAILQ_ENTRY(vhost_user_reconnect) next;
+};
+
+TAILQ_HEAD(vhost_user_reconnect_tailq_list, vhost_user_reconnect);
+struct vhost_user_reconnect_list {
+ struct vhost_user_reconnect_tailq_list head;
+ pthread_mutex_t mutex;
+};
+
+static struct vhost_user_reconnect_list reconn_list;
+static pthread_t reconn_tid;
+
+static int
+vhost_user_connect_nonblock(int fd, struct sockaddr *un, size_t sz)
+{
+ int ret, flags;
+
+ ret = connect(fd, un, sz);
+ if (ret < 0 && errno != EISCONN)
+ return -1;
+
+ flags = fcntl(fd, F_GETFL, 0);
+ if (flags < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "can't get flags for connfd %d\n", fd);
+ return -2;
+ }
+ if ((flags & O_NONBLOCK) && fcntl(fd, F_SETFL, flags & ~O_NONBLOCK)) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "can't disable nonblocking on fd %d\n", fd);
+ return -2;
+ }
+ return 0;
+}
+
+static void *
+vhost_user_client_reconnect(void *arg __rte_unused)
+{
+ int ret;
+ struct vhost_user_reconnect *reconn, *next;
+
+ while (1) {
+ pthread_mutex_lock(&reconn_list.mutex);
+
+ /*
+ * An equal implementation of TAILQ_FOREACH_SAFE,
+ * which does not exist on all platforms.
+ */
+ for (reconn = TAILQ_FIRST(&reconn_list.head);
+ reconn != NULL; reconn = next) {
+ next = TAILQ_NEXT(reconn, next);
+
+ ret = vhost_user_connect_nonblock(reconn->fd,
+ (struct sockaddr *)&reconn->un,
+ sizeof(reconn->un));
+ if (ret == -2) {
+ close(reconn->fd);
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "reconnection for fd %d failed\n",
+ reconn->fd);
+ goto remove_fd;
+ }
+ if (ret == -1)
+ continue;
+
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "%s: connected\n", reconn->vsocket->path);
+ vhost_user_add_connection(reconn->fd, reconn->vsocket);
+remove_fd:
+ TAILQ_REMOVE(&reconn_list.head, reconn, next);
+ free(reconn);
+ }
+
+ pthread_mutex_unlock(&reconn_list.mutex);
+ sleep(1);
+ }
+
+ return NULL;
+}
+
+static int
+vhost_user_reconnect_init(void)
+{
+ int ret;
+
+ pthread_mutex_init(&reconn_list.mutex, NULL);
+ TAILQ_INIT(&reconn_list.head);
+
+ ret = pthread_create(&reconn_tid, NULL,
+ vhost_user_client_reconnect, NULL);
+ if (ret < 0)
+ RTE_LOG(ERR, VHOST_CONFIG, "failed to create reconnect thread");
+
+ return ret;
+}
+
+static int
+vhost_user_start_client(struct vhost_user_socket *vsocket)
+{
+ int ret;
+ int fd = vsocket->socket_fd;
+ const char *path = vsocket->path;
+ struct vhost_user_reconnect *reconn;
+
+ ret = vhost_user_connect_nonblock(fd, (struct sockaddr *)&vsocket->un,
+ sizeof(vsocket->un));
+ if (ret == 0) {
+ vhost_user_add_connection(fd, vsocket);
+ return 0;
+ }
+
+ RTE_LOG(WARNING, VHOST_CONFIG,
+ "failed to connect to %s: %s\n",
+ path, strerror(errno));
+
+ if (ret == -2 || !vsocket->reconnect) {
+ close(fd);
+ return -1;
+ }
+
+ RTE_LOG(INFO, VHOST_CONFIG, "%s: reconnecting...\n", path);
+ reconn = malloc(sizeof(*reconn));
+ if (reconn == NULL) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "failed to allocate memory for reconnect\n");
+ close(fd);
+ return -1;
+ }
+ reconn->un = vsocket->un;
+ reconn->fd = fd;
+ reconn->vsocket = vsocket;
+ pthread_mutex_lock(&reconn_list.mutex);
+ TAILQ_INSERT_TAIL(&reconn_list.head, reconn, next);
+ pthread_mutex_unlock(&reconn_list.mutex);
+
+ return 0;
+}
+
+static struct vhost_user_socket *
+find_vhost_user_socket(const char *path)
+{
+ int i;
+
+ for (i = 0; i < vhost_user.vsocket_cnt; i++) {
+ struct vhost_user_socket *vsocket = vhost_user.vsockets[i];
+
+ if (!strcmp(vsocket->path, path))
+ return vsocket;
+ }
+
+ return NULL;
+}
+
+int
+rte_vhost_driver_disable_features(const char *path, uint64_t features)
+{
+ struct vhost_user_socket *vsocket;
+
+ pthread_mutex_lock(&vhost_user.mutex);
+ vsocket = find_vhost_user_socket(path);
+ if (vsocket)
+ vsocket->features &= ~features;
+ pthread_mutex_unlock(&vhost_user.mutex);
+
+ return vsocket ? 0 : -1;
+}
+
+int
+rte_vhost_driver_enable_features(const char *path, uint64_t features)
+{
+ struct vhost_user_socket *vsocket;
+
+ pthread_mutex_lock(&vhost_user.mutex);
+ vsocket = find_vhost_user_socket(path);
+ if (vsocket) {
+ if ((vsocket->supported_features & features) != features) {
+ /*
+ * trying to enable features the driver doesn't
+ * support.
+ */
+ pthread_mutex_unlock(&vhost_user.mutex);
+ return -1;
+ }
+ vsocket->features |= features;
+ }
+ pthread_mutex_unlock(&vhost_user.mutex);
+
+ return vsocket ? 0 : -1;
+}
+
+int
+rte_vhost_driver_set_features(const char *path, uint64_t features)
+{
+ struct vhost_user_socket *vsocket;
+
+ pthread_mutex_lock(&vhost_user.mutex);
+ vsocket = find_vhost_user_socket(path);
+ if (vsocket) {
+ vsocket->supported_features = features;
+ vsocket->features = features;
+ }
+ pthread_mutex_unlock(&vhost_user.mutex);
+
+ return vsocket ? 0 : -1;
+}
+
+int
+rte_vhost_driver_get_features(const char *path, uint64_t *features)
+{
+ struct vhost_user_socket *vsocket;
+
+ pthread_mutex_lock(&vhost_user.mutex);
+ vsocket = find_vhost_user_socket(path);
+ if (vsocket)
+ *features = vsocket->features;
+ pthread_mutex_unlock(&vhost_user.mutex);
+
+ if (!vsocket) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "socket file %s is not registered yet.\n", path);
+ return -1;
+ } else {
+ return 0;
+ }
+}
+
+/*
+ * Register a new vhost-user socket; here we could act as server
+ * (the default case), or client (when RTE_VHOST_USER_CLIENT) flag
+ * is set.
+ */
+int
+rte_vhost_driver_register(const char *path, uint64_t flags)
+{
+ int ret = -1;
+ struct vhost_user_socket *vsocket;
+
+ if (!path)
+ return -1;
+
+ pthread_mutex_lock(&vhost_user.mutex);
+
+ if (vhost_user.vsocket_cnt == MAX_VHOST_SOCKET) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "error: the number of vhost sockets reaches maximum\n");
+ goto out;
+ }
+
+ vsocket = malloc(sizeof(struct vhost_user_socket));
+ if (!vsocket)
+ goto out;
+ memset(vsocket, 0, sizeof(struct vhost_user_socket));
+ vsocket->path = strdup(path);
+ if (!vsocket->path) {
+ free(vsocket);
+ goto out;
+ }
+ TAILQ_INIT(&vsocket->conn_list);
+ vsocket->dequeue_zero_copy = flags & RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
+
+ /*
+ * Set the supported features correctly for the builtin vhost-user
+ * net driver.
+ *
+ * Applications know nothing about features the builtin virtio net
+ * driver (virtio_net.c) supports, thus it's not possible for them
+ * to invoke rte_vhost_driver_set_features(). To workaround it, here
+ * we set it unconditionally. If the application want to implement
+ * another vhost-user driver (say SCSI), it should call the
+ * rte_vhost_driver_set_features(), which will overwrite following
+ * two values.
+ */
+ vsocket->supported_features = VIRTIO_NET_SUPPORTED_FEATURES;
+ vsocket->features = VIRTIO_NET_SUPPORTED_FEATURES;
+
+ if ((flags & RTE_VHOST_USER_CLIENT) != 0) {
+ vsocket->reconnect = !(flags & RTE_VHOST_USER_NO_RECONNECT);
+ if (vsocket->reconnect && reconn_tid == 0) {
+ if (vhost_user_reconnect_init() < 0) {
+ free(vsocket->path);
+ free(vsocket);
+ goto out;
+ }
+ }
+ } else {
+ vsocket->is_server = true;
+ }
+ ret = create_unix_socket(vsocket);
+ if (ret < 0) {
+ free(vsocket->path);
+ free(vsocket);
+ goto out;
+ }
+
+ pthread_mutex_init(&vsocket->conn_mutex, NULL);
+ vhost_user.vsockets[vhost_user.vsocket_cnt++] = vsocket;
+
+out:
+ pthread_mutex_unlock(&vhost_user.mutex);
+
+ return ret;
+}
+
+static bool
+vhost_user_remove_reconnect(struct vhost_user_socket *vsocket)
+{
+ int found = false;
+ struct vhost_user_reconnect *reconn, *next;
+
+ pthread_mutex_lock(&reconn_list.mutex);
+
+ for (reconn = TAILQ_FIRST(&reconn_list.head);
+ reconn != NULL; reconn = next) {
+ next = TAILQ_NEXT(reconn, next);
+
+ if (reconn->vsocket == vsocket) {
+ TAILQ_REMOVE(&reconn_list.head, reconn, next);
+ close(reconn->fd);
+ free(reconn);
+ found = true;
+ break;
+ }
+ }
+ pthread_mutex_unlock(&reconn_list.mutex);
+ return found;
+}
+
+/**
+ * Unregister the specified vhost socket
+ */
+int
+rte_vhost_driver_unregister(const char *path)
+{
+ int i;
+ int count;
+ struct vhost_user_connection *conn;
+
+ pthread_mutex_lock(&vhost_user.mutex);
+
+ for (i = 0; i < vhost_user.vsocket_cnt; i++) {
+ struct vhost_user_socket *vsocket = vhost_user.vsockets[i];
+
+ if (!strcmp(vsocket->path, path)) {
+ if (vsocket->is_server) {
+ fdset_del(&vhost_user.fdset, vsocket->socket_fd);
+ close(vsocket->socket_fd);
+ unlink(path);
+ } else if (vsocket->reconnect) {
+ vhost_user_remove_reconnect(vsocket);
+ }
+
+ pthread_mutex_lock(&vsocket->conn_mutex);
+ TAILQ_FOREACH(conn, &vsocket->conn_list, next) {
+ close(conn->connfd);
+ conn->connfd = -1;
+ }
+ pthread_mutex_unlock(&vsocket->conn_mutex);
+
+ do {
+ pthread_mutex_lock(&vsocket->conn_mutex);
+ conn = TAILQ_FIRST(&vsocket->conn_list);
+ pthread_mutex_unlock(&vsocket->conn_mutex);
+ } while (conn != NULL);
+
+ free(vsocket->path);
+ free(vsocket);
+
+ count = --vhost_user.vsocket_cnt;
+ vhost_user.vsockets[i] = vhost_user.vsockets[count];
+ vhost_user.vsockets[count] = NULL;
+ pthread_mutex_unlock(&vhost_user.mutex);
+
+ return 0;
+ }
+ }
+ pthread_mutex_unlock(&vhost_user.mutex);
+
+ return -1;
+}
+
+/*
+ * Register ops so that we can add/remove device to data core.
+ */
+int
+rte_vhost_driver_callback_register(const char *path,
+ struct vhost_device_ops const * const ops)
+{
+ struct vhost_user_socket *vsocket;
+
+ pthread_mutex_lock(&vhost_user.mutex);
+ vsocket = find_vhost_user_socket(path);
+ if (vsocket)
+ vsocket->notify_ops = ops;
+ pthread_mutex_unlock(&vhost_user.mutex);
+
+ return vsocket ? 0 : -1;
+}
+
+struct vhost_device_ops const *
+vhost_driver_callback_get(const char *path)
+{
+ struct vhost_user_socket *vsocket;
+
+ pthread_mutex_lock(&vhost_user.mutex);
+ vsocket = find_vhost_user_socket(path);
+ pthread_mutex_unlock(&vhost_user.mutex);
+
+ return vsocket ? vsocket->notify_ops : NULL;
+}
+
+int
+rte_vhost_driver_start(const char *path)
+{
+ struct vhost_user_socket *vsocket;
+ static pthread_t fdset_tid;
+
+ pthread_mutex_lock(&vhost_user.mutex);
+ vsocket = find_vhost_user_socket(path);
+ pthread_mutex_unlock(&vhost_user.mutex);
+
+ if (!vsocket)
+ return -1;
+
+ if (fdset_tid == 0) {
+ rte_cpuset_t orig_cpuset;
+ rte_cpuset_t tmp_cpuset;
+ long num_cores, i;
+ int ret;
+
+ CPU_ZERO(&tmp_cpuset);
+ num_cores = sysconf(_SC_NPROCESSORS_CONF);
+ /* Create a mask containing all CPUs */
+ for (i = 0; i < num_cores; i++) {
+ CPU_SET(i, &tmp_cpuset);
+ }
+
+ rte_thread_get_affinity(&orig_cpuset);
+ rte_thread_set_affinity(&tmp_cpuset);
+ ret = pthread_create(&fdset_tid, NULL, fdset_event_dispatch,
+ &vhost_user.fdset);
+ rte_thread_set_affinity(&orig_cpuset);
+ if (ret < 0)
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "failed to create fdset handling thread");
+ }
+
+ if (vsocket->is_server)
+ return vhost_user_start_server(vsocket);
+ else
+ return vhost_user_start_client(vsocket);
+}
diff --git a/src/spdk/lib/rte_vhost/vhost.c b/src/spdk/lib/rte_vhost/vhost.c
new file mode 100644
index 000000000..8e875c585
--- /dev/null
+++ b/src/spdk/lib/rte_vhost/vhost.c
@@ -0,0 +1,565 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/vhost.h>
+#include <linux/virtio_net.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#ifdef RTE_LIBRTE_VHOST_NUMA
+#include <numaif.h>
+#endif
+
+#include <rte_ethdev.h>
+#include <rte_log.h>
+#include <rte_string_fns.h>
+#include <rte_memory.h>
+#include <rte_malloc.h>
+#include <rte_vhost.h>
+
+#include "vhost.h"
+
+struct virtio_net *vhost_devices[MAX_VHOST_DEVICE];
+
+struct virtio_net *
+get_device(int vid)
+{
+ struct virtio_net *dev = vhost_devices[vid];
+
+ if (unlikely(!dev)) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "(%d) device not found.\n", vid);
+ }
+
+ return dev;
+}
+
+static void
+cleanup_vq(struct vhost_virtqueue *vq, int destroy)
+{
+ if ((vq->callfd >= 0) && (destroy != 0))
+ close(vq->callfd);
+ if (vq->kickfd >= 0)
+ close(vq->kickfd);
+}
+
+/*
+ * Unmap any memory, close any file descriptors and
+ * free any memory owned by a device.
+ */
+void
+cleanup_device(struct virtio_net *dev, int destroy)
+{
+ uint32_t i;
+
+ vhost_backend_cleanup(dev);
+
+ for (i = 0; i < dev->nr_vring; i++)
+ cleanup_vq(dev->virtqueue[i], destroy);
+}
+
+/*
+ * Release virtqueues and device memory.
+ */
+static void
+free_device(struct virtio_net *dev)
+{
+ uint32_t i;
+ struct vhost_virtqueue *vq;
+
+ for (i = 0; i < dev->nr_vring; i++) {
+ vq = dev->virtqueue[i];
+
+ rte_free(vq->shadow_used_ring);
+
+ rte_free(vq);
+ }
+
+ rte_free(dev);
+}
+
+static void
+init_vring_queue(struct vhost_virtqueue *vq)
+{
+ memset(vq, 0, sizeof(struct vhost_virtqueue));
+
+ vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
+ vq->callfd = VIRTIO_UNINITIALIZED_EVENTFD;
+
+ /* Backends are set to -1 indicating an inactive device. */
+ vq->backend = -1;
+
+ /*
+ * always set the vq to enabled; this is to keep compatibility
+ * with the old QEMU, whereas there is no SET_VRING_ENABLE message.
+ */
+ vq->enabled = 1;
+
+ TAILQ_INIT(&vq->zmbuf_list);
+}
+
+static void
+reset_vring_queue(struct vhost_virtqueue *vq)
+{
+ int callfd;
+
+ callfd = vq->callfd;
+ init_vring_queue(vq);
+ vq->callfd = callfd;
+}
+
+int
+alloc_vring_queue(struct virtio_net *dev, uint32_t vring_idx)
+{
+ struct vhost_virtqueue *vq;
+
+ vq = rte_malloc(NULL, sizeof(struct vhost_virtqueue), 0);
+ if (vq == NULL) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Failed to allocate memory for vring:%u.\n", vring_idx);
+ return -1;
+ }
+
+ dev->virtqueue[vring_idx] = vq;
+ init_vring_queue(vq);
+
+ dev->nr_vring += 1;
+
+ return 0;
+}
+
+/*
+ * Reset some variables in device structure, while keeping few
+ * others untouched, such as vid, ifname, nr_vring: they
+ * should be same unless the device is removed.
+ */
+void
+reset_device(struct virtio_net *dev)
+{
+ uint32_t i;
+
+ dev->negotiated_features = 0;
+ dev->protocol_features = 0;
+ dev->flags = 0;
+
+ for (i = 0; i < dev->nr_vring; i++)
+ reset_vring_queue(dev->virtqueue[i]);
+}
+
+/*
+ * Invoked when there is a new vhost-user connection established (when
+ * there is a new virtio device being attached).
+ */
+int
+vhost_new_device(uint64_t features, struct vhost_device_ops const *ops)
+{
+ struct virtio_net *dev;
+ int i;
+
+ dev = rte_zmalloc(NULL, sizeof(struct virtio_net), 0);
+ if (dev == NULL) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Failed to allocate memory for new dev.\n");
+ return -1;
+ }
+
+ for (i = 0; i < MAX_VHOST_DEVICE; i++) {
+ if (vhost_devices[i] == NULL)
+ break;
+ }
+ if (i == MAX_VHOST_DEVICE) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Failed to find a free slot for new device.\n");
+ rte_free(dev);
+ return -1;
+ }
+
+ vhost_devices[i] = dev;
+ dev->vid = i;
+ dev->features = features;
+ dev->notify_ops = ops;
+
+ return i;
+}
+
+/*
+ * Invoked when there is the vhost-user connection is broken (when
+ * the virtio device is being detached).
+ */
+void
+vhost_destroy_device(int vid)
+{
+ struct virtio_net *dev = get_device(vid);
+
+ if (dev == NULL)
+ return;
+
+ if (dev->flags & VIRTIO_DEV_RUNNING) {
+ dev->flags &= ~VIRTIO_DEV_RUNNING;
+ dev->notify_ops->destroy_device(vid);
+ }
+
+ cleanup_device(dev, 1);
+ free_device(dev);
+
+ vhost_devices[vid] = NULL;
+}
+
+void
+vhost_set_ifname(int vid, const char *if_name, unsigned int if_len)
+{
+ struct virtio_net *dev;
+ unsigned int len;
+
+ dev = get_device(vid);
+ if (dev == NULL)
+ return;
+
+ len = if_len > sizeof(dev->ifname) ?
+ sizeof(dev->ifname) : if_len;
+
+ strncpy(dev->ifname, if_name, len);
+ dev->ifname[sizeof(dev->ifname) - 1] = '\0';
+}
+
+void
+vhost_enable_dequeue_zero_copy(int vid)
+{
+ struct virtio_net *dev = get_device(vid);
+
+ if (dev == NULL)
+ return;
+
+ dev->dequeue_zero_copy = 1;
+}
+
+int
+rte_vhost_get_mtu(int vid, uint16_t *mtu)
+{
+ struct virtio_net *dev = get_device(vid);
+
+ if (!dev)
+ return -ENODEV;
+
+ if (!(dev->flags & VIRTIO_DEV_READY))
+ return -EAGAIN;
+
+ if (!(dev->negotiated_features & VIRTIO_NET_F_MTU))
+ return -ENOTSUP;
+
+ *mtu = dev->mtu;
+
+ return 0;
+}
+
+int
+rte_vhost_get_numa_node(int vid)
+{
+#ifdef RTE_LIBRTE_VHOST_NUMA
+ struct virtio_net *dev = get_device(vid);
+ int numa_node;
+ int ret;
+
+ if (dev == NULL)
+ return -1;
+
+ ret = get_mempolicy(&numa_node, NULL, 0, dev,
+ MPOL_F_NODE | MPOL_F_ADDR);
+ if (ret < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "(%d) failed to query numa node: %d\n", vid, ret);
+ return -1;
+ }
+
+ return numa_node;
+#else
+ RTE_SET_USED(vid);
+ return -1;
+#endif
+}
+
+int
+rte_vhost_get_ifname(int vid, char *buf, size_t len)
+{
+ struct virtio_net *dev = get_device(vid);
+
+ if (dev == NULL)
+ return -1;
+
+ len = RTE_MIN(len, sizeof(dev->ifname));
+
+ strncpy(buf, dev->ifname, len);
+ buf[len - 1] = '\0';
+
+ return 0;
+}
+
+int
+rte_vhost_get_negotiated_features(int vid, uint64_t *features)
+{
+ struct virtio_net *dev;
+
+ dev = get_device(vid);
+ if (!dev)
+ return -1;
+
+ *features = dev->negotiated_features;
+ return 0;
+}
+
+int
+rte_vhost_get_mem_table(int vid, struct rte_vhost_memory **mem)
+{
+ struct virtio_net *dev;
+ struct rte_vhost_memory *m;
+ size_t size;
+
+ dev = get_device(vid);
+ if (!dev)
+ return -1;
+
+ size = dev->mem->nregions * sizeof(struct rte_vhost_mem_region);
+ m = malloc(sizeof(struct rte_vhost_memory) + size);
+ if (!m)
+ return -1;
+
+ m->nregions = dev->mem->nregions;
+ memcpy(m->regions, dev->mem->regions, size);
+ *mem = m;
+
+ return 0;
+}
+
+int
+rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx,
+ struct rte_vhost_vring *vring)
+{
+ struct virtio_net *dev;
+ struct vhost_virtqueue *vq;
+
+ dev = get_device(vid);
+ if (!dev)
+ return -1;
+
+ if (vring_idx >= VHOST_MAX_VRING)
+ return -1;
+
+ vq = dev->virtqueue[vring_idx];
+ if (!vq)
+ return -1;
+
+ vring->desc = vq->desc;
+ vring->avail = vq->avail;
+ vring->used = vq->used;
+ vring->log_guest_addr = vq->log_guest_addr;
+
+ vring->callfd = vq->callfd;
+ vring->kickfd = vq->kickfd;
+ vring->size = vq->size;
+
+ return 0;
+}
+
+uint16_t
+rte_vhost_avail_entries(int vid, uint16_t queue_id)
+{
+ struct virtio_net *dev;
+ struct vhost_virtqueue *vq;
+
+ dev = get_device(vid);
+ if (!dev)
+ return 0;
+
+ vq = dev->virtqueue[queue_id];
+ if (!vq->enabled)
+ return 0;
+
+ return *(volatile uint16_t *)&vq->avail->idx - vq->last_used_idx;
+}
+
+int
+rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable)
+{
+ struct virtio_net *dev = get_device(vid);
+
+ if (dev == NULL)
+ return -1;
+
+ if (enable) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "guest notification isn't supported.\n");
+ return -1;
+ }
+
+ dev->virtqueue[queue_id]->used->flags = VRING_USED_F_NO_NOTIFY;
+ return 0;
+}
+
+void
+rte_vhost_log_write(int vid, uint64_t addr, uint64_t len)
+{
+ struct virtio_net *dev = get_device(vid);
+
+ if (dev == NULL)
+ return;
+
+ vhost_log_write(dev, addr, len);
+}
+
+void
+rte_vhost_log_used_vring(int vid, uint16_t vring_idx,
+ uint64_t offset, uint64_t len)
+{
+ struct virtio_net *dev;
+ struct vhost_virtqueue *vq;
+
+ dev = get_device(vid);
+ if (dev == NULL)
+ return;
+
+ if (vring_idx >= VHOST_MAX_VRING)
+ return;
+ vq = dev->virtqueue[vring_idx];
+ if (!vq)
+ return;
+
+ vhost_log_used_vring(dev, vq, offset, len);
+}
+
+int
+rte_vhost_set_vring_base(int vid, uint16_t vring_idx,
+ uint16_t last_avail_idx, uint16_t last_used_idx)
+{
+ struct virtio_net *dev;
+ struct vhost_virtqueue *vq;
+
+ dev = get_device(vid);
+ if (!dev)
+ return -1;
+
+ if (vring_idx >= VHOST_MAX_VRING)
+ return -1;
+
+ vq = dev->virtqueue[vring_idx];
+ if (!vq)
+ return -1;
+
+ vq->last_avail_idx = last_avail_idx;
+ vq->last_used_idx = last_used_idx;
+
+ return 0;
+}
+
+int
+rte_vhost_get_vring_base(int vid, uint16_t vring_idx,
+ uint16_t *last_avail_idx, uint16_t *last_used_idx)
+{
+ struct virtio_net *dev;
+ struct vhost_virtqueue *vq;
+
+ dev = get_device(vid);
+ if (!dev)
+ return -1;
+
+ if (vring_idx >= VHOST_MAX_VRING)
+ return -1;
+
+ vq = dev->virtqueue[vring_idx];
+ if (!vq)
+ return -1;
+
+ *last_avail_idx = vq->last_avail_idx;
+ *last_used_idx = vq->last_used_idx;
+
+ return 0;
+}
+
+int
+rte_vhost_vring_call(int vid, uint16_t vring_idx)
+{
+ struct virtio_net *dev;
+ struct vhost_virtqueue *vq;
+
+ dev = get_device(vid);
+ if(!dev)
+ return -1;
+
+ if (vring_idx >= VHOST_MAX_VRING)
+ return -1;
+
+ vq = dev->virtqueue[vring_idx];
+ if (!vq)
+ return -1;
+
+ /* Ensure all our used ring changes are visible to the guest at the time
+ * of interrupt.
+ * TODO: this is currently an sfence on x86. For other architectures we
+ * will most likely need an smp_mb(), but smp_mb() is an overkill for x86.
+ */
+ rte_wmb();
+
+ if (vq->callfd != -1) {
+ eventfd_write(vq->callfd, (eventfd_t)1);
+ return 0;
+ }
+
+ return -1;
+}
+
+int
+rte_vhost_set_last_inflight_io_split(int vid, uint16_t vring_idx,
+ uint16_t idx)
+{
+ return 0;
+}
+
+int
+rte_vhost_clr_inflight_desc_split(int vid, uint16_t vring_idx,
+ uint16_t last_used_idx, uint16_t idx)
+{
+ return 0;
+}
+
+int
+rte_vhost_set_inflight_desc_split(int vid, uint16_t vring_idx,
+ uint16_t idx)
+{
+ return 0;
+}
+
+int
+rte_vhost_get_vhost_ring_inflight(int vid, uint16_t vring_idx,
+ struct rte_vhost_ring_inflight *vring)
+{
+ return 0;
+}
diff --git a/src/spdk/lib/rte_vhost/vhost.h b/src/spdk/lib/rte_vhost/vhost.h
new file mode 100644
index 000000000..d738dba7f
--- /dev/null
+++ b/src/spdk/lib/rte_vhost/vhost.h
@@ -0,0 +1,330 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _VHOST_NET_CDEV_H_
+#define _VHOST_NET_CDEV_H_
+#include <stdint.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/queue.h>
+#include <unistd.h>
+#include <linux/vhost.h>
+#include <linux/virtio_net.h>
+#include <sys/socket.h>
+#include <linux/if.h>
+
+#include <rte_log.h>
+#include <rte_ether.h>
+
+#include "rte_vhost.h"
+#include "vhost_user.h"
+
+/* Used to indicate that the device is running on a data core */
+#define VIRTIO_DEV_RUNNING 1
+/* Used to indicate that the device is ready to operate */
+#define VIRTIO_DEV_READY 2
+
+/* Backend value set by guest. */
+#define VIRTIO_DEV_STOPPED -1
+
+#define BUF_VECTOR_MAX 256
+
+/**
+ * Structure contains buffer address, length and descriptor index
+ * from vring to do scatter RX.
+ */
+struct buf_vector {
+ uint64_t buf_addr;
+ uint32_t buf_len;
+ uint32_t desc_idx;
+};
+
+/*
+ * A structure to hold some fields needed in zero copy code path,
+ * mainly for associating an mbuf with the right desc_idx.
+ */
+struct zcopy_mbuf {
+ struct rte_mbuf *mbuf;
+ uint32_t desc_idx;
+ uint16_t in_use;
+
+ TAILQ_ENTRY(zcopy_mbuf) next;
+};
+TAILQ_HEAD(zcopy_mbuf_list, zcopy_mbuf);
+
+/**
+ * Structure contains variables relevant to RX/TX virtqueues.
+ */
+struct vhost_virtqueue {
+ struct vring_desc *desc;
+ struct vring_avail *avail;
+ struct vring_used *used;
+ uint32_t size;
+
+ uint16_t last_avail_idx;
+ uint16_t last_used_idx;
+#define VIRTIO_INVALID_EVENTFD (-1)
+#define VIRTIO_UNINITIALIZED_EVENTFD (-2)
+
+ /* Backend value to determine if device should started/stopped */
+ int backend;
+ /* Used to notify the guest (trigger interrupt) */
+ int callfd;
+ /* Currently unused as polling mode is enabled */
+ int kickfd;
+ int enabled;
+
+ /* Physical address of used ring, for logging */
+ uint64_t log_guest_addr;
+
+ uint16_t nr_zmbuf;
+ uint16_t zmbuf_size;
+ uint16_t last_zmbuf_idx;
+ struct zcopy_mbuf *zmbufs;
+ struct zcopy_mbuf_list zmbuf_list;
+
+ struct vring_used_elem *shadow_used_ring;
+ uint16_t shadow_used_idx;
+} __rte_cache_aligned;
+
+/* Old kernels have no such macros defined */
+#ifndef VIRTIO_NET_F_GUEST_ANNOUNCE
+ #define VIRTIO_NET_F_GUEST_ANNOUNCE 21
+#endif
+
+#ifndef VIRTIO_NET_F_MQ
+ #define VIRTIO_NET_F_MQ 22
+#endif
+
+#define VHOST_MAX_VRING 0x100
+#define VHOST_MAX_QUEUE_PAIRS 0x80
+
+#ifndef VIRTIO_NET_F_MTU
+ #define VIRTIO_NET_F_MTU 3
+#endif
+
+/*
+ * Define virtio 1.0 for older kernels
+ */
+#ifndef VIRTIO_F_VERSION_1
+ #define VIRTIO_F_VERSION_1 32
+#endif
+
+#define VHOST_USER_F_PROTOCOL_FEATURES 30
+
+/* Features supported by this builtin vhost-user net driver. */
+#define VIRTIO_NET_SUPPORTED_FEATURES ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | \
+ (1ULL << VIRTIO_NET_F_CTRL_VQ) | \
+ (1ULL << VIRTIO_NET_F_CTRL_RX) | \
+ (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) | \
+ (1ULL << VIRTIO_NET_F_MQ) | \
+ (1ULL << VIRTIO_F_VERSION_1) | \
+ (1ULL << VHOST_F_LOG_ALL) | \
+ (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | \
+ (1ULL << VIRTIO_NET_F_HOST_TSO4) | \
+ (1ULL << VIRTIO_NET_F_HOST_TSO6) | \
+ (1ULL << VIRTIO_NET_F_CSUM) | \
+ (1ULL << VIRTIO_NET_F_GUEST_CSUM) | \
+ (1ULL << VIRTIO_NET_F_GUEST_TSO4) | \
+ (1ULL << VIRTIO_NET_F_GUEST_TSO6) | \
+ (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | \
+ (1ULL << VIRTIO_NET_F_MTU))
+
+
+struct guest_page {
+ uint64_t guest_phys_addr;
+ uint64_t host_phys_addr;
+ uint64_t size;
+};
+
+/* struct ether_addr was renamed to struct rte_ether_addr at one point */
+#ifdef RTE_ETHER_ADDR_LEN
+struct ether_addr {
+ uint8_t addr_bytes[RTE_ETHER_ADDR_LEN];
+} __attribute__((__packed__));
+#endif
+
+/**
+ * Device structure contains all configuration information relating
+ * to the device.
+ */
+struct virtio_net {
+ /* Frontend (QEMU) memory and memory region information */
+ struct rte_vhost_memory *mem;
+ uint64_t features;
+ uint64_t negotiated_features;
+ uint64_t protocol_features;
+ int vid;
+ uint32_t is_nvme;
+ uint32_t flags;
+ uint16_t vhost_hlen;
+ /* to tell if we need broadcast rarp packet */
+ rte_atomic16_t broadcast_rarp;
+ uint32_t nr_vring;
+ int dequeue_zero_copy;
+ struct vhost_virtqueue *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
+#define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
+ char ifname[IF_NAME_SZ];
+ uint64_t log_size;
+ uint64_t log_base;
+ uint64_t log_addr;
+ struct ether_addr mac;
+ uint16_t mtu;
+
+ struct vhost_device_ops const *notify_ops;
+
+ uint32_t nr_guest_pages;
+ uint32_t max_guest_pages;
+ struct guest_page *guest_pages;
+ int has_new_mem_table;
+ void *bar_addr;
+ uint64_t bar_size;
+ struct VhostUserMemory mem_table;
+ int mem_table_fds[VHOST_MEMORY_MAX_NREGIONS];
+} __rte_cache_aligned;
+
+
+#define VHOST_LOG_PAGE 4096
+
+static inline void __attribute__((always_inline))
+vhost_log_page(uint8_t *log_base, uint64_t page)
+{
+ log_base[page / 8] |= 1 << (page % 8);
+}
+
+static inline void __attribute__((always_inline))
+vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len)
+{
+ uint64_t page;
+
+ if (likely(((dev->negotiated_features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
+ !dev->log_base || !len))
+ return;
+
+ if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8)))
+ return;
+
+ /* To make sure guest memory updates are committed before logging */
+ rte_smp_wmb();
+
+ page = addr / VHOST_LOG_PAGE;
+ while (page * VHOST_LOG_PAGE < addr + len) {
+ vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page);
+ page += 1;
+ }
+}
+
+static inline void __attribute__((always_inline))
+vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ uint64_t offset, uint64_t len)
+{
+ vhost_log_write(dev, vq->log_guest_addr + offset, len);
+}
+
+/* Macros for printing using RTE_LOG */
+#define RTE_LOGTYPE_VHOST_CONFIG RTE_LOGTYPE_USER1
+#define RTE_LOGTYPE_VHOST_DATA RTE_LOGTYPE_USER1
+
+#ifdef RTE_LIBRTE_VHOST_DEBUG
+#define VHOST_MAX_PRINT_BUFF 6072
+#define VHOST_LOG_LEVEL RTE_LOG_DEBUG
+#define VHOST_LOG_DEBUG(log_type, fmt, args...) RTE_LOG(DEBUG, log_type, fmt, ##args)
+#define PRINT_PACKET(device, addr, size, header) do { \
+ char *pkt_addr = (char *)(addr); \
+ unsigned int index; \
+ char packet[VHOST_MAX_PRINT_BUFF]; \
+ \
+ if ((header)) \
+ snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%d) Header size %d: ", (device->vid), (size)); \
+ else \
+ snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%d) Packet size %d: ", (device->vid), (size)); \
+ for (index = 0; index < (size); index++) { \
+ snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), \
+ "%02hhx ", pkt_addr[index]); \
+ } \
+ snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), "\n"); \
+ \
+ VHOST_LOG_DEBUG(VHOST_DATA, "%s", packet); \
+} while (0)
+#else
+#define VHOST_LOG_LEVEL RTE_LOG_INFO
+#define VHOST_LOG_DEBUG(log_type, fmt, args...) do {} while (0)
+#define PRINT_PACKET(device, addr, size, header) do {} while (0)
+#endif
+
+extern uint64_t VHOST_FEATURES;
+#define MAX_VHOST_DEVICE 1024
+extern struct virtio_net *vhost_devices[MAX_VHOST_DEVICE];
+
+/* Convert guest physical address to host physical address */
+static inline phys_addr_t __attribute__((always_inline))
+gpa_to_hpa(struct virtio_net *dev, uint64_t gpa, uint64_t size)
+{
+ uint32_t i;
+ struct guest_page *page;
+
+ for (i = 0; i < dev->nr_guest_pages; i++) {
+ page = &dev->guest_pages[i];
+
+ if (gpa >= page->guest_phys_addr &&
+ gpa + size < page->guest_phys_addr + page->size) {
+ return gpa - page->guest_phys_addr +
+ page->host_phys_addr;
+ }
+ }
+
+ return 0;
+}
+
+struct virtio_net *get_device(int vid);
+
+int vhost_new_device(uint64_t features, struct vhost_device_ops const *ops);
+void cleanup_device(struct virtio_net *dev, int destroy);
+void reset_device(struct virtio_net *dev);
+void vhost_destroy_device(int);
+
+int alloc_vring_queue(struct virtio_net *dev, uint32_t vring_idx);
+
+void vhost_set_ifname(int, const char *if_name, unsigned int if_len);
+void vhost_enable_dequeue_zero_copy(int vid);
+
+struct vhost_device_ops const *vhost_driver_callback_get(const char *path);
+
+/*
+ * Backend-specific cleanup.
+ *
+ * TODO: fix it; we have one backend now
+ */
+void vhost_backend_cleanup(struct virtio_net *dev);
+
+#endif /* _VHOST_NET_CDEV_H_ */
diff --git a/src/spdk/lib/rte_vhost/vhost_user.c b/src/spdk/lib/rte_vhost/vhost_user.c
new file mode 100644
index 000000000..a07483fcf
--- /dev/null
+++ b/src/spdk/lib/rte_vhost/vhost_user.c
@@ -0,0 +1,1426 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <asm/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <assert.h>
+#ifdef RTE_LIBRTE_VHOST_NUMA
+#include <numaif.h>
+#endif
+
+#include <rte_common.h>
+#include <rte_malloc.h>
+#include <rte_log.h>
+
+#include "vhost.h"
+#include "vhost_user.h"
+
+#define VIRTIO_MIN_MTU 68
+#define VIRTIO_MAX_MTU 65535
+
+static const char *vhost_message_str[VHOST_USER_MAX] = {
+ [VHOST_USER_NONE] = "VHOST_USER_NONE",
+ [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES",
+ [VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES",
+ [VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER",
+ [VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER",
+ [VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE",
+ [VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE",
+ [VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD",
+ [VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM",
+ [VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR",
+ [VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE",
+ [VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE",
+ [VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK",
+ [VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL",
+ [VHOST_USER_SET_VRING_ERR] = "VHOST_USER_SET_VRING_ERR",
+ [VHOST_USER_GET_PROTOCOL_FEATURES] = "VHOST_USER_GET_PROTOCOL_FEATURES",
+ [VHOST_USER_SET_PROTOCOL_FEATURES] = "VHOST_USER_SET_PROTOCOL_FEATURES",
+ [VHOST_USER_GET_QUEUE_NUM] = "VHOST_USER_GET_QUEUE_NUM",
+ [VHOST_USER_SET_VRING_ENABLE] = "VHOST_USER_SET_VRING_ENABLE",
+ [VHOST_USER_SEND_RARP] = "VHOST_USER_SEND_RARP",
+ [VHOST_USER_NET_SET_MTU] = "VHOST_USER_NET_SET_MTU",
+ [VHOST_USER_GET_CONFIG] = "VHOST_USER_GET_CONFIG",
+ [VHOST_USER_SET_CONFIG] = "VHOST_USER_SET_CONFIG",
+ [VHOST_USER_NVME_ADMIN] = "VHOST_USER_NVME_ADMIN",
+ [VHOST_USER_NVME_SET_CQ_CALL] = "VHOST_USER_NVME_SET_CQ_CALL",
+ [VHOST_USER_NVME_GET_CAP] = "VHOST_USER_NVME_GET_CAP",
+ [VHOST_USER_NVME_START_STOP] = "VHOST_USER_NVME_START_STOP",
+ [VHOST_USER_NVME_SET_BAR_MR] = "VHOST_USER_NVME_SET_BAR_MR"
+};
+
+static uint64_t
+get_blk_size(int fd)
+{
+ struct stat stat;
+ int ret;
+
+ ret = fstat(fd, &stat);
+ return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
+}
+
+static void
+free_mem_region(struct virtio_net *dev)
+{
+ uint32_t i;
+ struct rte_vhost_mem_region *reg;
+
+ if (!dev || !dev->mem)
+ return;
+
+ for (i = 0; i < dev->mem->nregions; i++) {
+ reg = &dev->mem->regions[i];
+ if (reg->host_user_addr) {
+ munmap(reg->mmap_addr, reg->mmap_size);
+ close(reg->fd);
+ }
+ }
+}
+
+void
+vhost_backend_cleanup(struct virtio_net *dev)
+{
+ uint32_t i;
+
+ if (dev->has_new_mem_table) {
+ for (i = 0; i < dev->mem_table.nregions; i++) {
+ close(dev->mem_table_fds[i]);
+ }
+ dev->has_new_mem_table = 0;
+ }
+ if (dev->mem) {
+ free_mem_region(dev);
+ rte_free(dev->mem);
+ dev->mem = NULL;
+ }
+
+ free(dev->guest_pages);
+ dev->guest_pages = NULL;
+
+ if (dev->log_addr) {
+ munmap((void *)(uintptr_t)dev->log_addr, dev->log_size);
+ dev->log_addr = 0;
+ }
+ if (dev->bar_addr) {
+ munmap((void *)(uintptr_t)dev->bar_addr, dev->bar_size);
+ dev->bar_addr = NULL;
+ dev->bar_size = 0;
+ }
+}
+
+/*
+ * This function just returns success at the moment unless
+ * the device hasn't been initialised.
+ */
+static int
+vhost_user_set_owner(void)
+{
+ return 0;
+}
+
+static int
+vhost_user_reset_owner(struct virtio_net *dev)
+{
+ if (dev->flags & VIRTIO_DEV_RUNNING) {
+ dev->flags &= ~VIRTIO_DEV_RUNNING;
+ dev->notify_ops->destroy_device(dev->vid);
+ }
+
+ cleanup_device(dev, 0);
+ reset_device(dev);
+ return 0;
+}
+
+/*
+ * The features that we support are requested.
+ */
+static uint64_t
+vhost_user_get_features(struct virtio_net *dev)
+{
+ return dev->features;
+}
+
+/*
+ * We receive the negotiated features supported by us and the virtio device.
+ */
+static int
+vhost_user_set_features(struct virtio_net *dev, uint64_t features)
+{
+ uint64_t vhost_features = 0;
+
+ vhost_features = vhost_user_get_features(dev);
+ if (features & ~vhost_features) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "(%d) received invalid negotiated features.\n",
+ dev->vid);
+ return -1;
+ }
+
+ if ((dev->flags & VIRTIO_DEV_RUNNING) && dev->negotiated_features != features) {
+ if (dev->notify_ops->features_changed) {
+ dev->notify_ops->features_changed(dev->vid, features);
+ } else {
+ dev->flags &= ~VIRTIO_DEV_RUNNING;
+ dev->notify_ops->destroy_device(dev->vid);
+ }
+ }
+
+ dev->negotiated_features = features;
+ if (dev->negotiated_features &
+ ((1 << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VIRTIO_F_VERSION_1))) {
+ dev->vhost_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+ } else {
+ dev->vhost_hlen = sizeof(struct virtio_net_hdr);
+ }
+ VHOST_LOG_DEBUG(VHOST_CONFIG,
+ "(%d) mergeable RX buffers %s, virtio 1 %s\n",
+ dev->vid,
+ (dev->negotiated_features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ? "on" : "off",
+ (dev->negotiated_features & (1ULL << VIRTIO_F_VERSION_1)) ? "on" : "off");
+
+ return 0;
+}
+
+/*
+ * The virtio device sends us the size of the descriptor ring.
+ */
+static int
+vhost_user_set_vring_num(struct virtio_net *dev,
+ VhostUserMsg *msg)
+{
+ struct vhost_virtqueue *vq = dev->virtqueue[msg->payload.state.index];
+
+ vq->size = msg->payload.state.num;
+
+ if (dev->dequeue_zero_copy) {
+ vq->nr_zmbuf = 0;
+ vq->last_zmbuf_idx = 0;
+ vq->zmbuf_size = vq->size;
+ vq->zmbufs = rte_zmalloc(NULL, vq->zmbuf_size *
+ sizeof(struct zcopy_mbuf), 0);
+ if (vq->zmbufs == NULL) {
+ RTE_LOG(WARNING, VHOST_CONFIG,
+ "failed to allocate mem for zero copy; "
+ "zero copy is force disabled\n");
+ dev->dequeue_zero_copy = 0;
+ }
+ }
+
+ vq->shadow_used_ring = rte_malloc(NULL,
+ vq->size * sizeof(struct vring_used_elem),
+ RTE_CACHE_LINE_SIZE);
+ if (!vq->shadow_used_ring) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "failed to allocate memory for shadow used ring.\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * Reallocate virtio_dev and vhost_virtqueue data structure to make them on the
+ * same numa node as the memory of vring descriptor.
+ */
+#ifdef RTE_LIBRTE_VHOST_NUMA
+static struct virtio_net*
+numa_realloc(struct virtio_net *dev, int index)
+{
+ int oldnode, newnode;
+ struct virtio_net *old_dev;
+ struct vhost_virtqueue *old_vq, *vq;
+ int ret;
+
+ old_dev = dev;
+ vq = old_vq = dev->virtqueue[index];
+
+ ret = get_mempolicy(&newnode, NULL, 0, old_vq->desc,
+ MPOL_F_NODE | MPOL_F_ADDR);
+
+ /* check if we need to reallocate vq */
+ ret |= get_mempolicy(&oldnode, NULL, 0, old_vq,
+ MPOL_F_NODE | MPOL_F_ADDR);
+ if (ret) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Unable to get vq numa information.\n");
+ return dev;
+ }
+ if (oldnode != newnode) {
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "reallocate vq from %d to %d node\n", oldnode, newnode);
+ vq = rte_malloc_socket(NULL, sizeof(*vq), 0, newnode);
+ if (!vq)
+ return dev;
+
+ memcpy(vq, old_vq, sizeof(*vq));
+ rte_free(old_vq);
+ }
+
+ /* check if we need to reallocate dev */
+ ret = get_mempolicy(&oldnode, NULL, 0, old_dev,
+ MPOL_F_NODE | MPOL_F_ADDR);
+ if (ret) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "Unable to get dev numa information.\n");
+ goto out;
+ }
+ if (oldnode != newnode) {
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "reallocate dev from %d to %d node\n",
+ oldnode, newnode);
+ dev = rte_malloc_socket(NULL, sizeof(*dev), 0, newnode);
+ if (!dev) {
+ dev = old_dev;
+ goto out;
+ }
+
+ memcpy(dev, old_dev, sizeof(*dev));
+ rte_free(old_dev);
+ }
+
+out:
+ dev->virtqueue[index] = vq;
+ vhost_devices[dev->vid] = dev;
+
+ return dev;
+}
+#else
+static struct virtio_net*
+numa_realloc(struct virtio_net *dev, int index __rte_unused)
+{
+ return dev;
+}
+#endif
+
+/*
+ * Converts QEMU virtual address to Vhost virtual address. This function is
+ * used to convert the ring addresses to our address space.
+ */
+static uint64_t
+qva_to_vva(struct virtio_net *dev, uint64_t qva, uint64_t *len)
+{
+ struct rte_vhost_mem_region *reg;
+ uint32_t i;
+
+ /* Find the region where the address lives. */
+ for (i = 0; i < dev->mem->nregions; i++) {
+ reg = &dev->mem->regions[i];
+
+ if (qva >= reg->guest_user_addr &&
+ qva < reg->guest_user_addr + reg->size) {
+
+ if (unlikely(*len > reg->guest_user_addr + reg->size - qva))
+ *len = reg->guest_user_addr + reg->size - qva;
+
+ return qva - reg->guest_user_addr +
+ reg->host_user_addr;
+ }
+ }
+
+ return 0;
+}
+
+static int vhost_setup_mem_table(struct virtio_net *dev);
+
+/*
+ * The virtio device sends us the desc, used and avail ring addresses.
+ * This function then converts these to our address space.
+ */
+static int
+vhost_user_set_vring_addr(struct virtio_net *dev, VhostUserMsg *msg)
+{
+ struct vhost_virtqueue *vq;
+ uint64_t len;
+
+ /* Remove from the data plane. */
+ if (dev->flags & VIRTIO_DEV_RUNNING) {
+ dev->flags &= ~VIRTIO_DEV_RUNNING;
+ dev->notify_ops->destroy_device(dev->vid);
+ }
+
+ if (dev->has_new_mem_table) {
+ vhost_setup_mem_table(dev);
+ dev->has_new_mem_table = 0;
+ }
+
+ if (dev->mem == NULL)
+ return -1;
+
+ /* addr->index refers to the queue index. The txq 1, rxq is 0. */
+ vq = dev->virtqueue[msg->payload.addr.index];
+
+ /* The addresses are converted from QEMU virtual to Vhost virtual. */
+ len = sizeof(struct vring_desc) * vq->size;
+ vq->desc = (struct vring_desc *)(uintptr_t)qva_to_vva(dev,
+ msg->payload.addr.desc_user_addr, &len);
+ if (vq->desc == 0 || len != sizeof(struct vring_desc) * vq->size) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "(%d) failed to map desc ring.\n",
+ dev->vid);
+ return -1;
+ }
+
+ dev = numa_realloc(dev, msg->payload.addr.index);
+ vq = dev->virtqueue[msg->payload.addr.index];
+
+ len = sizeof(struct vring_avail) + sizeof(uint16_t) * vq->size;
+ vq->avail = (struct vring_avail *)(uintptr_t)qva_to_vva(dev,
+ msg->payload.addr.avail_user_addr, &len);
+ if (vq->avail == 0 ||
+ len != sizeof(struct vring_avail)
+ + sizeof(uint16_t) * vq->size) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "(%d) failed to find avail ring address.\n",
+ dev->vid);
+ return -1;
+ }
+
+ len = sizeof(struct vring_used) +
+ sizeof(struct vring_used_elem) * vq->size;
+ vq->used = (struct vring_used *)(uintptr_t)qva_to_vva(dev,
+ msg->payload.addr.used_user_addr, &len);
+ if (vq->used == 0 || len != sizeof(struct vring_used) +
+ sizeof(struct vring_used_elem) * vq->size) {
+
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "(%d) failed to find used ring address.\n",
+ dev->vid);
+ return -1;
+ }
+
+ if (vq->last_used_idx != vq->used->idx) {
+ RTE_LOG(WARNING, VHOST_CONFIG,
+ "last_used_idx (%u) and vq->used->idx (%u) mismatches; "
+ "some packets maybe resent for Tx and dropped for Rx\n",
+ vq->last_used_idx, vq->used->idx);
+ vq->last_used_idx = vq->used->idx;
+ vq->last_avail_idx = vq->used->idx;
+ }
+
+ vq->log_guest_addr = msg->payload.addr.log_guest_addr;
+
+ VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address desc: %p\n",
+ dev->vid, vq->desc);
+ VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address avail: %p\n",
+ dev->vid, vq->avail);
+ VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address used: %p\n",
+ dev->vid, vq->used);
+ VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) log_guest_addr: %" PRIx64 "\n",
+ dev->vid, vq->log_guest_addr);
+
+ return 0;
+}
+
+/*
+ * The virtio device sends us the available ring last used index.
+ */
+static int
+vhost_user_set_vring_base(struct virtio_net *dev,
+ VhostUserMsg *msg)
+{
+ /* Remove from the data plane. */
+ if (dev->flags & VIRTIO_DEV_RUNNING) {
+ dev->flags &= ~VIRTIO_DEV_RUNNING;
+ dev->notify_ops->destroy_device(dev->vid);
+ }
+
+ dev->virtqueue[msg->payload.state.index]->last_used_idx = msg->payload.state.num;
+ dev->virtqueue[msg->payload.state.index]->last_avail_idx = msg->payload.state.num;
+
+ return 0;
+}
+
+static void
+add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr,
+ uint64_t host_phys_addr, uint64_t size)
+{
+ struct guest_page *page, *last_page;
+
+ if (dev->nr_guest_pages == dev->max_guest_pages) {
+ dev->max_guest_pages = RTE_MAX(8U, dev->max_guest_pages * 2);
+ dev->guest_pages = realloc(dev->guest_pages,
+ dev->max_guest_pages * sizeof(*page));
+ }
+
+ if (dev->nr_guest_pages > 0) {
+ last_page = &dev->guest_pages[dev->nr_guest_pages - 1];
+ /* merge if the two pages are continuous */
+ if (host_phys_addr == last_page->host_phys_addr +
+ last_page->size) {
+ last_page->size += size;
+ return;
+ }
+ }
+
+ page = &dev->guest_pages[dev->nr_guest_pages++];
+ page->guest_phys_addr = guest_phys_addr;
+ page->host_phys_addr = host_phys_addr;
+ page->size = size;
+}
+
+static void
+add_guest_pages(struct virtio_net *dev, struct rte_vhost_mem_region *reg,
+ uint64_t page_size)
+{
+ uint64_t reg_size = reg->size;
+ uint64_t host_user_addr = reg->host_user_addr;
+ uint64_t guest_phys_addr = reg->guest_phys_addr;
+ uint64_t host_phys_addr;
+ uint64_t size;
+
+ host_phys_addr = rte_mem_virt2phy((void *)(uintptr_t)host_user_addr);
+ size = page_size - (guest_phys_addr & (page_size - 1));
+ size = RTE_MIN(size, reg_size);
+
+ add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size);
+ host_user_addr += size;
+ guest_phys_addr += size;
+ reg_size -= size;
+
+ while (reg_size > 0) {
+ size = RTE_MIN(reg_size, page_size);
+ host_phys_addr = rte_mem_virt2phy((void *)(uintptr_t)
+ host_user_addr);
+ add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size);
+
+ host_user_addr += size;
+ guest_phys_addr += size;
+ reg_size -= size;
+ }
+}
+
+#ifdef RTE_LIBRTE_VHOST_DEBUG
+/* TODO: enable it only in debug mode? */
+static void
+dump_guest_pages(struct virtio_net *dev)
+{
+ uint32_t i;
+ struct guest_page *page;
+
+ for (i = 0; i < dev->nr_guest_pages; i++) {
+ page = &dev->guest_pages[i];
+
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "guest physical page region %u\n"
+ "\t guest_phys_addr: %" PRIx64 "\n"
+ "\t host_phys_addr : %" PRIx64 "\n"
+ "\t size : %" PRIx64 "\n",
+ i,
+ page->guest_phys_addr,
+ page->host_phys_addr,
+ page->size);
+ }
+}
+#else
+#define dump_guest_pages(dev)
+#endif
+
+static int
+vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg)
+{
+ uint32_t i;
+
+ if (dev->has_new_mem_table) {
+ /*
+ * The previous mem table was not consumed, so close the
+ * file descriptors from that mem table before copying
+ * the new one.
+ */
+ for (i = 0; i < dev->mem_table.nregions; i++) {
+ close(dev->mem_table_fds[i]);
+ }
+ }
+
+ memcpy(&dev->mem_table, &pmsg->payload.memory, sizeof(dev->mem_table));
+ memcpy(dev->mem_table_fds, pmsg->fds, sizeof(dev->mem_table_fds));
+ dev->has_new_mem_table = 1;
+ /* vhost-user-nvme will not send
+ * set vring addr message, enable
+ * memory address table now.
+ */
+ if (dev->has_new_mem_table && dev->is_nvme) {
+ vhost_setup_mem_table(dev);
+ dev->has_new_mem_table = 0;
+ }
+
+ return 0;
+}
+
+ static int
+vhost_setup_mem_table(struct virtio_net *dev)
+{
+ struct VhostUserMemory memory = dev->mem_table;
+ struct rte_vhost_mem_region *reg;
+ struct vhost_virtqueue *vq;
+ void *mmap_addr;
+ uint64_t mmap_size;
+ uint64_t mmap_offset;
+ uint64_t alignment;
+ uint32_t i;
+ int fd;
+
+ if (dev->mem) {
+ free_mem_region(dev);
+ rte_free(dev->mem);
+ dev->mem = NULL;
+ }
+
+ for (i = 0; i < dev->nr_vring; i++) {
+ vq = dev->virtqueue[i];
+ /* Those addresses won't be valid anymore in host address space
+ * after setting new mem table. Initiator need to resend these
+ * addresses.
+ */
+ vq->desc = NULL;
+ vq->avail = NULL;
+ vq->used = NULL;
+ }
+
+ dev->nr_guest_pages = 0;
+ if (!dev->guest_pages) {
+ dev->max_guest_pages = 8;
+ dev->guest_pages = malloc(dev->max_guest_pages *
+ sizeof(struct guest_page));
+ }
+
+ dev->mem = rte_zmalloc("vhost-mem-table", sizeof(struct rte_vhost_memory) +
+ sizeof(struct rte_vhost_mem_region) * memory.nregions, 0);
+ if (dev->mem == NULL) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "(%d) failed to allocate memory for dev->mem\n",
+ dev->vid);
+ return -1;
+ }
+ dev->mem->nregions = memory.nregions;
+
+ for (i = 0; i < memory.nregions; i++) {
+ fd = dev->mem_table_fds[i];
+ reg = &dev->mem->regions[i];
+
+ reg->guest_phys_addr = memory.regions[i].guest_phys_addr;
+ reg->guest_user_addr = memory.regions[i].userspace_addr;
+ reg->size = memory.regions[i].memory_size;
+ reg->fd = fd;
+
+ mmap_offset = memory.regions[i].mmap_offset;
+ mmap_size = reg->size + mmap_offset;
+
+ /* mmap() without flag of MAP_ANONYMOUS, should be called
+ * with length argument aligned with hugepagesz at older
+ * longterm version Linux, like 2.6.32 and 3.2.72, or
+ * mmap() will fail with EINVAL.
+ *
+ * to avoid failure, make sure in caller to keep length
+ * aligned.
+ */
+ alignment = get_blk_size(fd);
+ if (alignment == (uint64_t)-1) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "couldn't get hugepage size through fstat\n");
+ goto err_mmap;
+ }
+ mmap_size = RTE_ALIGN_CEIL(mmap_size, alignment);
+
+ mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE, fd, 0);
+
+ if (mmap_addr == MAP_FAILED) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "mmap region %u failed.\n", i);
+ goto err_mmap;
+ }
+
+ if (madvise(mmap_addr, mmap_size, MADV_DONTDUMP) != 0) {
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "MADV_DONTDUMP advice setting failed.\n");
+ }
+
+ reg->mmap_addr = mmap_addr;
+ reg->mmap_size = mmap_size;
+ reg->host_user_addr = (uint64_t)(uintptr_t)mmap_addr +
+ mmap_offset;
+
+ if (dev->dequeue_zero_copy)
+ add_guest_pages(dev, reg, alignment);
+
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "guest memory region %u, size: 0x%" PRIx64 "\n"
+ "\t guest physical addr: 0x%" PRIx64 "\n"
+ "\t guest virtual addr: 0x%" PRIx64 "\n"
+ "\t host virtual addr: 0x%" PRIx64 "\n"
+ "\t mmap addr : 0x%" PRIx64 "\n"
+ "\t mmap size : 0x%" PRIx64 "\n"
+ "\t mmap align: 0x%" PRIx64 "\n"
+ "\t mmap off : 0x%" PRIx64 "\n",
+ i, reg->size,
+ reg->guest_phys_addr,
+ reg->guest_user_addr,
+ reg->host_user_addr,
+ (uint64_t)(uintptr_t)mmap_addr,
+ mmap_size,
+ alignment,
+ mmap_offset);
+ }
+
+ dump_guest_pages(dev);
+
+ return 0;
+
+err_mmap:
+ free_mem_region(dev);
+ rte_free(dev->mem);
+ dev->mem = NULL;
+ return -1;
+}
+
+static int
+vq_is_ready(struct vhost_virtqueue *vq)
+{
+ return vq && vq->desc &&
+ vq->kickfd != VIRTIO_UNINITIALIZED_EVENTFD &&
+ vq->callfd != VIRTIO_UNINITIALIZED_EVENTFD &&
+ vq->kickfd != VIRTIO_INVALID_EVENTFD &&
+ vq->callfd != VIRTIO_INVALID_EVENTFD;
+}
+
+static int
+virtio_is_ready(struct virtio_net *dev)
+{
+ struct vhost_virtqueue *vq;
+ uint32_t i;
+
+ if (dev->nr_vring == 0)
+ return 0;
+
+ for (i = 0; i < dev->nr_vring; i++) {
+ vq = dev->virtqueue[i];
+
+ if (vq_is_ready(vq)) {
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "virtio is now ready for processing.\n");
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+static void
+vhost_user_set_vring_call(struct virtio_net *dev, struct VhostUserMsg *pmsg)
+{
+ struct vhost_vring_file file;
+ struct vhost_virtqueue *vq;
+
+ /* Remove from the data plane. */
+ if (dev->flags & VIRTIO_DEV_RUNNING) {
+ dev->flags &= ~VIRTIO_DEV_RUNNING;
+ dev->notify_ops->destroy_device(dev->vid);
+ }
+
+ file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+ if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
+ file.fd = VIRTIO_INVALID_EVENTFD;
+ else
+ file.fd = pmsg->fds[0];
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "vring call idx:%d file:%d\n", file.index, file.fd);
+
+ vq = dev->virtqueue[file.index];
+ if (vq->callfd >= 0)
+ close(vq->callfd);
+
+ vq->callfd = file.fd;
+}
+
+static void
+vhost_user_set_vring_kick(struct virtio_net *dev, struct VhostUserMsg *pmsg)
+{
+ struct vhost_vring_file file;
+ struct vhost_virtqueue *vq;
+
+ /* Remove from the data plane. */
+ if (dev->flags & VIRTIO_DEV_RUNNING) {
+ dev->flags &= ~VIRTIO_DEV_RUNNING;
+ dev->notify_ops->destroy_device(dev->vid);
+ }
+
+ file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+ if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
+ file.fd = VIRTIO_INVALID_EVENTFD;
+ else
+ file.fd = pmsg->fds[0];
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "vring kick idx:%d file:%d\n", file.index, file.fd);
+
+ vq = dev->virtqueue[file.index];
+ if (vq->kickfd >= 0)
+ close(vq->kickfd);
+ vq->kickfd = file.fd;
+}
+
+static void
+free_zmbufs(struct vhost_virtqueue *vq)
+{
+ struct zcopy_mbuf *zmbuf, *next;
+
+ for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list);
+ zmbuf != NULL; zmbuf = next) {
+ next = TAILQ_NEXT(zmbuf, next);
+
+ rte_pktmbuf_free(zmbuf->mbuf);
+ TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next);
+ }
+
+ rte_free(vq->zmbufs);
+}
+
+/*
+ * when virtio is stopped, qemu will send us the GET_VRING_BASE message.
+ */
+static int
+vhost_user_get_vring_base(struct virtio_net *dev,
+ VhostUserMsg *msg)
+{
+ struct vhost_virtqueue *vq = dev->virtqueue[msg->payload.state.index];
+
+ /* We have to stop the queue (virtio) if it is running. */
+ if (dev->flags & VIRTIO_DEV_RUNNING) {
+ dev->flags &= ~VIRTIO_DEV_RUNNING;
+ dev->notify_ops->destroy_device(dev->vid);
+ }
+
+ dev->flags &= ~VIRTIO_DEV_READY;
+
+ /* Here we are safe to get the last used index */
+ msg->payload.state.num = vq->last_used_idx;
+
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "vring base idx:%d file:%d\n", msg->payload.state.index, msg->payload.state.num);
+ /*
+ * Based on current qemu vhost-user implementation, this message is
+ * sent and only sent in vhost_vring_stop.
+ * TODO: cleanup the vring, it isn't usable since here.
+ */
+ if (vq->kickfd >= 0)
+ close(vq->kickfd);
+
+ vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
+
+ if (vq->callfd >= 0)
+ close(vq->callfd);
+
+ vq->callfd = VIRTIO_UNINITIALIZED_EVENTFD;
+
+ if (dev->dequeue_zero_copy)
+ free_zmbufs(vq);
+ rte_free(vq->shadow_used_ring);
+ vq->shadow_used_ring = NULL;
+
+ return 0;
+}
+
+/*
+ * when virtio queues are ready to work, qemu will send us to
+ * enable the virtio queue pair.
+ */
+static int
+vhost_user_set_vring_enable(struct virtio_net *dev,
+ VhostUserMsg *msg)
+{
+ int enable = (int)msg->payload.state.num;
+
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "set queue enable: %d to qp idx: %d\n",
+ enable, msg->payload.state.index);
+
+ if (dev->notify_ops->vring_state_changed)
+ dev->notify_ops->vring_state_changed(dev->vid, msg->payload.state.index, enable);
+
+ dev->virtqueue[msg->payload.state.index]->enabled = enable;
+
+ return 0;
+}
+
+static void
+vhost_user_set_protocol_features(struct virtio_net *dev,
+ uint64_t protocol_features)
+{
+ if (protocol_features & ~VHOST_USER_PROTOCOL_FEATURES)
+ return;
+
+ /* Remove from the data plane. */
+ if (dev->flags & VIRTIO_DEV_RUNNING) {
+ dev->flags &= ~VIRTIO_DEV_RUNNING;
+ dev->notify_ops->destroy_device(dev->vid);
+ }
+
+ dev->protocol_features = protocol_features;
+}
+
+static int
+vhost_user_set_log_base(struct virtio_net *dev, struct VhostUserMsg *msg)
+{
+ int fd = msg->fds[0];
+ uint64_t size, off;
+ void *addr;
+
+ if (fd < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG, "invalid log fd: %d\n", fd);
+ return -1;
+ }
+
+ if (msg->size != sizeof(VhostUserLog)) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "invalid log base msg size: %"PRId32" != %d\n",
+ msg->size, (int)sizeof(VhostUserLog));
+ return -1;
+ }
+
+ /* Remove from the data plane. */
+ if (dev->flags & VIRTIO_DEV_RUNNING) {
+ dev->flags &= ~VIRTIO_DEV_RUNNING;
+ dev->notify_ops->destroy_device(dev->vid);
+ }
+
+ size = msg->payload.log.mmap_size;
+ off = msg->payload.log.mmap_offset;
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "log mmap size: %"PRId64", offset: %"PRId64"\n",
+ size, off);
+
+ /*
+ * mmap from 0 to workaround a hugepage mmap bug: mmap will
+ * fail when offset is not page size aligned.
+ */
+ addr = mmap(0, size + off, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+ close(fd);
+ if (addr == MAP_FAILED) {
+ RTE_LOG(ERR, VHOST_CONFIG, "mmap log base failed!\n");
+ return -1;
+ }
+
+ /*
+ * Free previously mapped log memory on occasionally
+ * multiple VHOST_USER_SET_LOG_BASE.
+ */
+ if (dev->log_addr) {
+ munmap((void *)(uintptr_t)dev->log_addr, dev->log_size);
+ }
+ dev->log_addr = (uint64_t)(uintptr_t)addr;
+ dev->log_base = dev->log_addr + off;
+ dev->log_size = size;
+
+ return 0;
+}
+
+/*
+ * An rarp packet is constructed and broadcasted to notify switches about
+ * the new location of the migrated VM, so that packets from outside will
+ * not be lost after migration.
+ *
+ * However, we don't actually "send" a rarp packet here, instead, we set
+ * a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it.
+ */
+static int
+vhost_user_send_rarp(struct virtio_net *dev, struct VhostUserMsg *msg)
+{
+ uint8_t *mac = (uint8_t *)&msg->payload.u64;
+
+ RTE_LOG(DEBUG, VHOST_CONFIG,
+ ":: mac: %02x:%02x:%02x:%02x:%02x:%02x\n",
+ mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]);
+ memcpy(dev->mac.addr_bytes, mac, 6);
+
+ /*
+ * Set the flag to inject a RARP broadcast packet at
+ * rte_vhost_dequeue_burst().
+ *
+ * rte_smp_wmb() is for making sure the mac is copied
+ * before the flag is set.
+ */
+ rte_smp_wmb();
+ rte_atomic16_set(&dev->broadcast_rarp, 1);
+
+ return 0;
+}
+
+static int
+vhost_user_net_set_mtu(struct virtio_net *dev, struct VhostUserMsg *msg)
+{
+ if (msg->payload.u64 < VIRTIO_MIN_MTU ||
+ msg->payload.u64 > VIRTIO_MAX_MTU) {
+ RTE_LOG(ERR, VHOST_CONFIG, "Invalid MTU size (%"PRIu64")\n",
+ msg->payload.u64);
+
+ return -1;
+ }
+
+ dev->mtu = msg->payload.u64;
+
+ return 0;
+}
+
+/* return bytes# of read on success or negative val on failure. */
+static int
+read_vhost_message(int sockfd, struct VhostUserMsg *msg)
+{
+ int ret;
+
+ ret = read_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE,
+ msg->fds, VHOST_MEMORY_MAX_NREGIONS);
+ if (ret <= 0)
+ return ret;
+
+ if (msg && msg->size) {
+ if (msg->size > sizeof(msg->payload)) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "invalid msg size: %d\n", msg->size);
+ return -1;
+ }
+ ret = read(sockfd, &msg->payload, msg->size);
+ if (ret <= 0)
+ return ret;
+ if (ret != (int)msg->size) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "read control message failed\n");
+ return -1;
+ }
+ }
+
+ return ret;
+}
+
+static int
+send_vhost_message(int sockfd, struct VhostUserMsg *msg)
+{
+ int ret;
+
+ if (!msg)
+ return 0;
+
+ msg->flags &= ~VHOST_USER_VERSION_MASK;
+ msg->flags &= ~VHOST_USER_NEED_REPLY;
+ msg->flags |= VHOST_USER_VERSION;
+ msg->flags |= VHOST_USER_REPLY_MASK;
+
+ ret = send_fd_message(sockfd, (char *)msg,
+ VHOST_USER_HDR_SIZE + msg->size, NULL, 0);
+
+ return ret;
+}
+
+/*
+ * Allocate a queue pair if it hasn't been allocated yet
+ */
+static int
+vhost_user_check_and_alloc_queue_pair(struct virtio_net *dev, VhostUserMsg *msg)
+{
+ uint16_t vring_idx;
+
+ switch (msg->request) {
+ case VHOST_USER_SET_VRING_KICK:
+ case VHOST_USER_SET_VRING_CALL:
+ case VHOST_USER_SET_VRING_ERR:
+ vring_idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
+ break;
+ case VHOST_USER_SET_VRING_NUM:
+ case VHOST_USER_SET_VRING_BASE:
+ case VHOST_USER_SET_VRING_ENABLE:
+ vring_idx = msg->payload.state.index;
+ break;
+ case VHOST_USER_SET_VRING_ADDR:
+ vring_idx = msg->payload.addr.index;
+ break;
+ default:
+ return 0;
+ }
+
+ if (vring_idx >= VHOST_MAX_VRING) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "invalid vring index: %u\n", vring_idx);
+ return -1;
+ }
+
+ if (dev->virtqueue[vring_idx])
+ return 0;
+
+ return alloc_vring_queue(dev, vring_idx);
+}
+
+static int
+vhost_user_nvme_admin_passthrough(struct virtio_net *dev,
+ void *cmd, void *cqe, void *buf)
+{
+ if (dev->notify_ops->vhost_nvme_admin_passthrough) {
+ return dev->notify_ops->vhost_nvme_admin_passthrough(dev->vid, cmd, cqe, buf);
+ }
+
+ return -1;
+}
+
+static int
+vhost_user_nvme_set_cq_call(struct virtio_net *dev, uint16_t qid, int fd)
+{
+ if (dev->notify_ops->vhost_nvme_set_cq_call) {
+ return dev->notify_ops->vhost_nvme_set_cq_call(dev->vid, qid, fd);
+ }
+
+ return -1;
+}
+
+static int
+vhost_user_nvme_get_cap(struct virtio_net *dev, uint64_t *cap)
+{
+ if (dev->notify_ops->vhost_nvme_get_cap) {
+ return dev->notify_ops->vhost_nvme_get_cap(dev->vid, cap);
+ }
+
+ return -1;
+}
+
+static int
+vhost_user_nvme_set_bar_mr(struct virtio_net *dev, struct VhostUserMsg *pmsg)
+{
+ struct VhostUserMemory mem_table;
+ int fd = pmsg->fds[0];
+ void *mmap_addr;
+ uint64_t mmap_size;
+ uint64_t mmap_offset;
+ uint64_t alignment;
+ struct rte_vhost_mem_region reg;
+ int ret = 0;
+
+ memcpy(&mem_table, &pmsg->payload.memory, sizeof(mem_table));
+
+ reg.guest_phys_addr = mem_table.regions[0].guest_phys_addr;
+ reg.guest_user_addr = mem_table.regions[0].userspace_addr;
+ reg.size = mem_table.regions[0].memory_size;
+ reg.fd = fd;
+ mmap_offset = mem_table.regions[0].mmap_offset;
+ mmap_size = reg.size + mmap_offset;
+
+ alignment = get_blk_size(fd);
+ if (alignment == (uint64_t)-1) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "couldn't get hugepage size through fstat\n");
+ return -1;
+ }
+ mmap_size = RTE_ALIGN_CEIL(mmap_size, alignment);
+
+ mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE, fd, 0);
+
+ if (mmap_addr == MAP_FAILED) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "mmap region failed.\n");
+ return -1;
+ }
+
+ if (madvise(mmap_addr, mmap_size, MADV_DONTDUMP) != 0) {
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "MADV_DONTDUMP advice setting failed.\n");
+ }
+
+ reg.mmap_addr = mmap_addr;
+ reg.mmap_size = mmap_size;
+ reg.host_user_addr = (uint64_t)(uintptr_t)mmap_addr +
+ mmap_offset;
+
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "BAR memory region %u, size: 0x%" PRIx64 "\n"
+ "\t guest physical addr: 0x%" PRIx64 "\n"
+ "\t guest virtual addr: 0x%" PRIx64 "\n"
+ "\t host virtual addr: 0x%" PRIx64 "\n"
+ "\t mmap addr : 0x%" PRIx64 "\n"
+ "\t mmap size : 0x%" PRIx64 "\n"
+ "\t mmap align: 0x%" PRIx64 "\n"
+ "\t mmap off : 0x%" PRIx64 "\n",
+ 0, reg.size,
+ reg.guest_phys_addr,
+ reg.guest_user_addr,
+ reg.host_user_addr,
+ (uint64_t)(uintptr_t)mmap_addr,
+ mmap_size,
+ alignment,
+ mmap_offset);
+
+ if (dev->bar_addr) {
+ munmap((void *)(uintptr_t)dev->bar_addr, dev->bar_size);
+ }
+ dev->bar_addr = (void *)(uintptr_t)reg.host_user_addr;
+ dev->bar_size = reg.mmap_size;
+
+ if (dev->notify_ops->vhost_nvme_set_bar_mr) {
+ ret = dev->notify_ops->vhost_nvme_set_bar_mr(dev->vid, dev->bar_addr, dev->bar_size);
+ if (ret) {
+ munmap((void *)(uintptr_t)dev->bar_addr, dev->bar_size);
+ dev->bar_addr = NULL;
+ dev->bar_size = 0;
+ }
+ }
+
+ return ret;
+}
+
+int
+vhost_user_msg_handler(int vid, int fd)
+{
+ struct virtio_net *dev;
+ struct VhostUserMsg msg;
+ struct vhost_vring_file file;
+ int ret;
+ uint64_t cap;
+ uint64_t enable;
+ uint8_t cqe[16];
+ uint8_t cmd[64];
+ uint8_t buf[4096];
+
+ dev = get_device(vid);
+ if (dev == NULL)
+ return -1;
+
+ ret = read_vhost_message(fd, &msg);
+ if (ret <= 0 || msg.request >= VHOST_USER_MAX) {
+ if (ret < 0)
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "vhost read message failed\n");
+ else if (ret == 0)
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "vhost peer closed\n");
+ else
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "vhost read incorrect message\n");
+
+ return -1;
+ }
+
+ RTE_LOG(INFO, VHOST_CONFIG, "%s: read message %s\n",
+ dev->ifname, vhost_message_str[msg.request]);
+
+ ret = vhost_user_check_and_alloc_queue_pair(dev, &msg);
+ if (ret < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG,
+ "failed to alloc queue\n");
+ return -1;
+ }
+
+ switch (msg.request) {
+ case VHOST_USER_GET_CONFIG:
+ if (dev->notify_ops->get_config(dev->vid,
+ msg.payload.config.region,
+ msg.payload.config.size) != 0) {
+ msg.size = sizeof(uint64_t);
+ }
+ send_vhost_message(fd, &msg);
+ break;
+ case VHOST_USER_SET_CONFIG:
+ if ((dev->notify_ops->set_config(dev->vid,
+ msg.payload.config.region,
+ msg.payload.config.offset,
+ msg.payload.config.size,
+ msg.payload.config.flags)) != 0) {
+ ret = 1;
+ } else {
+ ret = 0;
+ }
+ break;
+ case VHOST_USER_NVME_ADMIN:
+ if (!dev->is_nvme) {
+ dev->is_nvme = 1;
+ }
+ memcpy(cmd, msg.payload.nvme.cmd.req, sizeof(cmd));
+ ret = vhost_user_nvme_admin_passthrough(dev, cmd, cqe, buf);
+ memcpy(msg.payload.nvme.cmd.cqe, cqe, sizeof(cqe));
+ msg.size = sizeof(cqe);
+ /* NVMe Identify Command */
+ if (cmd[0] == 0x06) {
+ memcpy(msg.payload.nvme.buf, &buf, 4096);
+ msg.size += 4096;
+ }
+ send_vhost_message(fd, &msg);
+ break;
+ case VHOST_USER_NVME_SET_CQ_CALL:
+ file.index = msg.payload.u64 & VHOST_USER_VRING_IDX_MASK;
+ file.fd = msg.fds[0];
+ ret = vhost_user_nvme_set_cq_call(dev, file.index, file.fd);
+ break;
+ case VHOST_USER_NVME_GET_CAP:
+ ret = vhost_user_nvme_get_cap(dev, &cap);
+ if (!ret)
+ msg.payload.u64 = cap;
+ else
+ msg.payload.u64 = 0;
+ msg.size = sizeof(msg.payload.u64);
+ send_vhost_message(fd, &msg);
+ break;
+ case VHOST_USER_NVME_START_STOP:
+ enable = msg.payload.u64;
+ /* device must be started before set cq call */
+ if (enable) {
+ if (!(dev->flags & VIRTIO_DEV_RUNNING)) {
+ if (dev->notify_ops->new_device(dev->vid) == 0)
+ dev->flags |= VIRTIO_DEV_RUNNING;
+ }
+ } else {
+ if (dev->flags & VIRTIO_DEV_RUNNING) {
+ dev->flags &= ~VIRTIO_DEV_RUNNING;
+ dev->notify_ops->destroy_device(dev->vid);
+ }
+ }
+ break;
+ case VHOST_USER_NVME_SET_BAR_MR:
+ ret = vhost_user_nvme_set_bar_mr(dev, &msg);
+ break;
+ case VHOST_USER_GET_FEATURES:
+ msg.payload.u64 = vhost_user_get_features(dev);
+ msg.size = sizeof(msg.payload.u64);
+ send_vhost_message(fd, &msg);
+ break;
+ case VHOST_USER_SET_FEATURES:
+ vhost_user_set_features(dev, msg.payload.u64);
+ break;
+
+ case VHOST_USER_GET_PROTOCOL_FEATURES:
+ msg.payload.u64 = VHOST_USER_PROTOCOL_FEATURES;
+ msg.size = sizeof(msg.payload.u64);
+ send_vhost_message(fd, &msg);
+ break;
+ case VHOST_USER_SET_PROTOCOL_FEATURES:
+ vhost_user_set_protocol_features(dev, msg.payload.u64);
+ break;
+
+ case VHOST_USER_SET_OWNER:
+ vhost_user_set_owner();
+ break;
+ case VHOST_USER_RESET_OWNER:
+ vhost_user_reset_owner(dev);
+ break;
+
+ case VHOST_USER_SET_MEM_TABLE:
+ ret = vhost_user_set_mem_table(dev, &msg);
+ break;
+
+ case VHOST_USER_SET_LOG_BASE:
+ vhost_user_set_log_base(dev, &msg);
+
+ /* it needs a reply */
+ msg.size = sizeof(msg.payload.u64);
+ send_vhost_message(fd, &msg);
+ break;
+ case VHOST_USER_SET_LOG_FD:
+ close(msg.fds[0]);
+ RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n");
+ break;
+
+ case VHOST_USER_SET_VRING_NUM:
+ vhost_user_set_vring_num(dev, &msg);
+ break;
+ case VHOST_USER_SET_VRING_ADDR:
+ vhost_user_set_vring_addr(dev, &msg);
+ break;
+ case VHOST_USER_SET_VRING_BASE:
+ vhost_user_set_vring_base(dev, &msg);
+ break;
+
+ case VHOST_USER_GET_VRING_BASE:
+ vhost_user_get_vring_base(dev, &msg);
+ msg.size = sizeof(msg.payload.state);
+ send_vhost_message(fd, &msg);
+ break;
+
+ case VHOST_USER_SET_VRING_KICK:
+ vhost_user_set_vring_kick(dev, &msg);
+ break;
+ case VHOST_USER_SET_VRING_CALL:
+ vhost_user_set_vring_call(dev, &msg);
+ break;
+
+ case VHOST_USER_SET_VRING_ERR:
+ if (!(msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK))
+ close(msg.fds[0]);
+ RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n");
+ break;
+
+ case VHOST_USER_GET_QUEUE_NUM:
+ msg.payload.u64 = VHOST_MAX_QUEUE_PAIRS;
+ msg.size = sizeof(msg.payload.u64);
+ send_vhost_message(fd, &msg);
+ break;
+
+ case VHOST_USER_SET_VRING_ENABLE:
+ vhost_user_set_vring_enable(dev, &msg);
+ break;
+ case VHOST_USER_SEND_RARP:
+ vhost_user_send_rarp(dev, &msg);
+ break;
+
+ case VHOST_USER_NET_SET_MTU:
+ ret = vhost_user_net_set_mtu(dev, &msg);
+ break;
+
+ default:
+ ret = -1;
+ break;
+
+ }
+
+ if (msg.flags & VHOST_USER_NEED_REPLY) {
+ msg.payload.u64 = !!ret;
+ msg.size = sizeof(msg.payload.u64);
+ send_vhost_message(fd, &msg);
+ }
+
+ if (!(dev->flags & VIRTIO_DEV_RUNNING) && virtio_is_ready(dev)) {
+ dev->flags |= VIRTIO_DEV_READY;
+
+ if (!(dev->flags & VIRTIO_DEV_RUNNING)) {
+ if (dev->dequeue_zero_copy) {
+ RTE_LOG(INFO, VHOST_CONFIG,
+ "dequeue zero copy is enabled\n");
+ }
+
+ if (dev->notify_ops->new_device(dev->vid) == 0)
+ dev->flags |= VIRTIO_DEV_RUNNING;
+ }
+ }
+
+ return 0;
+}
diff --git a/src/spdk/lib/rte_vhost/vhost_user.h b/src/spdk/lib/rte_vhost/vhost_user.h
new file mode 100644
index 000000000..d20574b64
--- /dev/null
+++ b/src/spdk/lib/rte_vhost/vhost_user.h
@@ -0,0 +1,171 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _VHOST_NET_USER_H
+#define _VHOST_NET_USER_H
+
+#include <stdint.h>
+#include <linux/vhost.h>
+
+#include "rte_vhost.h"
+
+/* refer to hw/virtio/vhost-user.c */
+
+#define VHOST_MEMORY_MAX_NREGIONS 8
+
+/*
+ * Maximum size of virtio device config space
+ */
+#define VHOST_USER_MAX_CONFIG_SIZE 256
+
+#define VHOST_USER_PROTOCOL_F_MQ 0
+#define VHOST_USER_PROTOCOL_F_LOG_SHMFD 1
+#define VHOST_USER_PROTOCOL_F_RARP 2
+#define VHOST_USER_PROTOCOL_F_REPLY_ACK 3
+#define VHOST_USER_PROTOCOL_F_NET_MTU 4
+#define VHOST_USER_PROTOCOL_F_CONFIG 9
+
+#define VHOST_USER_PROTOCOL_FEATURES ((1ULL << VHOST_USER_PROTOCOL_F_MQ) | \
+ (1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD) |\
+ (1ULL << VHOST_USER_PROTOCOL_F_RARP) | \
+ (1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK) | \
+ (1ULL << VHOST_USER_PROTOCOL_F_NET_MTU) | \
+ (1ULL << VHOST_USER_PROTOCOL_F_CONFIG))
+
+typedef enum VhostUserRequest {
+ VHOST_USER_NONE = 0,
+ VHOST_USER_GET_FEATURES = 1,
+ VHOST_USER_SET_FEATURES = 2,
+ VHOST_USER_SET_OWNER = 3,
+ VHOST_USER_RESET_OWNER = 4,
+ VHOST_USER_SET_MEM_TABLE = 5,
+ VHOST_USER_SET_LOG_BASE = 6,
+ VHOST_USER_SET_LOG_FD = 7,
+ VHOST_USER_SET_VRING_NUM = 8,
+ VHOST_USER_SET_VRING_ADDR = 9,
+ VHOST_USER_SET_VRING_BASE = 10,
+ VHOST_USER_GET_VRING_BASE = 11,
+ VHOST_USER_SET_VRING_KICK = 12,
+ VHOST_USER_SET_VRING_CALL = 13,
+ VHOST_USER_SET_VRING_ERR = 14,
+ VHOST_USER_GET_PROTOCOL_FEATURES = 15,
+ VHOST_USER_SET_PROTOCOL_FEATURES = 16,
+ VHOST_USER_GET_QUEUE_NUM = 17,
+ VHOST_USER_SET_VRING_ENABLE = 18,
+ VHOST_USER_SEND_RARP = 19,
+ VHOST_USER_NET_SET_MTU = 20,
+ VHOST_USER_GET_CONFIG = 24,
+ VHOST_USER_SET_CONFIG = 25,
+ VHOST_USER_NVME_ADMIN = 80,
+ VHOST_USER_NVME_SET_CQ_CALL = 81,
+ VHOST_USER_NVME_GET_CAP = 82,
+ VHOST_USER_NVME_START_STOP = 83,
+ VHOST_USER_NVME_IO_CMD = 84,
+ VHOST_USER_NVME_SET_BAR_MR = 85,
+ VHOST_USER_MAX
+} VhostUserRequest;
+
+typedef enum VhostUserSlaveRequest {
+ VHOST_USER_SLAVE_NONE = 0,
+ VHOST_USER_SLAVE_IOTLB_MSG = 1,
+ VHOST_USER_SLAVE_CONFIG_CHANGE_MSG = 2,
+ VHOST_USER_SLAVE_MAX
+} VhostUserSlaveRequest;
+
+typedef struct VhostUserMemoryRegion {
+ uint64_t guest_phys_addr;
+ uint64_t memory_size;
+ uint64_t userspace_addr;
+ uint64_t mmap_offset;
+} VhostUserMemoryRegion;
+
+typedef struct VhostUserMemory {
+ uint32_t nregions;
+ uint32_t padding;
+ VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS];
+} VhostUserMemory;
+
+typedef struct VhostUserLog {
+ uint64_t mmap_size;
+ uint64_t mmap_offset;
+} VhostUserLog;
+
+typedef struct VhostUserConfig {
+ uint32_t offset;
+ uint32_t size;
+ uint32_t flags;
+ uint8_t region[VHOST_USER_MAX_CONFIG_SIZE];
+} VhostUserConfig;
+
+typedef struct VhostUserMsg {
+ VhostUserRequest request;
+
+#define VHOST_USER_VERSION_MASK 0x3
+#define VHOST_USER_REPLY_MASK (0x1 << 2)
+#define VHOST_USER_NEED_REPLY (0x1 << 3)
+ uint32_t flags;
+ uint32_t size; /* the following payload size */
+ union {
+#define VHOST_USER_VRING_IDX_MASK 0xff
+#define VHOST_USER_VRING_NOFD_MASK (0x1<<8)
+ uint64_t u64;
+ struct vhost_vring_state state;
+ struct vhost_vring_addr addr;
+ VhostUserMemory memory;
+ VhostUserLog log;
+ VhostUserConfig config;
+ struct nvme {
+ union {
+ uint8_t req[64];
+ uint8_t cqe[16];
+ } cmd;
+ uint8_t buf[4096];
+ } nvme;
+ } payload;
+ int fds[VHOST_MEMORY_MAX_NREGIONS];
+} __attribute((packed)) VhostUserMsg;
+
+#define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
+
+/* The version of the protocol we support */
+#define VHOST_USER_VERSION 0x1
+
+
+/* vhost_user.c */
+int vhost_user_msg_handler(int vid, int fd);
+
+/* socket.c */
+int read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num);
+int send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num);
+
+#endif
diff --git a/src/spdk/lib/scsi/Makefile b/src/spdk/lib/scsi/Makefile
new file mode 100644
index 000000000..8f8a8c326
--- /dev/null
+++ b/src/spdk/lib/scsi/Makefile
@@ -0,0 +1,45 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 3
+SO_MINOR := 0
+
+C_SRCS = dev.c lun.c port.c scsi.c scsi_bdev.c scsi_pr.c scsi_rpc.c task.c
+LIBNAME = scsi
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_scsi.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/scsi/dev.c b/src/spdk/lib/scsi/dev.c
new file mode 100644
index 000000000..6d3cfdf31
--- /dev/null
+++ b/src/spdk/lib/scsi/dev.c
@@ -0,0 +1,436 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "scsi_internal.h"
+
+static struct spdk_scsi_dev g_devs[SPDK_SCSI_MAX_DEVS];
+
+struct spdk_scsi_dev *
+scsi_dev_get_list(void)
+{
+ return g_devs;
+}
+
+static struct spdk_scsi_dev *
+allocate_dev(void)
+{
+ struct spdk_scsi_dev *dev;
+ int i;
+
+ for (i = 0; i < SPDK_SCSI_MAX_DEVS; i++) {
+ dev = &g_devs[i];
+ if (!dev->is_allocated) {
+ memset(dev, 0, sizeof(*dev));
+ dev->id = i;
+ dev->is_allocated = 1;
+ return dev;
+ }
+ }
+
+ return NULL;
+}
+
+static void
+free_dev(struct spdk_scsi_dev *dev)
+{
+ assert(dev->is_allocated == 1);
+ assert(dev->removed == true);
+
+ dev->is_allocated = 0;
+
+ if (dev->remove_cb) {
+ dev->remove_cb(dev->remove_ctx, 0);
+ dev->remove_cb = NULL;
+ }
+}
+
+void
+spdk_scsi_dev_destruct(struct spdk_scsi_dev *dev,
+ spdk_scsi_dev_destruct_cb_t cb_fn, void *cb_arg)
+{
+ int lun_cnt;
+ int i;
+
+ if (dev == NULL) {
+ if (cb_fn) {
+ cb_fn(cb_arg, -EINVAL);
+ }
+ return;
+ }
+
+ if (dev->removed) {
+ if (cb_fn) {
+ cb_fn(cb_arg, -EINVAL);
+ }
+ return;
+ }
+
+ dev->removed = true;
+ dev->remove_cb = cb_fn;
+ dev->remove_ctx = cb_arg;
+ lun_cnt = 0;
+
+ for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) {
+ if (dev->lun[i] == NULL) {
+ continue;
+ }
+
+ /*
+ * LUN will remove itself from this dev when all outstanding IO
+ * is done. When no more LUNs, dev will be deleted.
+ */
+ scsi_lun_destruct(dev->lun[i]);
+ lun_cnt++;
+ }
+
+ if (lun_cnt == 0) {
+ free_dev(dev);
+ return;
+ }
+}
+
+static int
+scsi_dev_find_lowest_free_lun_id(struct spdk_scsi_dev *dev)
+{
+ int i;
+
+ for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) {
+ if (dev->lun[i] == NULL) {
+ return i;
+ }
+ }
+
+ return -1;
+}
+
+int
+spdk_scsi_dev_add_lun(struct spdk_scsi_dev *dev, const char *bdev_name, int lun_id,
+ void (*hotremove_cb)(const struct spdk_scsi_lun *, void *),
+ void *hotremove_ctx)
+{
+ struct spdk_bdev *bdev;
+ struct spdk_scsi_lun *lun;
+
+ bdev = spdk_bdev_get_by_name(bdev_name);
+ if (bdev == NULL) {
+ SPDK_ERRLOG("device %s: cannot find bdev '%s' (target %d)\n",
+ dev->name, bdev_name, lun_id);
+ return -1;
+ }
+
+ /* Search the lowest free LUN ID if LUN ID is default */
+ if (lun_id == -1) {
+ lun_id = scsi_dev_find_lowest_free_lun_id(dev);
+ if (lun_id == -1) {
+ SPDK_ERRLOG("Free LUN ID is not found\n");
+ return -1;
+ }
+ }
+
+ lun = scsi_lun_construct(bdev, hotremove_cb, hotremove_ctx);
+ if (lun == NULL) {
+ return -1;
+ }
+
+ lun->id = lun_id;
+ lun->dev = dev;
+ dev->lun[lun_id] = lun;
+ return 0;
+}
+
+void
+spdk_scsi_dev_delete_lun(struct spdk_scsi_dev *dev,
+ struct spdk_scsi_lun *lun)
+{
+ int lun_cnt = 0;
+ int i;
+
+ for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) {
+ if (dev->lun[i] == lun) {
+ dev->lun[i] = NULL;
+ }
+
+ if (dev->lun[i]) {
+ lun_cnt++;
+ }
+ }
+
+ if (dev->removed == true && lun_cnt == 0) {
+ free_dev(dev);
+ }
+}
+
+struct spdk_scsi_dev *spdk_scsi_dev_construct(const char *name, const char *bdev_name_list[],
+ int *lun_id_list, int num_luns, uint8_t protocol_id,
+ void (*hotremove_cb)(const struct spdk_scsi_lun *, void *),
+ void *hotremove_ctx)
+{
+ struct spdk_scsi_dev *dev;
+ size_t name_len;
+ bool found_lun_0;
+ int i, rc;
+
+ name_len = strlen(name);
+ if (name_len > sizeof(dev->name) - 1) {
+ SPDK_ERRLOG("device %s: name longer than maximum allowed length %zu\n",
+ name, sizeof(dev->name) - 1);
+ return NULL;
+ }
+
+ if (num_luns == 0) {
+ SPDK_ERRLOG("device %s: no LUNs specified\n", name);
+ return NULL;
+ }
+
+ found_lun_0 = false;
+ for (i = 0; i < num_luns; i++) {
+ if (lun_id_list[i] == 0) {
+ found_lun_0 = true;
+ break;
+ }
+ }
+
+ if (!found_lun_0) {
+ SPDK_ERRLOG("device %s: no LUN 0 specified\n", name);
+ return NULL;
+ }
+
+ for (i = 0; i < num_luns; i++) {
+ if (bdev_name_list[i] == NULL) {
+ SPDK_ERRLOG("NULL spdk_scsi_lun for LUN %d\n",
+ lun_id_list[i]);
+ return NULL;
+ }
+ }
+
+ dev = allocate_dev();
+ if (dev == NULL) {
+ return NULL;
+ }
+
+ memcpy(dev->name, name, name_len + 1);
+
+ dev->num_ports = 0;
+ dev->protocol_id = protocol_id;
+
+ for (i = 0; i < num_luns; i++) {
+ rc = spdk_scsi_dev_add_lun(dev, bdev_name_list[i], lun_id_list[i],
+ hotremove_cb, hotremove_ctx);
+ if (rc < 0) {
+ spdk_scsi_dev_destruct(dev, NULL, NULL);
+ return NULL;
+ }
+ }
+
+ return dev;
+}
+
+void
+spdk_scsi_dev_queue_mgmt_task(struct spdk_scsi_dev *dev,
+ struct spdk_scsi_task *task)
+{
+ assert(task != NULL);
+
+ scsi_lun_execute_mgmt_task(task->lun, task);
+}
+
+void
+spdk_scsi_dev_queue_task(struct spdk_scsi_dev *dev,
+ struct spdk_scsi_task *task)
+{
+ assert(task != NULL);
+
+ scsi_lun_execute_task(task->lun, task);
+}
+
+static struct spdk_scsi_port *
+scsi_dev_find_free_port(struct spdk_scsi_dev *dev)
+{
+ int i;
+
+ for (i = 0; i < SPDK_SCSI_DEV_MAX_PORTS; i++) {
+ if (!dev->port[i].is_used) {
+ return &dev->port[i];
+ }
+ }
+
+ return NULL;
+}
+
+int
+spdk_scsi_dev_add_port(struct spdk_scsi_dev *dev, uint64_t id, const char *name)
+{
+ struct spdk_scsi_port *port;
+ int rc;
+
+ if (dev->num_ports == SPDK_SCSI_DEV_MAX_PORTS) {
+ SPDK_ERRLOG("device already has %d ports\n", SPDK_SCSI_DEV_MAX_PORTS);
+ return -1;
+ }
+
+ port = spdk_scsi_dev_find_port_by_id(dev, id);
+ if (port != NULL) {
+ SPDK_ERRLOG("device already has port(%" PRIu64 ")\n", id);
+ return -1;
+ }
+
+ port = scsi_dev_find_free_port(dev);
+ if (port == NULL) {
+ assert(false);
+ return -1;
+ }
+
+ rc = scsi_port_construct(port, id, dev->num_ports, name);
+ if (rc != 0) {
+ return rc;
+ }
+
+ dev->num_ports++;
+ return 0;
+}
+
+int
+spdk_scsi_dev_delete_port(struct spdk_scsi_dev *dev, uint64_t id)
+{
+ struct spdk_scsi_port *port;
+
+ port = spdk_scsi_dev_find_port_by_id(dev, id);
+ if (port == NULL) {
+ SPDK_ERRLOG("device does not have specified port(%" PRIu64 ")\n", id);
+ return -1;
+ }
+
+ scsi_port_destruct(port);
+
+ dev->num_ports--;
+
+ return 0;
+}
+
+struct spdk_scsi_port *
+spdk_scsi_dev_find_port_by_id(struct spdk_scsi_dev *dev, uint64_t id)
+{
+ int i;
+
+ for (i = 0; i < SPDK_SCSI_DEV_MAX_PORTS; i++) {
+ if (!dev->port[i].is_used) {
+ continue;
+ }
+ if (dev->port[i].id == id) {
+ return &dev->port[i];
+ }
+ }
+
+ /* No matching port found. */
+ return NULL;
+}
+
+void
+spdk_scsi_dev_free_io_channels(struct spdk_scsi_dev *dev)
+{
+ int i;
+
+ for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) {
+ if (dev->lun[i] == NULL) {
+ continue;
+ }
+ scsi_lun_free_io_channel(dev->lun[i]);
+ }
+}
+
+int
+spdk_scsi_dev_allocate_io_channels(struct spdk_scsi_dev *dev)
+{
+ int i, rc;
+
+ for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) {
+ if (dev->lun[i] == NULL) {
+ continue;
+ }
+ rc = scsi_lun_allocate_io_channel(dev->lun[i]);
+ if (rc < 0) {
+ spdk_scsi_dev_free_io_channels(dev);
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+const char *
+spdk_scsi_dev_get_name(const struct spdk_scsi_dev *dev)
+{
+ return dev->name;
+}
+
+int
+spdk_scsi_dev_get_id(const struct spdk_scsi_dev *dev)
+{
+ return dev->id;
+}
+
+struct spdk_scsi_lun *
+spdk_scsi_dev_get_lun(struct spdk_scsi_dev *dev, int lun_id)
+{
+ struct spdk_scsi_lun *lun;
+
+ if (lun_id < 0 || lun_id >= SPDK_SCSI_DEV_MAX_LUN) {
+ return NULL;
+ }
+
+ lun = dev->lun[lun_id];
+
+ if (lun != NULL && !spdk_scsi_lun_is_removing(lun)) {
+ return lun;
+ } else {
+ return NULL;
+ }
+}
+
+bool
+spdk_scsi_dev_has_pending_tasks(const struct spdk_scsi_dev *dev,
+ const struct spdk_scsi_port *initiator_port)
+{
+ int i;
+
+ for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; ++i) {
+ if (dev->lun[i] &&
+ (scsi_lun_has_pending_tasks(dev->lun[i], initiator_port) ||
+ scsi_lun_has_pending_mgmt_tasks(dev->lun[i], initiator_port))) {
+ return true;
+ }
+ }
+
+ return false;
+}
diff --git a/src/spdk/lib/scsi/lun.c b/src/spdk/lib/scsi/lun.c
new file mode 100644
index 000000000..262137d80
--- /dev/null
+++ b/src/spdk/lib/scsi/lun.c
@@ -0,0 +1,623 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "scsi_internal.h"
+#include "spdk/endian.h"
+#include "spdk/env.h"
+#include "spdk/thread.h"
+#include "spdk/util.h"
+#include "spdk/likely.h"
+
+static void scsi_lun_execute_tasks(struct spdk_scsi_lun *lun);
+static void _scsi_lun_execute_mgmt_task(struct spdk_scsi_lun *lun);
+
+void
+scsi_lun_complete_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task)
+{
+ if (lun) {
+ TAILQ_REMOVE(&lun->tasks, task, scsi_link);
+ spdk_trace_record(TRACE_SCSI_TASK_DONE, lun->dev->id, 0, (uintptr_t)task, 0);
+ }
+ task->cpl_fn(task);
+}
+
+static void
+scsi_lun_complete_mgmt_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task)
+{
+ TAILQ_REMOVE(&lun->mgmt_tasks, task, scsi_link);
+
+ task->cpl_fn(task);
+
+ /* Try to execute the first pending mgmt task if it exists. */
+ _scsi_lun_execute_mgmt_task(lun);
+}
+
+static bool
+_scsi_lun_has_pending_mgmt_tasks(const struct spdk_scsi_lun *lun)
+{
+ return !TAILQ_EMPTY(&lun->pending_mgmt_tasks);
+}
+
+static bool
+scsi_lun_has_outstanding_mgmt_tasks(const struct spdk_scsi_lun *lun)
+{
+ return !TAILQ_EMPTY(&lun->mgmt_tasks);
+}
+
+static bool
+_scsi_lun_has_pending_tasks(const struct spdk_scsi_lun *lun)
+{
+ return !TAILQ_EMPTY(&lun->pending_tasks);
+}
+
+static bool
+scsi_lun_has_outstanding_tasks(const struct spdk_scsi_lun *lun)
+{
+ return !TAILQ_EMPTY(&lun->tasks);
+}
+
+/* Reset task have to wait until all prior outstanding tasks complete. */
+static int
+scsi_lun_reset_check_outstanding_tasks(void *arg)
+{
+ struct spdk_scsi_task *task = (struct spdk_scsi_task *)arg;
+ struct spdk_scsi_lun *lun = task->lun;
+
+ if (scsi_lun_has_outstanding_tasks(lun)) {
+ return SPDK_POLLER_BUSY;
+ }
+ spdk_poller_unregister(&lun->reset_poller);
+
+ scsi_lun_complete_mgmt_task(lun, task);
+ return SPDK_POLLER_BUSY;
+}
+
+void
+scsi_lun_complete_reset_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task)
+{
+ if (task->status == SPDK_SCSI_STATUS_GOOD) {
+ if (scsi_lun_has_outstanding_tasks(lun)) {
+ lun->reset_poller =
+ SPDK_POLLER_REGISTER(scsi_lun_reset_check_outstanding_tasks,
+ task, 10);
+ return;
+ }
+ }
+
+ scsi_lun_complete_mgmt_task(lun, task);
+}
+
+static void
+scsi_lun_append_mgmt_task(struct spdk_scsi_lun *lun,
+ struct spdk_scsi_task *task)
+{
+ TAILQ_INSERT_TAIL(&lun->pending_mgmt_tasks, task, scsi_link);
+}
+
+static void
+_scsi_lun_execute_mgmt_task(struct spdk_scsi_lun *lun)
+{
+ struct spdk_scsi_task *task;
+
+ if (!TAILQ_EMPTY(&lun->mgmt_tasks)) {
+ return;
+ }
+
+ task = TAILQ_FIRST(&lun->pending_mgmt_tasks);
+ if (spdk_likely(task == NULL)) {
+ /* Try to execute all pending tasks */
+ scsi_lun_execute_tasks(lun);
+ return;
+ }
+ TAILQ_REMOVE(&lun->pending_mgmt_tasks, task, scsi_link);
+
+ TAILQ_INSERT_TAIL(&lun->mgmt_tasks, task, scsi_link);
+
+ if (lun->removed) {
+ task->response = SPDK_SCSI_TASK_MGMT_RESP_INVALID_LUN;
+ scsi_lun_complete_mgmt_task(lun, task);
+ return;
+ }
+
+ switch (task->function) {
+ case SPDK_SCSI_TASK_FUNC_ABORT_TASK:
+ task->response = SPDK_SCSI_TASK_MGMT_RESP_REJECT_FUNC_NOT_SUPPORTED;
+ SPDK_ERRLOG("ABORT_TASK failed\n");
+ break;
+
+ case SPDK_SCSI_TASK_FUNC_ABORT_TASK_SET:
+ task->response = SPDK_SCSI_TASK_MGMT_RESP_REJECT_FUNC_NOT_SUPPORTED;
+ SPDK_ERRLOG("ABORT_TASK_SET failed\n");
+ break;
+
+ case SPDK_SCSI_TASK_FUNC_LUN_RESET:
+ bdev_scsi_reset(task);
+ return;
+
+ default:
+ SPDK_ERRLOG("Unknown Task Management Function!\n");
+ /*
+ * Task management functions other than those above should never
+ * reach this point having been filtered by the frontend. Reject
+ * the task as being unsupported.
+ */
+ task->response = SPDK_SCSI_TASK_MGMT_RESP_REJECT_FUNC_NOT_SUPPORTED;
+ break;
+ }
+
+ scsi_lun_complete_mgmt_task(lun, task);
+}
+
+void
+scsi_lun_execute_mgmt_task(struct spdk_scsi_lun *lun,
+ struct spdk_scsi_task *task)
+{
+ scsi_lun_append_mgmt_task(lun, task);
+ _scsi_lun_execute_mgmt_task(lun);
+}
+
+static void
+_scsi_lun_execute_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task)
+{
+ int rc;
+
+ task->status = SPDK_SCSI_STATUS_GOOD;
+ spdk_trace_record(TRACE_SCSI_TASK_START, lun->dev->id, task->length, (uintptr_t)task, 0);
+ TAILQ_INSERT_TAIL(&lun->tasks, task, scsi_link);
+ if (!lun->removed) {
+ /* Check the command is allowed or not when reservation is exist */
+ if (spdk_unlikely(lun->reservation.flags & SCSI_SPC2_RESERVE)) {
+ rc = scsi2_reserve_check(task);
+ } else {
+ rc = scsi_pr_check(task);
+ }
+ if (spdk_unlikely(rc < 0)) {
+ /* Reservation Conflict */
+ rc = SPDK_SCSI_TASK_COMPLETE;
+ } else {
+ rc = bdev_scsi_execute(task);
+ }
+ } else {
+ spdk_scsi_task_process_abort(task);
+ rc = SPDK_SCSI_TASK_COMPLETE;
+ }
+
+ switch (rc) {
+ case SPDK_SCSI_TASK_PENDING:
+ break;
+
+ case SPDK_SCSI_TASK_COMPLETE:
+ scsi_lun_complete_task(lun, task);
+ break;
+
+ default:
+ abort();
+ }
+}
+
+static void
+scsi_lun_append_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task)
+{
+ TAILQ_INSERT_TAIL(&lun->pending_tasks, task, scsi_link);
+}
+
+static void
+scsi_lun_execute_tasks(struct spdk_scsi_lun *lun)
+{
+ struct spdk_scsi_task *task, *task_tmp;
+
+ TAILQ_FOREACH_SAFE(task, &lun->pending_tasks, scsi_link, task_tmp) {
+ TAILQ_REMOVE(&lun->pending_tasks, task, scsi_link);
+ _scsi_lun_execute_task(lun, task);
+ }
+}
+
+void
+scsi_lun_execute_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task)
+{
+ if (spdk_unlikely(_scsi_lun_has_pending_mgmt_tasks(lun))) {
+ /* Add the IO task to pending list and wait for completion of
+ * existing mgmt tasks.
+ */
+ scsi_lun_append_task(lun, task);
+ } else if (spdk_unlikely(_scsi_lun_has_pending_tasks(lun))) {
+ /* If there is any pending IO task, append the IO task to the
+ * tail of the pending list, and then execute all pending IO tasks
+ * from the head to submit IO tasks in order.
+ */
+ scsi_lun_append_task(lun, task);
+ scsi_lun_execute_tasks(lun);
+ } else {
+ /* Execute the IO task directly. */
+ _scsi_lun_execute_task(lun, task);
+ }
+}
+
+static void
+_scsi_lun_remove(void *arg)
+{
+ struct spdk_scsi_lun *lun = (struct spdk_scsi_lun *)arg;
+
+ spdk_bdev_close(lun->bdev_desc);
+ spdk_scsi_dev_delete_lun(lun->dev, lun);
+ free(lun);
+}
+
+static void
+scsi_lun_remove(struct spdk_scsi_lun *lun)
+{
+ struct spdk_scsi_pr_registrant *reg, *tmp;
+ struct spdk_thread *thread;
+
+ TAILQ_FOREACH_SAFE(reg, &lun->reg_head, link, tmp) {
+ TAILQ_REMOVE(&lun->reg_head, reg, link);
+ free(reg);
+ }
+
+ thread = spdk_get_thread();
+ if (thread != lun->thread) {
+ spdk_thread_send_msg(lun->thread, _scsi_lun_remove, lun);
+ } else {
+ _scsi_lun_remove(lun);
+ }
+}
+
+static int
+scsi_lun_check_io_channel(void *arg)
+{
+ struct spdk_scsi_lun *lun = (struct spdk_scsi_lun *)arg;
+
+ if (lun->io_channel) {
+ return SPDK_POLLER_BUSY;
+ }
+ spdk_poller_unregister(&lun->hotremove_poller);
+
+ scsi_lun_remove(lun);
+ return SPDK_POLLER_BUSY;
+}
+
+static void
+scsi_lun_notify_hot_remove(struct spdk_scsi_lun *lun)
+{
+ struct spdk_scsi_lun_desc *desc, *tmp;
+
+ if (lun->hotremove_cb) {
+ lun->hotremove_cb(lun, lun->hotremove_ctx);
+ }
+
+ TAILQ_FOREACH_SAFE(desc, &lun->open_descs, link, tmp) {
+ if (desc->hotremove_cb) {
+ desc->hotremove_cb(lun, desc->hotremove_ctx);
+ } else {
+ spdk_scsi_lun_close(desc);
+ }
+ }
+
+ if (lun->io_channel) {
+ lun->hotremove_poller = SPDK_POLLER_REGISTER(scsi_lun_check_io_channel,
+ lun, 10);
+ } else {
+ scsi_lun_remove(lun);
+ }
+}
+
+static int
+scsi_lun_check_outstanding_tasks(void *arg)
+{
+ struct spdk_scsi_lun *lun = (struct spdk_scsi_lun *)arg;
+
+ if (scsi_lun_has_outstanding_tasks(lun) ||
+ scsi_lun_has_outstanding_mgmt_tasks(lun)) {
+ return SPDK_POLLER_BUSY;
+ }
+ spdk_poller_unregister(&lun->hotremove_poller);
+
+ scsi_lun_notify_hot_remove(lun);
+ return SPDK_POLLER_BUSY;
+}
+
+static void
+_scsi_lun_hot_remove(void *arg1)
+{
+ struct spdk_scsi_lun *lun = arg1;
+
+ /* If lun->removed is set, no new task can be submitted to the LUN.
+ * Execute previously queued tasks, which will be immediately aborted.
+ */
+ scsi_lun_execute_tasks(lun);
+
+ /* Then we only need to wait for all outstanding tasks to be completed
+ * before notifying the upper layer about the removal.
+ */
+ if (scsi_lun_has_outstanding_tasks(lun) ||
+ scsi_lun_has_outstanding_mgmt_tasks(lun)) {
+ lun->hotremove_poller = SPDK_POLLER_REGISTER(scsi_lun_check_outstanding_tasks,
+ lun, 10);
+ } else {
+ scsi_lun_notify_hot_remove(lun);
+ }
+}
+
+static void
+scsi_lun_hot_remove(void *remove_ctx)
+{
+ struct spdk_scsi_lun *lun = (struct spdk_scsi_lun *)remove_ctx;
+ struct spdk_thread *thread;
+
+ if (lun->removed) {
+ return;
+ }
+
+ lun->removed = true;
+ if (lun->io_channel == NULL) {
+ _scsi_lun_hot_remove(lun);
+ return;
+ }
+
+ thread = spdk_io_channel_get_thread(lun->io_channel);
+ if (thread != spdk_get_thread()) {
+ spdk_thread_send_msg(thread, _scsi_lun_hot_remove, lun);
+ } else {
+ _scsi_lun_hot_remove(lun);
+ }
+}
+
+/**
+ * \brief Constructs a new spdk_scsi_lun object based on the provided parameters.
+ *
+ * \param bdev bdev associated with this LUN
+ *
+ * \return NULL if bdev == NULL
+ * \return pointer to the new spdk_scsi_lun object otherwise
+ */
+struct spdk_scsi_lun *scsi_lun_construct(struct spdk_bdev *bdev,
+ void (*hotremove_cb)(const struct spdk_scsi_lun *, void *),
+ void *hotremove_ctx)
+{
+ struct spdk_scsi_lun *lun;
+ int rc;
+
+ if (bdev == NULL) {
+ SPDK_ERRLOG("bdev must be non-NULL\n");
+ return NULL;
+ }
+
+ lun = calloc(1, sizeof(*lun));
+ if (lun == NULL) {
+ SPDK_ERRLOG("could not allocate lun\n");
+ return NULL;
+ }
+
+ rc = spdk_bdev_open(bdev, true, scsi_lun_hot_remove, lun, &lun->bdev_desc);
+
+ if (rc != 0) {
+ SPDK_ERRLOG("bdev %s cannot be opened, error=%d\n", spdk_bdev_get_name(bdev), rc);
+ free(lun);
+ return NULL;
+ }
+
+ lun->thread = spdk_get_thread();
+
+ TAILQ_INIT(&lun->tasks);
+ TAILQ_INIT(&lun->pending_tasks);
+ TAILQ_INIT(&lun->mgmt_tasks);
+ TAILQ_INIT(&lun->pending_mgmt_tasks);
+
+ lun->bdev = bdev;
+ lun->io_channel = NULL;
+ lun->hotremove_cb = hotremove_cb;
+ lun->hotremove_ctx = hotremove_ctx;
+ TAILQ_INIT(&lun->open_descs);
+ TAILQ_INIT(&lun->reg_head);
+
+ return lun;
+}
+
+void
+scsi_lun_destruct(struct spdk_scsi_lun *lun)
+{
+ scsi_lun_hot_remove(lun);
+}
+
+int
+spdk_scsi_lun_open(struct spdk_scsi_lun *lun, spdk_scsi_lun_remove_cb_t hotremove_cb,
+ void *hotremove_ctx, struct spdk_scsi_lun_desc **_desc)
+{
+ struct spdk_scsi_lun_desc *desc;
+
+ desc = calloc(1, sizeof(*desc));
+ if (desc == NULL) {
+ SPDK_ERRLOG("calloc() failed for LUN descriptor.\n");
+ return -ENOMEM;
+ }
+
+ TAILQ_INSERT_TAIL(&lun->open_descs, desc, link);
+
+ desc->lun = lun;
+ desc->hotremove_cb = hotremove_cb;
+ desc->hotremove_ctx = hotremove_ctx;
+ *_desc = desc;
+
+ return 0;
+}
+
+void
+spdk_scsi_lun_close(struct spdk_scsi_lun_desc *desc)
+{
+ struct spdk_scsi_lun *lun = desc->lun;
+
+ TAILQ_REMOVE(&lun->open_descs, desc, link);
+ free(desc);
+
+ assert(!TAILQ_EMPTY(&lun->open_descs) || lun->io_channel == NULL);
+}
+
+int
+scsi_lun_allocate_io_channel(struct spdk_scsi_lun *lun)
+{
+ if (lun->io_channel != NULL) {
+ if (spdk_get_thread() == spdk_io_channel_get_thread(lun->io_channel)) {
+ lun->ref++;
+ return 0;
+ }
+ SPDK_ERRLOG("io_channel already allocated for lun %s\n",
+ spdk_bdev_get_name(lun->bdev));
+ return -1;
+ }
+
+ lun->io_channel = spdk_bdev_get_io_channel(lun->bdev_desc);
+ if (lun->io_channel == NULL) {
+ return -1;
+ }
+ lun->ref = 1;
+ return 0;
+}
+
+void
+scsi_lun_free_io_channel(struct spdk_scsi_lun *lun)
+{
+ if (lun->io_channel == NULL) {
+ return;
+ }
+
+ if (spdk_get_thread() != spdk_io_channel_get_thread(lun->io_channel)) {
+ SPDK_ERRLOG("io_channel was freed by different thread\n");
+ return;
+ }
+
+ lun->ref--;
+ if (lun->ref == 0) {
+ spdk_put_io_channel(lun->io_channel);
+ lun->io_channel = NULL;
+ }
+}
+
+int
+spdk_scsi_lun_allocate_io_channel(struct spdk_scsi_lun_desc *desc)
+{
+ struct spdk_scsi_lun *lun = desc->lun;
+
+ return scsi_lun_allocate_io_channel(lun);
+}
+
+void
+spdk_scsi_lun_free_io_channel(struct spdk_scsi_lun_desc *desc)
+{
+ struct spdk_scsi_lun *lun = desc->lun;
+
+ scsi_lun_free_io_channel(lun);
+}
+
+int
+spdk_scsi_lun_get_id(const struct spdk_scsi_lun *lun)
+{
+ return lun->id;
+}
+
+const char *
+spdk_scsi_lun_get_bdev_name(const struct spdk_scsi_lun *lun)
+{
+ return spdk_bdev_get_name(lun->bdev);
+}
+
+const struct spdk_scsi_dev *
+spdk_scsi_lun_get_dev(const struct spdk_scsi_lun *lun)
+{
+ return lun->dev;
+}
+
+bool
+scsi_lun_has_pending_mgmt_tasks(const struct spdk_scsi_lun *lun,
+ const struct spdk_scsi_port *initiator_port)
+{
+ struct spdk_scsi_task *task;
+
+ if (initiator_port == NULL) {
+ return _scsi_lun_has_pending_mgmt_tasks(lun) ||
+ scsi_lun_has_outstanding_mgmt_tasks(lun);
+ }
+
+ TAILQ_FOREACH(task, &lun->pending_mgmt_tasks, scsi_link) {
+ if (task->initiator_port == initiator_port) {
+ return true;
+ }
+ }
+
+ TAILQ_FOREACH(task, &lun->mgmt_tasks, scsi_link) {
+ if (task->initiator_port == initiator_port) {
+ return true;
+ }
+ }
+
+ return false;
+}
+/* This check includes both pending and submitted (outstanding) tasks. */
+bool
+scsi_lun_has_pending_tasks(const struct spdk_scsi_lun *lun,
+ const struct spdk_scsi_port *initiator_port)
+{
+ struct spdk_scsi_task *task;
+
+ if (initiator_port == NULL) {
+ return _scsi_lun_has_pending_tasks(lun) ||
+ scsi_lun_has_outstanding_tasks(lun);
+ }
+
+ TAILQ_FOREACH(task, &lun->pending_tasks, scsi_link) {
+ if (task->initiator_port == initiator_port) {
+ return true;
+ }
+ }
+
+ TAILQ_FOREACH(task, &lun->tasks, scsi_link) {
+ if (task->initiator_port == initiator_port) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool
+spdk_scsi_lun_is_removing(const struct spdk_scsi_lun *lun)
+{
+ return lun->removed;
+}
+
+bool
+spdk_scsi_lun_get_dif_ctx(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task,
+ struct spdk_dif_ctx *dif_ctx)
+{
+ return bdev_scsi_get_dif_ctx(lun->bdev, task, dif_ctx);
+}
diff --git a/src/spdk/lib/scsi/port.c b/src/spdk/lib/scsi/port.c
new file mode 100644
index 000000000..09311bac2
--- /dev/null
+++ b/src/spdk/lib/scsi/port.c
@@ -0,0 +1,134 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "scsi_internal.h"
+
+#include "spdk/endian.h"
+
+struct spdk_scsi_port *
+spdk_scsi_port_create(uint64_t id, uint16_t index, const char *name)
+{
+ struct spdk_scsi_port *port;
+
+ port = calloc(1, sizeof(struct spdk_scsi_port));
+
+ if (!port) {
+ return NULL;
+ }
+
+ if (scsi_port_construct(port, id, index, name) != 0) {
+ spdk_scsi_port_free(&port);
+ return NULL;
+ }
+
+ return port;
+}
+
+void
+spdk_scsi_port_free(struct spdk_scsi_port **pport)
+{
+ struct spdk_scsi_port *port;
+
+ if (!pport) {
+ return;
+ }
+
+ port = *pport;
+ *pport = NULL;
+ free(port);
+}
+
+int
+scsi_port_construct(struct spdk_scsi_port *port, uint64_t id, uint16_t index,
+ const char *name)
+{
+ if (strlen(name) >= sizeof(port->name)) {
+ SPDK_ERRLOG("port name too long\n");
+ return -1;
+ }
+
+ port->is_used = 1;
+ port->id = id;
+ port->index = index;
+ snprintf(port->name, sizeof(port->name), "%s", name);
+ return 0;
+}
+
+void
+scsi_port_destruct(struct spdk_scsi_port *port)
+{
+ memset(port, 0, sizeof(struct spdk_scsi_port));
+}
+
+const char *
+spdk_scsi_port_get_name(const struct spdk_scsi_port *port)
+{
+ return port->name;
+}
+
+/*
+ * spc3r23 7.5.4.6 iSCSI initiator port TransportID,
+ * using code format 0x01.
+ */
+void
+spdk_scsi_port_set_iscsi_transport_id(struct spdk_scsi_port *port, char *iscsi_name,
+ uint64_t isid)
+{
+ struct spdk_scsi_iscsi_transport_id *data;
+ uint32_t len;
+ char *name;
+
+ memset(port->transport_id, 0, sizeof(port->transport_id));
+ port->transport_id_len = 0;
+
+ data = (struct spdk_scsi_iscsi_transport_id *)port->transport_id;
+
+ data->protocol_id = (uint8_t)SPDK_SPC_PROTOCOL_IDENTIFIER_ISCSI;
+ data->format = 0x1;
+
+ name = data->name;
+ len = snprintf(name, SPDK_SCSI_MAX_TRANSPORT_ID_LENGTH - sizeof(*data),
+ "%s,i,0x%12.12" PRIx64, iscsi_name, isid);
+ do {
+ name[len++] = '\0';
+ } while (len & 3);
+
+ if (len < 20) {
+ SPDK_ERRLOG("The length of Transport ID should >= 20 bytes\n");
+ return;
+ }
+
+ to_be16(&data->additional_len, len);
+ port->transport_id_len = len + sizeof(*data);
+}
diff --git a/src/spdk/lib/scsi/scsi.c b/src/spdk/lib/scsi/scsi.c
new file mode 100644
index 000000000..c18192e37
--- /dev/null
+++ b/src/spdk/lib/scsi/scsi.c
@@ -0,0 +1,110 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "scsi_internal.h"
+
+struct spdk_scsi_globals g_scsi;
+
+int
+spdk_scsi_init(void)
+{
+ int rc;
+
+ rc = pthread_mutex_init(&g_scsi.mutex, NULL);
+ if (rc != 0) {
+ SPDK_ERRLOG("mutex_init() failed\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+void
+spdk_scsi_fini(void)
+{
+ pthread_mutex_destroy(&g_scsi.mutex);
+}
+
+SPDK_TRACE_REGISTER_FN(scsi_trace, "scsi", TRACE_GROUP_SCSI)
+{
+ spdk_trace_register_owner(OWNER_SCSI_DEV, 'd');
+ spdk_trace_register_object(OBJECT_SCSI_TASK, 't');
+ spdk_trace_register_description("SCSI_TASK_DONE", TRACE_SCSI_TASK_DONE,
+ OWNER_SCSI_DEV, OBJECT_SCSI_TASK, 0, 0, "");
+ spdk_trace_register_description("SCSI_TASK_START", TRACE_SCSI_TASK_START,
+ OWNER_SCSI_DEV, OBJECT_SCSI_TASK, 0, 0, "");
+}
+
+uint64_t
+spdk_scsi_lun_id_int_to_fmt(int lun_id)
+{
+ uint64_t fmt_lun, method;
+
+ if (SPDK_SCSI_DEV_MAX_LUN <= 0x0100) {
+ /* below 256 */
+ method = 0x00U;
+ fmt_lun = (method & 0x03U) << 62;
+ fmt_lun |= ((uint64_t)lun_id & 0x00ffU) << 48;
+ } else if (SPDK_SCSI_DEV_MAX_LUN <= 0x4000) {
+ /* below 16384 */
+ method = 0x01U;
+ fmt_lun = (method & 0x03U) << 62;
+ fmt_lun |= ((uint64_t)lun_id & 0x3fffU) << 48;
+ } else {
+ /* XXX */
+ fmt_lun = 0;
+ }
+
+ return fmt_lun;
+}
+
+int
+spdk_scsi_lun_id_fmt_to_int(uint64_t fmt_lun)
+{
+ uint64_t method;
+ int lun_i;
+
+ method = (fmt_lun >> 62) & 0x03U;
+ fmt_lun = fmt_lun >> 48;
+ if (method == 0x00U) {
+ lun_i = (int)(fmt_lun & 0x00ffU);
+ } else if (method == 0x01U) {
+ lun_i = (int)(fmt_lun & 0x3fffU);
+ } else {
+ lun_i = 0xffffU;
+ }
+ return lun_i;
+}
+
+SPDK_LOG_REGISTER_COMPONENT("scsi", SPDK_LOG_SCSI)
diff --git a/src/spdk/lib/scsi/scsi_bdev.c b/src/spdk/lib/scsi/scsi_bdev.c
new file mode 100644
index 000000000..bf0fb5af7
--- /dev/null
+++ b/src/spdk/lib/scsi/scsi_bdev.c
@@ -0,0 +1,2067 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "scsi_internal.h"
+
+/*
+ * TODO: move bdev SCSI error code translation tests to bdev unit test
+ * and remove this include.
+ */
+#include "spdk/bdev_module.h"
+
+#include "spdk/env.h"
+#include "spdk/bdev.h"
+#include "spdk/endian.h"
+#include "spdk/likely.h"
+#include "spdk/string.h"
+#include "spdk/util.h"
+
+#define SPDK_WORK_BLOCK_SIZE (4ULL * 1024ULL * 1024ULL)
+#define SPDK_WORK_ATS_BLOCK_SIZE (1ULL * 1024ULL * 1024ULL)
+#define MAX_SERIAL_STRING 32
+
+#define DEFAULT_DISK_VENDOR "INTEL"
+#define DEFAULT_DISK_REVISION "0001"
+#define DEFAULT_DISK_ROTATION_RATE 1 /* Non-rotating medium */
+#define DEFAULT_DISK_FORM_FACTOR 0x02 /* 3.5 inch */
+#define DEFAULT_MAX_UNMAP_BLOCK_DESCRIPTOR_COUNT 256
+
+#define INQUIRY_OFFSET(field) offsetof(struct spdk_scsi_cdb_inquiry_data, field) + \
+ sizeof(((struct spdk_scsi_cdb_inquiry_data *)0x0)->field)
+
+static void bdev_scsi_process_block_resubmit(void *arg);
+
+static int
+hex2bin(char ch)
+{
+ if ((ch >= '0') && (ch <= '9')) {
+ return ch - '0';
+ }
+ ch = tolower(ch);
+ if ((ch >= 'a') && (ch <= 'f')) {
+ return ch - 'a' + 10;
+ }
+ return (int)ch;
+}
+
+static void
+bdev_scsi_set_naa_ieee_extended(const char *name, uint8_t *buf)
+{
+ int i, value, count = 0;
+ uint64_t local_value;
+
+ for (i = 0; (i < 16) && (name[i] != '\0'); i++) {
+ value = hex2bin(name[i]);
+ if (i % 2) {
+ buf[count++] |= value << 4;
+ } else {
+ buf[count] = value;
+ }
+ }
+
+ local_value = *(uint64_t *)buf;
+ /*
+ * see spc3r23 7.6.3.6.2,
+ * NAA IEEE Extended identifer format
+ */
+ local_value &= 0x0fff000000ffffffull;
+ /* NAA 02, and 00 03 47 for IEEE Intel */
+ local_value |= 0x2000000347000000ull;
+
+ to_be64((void *)buf, local_value);
+}
+
+static int
+bdev_scsi_report_luns(struct spdk_scsi_lun *lun,
+ int sel, uint8_t *data, int alloc_len)
+{
+ struct spdk_scsi_dev *dev;
+ uint64_t fmt_lun;
+ int hlen, len = 0;
+ int i;
+
+ if (alloc_len < 8) {
+ return -1;
+ }
+
+ if (sel == 0x00) {
+ /* logical unit with addressing method */
+ } else if (sel == 0x01) {
+ /* well known logical unit */
+ } else if (sel == 0x02) {
+ /* logical unit */
+ } else {
+ return -1;
+ }
+
+ /* LUN LIST LENGTH */
+ memset(data, 0, 4);
+
+ /* Reserved */
+ memset(&data[4], 0, 4);
+ hlen = 8;
+
+ dev = lun->dev;
+
+ for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) {
+ if (dev->lun[i] == NULL) {
+ continue;
+ }
+
+ if (alloc_len - (hlen + len) < 8) {
+ return -1;
+ }
+
+ fmt_lun = spdk_scsi_lun_id_int_to_fmt(i);
+
+ /* LUN */
+ to_be64(&data[hlen + len], fmt_lun);
+ len += 8;
+ }
+
+ /* LUN LIST LENGTH */
+ to_be32(data, len);
+
+ return hlen + len;
+}
+
+static int
+bdev_scsi_pad_scsi_name(char *dst, const char *name)
+{
+ size_t len;
+
+ len = strlen(name);
+ memcpy(dst, name, len);
+ do {
+ dst[len++] = '\0';
+ } while (len & 3);
+
+ return len;
+}
+
+static int
+bdev_scsi_inquiry(struct spdk_bdev *bdev, struct spdk_scsi_task *task,
+ uint8_t *cdb, uint8_t *data, uint16_t alloc_len)
+{
+ struct spdk_scsi_lun *lun;
+ struct spdk_scsi_dev *dev;
+ struct spdk_scsi_port *port;
+ uint32_t blocks, optimal_blocks;
+ int hlen = 0, plen, plen2;
+ uint16_t len = 0;
+ int pc;
+ int pd;
+ int evpd;
+ int i;
+ struct spdk_scsi_cdb_inquiry *inq = (struct spdk_scsi_cdb_inquiry *)cdb;
+
+ /* standard inquiry command at lease with 36 Bytes */
+ if (alloc_len < 0x24) {
+ goto inq_error;
+ }
+
+ lun = task->lun;
+ dev = lun->dev;
+ port = task->target_port;
+
+ pd = SPDK_SPC_PERIPHERAL_DEVICE_TYPE_DISK;
+ pc = inq->page_code;
+ evpd = inq->evpd & 0x1;
+
+ if (!evpd && pc) {
+ spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION,
+ SPDK_SCSI_SENSE_ILLEGAL_REQUEST,
+ SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB,
+ SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ return -1;
+ }
+
+ if (evpd) {
+ struct spdk_scsi_vpd_page *vpage = (struct spdk_scsi_vpd_page *)data;
+
+ /* PERIPHERAL QUALIFIER(7-5) PERIPHERAL DEVICE TYPE(4-0) */
+ vpage->peripheral_device_type = pd;
+ vpage->peripheral_qualifier = SPDK_SPC_PERIPHERAL_QUALIFIER_CONNECTED;
+ /* PAGE CODE */
+ vpage->page_code = pc;
+
+ /* Vital product data */
+ switch (pc) {
+ case SPDK_SPC_VPD_SUPPORTED_VPD_PAGES:
+ hlen = 4;
+
+ vpage->params[0] = SPDK_SPC_VPD_SUPPORTED_VPD_PAGES;
+ vpage->params[1] = SPDK_SPC_VPD_UNIT_SERIAL_NUMBER;
+ vpage->params[2] = SPDK_SPC_VPD_DEVICE_IDENTIFICATION;
+ vpage->params[3] = SPDK_SPC_VPD_MANAGEMENT_NETWORK_ADDRESSES;
+ vpage->params[4] = SPDK_SPC_VPD_EXTENDED_INQUIRY_DATA;
+ vpage->params[5] = SPDK_SPC_VPD_MODE_PAGE_POLICY;
+ vpage->params[6] = SPDK_SPC_VPD_SCSI_PORTS;
+ vpage->params[7] = SPDK_SPC_VPD_BLOCK_LIMITS;
+ vpage->params[8] = SPDK_SPC_VPD_BLOCK_DEV_CHARS;
+ len = 9;
+ if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
+ vpage->params[9] = SPDK_SPC_VPD_BLOCK_THIN_PROVISION;
+ len++;
+ }
+
+ /* PAGE LENGTH */
+ to_be16(vpage->alloc_len, len);
+ break;
+
+ case SPDK_SPC_VPD_UNIT_SERIAL_NUMBER: {
+ const char *name = spdk_bdev_get_name(bdev);
+
+ hlen = 4;
+
+ /* PRODUCT SERIAL NUMBER */
+ len = strlen(name) + 1;
+ if (len > MAX_SERIAL_STRING) {
+ len = MAX_SERIAL_STRING;
+ }
+
+ memcpy(vpage->params, name, len - 1);
+ vpage->params[len - 1] = 0;
+
+ /* PAGE LENGTH */
+ to_be16(vpage->alloc_len, len);
+ break;
+ }
+
+ case SPDK_SPC_VPD_DEVICE_IDENTIFICATION: {
+ const char *name = spdk_bdev_get_name(bdev);
+ const char *product_name = spdk_bdev_get_product_name(bdev);
+ uint8_t protocol_id = dev->protocol_id;
+ uint8_t *buf = vpage->params;
+ struct spdk_scsi_desig_desc *desig;
+
+ hlen = 4;
+
+ /* Check total length by calculated how much space all entries take */
+ len = sizeof(struct spdk_scsi_desig_desc) + 8;
+ len += sizeof(struct spdk_scsi_desig_desc) + 8 + 16 + MAX_SERIAL_STRING;
+ len += sizeof(struct spdk_scsi_desig_desc) + SPDK_SCSI_DEV_MAX_NAME + 1;
+ len += sizeof(struct spdk_scsi_desig_desc) + SPDK_SCSI_PORT_MAX_NAME_LENGTH;
+ len += sizeof(struct spdk_scsi_desig_desc) + 4;
+ len += sizeof(struct spdk_scsi_desig_desc) + 4;
+ len += sizeof(struct spdk_scsi_desig_desc) + 4;
+ if (sizeof(struct spdk_scsi_vpd_page) + len > alloc_len) {
+ spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION,
+ SPDK_SCSI_SENSE_ILLEGAL_REQUEST,
+ SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB,
+ SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ return -1;
+ }
+
+ /* Now fill out the designator array */
+
+ /* NAA designator */
+ desig = (struct spdk_scsi_desig_desc *)buf;
+ desig->code_set = SPDK_SPC_VPD_CODE_SET_BINARY;
+ desig->protocol_id = protocol_id;
+ desig->type = SPDK_SPC_VPD_IDENTIFIER_TYPE_NAA;
+ desig->association = SPDK_SPC_VPD_ASSOCIATION_LOGICAL_UNIT;
+ desig->reserved0 = 0;
+ desig->piv = 1;
+ desig->reserved1 = 0;
+ desig->len = 8;
+ bdev_scsi_set_naa_ieee_extended(name, desig->desig);
+ len = sizeof(struct spdk_scsi_desig_desc) + 8;
+
+ buf += sizeof(struct spdk_scsi_desig_desc) + desig->len;
+
+ /* T10 Vendor ID designator */
+ desig = (struct spdk_scsi_desig_desc *)buf;
+ desig->code_set = SPDK_SPC_VPD_CODE_SET_ASCII;
+ desig->protocol_id = protocol_id;
+ desig->type = SPDK_SPC_VPD_IDENTIFIER_TYPE_T10_VENDOR_ID;
+ desig->association = SPDK_SPC_VPD_ASSOCIATION_LOGICAL_UNIT;
+ desig->reserved0 = 0;
+ desig->piv = 1;
+ desig->reserved1 = 0;
+ desig->len = 8 + 16 + MAX_SERIAL_STRING;
+ spdk_strcpy_pad(desig->desig, DEFAULT_DISK_VENDOR, 8, ' ');
+ spdk_strcpy_pad(&desig->desig[8], product_name, 16, ' ');
+ spdk_strcpy_pad(&desig->desig[24], name, MAX_SERIAL_STRING, ' ');
+ len += sizeof(struct spdk_scsi_desig_desc) + 8 + 16 + MAX_SERIAL_STRING;
+
+ buf += sizeof(struct spdk_scsi_desig_desc) + desig->len;
+
+ /* SCSI Device Name designator */
+ desig = (struct spdk_scsi_desig_desc *)buf;
+ desig->code_set = SPDK_SPC_VPD_CODE_SET_UTF8;
+ desig->protocol_id = protocol_id;
+ desig->type = SPDK_SPC_VPD_IDENTIFIER_TYPE_SCSI_NAME;
+ desig->association = SPDK_SPC_VPD_ASSOCIATION_TARGET_DEVICE;
+ desig->reserved0 = 0;
+ desig->piv = 1;
+ desig->reserved1 = 0;
+ desig->len = bdev_scsi_pad_scsi_name(desig->desig, dev->name);
+ len += sizeof(struct spdk_scsi_desig_desc) + desig->len;
+
+ buf += sizeof(struct spdk_scsi_desig_desc) + desig->len;
+
+ /* SCSI Port Name designator */
+ desig = (struct spdk_scsi_desig_desc *)buf;
+ desig->code_set = SPDK_SPC_VPD_CODE_SET_UTF8;
+ desig->protocol_id = protocol_id;
+ desig->type = SPDK_SPC_VPD_IDENTIFIER_TYPE_SCSI_NAME;
+ desig->association = SPDK_SPC_VPD_ASSOCIATION_TARGET_PORT;
+ desig->reserved0 = 0;
+ desig->piv = 1;
+ desig->reserved1 = 0;
+ desig->len = snprintf(desig->desig, SPDK_SCSI_PORT_MAX_NAME_LENGTH, "%s", port->name);
+ len += sizeof(struct spdk_scsi_desig_desc) + desig->len;
+
+ buf += sizeof(struct spdk_scsi_desig_desc) + desig->len;
+
+ /* Relative Target Port designator */
+ desig = (struct spdk_scsi_desig_desc *)buf;
+ desig->code_set = SPDK_SPC_VPD_CODE_SET_BINARY;
+ desig->protocol_id = protocol_id;
+ desig->type = SPDK_SPC_VPD_IDENTIFIER_TYPE_RELATIVE_TARGET_PORT;
+ desig->association = SPDK_SPC_VPD_ASSOCIATION_TARGET_PORT;
+ desig->reserved0 = 0;
+ desig->piv = 1;
+ desig->reserved1 = 0;
+ desig->len = 4;
+ memset(desig->desig, 0, 2); /* Reserved */
+ to_be16(&desig->desig[2], port->index);
+ len += sizeof(struct spdk_scsi_desig_desc) + desig->len;
+
+ buf += sizeof(struct spdk_scsi_desig_desc) + desig->len;
+
+ /* Target port group designator */
+ desig = (struct spdk_scsi_desig_desc *)buf;
+ desig->code_set = SPDK_SPC_VPD_CODE_SET_BINARY;
+ desig->protocol_id = protocol_id;
+ desig->type = SPDK_SPC_VPD_IDENTIFIER_TYPE_TARGET_PORT_GROUP;
+ desig->association = SPDK_SPC_VPD_ASSOCIATION_TARGET_PORT;
+ desig->reserved0 = 0;
+ desig->piv = 1;
+ desig->reserved1 = 0;
+ desig->len = 4;
+ memset(desig->desig, 0, 4);
+ len += sizeof(struct spdk_scsi_desig_desc) + desig->len;
+
+ buf += sizeof(struct spdk_scsi_desig_desc) + desig->len;
+
+ /* Logical unit group designator */
+ desig = (struct spdk_scsi_desig_desc *)buf;
+ desig->code_set = SPDK_SPC_VPD_CODE_SET_BINARY;
+ desig->protocol_id = protocol_id;
+ desig->type = SPDK_SPC_VPD_IDENTIFIER_TYPE_LOGICAL_UNIT_GROUP;
+ desig->association = SPDK_SPC_VPD_ASSOCIATION_LOGICAL_UNIT;
+ desig->reserved0 = 0;
+ desig->piv = 1;
+ desig->reserved1 = 0;
+ desig->len = 4;
+ memset(desig->desig, 0, 2); /* Reserved */
+ to_be16(&desig->desig[2], dev->id);
+ len += sizeof(struct spdk_scsi_desig_desc) + desig->len;
+
+ to_be16(vpage->alloc_len, len);
+
+ break;
+ }
+
+ case SPDK_SPC_VPD_EXTENDED_INQUIRY_DATA: {
+ struct spdk_scsi_vpd_ext_inquiry *vext = (struct spdk_scsi_vpd_ext_inquiry *)vpage;
+
+ hlen = 4;
+ memset((uint8_t *)vext + hlen, 0, sizeof(*vext) - hlen);
+
+ /* RTO(3) GRD_CHK(2) APP_CHK(1) REF_CHK(0) */
+
+ /* GROUP_SUP(4) PRIOR_SUP(3) HEADSUP(2) ORDSUP(1) SIMPSUP(0) */
+ vext->sup = SPDK_SCSI_VEXT_HEADSUP | SPDK_SCSI_VEXT_SIMPSUP;
+
+ /* NV_SUP(1) V_SUP(0) */
+
+ /* Reserved[7-63] */
+
+ len = 64 - hlen;
+
+ /* PAGE LENGTH */
+ to_be16(vpage->alloc_len, len);
+ break;
+ }
+
+ case SPDK_SPC_VPD_MANAGEMENT_NETWORK_ADDRESSES:
+ /* PAGE LENGTH */
+ hlen = 4;
+
+ to_be16(vpage->alloc_len, len);
+ break;
+
+ case SPDK_SPC_VPD_MODE_PAGE_POLICY: {
+ struct spdk_scsi_mpage_policy_desc *pdesc =
+ (struct spdk_scsi_mpage_policy_desc *)vpage->params;
+
+ hlen = 4;
+
+ /* Mode page policy descriptor 1 */
+
+ /* POLICY PAGE CODE(5-0) */
+ /* all page code */
+ pdesc->page_code = 0x3f;
+
+ /* POLICY SUBPAGE CODE */
+ /* all sub page */
+ pdesc->sub_page_code = 0xff;
+
+ /* MLUS(7) MODE PAGE POLICY(1-0) */
+ /* MLUS own copy */
+ /* Shared MODE PAGE policy */
+ pdesc->policy = 0;
+ /* Reserved */
+ pdesc->reserved = 0;
+
+ len += 4;
+
+ to_be16(vpage->alloc_len, len);
+ break;
+ }
+
+ case SPDK_SPC_VPD_SCSI_PORTS: {
+ /* PAGE LENGTH */
+ hlen = 4;
+
+ /* Identification descriptor list */
+ for (i = 0; i < SPDK_SCSI_DEV_MAX_PORTS; i++) {
+ struct spdk_scsi_port_desc *sdesc;
+ struct spdk_scsi_tgt_port_desc *pdesc;
+
+ if (!dev->port[i].is_used) {
+ continue;
+ }
+
+ /* Identification descriptor N */
+ sdesc = (struct spdk_scsi_port_desc *)&vpage->params[len];
+
+ /* Reserved */
+ sdesc->reserved = 0;
+
+ /* RELATIVE PORT IDENTIFIER */
+ to_be16(&sdesc->rel_port_id, dev->port[i].index);
+
+ /* Reserved */
+ sdesc->reserved2 = 0;
+
+ /* INITIATOR PORT TRANSPORTID LENGTH */
+ sdesc->init_port_len = 0;
+
+ /* Reserved */
+ sdesc->init_port_id = 0;
+
+ /* TARGET PORT DESCRIPTORS LENGTH */
+ sdesc->tgt_desc_len = 0;
+
+ len += 12;
+
+ plen2 = 0;
+ /* Target port descriptor 1 */
+ pdesc = (struct spdk_scsi_tgt_port_desc *)sdesc->tgt_desc;
+
+ /* PROTOCOL IDENTIFIER(7-4) CODE SET(3-0) */
+ pdesc->code_set =
+ SPDK_SPC_PROTOCOL_IDENTIFIER_ISCSI << 4 |
+ SPDK_SPC_VPD_CODE_SET_UTF8;
+
+ /* PIV(7) ASSOCIATION(5-4) IDENTIFIER TYPE(3-0) */
+ pdesc->desig_type = SPDK_SPC_VPD_DESIG_PIV |
+ SPDK_SPC_VPD_ASSOCIATION_TARGET_PORT << 4 |
+ SPDK_SPC_VPD_IDENTIFIER_TYPE_SCSI_NAME;
+
+ /* Reserved */
+ pdesc->reserved = 0;
+
+ /* IDENTIFIER */
+ plen = snprintf((char *)pdesc->designator,
+ SPDK_SCSI_PORT_MAX_NAME_LENGTH, "%s",
+ dev->port[i].name);
+ pdesc->len = plen;
+
+ plen2 += 4 + plen;
+
+ /* TARGET PORT DESCRIPTORS LENGTH */
+ to_be16(&sdesc->tgt_desc_len, plen2);
+
+ len += plen2;
+ }
+
+ to_be16(vpage->alloc_len, len);
+ break;
+ }
+
+ case SPDK_SPC_VPD_BLOCK_LIMITS: {
+ uint32_t block_size = spdk_bdev_get_data_block_size(bdev);
+
+ /* PAGE LENGTH */
+ memset(&data[4], 0, 60);
+
+ hlen = 4;
+
+ /* WSNZ(0) */
+ /* support zero length in WRITE SAME */
+
+ /* MAXIMUM COMPARE AND WRITE LENGTH */
+ blocks = SPDK_WORK_ATS_BLOCK_SIZE / block_size;
+
+ if (blocks > 0xff) {
+ blocks = 0xff;
+ }
+
+ data[5] = (uint8_t)blocks;
+
+ /* force align to 4KB */
+ if (block_size < 4096) {
+ optimal_blocks = 4096 / block_size;
+ } else {
+ optimal_blocks = 1;
+ }
+
+ /* OPTIMAL TRANSFER LENGTH GRANULARITY */
+ to_be16(&data[6], optimal_blocks);
+
+ blocks = SPDK_WORK_BLOCK_SIZE / block_size;
+
+ /* MAXIMUM TRANSFER LENGTH */
+ to_be32(&data[8], blocks);
+ /* OPTIMAL TRANSFER LENGTH */
+ to_be32(&data[12], blocks);
+
+ /* MAXIMUM PREFETCH XDREAD XDWRITE TRANSFER LENGTH */
+
+ len = 20 - hlen;
+
+ if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
+ /*
+ * MAXIMUM UNMAP LBA COUNT: indicates the
+ * maximum number of LBAs that may be
+ * unmapped by an UNMAP command.
+ */
+ /* For now, choose 4MB as the maximum. */
+ to_be32(&data[20], 4194304);
+
+ /*
+ * MAXIMUM UNMAP BLOCK DESCRIPTOR COUNT:
+ * indicates the maximum number of UNMAP
+ * block descriptors that shall be contained
+ * in the parameter data transferred to the
+ * device server for an UNMAP command.
+ * The bdev layer automatically splits unmap
+ * requests, so pick an arbitrary high number here.
+ */
+ to_be32(&data[24], DEFAULT_MAX_UNMAP_BLOCK_DESCRIPTOR_COUNT);
+
+ /*
+ * The UGAVALID bit is left as 0 which means neither the
+ * OPTIMAL UNMAP GRANULARITY nor the UNMAP GRANULARITY
+ * ALIGNMENT fields are valid.
+ */
+
+ /*
+ * MAXIMUM WRITE SAME LENGTH: indicates the
+ * maximum number of contiguous logical blocks
+ * that the device server allows to be unmapped
+ * or written in a single WRITE SAME command.
+ */
+ to_be64(&data[36], 512);
+
+ /* Reserved */
+ /* not specified */
+ len = 64 - hlen;
+ }
+
+ to_be16(vpage->alloc_len, len);
+ break;
+ }
+
+ case SPDK_SPC_VPD_BLOCK_DEV_CHARS: {
+ /* PAGE LENGTH */
+ hlen = 4;
+ len = 64 - hlen;
+
+ to_be16(&data[4], DEFAULT_DISK_ROTATION_RATE);
+
+ /* Reserved */
+ data[6] = 0;
+ /* NOMINAL FORM FACTOR(3-0) */
+ data[7] = DEFAULT_DISK_FORM_FACTOR << 4;
+ /* Reserved */
+ memset(&data[8], 0, 64 - 8);
+
+ to_be16(vpage->alloc_len, len);
+ break;
+ }
+
+ case SPDK_SPC_VPD_BLOCK_THIN_PROVISION: {
+ if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
+ goto inq_error;
+ }
+
+ hlen = 4;
+ len = 7;
+
+ /*
+ * PAGE LENGTH : if the DP bit is set to one, then the
+ * page length shall be set 0004h.
+ */
+ to_be16(&data[2], 0x0004);
+
+ /*
+ * THRESHOLD EXPONENT : it indicates the threshold set
+ * size in LBAs as a power of 2( i.e., the threshold
+ * set size = 2 ^ (threshold exponent).
+ */
+ data[4] = 0;
+
+ /*
+ * Set the LBPU bit to indicate the support for UNMAP
+ * command.
+ */
+ data[5] |= SPDK_SCSI_UNMAP_LBPU;
+
+ /*
+ * Set the provisioning type to thin provision.
+ */
+ data[6] = SPDK_SCSI_UNMAP_THIN_PROVISIONING;
+
+ to_be16(vpage->alloc_len, len);
+ break;
+ }
+
+ default:
+ if (pc >= 0xc0 && pc <= 0xff) {
+ SPDK_DEBUGLOG(SPDK_LOG_SCSI, "Vendor specific INQUIRY VPD page 0x%x\n", pc);
+ } else {
+ SPDK_ERRLOG("unsupported INQUIRY VPD page 0x%x\n", pc);
+ }
+ goto inq_error;
+ }
+ } else {
+ struct spdk_scsi_cdb_inquiry_data *inqdata =
+ (struct spdk_scsi_cdb_inquiry_data *)data;
+
+ /* Standard INQUIRY data */
+ /* PERIPHERAL QUALIFIER(7-5) PERIPHERAL DEVICE TYPE(4-0) */
+ inqdata->peripheral_device_type = pd;
+ inqdata->peripheral_qualifier = SPDK_SPC_PERIPHERAL_QUALIFIER_CONNECTED;
+
+ /* RMB(7) */
+ inqdata->rmb = 0;
+
+ /* VERSION */
+ /* See SPC3/SBC2/MMC4/SAM2 for more details */
+ inqdata->version = SPDK_SPC_VERSION_SPC3;
+
+ /* NORMACA(5) HISUP(4) RESPONSE DATA FORMAT(3-0) */
+ /* format 2 */ /* hierarchical support */
+ inqdata->response = 2 | 1 << 4;
+
+ hlen = 5;
+
+ /* SCCS(7) ACC(6) TPGS(5-4) 3PC(3) PROTECT(0) */
+ /* Not support TPGS */
+ inqdata->flags = 0;
+
+ /* MULTIP */
+ inqdata->flags2 = 0x10;
+
+ /* WBUS16(5) SYNC(4) LINKED(3) CMDQUE(1) VS(0) */
+ /* CMDQUE */
+ inqdata->flags3 = 0x2;
+
+ /* T10 VENDOR IDENTIFICATION */
+ spdk_strcpy_pad(inqdata->t10_vendor_id, DEFAULT_DISK_VENDOR, 8, ' ');
+
+ /* PRODUCT IDENTIFICATION */
+ spdk_strcpy_pad(inqdata->product_id, spdk_bdev_get_product_name(bdev), 16, ' ');
+
+ /* PRODUCT REVISION LEVEL */
+ spdk_strcpy_pad(inqdata->product_rev, DEFAULT_DISK_REVISION, 4, ' ');
+
+ /*
+ * Standard inquiry data ends here. Only populate remaining fields if alloc_len
+ * indicates enough space to hold it.
+ */
+ len = INQUIRY_OFFSET(product_rev) - 5;
+
+ if (alloc_len >= INQUIRY_OFFSET(vendor)) {
+ /* Vendor specific */
+ memset(inqdata->vendor, 0x20, 20);
+ len += sizeof(inqdata->vendor);
+ }
+
+ if (alloc_len >= INQUIRY_OFFSET(ius)) {
+ /* CLOCKING(3-2) QAS(1) IUS(0) */
+ inqdata->ius = 0;
+ len += sizeof(inqdata->ius);
+ }
+
+ if (alloc_len >= INQUIRY_OFFSET(reserved)) {
+ /* Reserved */
+ inqdata->reserved = 0;
+ len += sizeof(inqdata->reserved);
+ }
+
+ /* VERSION DESCRIPTOR 1-8 */
+ if (alloc_len >= INQUIRY_OFFSET(reserved) + 2) {
+ to_be16(&inqdata->desc[0], 0x0960);
+ len += 2;
+ }
+
+ if (alloc_len >= INQUIRY_OFFSET(reserved) + 4) {
+ to_be16(&inqdata->desc[2], 0x0300); /* SPC-3 (no version claimed) */
+ len += 2;
+ }
+
+ if (alloc_len >= INQUIRY_OFFSET(reserved) + 6) {
+ to_be16(&inqdata->desc[4], 0x320); /* SBC-2 (no version claimed) */
+ len += 2;
+ }
+
+ if (alloc_len >= INQUIRY_OFFSET(reserved) + 8) {
+ to_be16(&inqdata->desc[6], 0x0040); /* SAM-2 (no version claimed) */
+ len += 2;
+ }
+
+ /*
+ * We only fill out 4 descriptors, but if the allocation length goes past
+ * that, zero the remaining bytes. This fixes some SCSI compliance tests
+ * which expect a full 96 bytes to be returned, including the unpopulated
+ * version descriptors 5-8 (4 * 2 = 8 bytes) plus the 22 bytes of reserved
+ * space (bytes 74-95) - for a total of 30 bytes.
+ */
+ if (alloc_len > INQUIRY_OFFSET(reserved) + 8) {
+ i = alloc_len - (INQUIRY_OFFSET(reserved) + 8);
+ if (i > 30) {
+ i = 30;
+ }
+ memset(&inqdata->desc[8], 0, i);
+ len += i;
+ }
+
+ /* ADDITIONAL LENGTH */
+ inqdata->add_len = len;
+ }
+
+ return hlen + len;
+
+inq_error:
+ task->data_transferred = 0;
+ spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION,
+ SPDK_SCSI_SENSE_NO_SENSE,
+ SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE,
+ SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ return -1;
+}
+
+static void
+mode_sense_page_init(uint8_t *buf, int len, int page, int subpage)
+{
+ if (!buf) {
+ return;
+ }
+
+ memset(buf, 0, len);
+ if (subpage != 0) {
+ buf[0] = page | 0x40; /* PAGE + SPF=1 */
+ buf[1] = subpage;
+ to_be16(&buf[2], len - 4);
+ } else {
+ buf[0] = page;
+ buf[1] = len - 2;
+ }
+}
+
+static int
+bdev_scsi_mode_sense_page(struct spdk_bdev *bdev,
+ uint8_t *cdb, int pc, int page, int subpage,
+ uint8_t *data, struct spdk_scsi_task *task)
+{
+ uint8_t *cp = data;
+ int len = 0;
+ int plen;
+ int i;
+
+ if (pc == 0x00) {
+ /* Current values */
+ } else if (pc == 0x01) {
+ /* Changeable values */
+ /* As we currently do not support changeable values,
+ all parameters are reported as zero. */
+ } else if (pc == 0x02) {
+ /* Default values */
+ } else {
+ /* Saved values not supported */
+ spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION,
+ SPDK_SCSI_SENSE_ILLEGAL_REQUEST,
+ SPDK_SCSI_ASC_SAVING_PARAMETERS_NOT_SUPPORTED,
+ SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ return -1;
+ }
+
+ switch (page) {
+ case 0x00:
+ /* Vendor specific */
+ break;
+ case 0x01:
+ /* Read-Write Error Recovery */
+ SPDK_DEBUGLOG(SPDK_LOG_SCSI,
+ "MODE_SENSE Read-Write Error Recovery\n");
+ if (subpage != 0x00) {
+ break;
+ }
+ plen = 0x0a + 2;
+ mode_sense_page_init(cp, plen, page, subpage);
+ len += plen;
+ break;
+ case 0x02:
+ /* Disconnect-Reconnect */
+ SPDK_DEBUGLOG(SPDK_LOG_SCSI,
+ "MODE_SENSE Disconnect-Reconnect\n");
+ if (subpage != 0x00) {
+ break;
+ }
+ plen = 0x0e + 2;
+ mode_sense_page_init(cp, plen, page, subpage);
+ len += plen;
+ break;
+ case 0x03:
+ /* Obsolete (Format Device) */
+ break;
+ case 0x04:
+ /* Obsolete (Rigid Disk Geometry) */
+ break;
+ case 0x05:
+ /* Obsolete (Rigid Disk Geometry) */
+ break;
+ case 0x06:
+ /* Reserved */
+ break;
+ case 0x07:
+ /* Verify Error Recovery */
+ SPDK_DEBUGLOG(SPDK_LOG_SCSI,
+ "MODE_SENSE Verify Error Recovery\n");
+
+ if (subpage != 0x00) {
+ break;
+ }
+
+ plen = 0x0a + 2;
+ mode_sense_page_init(cp, plen, page, subpage);
+ len += plen;
+ break;
+ case 0x08: {
+ /* Caching */
+ SPDK_DEBUGLOG(SPDK_LOG_SCSI, "MODE_SENSE Caching\n");
+ if (subpage != 0x00) {
+ break;
+ }
+
+ plen = 0x12 + 2;
+ mode_sense_page_init(cp, plen, page, subpage);
+
+ if (cp && spdk_bdev_has_write_cache(bdev) && pc != 0x01) {
+ cp[2] |= 0x4; /* WCE */
+ }
+
+ /* Read Cache Disable (RCD) = 1 */
+ if (cp && pc != 0x01) {
+ cp[2] |= 0x1;
+ }
+
+ len += plen;
+ break;
+ }
+ case 0x09:
+ /* Obsolete */
+ break;
+ case 0x0a:
+ switch (subpage) {
+ case 0x00:
+ /* Control */
+ SPDK_DEBUGLOG(SPDK_LOG_SCSI,
+ "MODE_SENSE Control\n");
+ plen = 0x0a + 2;
+ mode_sense_page_init(cp, plen, page, subpage);
+ len += plen;
+ break;
+ case 0x01:
+ /* Control Extension */
+ SPDK_DEBUGLOG(SPDK_LOG_SCSI,
+ "MODE_SENSE Control Extension\n");
+ plen = 0x1c + 4;
+ mode_sense_page_init(cp, plen, page, subpage);
+ len += plen;
+ break;
+ case 0xff:
+ /* All subpages */
+ len += bdev_scsi_mode_sense_page(bdev,
+ cdb, pc, page,
+ 0x00,
+ cp ? &cp[len] : NULL, task);
+ len += bdev_scsi_mode_sense_page(bdev,
+ cdb, pc, page,
+ 0x01,
+ cp ? &cp[len] : NULL, task);
+ break;
+ default:
+ /* 0x02-0x3e: Reserved */
+ break;
+ }
+ break;
+ case 0x0b:
+ /* Obsolete (Medium Types Supported) */
+ break;
+ case 0x0c:
+ /* Obsolete (Notch And Partitio) */
+ break;
+ case 0x0d:
+ /* Obsolete */
+ break;
+ case 0x0e:
+ case 0x0f:
+ /* Reserved */
+ break;
+ case 0x10:
+ /* XOR Control */
+ SPDK_DEBUGLOG(SPDK_LOG_SCSI, "MODE_SENSE XOR Control\n");
+ if (subpage != 0x00) {
+ break;
+ }
+ plen = 0x16 + 2;
+ mode_sense_page_init(cp, plen, page, subpage);
+ len += plen;
+ break;
+ case 0x11:
+ case 0x12:
+ case 0x13:
+ /* Reserved */
+ break;
+ case 0x14:
+ /* Enclosure Services Management */
+ break;
+ case 0x15:
+ case 0x16:
+ case 0x17:
+ /* Reserved */
+ break;
+ case 0x18:
+ /* Protocol-Specific LUN */
+ break;
+ case 0x19:
+ /* Protocol-Specific Port */
+ break;
+ case 0x1a:
+ /* Power Condition */
+ SPDK_DEBUGLOG(SPDK_LOG_SCSI,
+ "MODE_SENSE Power Condition\n");
+ if (subpage != 0x00) {
+ break;
+ }
+ plen = 0x0a + 2;
+ mode_sense_page_init(cp, plen, page, subpage);
+ len += plen;
+ break;
+ case 0x1b:
+ /* Reserved */
+ break;
+ case 0x1c:
+ /* Informational Exceptions Control */
+ SPDK_DEBUGLOG(SPDK_LOG_SCSI,
+ "MODE_SENSE Informational Exceptions Control\n");
+ if (subpage != 0x00) {
+ break;
+ }
+
+ plen = 0x0a + 2;
+ mode_sense_page_init(cp, plen, page, subpage);
+ len += plen;
+ break;
+ case 0x1d:
+ case 0x1e:
+ case 0x1f:
+ /* Reserved */
+ break;
+ case 0x20:
+ case 0x21:
+ case 0x22:
+ case 0x23:
+ case 0x24:
+ case 0x25:
+ case 0x26:
+ case 0x27:
+ case 0x28:
+ case 0x29:
+ case 0x2a:
+ case 0x2b:
+ case 0x2c:
+ case 0x2d:
+ case 0x2e:
+ case 0x2f:
+ case 0x30:
+ case 0x31:
+ case 0x32:
+ case 0x33:
+ case 0x34:
+ case 0x35:
+ case 0x36:
+ case 0x37:
+ case 0x38:
+ case 0x39:
+ case 0x3a:
+ case 0x3b:
+ case 0x3c:
+ case 0x3d:
+ case 0x3e:
+ /* Vendor-specific */
+ break;
+ case 0x3f:
+ switch (subpage) {
+ case 0x00:
+ /* All mode pages */
+ for (i = 0x00; i < 0x3e; i ++) {
+ len += bdev_scsi_mode_sense_page(
+ bdev, cdb, pc, i, 0x00,
+ cp ? &cp[len] : NULL, task);
+ }
+ break;
+ case 0xff:
+ /* All mode pages and subpages */
+ for (i = 0x00; i < 0x3e; i ++) {
+ len += bdev_scsi_mode_sense_page(
+ bdev, cdb, pc, i, 0x00,
+ cp ? &cp[len] : NULL, task);
+ }
+ for (i = 0x00; i < 0x3e; i ++) {
+ len += bdev_scsi_mode_sense_page(
+ bdev, cdb, pc, i, 0xff,
+ cp ? &cp[len] : NULL, task);
+ }
+ break;
+ default:
+ /* 0x01-0x3e: Reserved */
+ break;
+ }
+ }
+
+ return len;
+}
+
+static int
+bdev_scsi_mode_sense(struct spdk_bdev *bdev, int md,
+ uint8_t *cdb, int dbd, int llbaa, int pc,
+ int page, int subpage, uint8_t *data, struct spdk_scsi_task *task)
+{
+ uint64_t num_blocks = spdk_bdev_get_num_blocks(bdev);
+ uint32_t block_size = spdk_bdev_get_data_block_size(bdev);
+ uint8_t *hdr, *bdesc, *pages;
+ int hlen;
+ int blen;
+ int plen, total;
+
+ assert(md == 6 || md == 10);
+
+ if (md == 6) {
+ hlen = 4;
+ blen = 8; /* For MODE SENSE 6 only short LBA */
+ } else {
+ hlen = 8;
+ blen = llbaa ? 16 : 8;
+ }
+
+ if (dbd) {
+ blen = 0;
+ }
+
+ pages = data ? &data[hlen + blen] : NULL;
+ plen = bdev_scsi_mode_sense_page(bdev, cdb, pc, page,
+ subpage,
+ pages, task);
+ if (plen < 0) {
+ return -1;
+ }
+
+ total = hlen + blen + plen;
+ if (data == NULL) {
+ return total;
+ }
+
+ hdr = &data[0];
+ if (hlen == 4) {
+ hdr[0] = total - 1; /* Mode Data Length */
+ hdr[1] = 0; /* Medium Type */
+ hdr[2] = 0; /* Device-Specific Parameter */
+ hdr[3] = blen; /* Block Descripter Length */
+ } else {
+ to_be16(&hdr[0], total - 2); /* Mode Data Length */
+ hdr[2] = 0; /* Medium Type */
+ hdr[3] = 0; /* Device-Specific Parameter */
+ hdr[4] = llbaa ? 0x1 : 0; /* Long/short LBA */
+ hdr[5] = 0; /* Reserved */
+ to_be16(&hdr[6], blen); /* Block Descripter Length */
+ }
+
+ bdesc = &data[hlen];
+ if (blen == 16) {
+ /* Number of Blocks */
+ to_be64(&bdesc[0], num_blocks);
+ /* Reserved */
+ memset(&bdesc[8], 0, 4);
+ /* Block Length */
+ to_be32(&bdesc[12], block_size);
+ } else if (blen == 8) {
+ /* Number of Blocks */
+ if (num_blocks > 0xffffffffULL) {
+ memset(&bdesc[0], 0xff, 4);
+ } else {
+ to_be32(&bdesc[0], num_blocks);
+ }
+
+ /* Block Length */
+ to_be32(&bdesc[4], block_size);
+ }
+
+ return total;
+}
+
+static void
+bdev_scsi_task_complete_cmd(struct spdk_bdev_io *bdev_io, bool success,
+ void *cb_arg)
+{
+ struct spdk_scsi_task *task = cb_arg;
+ int sc, sk, asc, ascq;
+
+ spdk_bdev_io_get_scsi_status(bdev_io, &sc, &sk, &asc, &ascq);
+
+ spdk_bdev_free_io(bdev_io);
+
+ spdk_scsi_task_set_status(task, sc, sk, asc, ascq);
+ scsi_lun_complete_task(task->lun, task);
+}
+
+static void
+bdev_scsi_read_task_complete_cmd(struct spdk_bdev_io *bdev_io, bool success,
+ void *cb_arg)
+{
+ struct spdk_scsi_task *task = cb_arg;
+ int sc, sk, asc, ascq;
+
+ task->bdev_io = bdev_io;
+
+ spdk_bdev_io_get_scsi_status(bdev_io, &sc, &sk, &asc, &ascq);
+
+ spdk_scsi_task_set_status(task, sc, sk, asc, ascq);
+ scsi_lun_complete_task(task->lun, task);
+}
+
+static void
+bdev_scsi_task_complete_reset(struct spdk_bdev_io *bdev_io, bool success,
+ void *cb_arg)
+{
+ struct spdk_scsi_task *task = cb_arg;
+
+ spdk_bdev_free_io(bdev_io);
+
+ if (success) {
+ task->response = SPDK_SCSI_TASK_MGMT_RESP_SUCCESS;
+ }
+
+ scsi_lun_complete_reset_task(task->lun, task);
+}
+
+static void
+bdev_scsi_queue_io(struct spdk_scsi_task *task, spdk_bdev_io_wait_cb cb_fn, void *cb_arg)
+{
+ struct spdk_scsi_lun *lun = task->lun;
+ struct spdk_bdev *bdev = lun->bdev;
+ struct spdk_io_channel *ch = lun->io_channel;
+ int rc;
+
+ task->bdev_io_wait.bdev = bdev;
+ task->bdev_io_wait.cb_fn = cb_fn;
+ task->bdev_io_wait.cb_arg = cb_arg;
+
+ rc = spdk_bdev_queue_io_wait(bdev, ch, &task->bdev_io_wait);
+ if (rc != 0) {
+ assert(false);
+ }
+}
+
+static int
+bdev_scsi_sync(struct spdk_bdev *bdev, struct spdk_bdev_desc *bdev_desc,
+ struct spdk_io_channel *bdev_ch, struct spdk_scsi_task *task,
+ uint64_t lba, uint32_t num_blocks)
+{
+ uint64_t bdev_num_blocks;
+ int rc;
+
+ if (num_blocks == 0) {
+ return SPDK_SCSI_TASK_COMPLETE;
+ }
+
+ bdev_num_blocks = spdk_bdev_get_num_blocks(bdev);
+
+ if (lba >= bdev_num_blocks || num_blocks > bdev_num_blocks ||
+ lba > (bdev_num_blocks - num_blocks)) {
+ SPDK_ERRLOG("end of media\n");
+ spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION,
+ SPDK_SCSI_SENSE_NO_SENSE,
+ SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE,
+ SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ return SPDK_SCSI_TASK_COMPLETE;
+ }
+
+ rc = spdk_bdev_flush_blocks(bdev_desc, bdev_ch, lba, num_blocks,
+ bdev_scsi_task_complete_cmd, task);
+
+ if (rc) {
+ if (rc == -ENOMEM) {
+ bdev_scsi_queue_io(task, bdev_scsi_process_block_resubmit, task);
+ return SPDK_SCSI_TASK_PENDING;
+ }
+ SPDK_ERRLOG("spdk_bdev_flush_blocks() failed\n");
+ spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION,
+ SPDK_SCSI_SENSE_NO_SENSE,
+ SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE,
+ SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ return SPDK_SCSI_TASK_COMPLETE;
+ }
+ task->data_transferred = 0;
+ return SPDK_SCSI_TASK_PENDING;
+}
+
+static uint64_t
+_bytes_to_blocks(uint32_t block_size, uint64_t offset_bytes, uint64_t *offset_blocks,
+ uint64_t num_bytes, uint64_t *num_blocks)
+{
+ uint8_t shift_cnt;
+
+ /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */
+ if (spdk_likely(spdk_u32_is_pow2(block_size))) {
+ shift_cnt = spdk_u32log2(block_size);
+ *offset_blocks = offset_bytes >> shift_cnt;
+ *num_blocks = num_bytes >> shift_cnt;
+ return (offset_bytes - (*offset_blocks << shift_cnt)) |
+ (num_bytes - (*num_blocks << shift_cnt));
+ } else {
+ *offset_blocks = offset_bytes / block_size;
+ *num_blocks = num_bytes / block_size;
+ return (offset_bytes % block_size) | (num_bytes % block_size);
+ }
+}
+
+static int
+bdev_scsi_readwrite(struct spdk_bdev *bdev, struct spdk_bdev_desc *bdev_desc,
+ struct spdk_io_channel *bdev_ch, struct spdk_scsi_task *task,
+ uint64_t lba, uint32_t xfer_len, bool is_read)
+{
+ uint64_t bdev_num_blocks, offset_blocks, num_blocks;
+ uint32_t max_xfer_len, block_size;
+ int sk = SPDK_SCSI_SENSE_NO_SENSE, asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
+ int rc;
+
+ task->data_transferred = 0;
+
+ if (spdk_unlikely(task->dxfer_dir != SPDK_SCSI_DIR_NONE &&
+ task->dxfer_dir != (is_read ? SPDK_SCSI_DIR_FROM_DEV : SPDK_SCSI_DIR_TO_DEV))) {
+ SPDK_ERRLOG("Incorrect data direction\n");
+ goto check_condition;
+ }
+
+ bdev_num_blocks = spdk_bdev_get_num_blocks(bdev);
+ if (spdk_unlikely(bdev_num_blocks <= lba || bdev_num_blocks - lba < xfer_len)) {
+ SPDK_DEBUGLOG(SPDK_LOG_SCSI, "end of media\n");
+ sk = SPDK_SCSI_SENSE_ILLEGAL_REQUEST;
+ asc = SPDK_SCSI_ASC_LOGICAL_BLOCK_ADDRESS_OUT_OF_RANGE;
+ goto check_condition;
+ }
+
+ if (spdk_unlikely(xfer_len == 0)) {
+ task->status = SPDK_SCSI_STATUS_GOOD;
+ return SPDK_SCSI_TASK_COMPLETE;
+ }
+
+ block_size = spdk_bdev_get_data_block_size(bdev);
+
+ /* Transfer Length is limited to the Block Limits VPD page Maximum Transfer Length */
+ max_xfer_len = SPDK_WORK_BLOCK_SIZE / block_size;
+ if (spdk_unlikely(xfer_len > max_xfer_len)) {
+ SPDK_ERRLOG("xfer_len %" PRIu32 " > maximum transfer length %" PRIu32 "\n",
+ xfer_len, max_xfer_len);
+ sk = SPDK_SCSI_SENSE_ILLEGAL_REQUEST;
+ asc = SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB;
+ goto check_condition;
+ }
+
+ if (!is_read) {
+ /* Additional check for Transfer Length */
+ if (xfer_len * block_size > task->transfer_len) {
+ SPDK_ERRLOG("xfer_len %" PRIu32 " * block_size %" PRIu32 " > transfer_len %u\n",
+ xfer_len, block_size, task->transfer_len);
+ goto check_condition;
+ }
+ }
+
+ if (_bytes_to_blocks(block_size, task->offset, &offset_blocks, task->length, &num_blocks) != 0) {
+ SPDK_ERRLOG("task's offset %" PRIu64 " or length %" PRIu32 " is not block multiple\n",
+ task->offset, task->length);
+ goto check_condition;
+ }
+
+ offset_blocks += lba;
+
+ SPDK_DEBUGLOG(SPDK_LOG_SCSI,
+ "%s: lba=%"PRIu64", len=%"PRIu64"\n",
+ is_read ? "Read" : "Write", offset_blocks, num_blocks);
+
+ if (is_read) {
+ rc = spdk_bdev_readv_blocks(bdev_desc, bdev_ch, task->iovs, task->iovcnt,
+ offset_blocks, num_blocks,
+ bdev_scsi_read_task_complete_cmd, task);
+ } else {
+ rc = spdk_bdev_writev_blocks(bdev_desc, bdev_ch, task->iovs, task->iovcnt,
+ offset_blocks, num_blocks,
+ bdev_scsi_task_complete_cmd, task);
+ }
+
+ if (rc) {
+ if (rc == -ENOMEM) {
+ bdev_scsi_queue_io(task, bdev_scsi_process_block_resubmit, task);
+ return SPDK_SCSI_TASK_PENDING;
+ }
+ SPDK_ERRLOG("spdk_bdev_%s_blocks() failed\n", is_read ? "readv" : "writev");
+ goto check_condition;
+ }
+
+ task->data_transferred = task->length;
+ return SPDK_SCSI_TASK_PENDING;
+
+check_condition:
+ spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, sk, asc,
+ SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ return SPDK_SCSI_TASK_COMPLETE;
+}
+
+struct spdk_bdev_scsi_unmap_ctx {
+ struct spdk_scsi_task *task;
+ struct spdk_scsi_unmap_bdesc desc[DEFAULT_MAX_UNMAP_BLOCK_DESCRIPTOR_COUNT];
+ uint32_t count;
+};
+
+static int bdev_scsi_unmap(struct spdk_bdev *bdev, struct spdk_bdev_desc *bdev_desc,
+ struct spdk_io_channel *bdev_ch, struct spdk_scsi_task *task,
+ struct spdk_bdev_scsi_unmap_ctx *ctx);
+
+static void
+bdev_scsi_task_complete_unmap_cmd(struct spdk_bdev_io *bdev_io, bool success,
+ void *cb_arg)
+{
+ struct spdk_bdev_scsi_unmap_ctx *ctx = cb_arg;
+ struct spdk_scsi_task *task = ctx->task;
+ int sc, sk, asc, ascq;
+
+ ctx->count--;
+
+ task->bdev_io = bdev_io;
+
+ if (task->status == SPDK_SCSI_STATUS_GOOD) {
+ spdk_bdev_io_get_scsi_status(bdev_io, &sc, &sk, &asc, &ascq);
+ spdk_scsi_task_set_status(task, sc, sk, asc, ascq);
+ }
+
+ if (ctx->count == 0) {
+ scsi_lun_complete_task(task->lun, task);
+ free(ctx);
+ }
+}
+
+static int
+__copy_desc(struct spdk_bdev_scsi_unmap_ctx *ctx, uint8_t *data, size_t data_len)
+{
+ uint16_t desc_data_len;
+ uint16_t desc_count;
+
+ if (!data) {
+ return -EINVAL;
+ }
+
+ if (data_len < 8) {
+ /* We can't even get the reported length, so fail. */
+ return -EINVAL;
+ }
+
+ desc_data_len = from_be16(&data[2]);
+ desc_count = desc_data_len / 16;
+
+ if (desc_data_len > (data_len - 8)) {
+ SPDK_ERRLOG("Error - desc_data_len (%u) > data_len (%lu) - 8\n",
+ desc_data_len, data_len);
+ return -EINVAL;
+ }
+
+ if (desc_count > DEFAULT_MAX_UNMAP_BLOCK_DESCRIPTOR_COUNT) {
+ SPDK_ERRLOG("desc_count (%u) greater than max allowed (%u)\n",
+ desc_count, DEFAULT_MAX_UNMAP_BLOCK_DESCRIPTOR_COUNT);
+ return -EINVAL;
+ }
+
+ memcpy(ctx->desc, &data[8], desc_data_len);
+ return desc_count;
+}
+
+static void
+bdev_scsi_unmap_resubmit(void *arg)
+{
+ struct spdk_bdev_scsi_unmap_ctx *ctx = arg;
+ struct spdk_scsi_task *task = ctx->task;
+ struct spdk_scsi_lun *lun = task->lun;
+
+ bdev_scsi_unmap(lun->bdev, lun->bdev_desc, lun->io_channel, task, ctx);
+}
+
+static int
+bdev_scsi_unmap(struct spdk_bdev *bdev, struct spdk_bdev_desc *bdev_desc,
+ struct spdk_io_channel *bdev_ch, struct spdk_scsi_task *task,
+ struct spdk_bdev_scsi_unmap_ctx *ctx)
+{
+ uint8_t *data;
+ int i, desc_count = -1;
+ int data_len;
+ int rc;
+
+ assert(task->status == SPDK_SCSI_STATUS_GOOD);
+
+ if (ctx == NULL) {
+ ctx = calloc(1, sizeof(*ctx));
+ if (!ctx) {
+ spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION,
+ SPDK_SCSI_SENSE_NO_SENSE,
+ SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE,
+ SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ return SPDK_SCSI_TASK_COMPLETE;
+ }
+
+ ctx->task = task;
+ ctx->count = 0;
+ }
+
+
+ if (task->iovcnt == 1) {
+ data = (uint8_t *)task->iovs[0].iov_base;
+ data_len = task->iovs[0].iov_len;
+ desc_count = __copy_desc(ctx, data, data_len);
+ } else {
+ data = spdk_scsi_task_gather_data(task, &data_len);
+ if (data) {
+ desc_count = __copy_desc(ctx, data, data_len);
+ free(data);
+ }
+ }
+
+ if (desc_count < 0) {
+ spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION,
+ SPDK_SCSI_SENSE_ILLEGAL_REQUEST,
+ SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB,
+ SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ free(ctx);
+ return SPDK_SCSI_TASK_COMPLETE;
+ }
+
+ for (i = ctx->count; i < desc_count; i++) {
+ struct spdk_scsi_unmap_bdesc *desc;
+ uint64_t offset_blocks;
+ uint64_t num_blocks;
+
+ desc = &ctx->desc[i];
+
+ offset_blocks = from_be64(&desc->lba);
+ num_blocks = from_be32(&desc->block_count);
+
+ if (num_blocks == 0) {
+ continue;
+ }
+
+ ctx->count++;
+ rc = spdk_bdev_unmap_blocks(bdev_desc, bdev_ch, offset_blocks, num_blocks,
+ bdev_scsi_task_complete_unmap_cmd, ctx);
+
+ if (rc) {
+ if (rc == -ENOMEM) {
+ bdev_scsi_queue_io(task, bdev_scsi_unmap_resubmit, ctx);
+ /* Unmap was not yet submitted to bdev */
+ ctx->count--;
+ return SPDK_SCSI_TASK_PENDING;
+ }
+ SPDK_ERRLOG("SCSI Unmapping failed\n");
+ spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION,
+ SPDK_SCSI_SENSE_NO_SENSE,
+ SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE,
+ SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ ctx->count--;
+ /* We can't complete here - we may have to wait for previously
+ * submitted unmaps to complete */
+ break;
+ }
+ }
+
+ if (ctx->count == 0) {
+ free(ctx);
+ return SPDK_SCSI_TASK_COMPLETE;
+ }
+
+ return SPDK_SCSI_TASK_PENDING;
+}
+
+static int
+bdev_scsi_process_block(struct spdk_scsi_task *task)
+{
+ struct spdk_scsi_lun *lun = task->lun;
+ struct spdk_bdev *bdev = lun->bdev;
+ uint64_t lba;
+ uint32_t xfer_len;
+ uint32_t len = 0;
+ uint8_t *cdb = task->cdb;
+
+ /* XXX: We need to support FUA bit for writes! */
+ switch (cdb[0]) {
+ case SPDK_SBC_READ_6:
+ case SPDK_SBC_WRITE_6:
+ lba = (uint64_t)cdb[1] << 16;
+ lba |= (uint64_t)cdb[2] << 8;
+ lba |= (uint64_t)cdb[3];
+ xfer_len = cdb[4];
+ if (xfer_len == 0) {
+ xfer_len = 256;
+ }
+ return bdev_scsi_readwrite(bdev, lun->bdev_desc, lun->io_channel,
+ task, lba, xfer_len,
+ cdb[0] == SPDK_SBC_READ_6);
+
+ case SPDK_SBC_READ_10:
+ case SPDK_SBC_WRITE_10:
+ lba = from_be32(&cdb[2]);
+ xfer_len = from_be16(&cdb[7]);
+ return bdev_scsi_readwrite(bdev, lun->bdev_desc, lun->io_channel,
+ task, lba, xfer_len,
+ cdb[0] == SPDK_SBC_READ_10);
+
+ case SPDK_SBC_READ_12:
+ case SPDK_SBC_WRITE_12:
+ lba = from_be32(&cdb[2]);
+ xfer_len = from_be32(&cdb[6]);
+ return bdev_scsi_readwrite(bdev, lun->bdev_desc, lun->io_channel,
+ task, lba, xfer_len,
+ cdb[0] == SPDK_SBC_READ_12);
+ case SPDK_SBC_READ_16:
+ case SPDK_SBC_WRITE_16:
+ lba = from_be64(&cdb[2]);
+ xfer_len = from_be32(&cdb[10]);
+ return bdev_scsi_readwrite(bdev, lun->bdev_desc, lun->io_channel,
+ task, lba, xfer_len,
+ cdb[0] == SPDK_SBC_READ_16);
+
+ case SPDK_SBC_READ_CAPACITY_10: {
+ uint64_t num_blocks = spdk_bdev_get_num_blocks(bdev);
+ uint8_t buffer[8];
+
+ if (num_blocks - 1 > 0xffffffffULL) {
+ memset(buffer, 0xff, 4);
+ } else {
+ to_be32(buffer, num_blocks - 1);
+ }
+ to_be32(&buffer[4], spdk_bdev_get_data_block_size(bdev));
+
+ len = spdk_min(task->length, sizeof(buffer));
+ if (spdk_scsi_task_scatter_data(task, buffer, len) < 0) {
+ break;
+ }
+
+ task->data_transferred = len;
+ task->status = SPDK_SCSI_STATUS_GOOD;
+ break;
+ }
+
+ case SPDK_SPC_SERVICE_ACTION_IN_16:
+ switch (cdb[1] & 0x1f) { /* SERVICE ACTION */
+ case SPDK_SBC_SAI_READ_CAPACITY_16: {
+ uint8_t buffer[32] = {0};
+
+ to_be64(&buffer[0], spdk_bdev_get_num_blocks(bdev) - 1);
+ to_be32(&buffer[8], spdk_bdev_get_data_block_size(bdev));
+ /*
+ * Set the TPE bit to 1 to indicate thin provisioning.
+ * The position of TPE bit is the 7th bit in 14th byte
+ * in READ CAPACITY (16) parameter data.
+ */
+ if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
+ buffer[14] |= 1 << 7;
+ }
+
+ len = spdk_min(from_be32(&cdb[10]), sizeof(buffer));
+ if (spdk_scsi_task_scatter_data(task, buffer, len) < 0) {
+ break;
+ }
+
+ task->data_transferred = len;
+ task->status = SPDK_SCSI_STATUS_GOOD;
+ break;
+ }
+
+ default:
+ return SPDK_SCSI_TASK_UNKNOWN;
+ }
+ break;
+
+ case SPDK_SBC_SYNCHRONIZE_CACHE_10:
+ case SPDK_SBC_SYNCHRONIZE_CACHE_16:
+ if (cdb[0] == SPDK_SBC_SYNCHRONIZE_CACHE_10) {
+ lba = from_be32(&cdb[2]);
+ len = from_be16(&cdb[7]);
+ } else {
+ lba = from_be64(&cdb[2]);
+ len = from_be32(&cdb[10]);
+ }
+
+ if (len == 0) {
+ len = spdk_bdev_get_num_blocks(bdev) - lba;
+ }
+
+ return bdev_scsi_sync(bdev, lun->bdev_desc, lun->io_channel, task, lba, len);
+ break;
+
+ case SPDK_SBC_UNMAP:
+ return bdev_scsi_unmap(bdev, lun->bdev_desc, lun->io_channel, task, NULL);
+
+ default:
+ return SPDK_SCSI_TASK_UNKNOWN;
+ }
+
+ return SPDK_SCSI_TASK_COMPLETE;
+}
+
+static void
+bdev_scsi_process_block_resubmit(void *arg)
+{
+ struct spdk_scsi_task *task = arg;
+
+ bdev_scsi_process_block(task);
+}
+
+static int
+bdev_scsi_check_len(struct spdk_scsi_task *task, int len, int min_len)
+{
+ if (len >= min_len) {
+ return 0;
+ }
+
+ /* INVALID FIELD IN CDB */
+ spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION,
+ SPDK_SCSI_SENSE_ILLEGAL_REQUEST,
+ SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB,
+ SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ return -1;
+}
+
+static int
+bdev_scsi_process_primary(struct spdk_scsi_task *task)
+{
+ struct spdk_scsi_lun *lun = task->lun;
+ struct spdk_bdev *bdev = lun->bdev;
+ int alloc_len = -1;
+ int data_len = -1;
+ uint8_t *cdb = task->cdb;
+ uint8_t *data = NULL;
+ int rc = 0;
+ int pllen, md = 0;
+ int llba;
+ int dbd, pc, page, subpage;
+ int cmd_parsed = 0;
+
+ switch (cdb[0]) {
+ case SPDK_SPC_INQUIRY:
+ alloc_len = from_be16(&cdb[3]);
+ data_len = spdk_max(4096, alloc_len);
+ data = calloc(1, data_len);
+ assert(data != NULL);
+ rc = bdev_scsi_inquiry(bdev, task, cdb, data, data_len);
+ data_len = spdk_min(rc, data_len);
+ if (rc < 0) {
+ break;
+ }
+
+ SPDK_LOGDUMP(SPDK_LOG_SCSI, "INQUIRY", data, data_len);
+ break;
+
+ case SPDK_SPC_REPORT_LUNS: {
+ int sel;
+
+ sel = cdb[2];
+ SPDK_DEBUGLOG(SPDK_LOG_SCSI, "sel=%x\n", sel);
+
+ alloc_len = from_be32(&cdb[6]);
+ rc = bdev_scsi_check_len(task, alloc_len, 16);
+ if (rc < 0) {
+ break;
+ }
+
+ data_len = spdk_max(4096, alloc_len);
+ data = calloc(1, data_len);
+ assert(data != NULL);
+ rc = bdev_scsi_report_luns(task->lun, sel, data, data_len);
+ data_len = rc;
+ if (rc < 0) {
+ spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION,
+ SPDK_SCSI_SENSE_NO_SENSE,
+ SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE,
+ SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ break;
+ }
+
+ SPDK_LOGDUMP(SPDK_LOG_SCSI, "REPORT LUNS", data, data_len);
+ break;
+ }
+
+ case SPDK_SPC_MODE_SELECT_6:
+ case SPDK_SPC_MODE_SELECT_10:
+ if (cdb[0] == SPDK_SPC_MODE_SELECT_6) {
+ /* MODE_SELECT(6) must have at least a 4 byte header. */
+ md = 4;
+ pllen = cdb[4];
+ } else {
+ /* MODE_SELECT(10) must have at least an 8 byte header. */
+ md = 8;
+ pllen = from_be16(&cdb[7]);
+ }
+
+ if (pllen == 0) {
+ break;
+ }
+
+ rc = bdev_scsi_check_len(task, pllen, md);
+ if (rc < 0) {
+ break;
+ }
+
+ data = spdk_scsi_task_gather_data(task, &rc);
+ if (rc < 0) {
+ break;
+ }
+ data_len = rc;
+
+ rc = bdev_scsi_check_len(task, data_len, spdk_max(pllen, md));
+ if (rc < 0) {
+ break;
+ }
+
+ rc = pllen;
+ data_len = 0;
+ break;
+
+ case SPDK_SPC_MODE_SENSE_6:
+ alloc_len = cdb[4];
+ md = 6;
+ /* FALLTHROUGH */
+ case SPDK_SPC_MODE_SENSE_10:
+ llba = 0;
+
+ if (md == 0) {
+ alloc_len = from_be16(&cdb[7]);
+ llba = !!(cdb[1] & 0x10);
+ md = 10;
+ }
+
+ dbd = !!(cdb[1] & 0x8);
+ pc = (cdb[2] & 0xc0) >> 6;
+ page = cdb[2] & 0x3f;
+ subpage = cdb[3];
+
+ /* First call with no buffer to discover needed buffer size */
+ rc = bdev_scsi_mode_sense(bdev, md,
+ cdb, dbd, llba, pc,
+ page, subpage,
+ NULL, task);
+ if (rc < 0) {
+ break;
+ }
+
+ data_len = rc;
+ data = calloc(1, data_len);
+ assert(data != NULL);
+
+ /* First call with no buffer to discover needed buffer size */
+ rc = bdev_scsi_mode_sense(bdev, md,
+ cdb, dbd, llba, pc,
+ page, subpage,
+ data, task);
+ if (rc < 0) {
+ /* INVALID FIELD IN CDB */
+ spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION,
+ SPDK_SCSI_SENSE_ILLEGAL_REQUEST,
+ SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB,
+ SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ break;
+ }
+ break;
+
+ case SPDK_SPC_REQUEST_SENSE: {
+ int desc;
+ int sk, asc, ascq;
+
+ desc = cdb[1] & 0x1;
+ if (desc != 0) {
+ /* INVALID FIELD IN CDB */
+ spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION,
+ SPDK_SCSI_SENSE_ILLEGAL_REQUEST,
+ SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB,
+ SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ break;
+ }
+
+ alloc_len = cdb[4];
+
+ /* NO ADDITIONAL SENSE INFORMATION */
+ sk = SPDK_SCSI_SENSE_NO_SENSE;
+ asc = 0x00;
+ ascq = 0x00;
+
+ spdk_scsi_task_build_sense_data(task, sk, asc, ascq);
+
+ data_len = task->sense_data_len;
+ data = calloc(1, data_len);
+ assert(data != NULL);
+ memcpy(data, task->sense_data, data_len);
+ break;
+ }
+
+ case SPDK_SPC_LOG_SELECT:
+ SPDK_DEBUGLOG(SPDK_LOG_SCSI, "LOG_SELECT\n");
+ cmd_parsed = 1;
+ /* FALLTHROUGH */
+ case SPDK_SPC_LOG_SENSE:
+ if (!cmd_parsed) {
+ SPDK_DEBUGLOG(SPDK_LOG_SCSI, "LOG_SENSE\n");
+ }
+
+ /* INVALID COMMAND OPERATION CODE */
+ spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION,
+ SPDK_SCSI_SENSE_ILLEGAL_REQUEST,
+ SPDK_SCSI_ASC_INVALID_COMMAND_OPERATION_CODE,
+ SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ rc = -1;
+ break;
+
+ case SPDK_SPC_TEST_UNIT_READY:
+ SPDK_DEBUGLOG(SPDK_LOG_SCSI, "TEST_UNIT_READY\n");
+ cmd_parsed = 1;
+ /* FALLTHROUGH */
+ case SPDK_SBC_START_STOP_UNIT:
+ if (!cmd_parsed) {
+ SPDK_DEBUGLOG(SPDK_LOG_SCSI, "START_STOP_UNIT\n");
+ }
+
+ rc = 0;
+ break;
+
+ case SPDK_SPC_PERSISTENT_RESERVE_OUT:
+ pllen = from_be32(&cdb[5]);
+ rc = bdev_scsi_check_len(task, pllen, 24);
+ if (rc < 0) {
+ break;
+ }
+
+ data = spdk_scsi_task_gather_data(task, &rc);
+ if (rc < 0) {
+ break;
+ }
+ data_len = rc;
+ if (data_len < 24) {
+ rc = -1;
+ break;
+ }
+
+ rc = scsi_pr_out(task, cdb, data, data_len);
+ if (rc < 0) {
+ break;
+ }
+ rc = pllen;
+ data_len = 0;
+ break;
+
+ case SPDK_SPC_PERSISTENT_RESERVE_IN:
+ alloc_len = from_be16(&cdb[7]);
+ data_len = alloc_len;
+ data = calloc(1, data_len);
+ assert(data != NULL);
+ rc = scsi_pr_in(task, cdb, data, data_len);
+ break;
+
+ case SPDK_SPC2_RESERVE_6:
+ case SPDK_SPC2_RESERVE_10:
+ rc = scsi2_reserve(task, cdb);
+ if (rc == 0) {
+ if (cdb[0] == SPDK_SPC2_RESERVE_10) {
+ rc = from_be16(&cdb[7]);
+ }
+ data_len = 0;
+ }
+ break;
+
+ case SPDK_SPC2_RELEASE_6:
+ case SPDK_SPC2_RELEASE_10:
+ rc = scsi2_release(task);
+ break;
+
+ default:
+ return SPDK_SCSI_TASK_UNKNOWN;
+ }
+
+ if (rc >= 0 && data_len > 0) {
+ assert(alloc_len >= 0);
+ spdk_scsi_task_scatter_data(task, data, spdk_min(alloc_len, data_len));
+ rc = spdk_min(data_len, alloc_len);
+ }
+
+ if (rc >= 0) {
+ task->data_transferred = rc;
+ task->status = SPDK_SCSI_STATUS_GOOD;
+ }
+
+ if (data) {
+ free(data);
+ }
+
+ return SPDK_SCSI_TASK_COMPLETE;
+}
+
+int
+bdev_scsi_execute(struct spdk_scsi_task *task)
+{
+ int rc;
+
+ if ((rc = bdev_scsi_process_block(task)) == SPDK_SCSI_TASK_UNKNOWN) {
+ if ((rc = bdev_scsi_process_primary(task)) == SPDK_SCSI_TASK_UNKNOWN) {
+ SPDK_DEBUGLOG(SPDK_LOG_SCSI, "unsupported SCSI OP=0x%x\n", task->cdb[0]);
+ /* INVALID COMMAND OPERATION CODE */
+ spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION,
+ SPDK_SCSI_SENSE_ILLEGAL_REQUEST,
+ SPDK_SCSI_ASC_INVALID_COMMAND_OPERATION_CODE,
+ SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ return SPDK_SCSI_TASK_COMPLETE;
+ }
+ }
+
+ return rc;
+}
+
+static void
+bdev_scsi_reset_resubmit(void *arg)
+{
+ struct spdk_scsi_task *task = arg;
+
+ bdev_scsi_reset(task);
+}
+
+void
+bdev_scsi_reset(struct spdk_scsi_task *task)
+{
+ struct spdk_scsi_lun *lun = task->lun;
+ int rc;
+
+ rc = spdk_bdev_reset(lun->bdev_desc, lun->io_channel, bdev_scsi_task_complete_reset,
+ task);
+ if (rc == -ENOMEM) {
+ bdev_scsi_queue_io(task, bdev_scsi_reset_resubmit, task);
+ }
+}
+
+bool
+bdev_scsi_get_dif_ctx(struct spdk_bdev *bdev, struct spdk_scsi_task *task,
+ struct spdk_dif_ctx *dif_ctx)
+{
+ uint32_t ref_tag = 0, dif_check_flags = 0, data_offset;
+ uint8_t *cdb;
+ int rc;
+
+ if (spdk_likely(spdk_bdev_get_md_size(bdev) == 0)) {
+ return false;
+ }
+
+ cdb = task->cdb;
+ data_offset = task->offset;
+
+ /* We use lower 32 bits of LBA as Reference. Tag */
+ switch (cdb[0]) {
+ case SPDK_SBC_READ_6:
+ case SPDK_SBC_WRITE_6:
+ ref_tag = (uint32_t)cdb[1] << 16;
+ ref_tag |= (uint32_t)cdb[2] << 8;
+ ref_tag |= (uint32_t)cdb[3];
+ break;
+ case SPDK_SBC_READ_10:
+ case SPDK_SBC_WRITE_10:
+ case SPDK_SBC_READ_12:
+ case SPDK_SBC_WRITE_12:
+ ref_tag = from_be32(&cdb[2]);
+ break;
+ case SPDK_SBC_READ_16:
+ case SPDK_SBC_WRITE_16:
+ ref_tag = (uint32_t)from_be64(&cdb[2]);
+ break;
+ default:
+ return false;
+ }
+
+ if (spdk_bdev_is_dif_check_enabled(bdev, SPDK_DIF_CHECK_TYPE_REFTAG)) {
+ dif_check_flags |= SPDK_DIF_FLAGS_REFTAG_CHECK;
+ }
+
+ if (spdk_bdev_is_dif_check_enabled(bdev, SPDK_DIF_CHECK_TYPE_GUARD)) {
+ dif_check_flags |= SPDK_DIF_FLAGS_GUARD_CHECK;
+ }
+
+ rc = spdk_dif_ctx_init(dif_ctx,
+ spdk_bdev_get_block_size(bdev),
+ spdk_bdev_get_md_size(bdev),
+ spdk_bdev_is_md_interleaved(bdev),
+ spdk_bdev_is_dif_head_of_md(bdev),
+ spdk_bdev_get_dif_type(bdev),
+ dif_check_flags,
+ ref_tag, 0, 0, data_offset, 0);
+
+ return (rc == 0) ? true : false;
+}
diff --git a/src/spdk/lib/scsi/scsi_internal.h b/src/spdk/lib/scsi/scsi_internal.h
new file mode 100644
index 000000000..2da3a99a8
--- /dev/null
+++ b/src/spdk/lib/scsi/scsi_internal.h
@@ -0,0 +1,214 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_SCSI_INTERNAL_H
+#define SPDK_SCSI_INTERNAL_H
+
+#include "spdk/stdinc.h"
+
+#include "spdk/bdev.h"
+#include "spdk/scsi.h"
+#include "spdk/scsi_spec.h"
+#include "spdk/trace.h"
+#include "spdk/dif.h"
+
+#include "spdk_internal/log.h"
+
+enum {
+ SPDK_SCSI_TASK_UNKNOWN = -1,
+ SPDK_SCSI_TASK_COMPLETE,
+ SPDK_SCSI_TASK_PENDING,
+};
+
+struct spdk_scsi_port {
+ uint8_t is_used;
+ uint64_t id;
+ uint16_t index;
+ uint16_t transport_id_len;
+ char transport_id[SPDK_SCSI_MAX_TRANSPORT_ID_LENGTH];
+ char name[SPDK_SCSI_PORT_MAX_NAME_LENGTH];
+};
+
+/* Registrant with I_T nextus */
+struct spdk_scsi_pr_registrant {
+ uint64_t rkey;
+ uint16_t relative_target_port_id;
+ uint16_t transport_id_len;
+ char transport_id[SPDK_SCSI_MAX_TRANSPORT_ID_LENGTH];
+ char initiator_port_name[SPDK_SCSI_PORT_MAX_NAME_LENGTH];
+ char target_port_name[SPDK_SCSI_PORT_MAX_NAME_LENGTH];
+ struct spdk_scsi_port *initiator_port;
+ struct spdk_scsi_port *target_port;
+ TAILQ_ENTRY(spdk_scsi_pr_registrant) link;
+};
+
+#define SCSI_SPC2_RESERVE 0x00000001U
+
+/* Reservation with LU_SCOPE */
+struct spdk_scsi_pr_reservation {
+ uint32_t flags;
+ struct spdk_scsi_pr_registrant *holder;
+ enum spdk_scsi_pr_type_code rtype;
+ uint64_t crkey;
+};
+
+struct spdk_scsi_dev {
+ int id;
+ int is_allocated;
+ bool removed;
+ spdk_scsi_dev_destruct_cb_t remove_cb;
+ void *remove_ctx;
+
+ char name[SPDK_SCSI_DEV_MAX_NAME + 1];
+
+ struct spdk_scsi_lun *lun[SPDK_SCSI_DEV_MAX_LUN];
+
+ int num_ports;
+ struct spdk_scsi_port port[SPDK_SCSI_DEV_MAX_PORTS];
+
+ uint8_t protocol_id;
+};
+
+struct spdk_scsi_lun_desc {
+ struct spdk_scsi_lun *lun;
+ spdk_scsi_lun_remove_cb_t hotremove_cb;
+ void *hotremove_ctx;
+ TAILQ_ENTRY(spdk_scsi_lun_desc) link;
+};
+
+struct spdk_scsi_lun {
+ /** LUN id for this logical unit. */
+ int id;
+
+ /** Pointer to the SCSI device containing this LUN. */
+ struct spdk_scsi_dev *dev;
+
+ /** The bdev associated with this LUN. */
+ struct spdk_bdev *bdev;
+
+ /** Descriptor for opened block device. */
+ struct spdk_bdev_desc *bdev_desc;
+
+ /** The thread which opens this LUN. */
+ struct spdk_thread *thread;
+
+ /** I/O channel for the bdev associated with this LUN. */
+ struct spdk_io_channel *io_channel;
+
+ /** The reference number for this LUN, thus we can correctly free the io_channel */
+ uint32_t ref;
+
+ /** Poller to release the resource of the lun when it is hot removed */
+ struct spdk_poller *hotremove_poller;
+
+ /** The LUN is removed */
+ bool removed;
+
+ /** Callback to be fired when LUN removal is first triggered. */
+ void (*hotremove_cb)(const struct spdk_scsi_lun *lun, void *arg);
+
+ /** Argument for hotremove_cb */
+ void *hotremove_ctx;
+
+ /** Registrant head for I_T nexus */
+ TAILQ_HEAD(, spdk_scsi_pr_registrant) reg_head;
+ /** Persistent Reservation Generation */
+ uint32_t pr_generation;
+ /** Reservation for the LUN */
+ struct spdk_scsi_pr_reservation reservation;
+ /** Reservation holder for SPC2 RESERVE(6) and RESERVE(10) */
+ struct spdk_scsi_pr_registrant scsi2_holder;
+
+ /** List of open descriptors for this LUN. */
+ TAILQ_HEAD(, spdk_scsi_lun_desc) open_descs;
+
+ /** submitted tasks */
+ TAILQ_HEAD(tasks, spdk_scsi_task) tasks;
+
+ /** pending tasks */
+ TAILQ_HEAD(pending_tasks, spdk_scsi_task) pending_tasks;
+
+ /** submitted management tasks */
+ TAILQ_HEAD(mgmt_tasks, spdk_scsi_task) mgmt_tasks;
+
+ /** pending management tasks */
+ TAILQ_HEAD(pending_mgmt_tasks, spdk_scsi_task) pending_mgmt_tasks;
+
+ /** poller to check completion of tasks prior to reset */
+ struct spdk_poller *reset_poller;
+};
+
+struct spdk_scsi_lun *scsi_lun_construct(struct spdk_bdev *bdev,
+ void (*hotremove_cb)(const struct spdk_scsi_lun *, void *),
+ void *hotremove_ctx);
+void scsi_lun_destruct(struct spdk_scsi_lun *lun);
+
+void scsi_lun_execute_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task);
+void scsi_lun_execute_mgmt_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task);
+bool scsi_lun_has_pending_mgmt_tasks(const struct spdk_scsi_lun *lun,
+ const struct spdk_scsi_port *initiator_port);
+void scsi_lun_complete_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task);
+void scsi_lun_complete_reset_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task);
+bool scsi_lun_has_pending_tasks(const struct spdk_scsi_lun *lun,
+ const struct spdk_scsi_port *initiator_port);
+int scsi_lun_allocate_io_channel(struct spdk_scsi_lun *lun);
+void scsi_lun_free_io_channel(struct spdk_scsi_lun *lun);
+
+struct spdk_scsi_dev *scsi_dev_get_list(void);
+
+int scsi_port_construct(struct spdk_scsi_port *port, uint64_t id,
+ uint16_t index, const char *name);
+void scsi_port_destruct(struct spdk_scsi_port *port);
+
+int bdev_scsi_execute(struct spdk_scsi_task *task);
+void bdev_scsi_reset(struct spdk_scsi_task *task);
+
+bool bdev_scsi_get_dif_ctx(struct spdk_bdev *bdev, struct spdk_scsi_task *task,
+ struct spdk_dif_ctx *dif_ctx);
+
+int scsi_pr_out(struct spdk_scsi_task *task, uint8_t *cdb, uint8_t *data, uint16_t data_len);
+int scsi_pr_in(struct spdk_scsi_task *task, uint8_t *cdb, uint8_t *data, uint16_t data_len);
+int scsi_pr_check(struct spdk_scsi_task *task);
+
+int scsi2_reserve(struct spdk_scsi_task *task, uint8_t *cdb);
+int scsi2_release(struct spdk_scsi_task *task);
+int scsi2_reserve_check(struct spdk_scsi_task *task);
+
+struct spdk_scsi_globals {
+ pthread_mutex_t mutex;
+};
+
+extern struct spdk_scsi_globals g_scsi;
+
+#endif /* SPDK_SCSI_INTERNAL_H */
diff --git a/src/spdk/lib/scsi/scsi_pr.c b/src/spdk/lib/scsi/scsi_pr.c
new file mode 100644
index 000000000..4e17cc2c6
--- /dev/null
+++ b/src/spdk/lib/scsi/scsi_pr.c
@@ -0,0 +1,1067 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "scsi_internal.h"
+
+#include "spdk/endian.h"
+
+/* Get registrant by I_T nexus */
+static struct spdk_scsi_pr_registrant *
+scsi_pr_get_registrant(struct spdk_scsi_lun *lun,
+ struct spdk_scsi_port *initiator_port,
+ struct spdk_scsi_port *target_port)
+{
+ struct spdk_scsi_pr_registrant *reg, *tmp;
+
+ TAILQ_FOREACH_SAFE(reg, &lun->reg_head, link, tmp) {
+ if (initiator_port == reg->initiator_port &&
+ target_port == reg->target_port) {
+ return reg;
+ }
+ }
+
+ return NULL;
+}
+
+static bool
+scsi2_it_nexus_is_holder(struct spdk_scsi_lun *lun,
+ struct spdk_scsi_port *initiator_port,
+ struct spdk_scsi_port *target_port)
+{
+ struct spdk_scsi_pr_registrant *reg = lun->reservation.holder;
+
+ assert(reg != NULL);
+
+ if ((reg->initiator_port == initiator_port) &&
+ (reg->target_port == target_port)) {
+ return true;
+ }
+
+ return false;
+}
+
+/* Reservation type is all registrants or not */
+static inline bool
+scsi_pr_is_all_registrants_type(struct spdk_scsi_lun *lun)
+{
+ return (lun->reservation.rtype == SPDK_SCSI_PR_WRITE_EXCLUSIVE_ALL_REGS ||
+ lun->reservation.rtype == SPDK_SCSI_PR_EXCLUSIVE_ACCESS_ALL_REGS);
+}
+
+/* Registrant is reservation holder or not */
+static inline bool
+scsi_pr_registrant_is_holder(struct spdk_scsi_lun *lun,
+ struct spdk_scsi_pr_registrant *reg)
+{
+ if (scsi_pr_is_all_registrants_type(lun)) {
+ return true;
+ }
+
+ return (lun->reservation.holder == reg);
+}
+
+/* LUN holds a reservation or not */
+static inline bool
+scsi_pr_has_reservation(struct spdk_scsi_lun *lun)
+{
+ return !(lun->reservation.holder == NULL);
+}
+
+static int
+scsi_pr_register_registrant(struct spdk_scsi_lun *lun,
+ struct spdk_scsi_port *initiator_port,
+ struct spdk_scsi_port *target_port,
+ uint64_t sa_rkey)
+{
+ struct spdk_scsi_pr_registrant *reg;
+
+ /* Register sa_rkey with the I_T nexus */
+ reg = calloc(1, sizeof(*reg));
+ if (!reg) {
+ return -ENOMEM;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_SCSI, "REGISTER: new registrant registered "
+ "with key 0x%"PRIx64"\n", sa_rkey);
+
+ /* New I_T nexus */
+ reg->initiator_port = initiator_port;
+ if (initiator_port) {
+ snprintf(reg->initiator_port_name, sizeof(reg->initiator_port_name), "%s",
+ initiator_port->name);
+ reg->transport_id_len = initiator_port->transport_id_len;
+ memcpy(reg->transport_id, initiator_port->transport_id, reg->transport_id_len);
+ }
+ reg->target_port = target_port;
+ if (target_port) {
+ snprintf(reg->target_port_name, sizeof(reg->target_port_name), "%s",
+ target_port->name);
+ reg->relative_target_port_id = target_port->index;
+ }
+ reg->rkey = sa_rkey;
+ TAILQ_INSERT_TAIL(&lun->reg_head, reg, link);
+ lun->pr_generation++;
+
+ return 0;
+}
+
+static void
+scsi_pr_release_reservation(struct spdk_scsi_lun *lun, struct spdk_scsi_pr_registrant *reg)
+{
+ bool all_regs = false;
+
+ SPDK_DEBUGLOG(SPDK_LOG_SCSI, "REGISTER: release reservation "
+ "with type %u\n", lun->reservation.rtype);
+
+ /* TODO: Unit Attention */
+ all_regs = scsi_pr_is_all_registrants_type(lun);
+ if (all_regs && !TAILQ_EMPTY(&lun->reg_head)) {
+ lun->reservation.holder = TAILQ_FIRST(&lun->reg_head);
+ return;
+ }
+
+ memset(&lun->reservation, 0, sizeof(struct spdk_scsi_pr_reservation));
+}
+
+static void
+scsi_pr_reserve_reservation(struct spdk_scsi_lun *lun,
+ enum spdk_scsi_pr_type_code type,
+ uint64_t rkey,
+ struct spdk_scsi_pr_registrant *holder)
+{
+ lun->reservation.rtype = type;
+ lun->reservation.crkey = rkey;
+ lun->reservation.holder = holder;
+}
+
+static void
+scsi_pr_unregister_registrant(struct spdk_scsi_lun *lun,
+ struct spdk_scsi_pr_registrant *reg)
+{
+ SPDK_DEBUGLOG(SPDK_LOG_SCSI, "REGISTER: unregister registrant\n");
+
+ TAILQ_REMOVE(&lun->reg_head, reg, link);
+ if (scsi_pr_registrant_is_holder(lun, reg)) {
+ scsi_pr_release_reservation(lun, reg);
+ }
+
+ free(reg);
+ lun->pr_generation++;
+}
+
+static void
+scsi_pr_replace_registrant_key(struct spdk_scsi_lun *lun,
+ struct spdk_scsi_pr_registrant *reg,
+ uint64_t sa_rkey)
+{
+ SPDK_DEBUGLOG(SPDK_LOG_SCSI, "REGISTER: replace with new "
+ "reservation key 0x%"PRIx64"\n", sa_rkey);
+ reg->rkey = sa_rkey;
+ lun->pr_generation++;
+}
+
+static int
+scsi_pr_out_reserve(struct spdk_scsi_task *task,
+ enum spdk_scsi_pr_type_code rtype, uint64_t rkey,
+ uint8_t spec_i_pt, uint8_t all_tg_pt, uint8_t aptpl)
+{
+ struct spdk_scsi_lun *lun = task->lun;
+ struct spdk_scsi_pr_registrant *reg;
+
+ SPDK_DEBUGLOG(SPDK_LOG_SCSI, "PR OUT RESERVE: rkey 0x%"PRIx64", requested "
+ "reservation type %u, type %u\n", rkey, rtype, lun->reservation.rtype);
+
+ /* TODO: don't support now */
+ if (spec_i_pt || all_tg_pt || aptpl) {
+ SPDK_ERRLOG("Unspported spec_i_pt/all_tg_pt fields "
+ "or invalid aptpl field\n");
+ spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION,
+ SPDK_SCSI_SENSE_ILLEGAL_REQUEST,
+ SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB,
+ SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ return -EINVAL;
+ }
+
+ reg = scsi_pr_get_registrant(lun, task->initiator_port, task->target_port);
+ /* No registration for the I_T nexus */
+ if (!reg) {
+ SPDK_ERRLOG("No registration\n");
+ goto conflict;
+ }
+
+ /* invalid reservation key */
+ if (reg->rkey != rkey) {
+ SPDK_ERRLOG("Reservation key 0x%"PRIx64" don't match 0x%"PRIx64"\n",
+ rkey, reg->rkey);
+ goto conflict;
+ }
+
+ /* reservation holder already exists */
+ if (scsi_pr_has_reservation(lun)) {
+ if (rtype != lun->reservation.rtype) {
+ SPDK_ERRLOG("Reservation type doesn't match\n");
+ goto conflict;
+ }
+
+ if (!scsi_pr_registrant_is_holder(lun, reg)) {
+ SPDK_ERRLOG("Only 1 holder is allowed for type %u\n", rtype);
+ goto conflict;
+ }
+ } else {
+ /* current I_T nexus is the first reservation holder */
+ scsi_pr_reserve_reservation(lun, rtype, rkey, reg);
+ }
+
+ return 0;
+
+conflict:
+ spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_RESERVATION_CONFLICT,
+ SPDK_SCSI_SENSE_NO_SENSE,
+ SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE,
+ SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ return -EINVAL;
+}
+
+static int
+scsi_pr_out_register(struct spdk_scsi_task *task,
+ enum spdk_scsi_pr_out_service_action_code action,
+ uint64_t rkey, uint64_t sa_rkey,
+ uint8_t spec_i_pt, uint8_t all_tg_pt, uint8_t aptpl)
+{
+ struct spdk_scsi_lun *lun = task->lun;
+ struct spdk_scsi_pr_registrant *reg;
+ int sc, sk, asc;
+
+ SPDK_DEBUGLOG(SPDK_LOG_SCSI, "PR OUT REGISTER: rkey 0x%"PRIx64", "
+ "sa_key 0x%"PRIx64", reservation type %u\n", rkey, sa_rkey, lun->reservation.rtype);
+
+ /* TODO: don't support now */
+ if (spec_i_pt || all_tg_pt || aptpl) {
+ SPDK_ERRLOG("Unsupported spec_i_pt/all_tg_pt/aptpl field\n");
+ sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
+ sk = SPDK_SCSI_SENSE_ILLEGAL_REQUEST;
+ asc = SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB;
+ goto error_exit;
+ }
+
+ reg = scsi_pr_get_registrant(lun, task->initiator_port, task->target_port);
+ /* an unregistered I_T nexus session */
+ if (!reg) {
+ if (rkey && (action == SPDK_SCSI_PR_OUT_REGISTER)) {
+ SPDK_ERRLOG("Reservation key field is not empty\n");
+ sc = SPDK_SCSI_STATUS_RESERVATION_CONFLICT;
+ sk = SPDK_SCSI_SENSE_NO_SENSE;
+ asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
+ goto error_exit;
+ }
+
+ if (!sa_rkey) {
+ /* Do nothing except return GOOD status */
+ SPDK_DEBUGLOG(SPDK_LOG_SCSI, "REGISTER: service action "
+ "reservation key is zero, do noting\n");
+ return 0;
+ }
+ /* Add a new registrant for the I_T nexus */
+ return scsi_pr_register_registrant(lun, task->initiator_port,
+ task->target_port, sa_rkey);
+ } else {
+ /* a registered I_T nexus */
+ if (rkey != reg->rkey && action == SPDK_SCSI_PR_OUT_REGISTER) {
+ SPDK_ERRLOG("Reservation key 0x%"PRIx64" don't match "
+ "registrant's key 0x%"PRIx64"\n", rkey, reg->rkey);
+ sc = SPDK_SCSI_STATUS_RESERVATION_CONFLICT;
+ sk = SPDK_SCSI_SENSE_NO_SENSE;
+ asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
+ goto error_exit;
+ }
+
+ if (!sa_rkey) {
+ /* unregister */
+ scsi_pr_unregister_registrant(lun, reg);
+ } else {
+ /* replace */
+ scsi_pr_replace_registrant_key(lun, reg, sa_rkey);
+ }
+ }
+
+ return 0;
+
+error_exit:
+ spdk_scsi_task_set_status(task, sc, sk, asc, SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE);
+ return -EINVAL;
+}
+
+static int
+scsi_pr_out_release(struct spdk_scsi_task *task,
+ enum spdk_scsi_pr_type_code rtype, uint64_t rkey)
+{
+ struct spdk_scsi_lun *lun = task->lun;
+ struct spdk_scsi_pr_registrant *reg;
+ int sk, asc;
+
+ SPDK_DEBUGLOG(SPDK_LOG_SCSI, "PR OUT RELEASE: rkey 0x%"PRIx64", "
+ "reservation type %u\n", rkey, rtype);
+
+ reg = scsi_pr_get_registrant(lun, task->initiator_port, task->target_port);
+ if (!reg) {
+ SPDK_ERRLOG("No registration\n");
+ sk = SPDK_SCSI_SENSE_NOT_READY;
+ asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
+ goto check_condition;
+ }
+
+ /* no reservation holder */
+ if (!scsi_pr_has_reservation(lun)) {
+ SPDK_DEBUGLOG(SPDK_LOG_SCSI, "RELEASE: no reservation holder\n");
+ return 0;
+ }
+
+ if (lun->reservation.rtype != rtype || rkey != lun->reservation.crkey) {
+ sk = SPDK_SCSI_SENSE_ILLEGAL_REQUEST;
+ asc = SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB;
+ goto check_condition;
+ }
+
+ /* I_T nexus is not a persistent reservation holder */
+ if (!scsi_pr_registrant_is_holder(lun, reg)) {
+ SPDK_DEBUGLOG(SPDK_LOG_SCSI, "RELEASE: current I_T nexus is not holder\n");
+ return 0;
+ }
+
+ scsi_pr_release_reservation(lun, reg);
+
+ return 0;
+
+check_condition:
+ spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, sk, asc,
+ SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ return -EINVAL;
+}
+
+static int
+scsi_pr_out_clear(struct spdk_scsi_task *task, uint64_t rkey)
+{
+ struct spdk_scsi_lun *lun = task->lun;
+ struct spdk_scsi_pr_registrant *reg, *tmp;
+ int sc, sk, asc;
+
+ SPDK_DEBUGLOG(SPDK_LOG_SCSI, "PR OUT CLEAR: rkey 0x%"PRIx64"\n", rkey);
+
+ reg = scsi_pr_get_registrant(lun, task->initiator_port, task->target_port);
+ if (!reg) {
+ SPDK_ERRLOG("No registration\n");
+ sc = SPDK_SCSI_STATUS_CHECK_CONDITION;
+ sk = SPDK_SCSI_SENSE_NOT_READY;
+ asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
+ goto error_exit;
+ }
+
+ if (rkey != reg->rkey) {
+ SPDK_ERRLOG("Reservation key 0x%"PRIx64" doesn't match "
+ "registrant's key 0x%"PRIx64"\n", rkey, reg->rkey);
+ sc = SPDK_SCSI_STATUS_RESERVATION_CONFLICT;
+ sk = SPDK_SCSI_SENSE_NO_SENSE;
+ asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE;
+ goto error_exit;
+ }
+
+ TAILQ_FOREACH_SAFE(reg, &lun->reg_head, link, tmp) {
+ scsi_pr_unregister_registrant(lun, reg);
+ }
+
+ return 0;
+
+error_exit:
+ spdk_scsi_task_set_status(task, sc, sk, asc, SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ return -EINVAL;
+}
+
+static void
+scsi_pr_remove_all_regs_by_key(struct spdk_scsi_lun *lun, uint64_t sa_rkey)
+{
+ struct spdk_scsi_pr_registrant *reg, *tmp;
+
+ TAILQ_FOREACH_SAFE(reg, &lun->reg_head, link, tmp) {
+ if (reg->rkey == sa_rkey) {
+ scsi_pr_unregister_registrant(lun, reg);
+ }
+ }
+}
+
+static void
+scsi_pr_remove_all_other_regs(struct spdk_scsi_lun *lun, struct spdk_scsi_pr_registrant *reg)
+{
+ struct spdk_scsi_pr_registrant *reg_tmp, *reg_tmp2;
+
+ TAILQ_FOREACH_SAFE(reg_tmp, &lun->reg_head, link, reg_tmp2) {
+ if (reg_tmp != reg) {
+ scsi_pr_unregister_registrant(lun, reg_tmp);
+ }
+ }
+}
+
+static int
+scsi_pr_out_preempt(struct spdk_scsi_task *task,
+ enum spdk_scsi_pr_out_service_action_code action,
+ enum spdk_scsi_pr_type_code rtype,
+ uint64_t rkey, uint64_t sa_rkey)
+{
+ struct spdk_scsi_lun *lun = task->lun;
+ struct spdk_scsi_pr_registrant *reg;
+ bool all_regs = false;
+
+ SPDK_DEBUGLOG(SPDK_LOG_SCSI, "PR OUT PREEMPT: rkey 0x%"PRIx64", sa_rkey 0x%"PRIx64" "
+ "action %u, type %u, reservation type %u\n",
+ rkey, sa_rkey, action, rtype, lun->reservation.rtype);
+
+ /* I_T nexus is not registered */
+ reg = scsi_pr_get_registrant(lun, task->initiator_port, task->target_port);
+ if (!reg) {
+ SPDK_ERRLOG("No registration\n");
+ goto conflict;
+ }
+ if (rkey != reg->rkey) {
+ SPDK_ERRLOG("Reservation key 0x%"PRIx64" doesn't match "
+ "registrant's key 0x%"PRIx64"\n", rkey, reg->rkey);
+ goto conflict;
+ }
+
+ /* no persistent reservation */
+ if (!scsi_pr_has_reservation(lun)) {
+ scsi_pr_remove_all_regs_by_key(lun, sa_rkey);
+ SPDK_DEBUGLOG(SPDK_LOG_SCSI, "PREEMPT: no persistent reservation\n");
+ goto exit;
+ }
+
+ all_regs = scsi_pr_is_all_registrants_type(lun);
+
+ if (all_regs) {
+ if (sa_rkey != 0) {
+ scsi_pr_remove_all_regs_by_key(lun, sa_rkey);
+ SPDK_DEBUGLOG(SPDK_LOG_SCSI, "PREEMPT: All registrants type with sa_rkey\n");
+ } else {
+ /* remove all other registrants and release persistent reservation if any */
+ scsi_pr_remove_all_other_regs(lun, reg);
+ /* create persistent reservation using new type and scope */
+ scsi_pr_reserve_reservation(lun, rtype, 0, reg);
+ SPDK_DEBUGLOG(SPDK_LOG_SCSI, "PREEMPT: All registrants type with sa_rkey zeroed\n");
+ }
+ goto exit;
+ }
+
+ assert(lun->reservation.crkey != 0);
+
+ if (sa_rkey != lun->reservation.crkey) {
+ if (!sa_rkey) {
+ SPDK_ERRLOG("Zeroed sa_rkey\n");
+ spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION,
+ SPDK_SCSI_SENSE_ILLEGAL_REQUEST,
+ SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB,
+ SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ return -EINVAL;
+ }
+ scsi_pr_remove_all_regs_by_key(lun, sa_rkey);
+ goto exit;
+ }
+
+ if (scsi_pr_registrant_is_holder(lun, reg)) {
+ scsi_pr_reserve_reservation(lun, rtype, rkey, reg);
+ SPDK_DEBUGLOG(SPDK_LOG_SCSI, "PREEMPT: preempt itself with type %u\n", rtype);
+ goto exit;
+ }
+
+ /* unregister registrants if any */
+ scsi_pr_remove_all_regs_by_key(lun, sa_rkey);
+ reg = scsi_pr_get_registrant(lun, task->initiator_port, task->target_port);
+ if (!reg) {
+ SPDK_ERRLOG("Current I_T nexus registrant was removed\n");
+ goto conflict;
+ }
+
+ /* preempt the holder */
+ scsi_pr_reserve_reservation(lun, rtype, rkey, reg);
+
+exit:
+ lun->pr_generation++;
+ return 0;
+
+conflict:
+ spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_RESERVATION_CONFLICT,
+ SPDK_SCSI_SENSE_NO_SENSE,
+ SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE,
+ SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ return -EINVAL;
+}
+
+int
+scsi_pr_out(struct spdk_scsi_task *task, uint8_t *cdb,
+ uint8_t *data, uint16_t data_len)
+{
+ int rc = -1;
+ uint64_t rkey, sa_rkey;
+ uint8_t spec_i_pt, all_tg_pt, aptpl;
+ enum spdk_scsi_pr_out_service_action_code action;
+ enum spdk_scsi_pr_scope_code scope;
+ enum spdk_scsi_pr_type_code rtype;
+ struct spdk_scsi_pr_out_param_list *param = (struct spdk_scsi_pr_out_param_list *)data;
+
+ action = cdb[1] & 0x0f;
+ scope = (cdb[2] >> 4) & 0x0f;
+ rtype = cdb[2] & 0x0f;
+
+ rkey = from_be64(&param->rkey);
+ sa_rkey = from_be64(&param->sa_rkey);
+ aptpl = param->aptpl;
+ spec_i_pt = param->spec_i_pt;
+ all_tg_pt = param->all_tg_pt;
+
+ switch (action) {
+ case SPDK_SCSI_PR_OUT_REGISTER:
+ case SPDK_SCSI_PR_OUT_REG_AND_IGNORE_KEY:
+ rc = scsi_pr_out_register(task, action, rkey, sa_rkey,
+ spec_i_pt, all_tg_pt, aptpl);
+ break;
+ case SPDK_SCSI_PR_OUT_RESERVE:
+ if (scope != SPDK_SCSI_PR_LU_SCOPE) {
+ goto invalid;
+ }
+ rc = scsi_pr_out_reserve(task, rtype, rkey,
+ spec_i_pt, all_tg_pt, aptpl);
+ break;
+ case SPDK_SCSI_PR_OUT_RELEASE:
+ if (scope != SPDK_SCSI_PR_LU_SCOPE) {
+ goto invalid;
+ }
+ rc = scsi_pr_out_release(task, rtype, rkey);
+ break;
+ case SPDK_SCSI_PR_OUT_CLEAR:
+ rc = scsi_pr_out_clear(task, rkey);
+ break;
+ case SPDK_SCSI_PR_OUT_PREEMPT:
+ if (scope != SPDK_SCSI_PR_LU_SCOPE) {
+ goto invalid;
+ }
+ rc = scsi_pr_out_preempt(task, action, rtype, rkey, sa_rkey);
+ break;
+ default:
+ SPDK_ERRLOG("Invalid service action code %u\n", action);
+ goto invalid;
+ }
+
+ return rc;
+
+invalid:
+ spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION,
+ SPDK_SCSI_SENSE_ILLEGAL_REQUEST,
+ SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB,
+ SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ return -EINVAL;
+}
+
+static int
+scsi_pr_in_read_keys(struct spdk_scsi_task *task, uint8_t *data,
+ uint16_t data_len)
+{
+ struct spdk_scsi_lun *lun = task->lun;
+ struct spdk_scsi_pr_in_read_keys_data *keys;
+ struct spdk_scsi_pr_registrant *reg, *tmp;
+ uint16_t count = 0;
+
+ SPDK_DEBUGLOG(SPDK_LOG_SCSI, "PR IN READ KEYS\n");
+ keys = (struct spdk_scsi_pr_in_read_keys_data *)data;
+
+ to_be32(&keys->header.pr_generation, lun->pr_generation);
+ TAILQ_FOREACH_SAFE(reg, &lun->reg_head, link, tmp) {
+ if (((count + 1) * 8 + sizeof(keys->header)) > data_len) {
+ break;
+ }
+ to_be64(&keys->rkeys[count], reg->rkey);
+ count++;
+ }
+ to_be32(&keys->header.additional_len, count * 8);
+
+ return (sizeof(keys->header) + count * 8);
+}
+
+static int
+scsi_pr_in_read_reservations(struct spdk_scsi_task *task,
+ uint8_t *data, uint16_t data_len)
+{
+ struct spdk_scsi_lun *lun = task->lun;
+ struct spdk_scsi_pr_in_read_reservations_data *param;
+ bool all_regs = false;
+
+ SPDK_DEBUGLOG(SPDK_LOG_SCSI, "PR IN READ RESERVATIONS\n");
+ param = (struct spdk_scsi_pr_in_read_reservations_data *)(data);
+
+ to_be32(&param->header.pr_generation, lun->pr_generation);
+ if (scsi_pr_has_reservation(lun)) {
+ all_regs = scsi_pr_is_all_registrants_type(lun);
+ if (all_regs) {
+ to_be64(&param->rkey, 0);
+ } else {
+ to_be64(&param->rkey, lun->reservation.crkey);
+ }
+ to_be32(&param->header.additional_len, 16);
+ param->scope = SPDK_SCSI_PR_LU_SCOPE;
+ param->type = lun->reservation.rtype;
+ SPDK_DEBUGLOG(SPDK_LOG_SCSI, "READ RESERVATIONS with valid reservation\n");
+ return sizeof(*param);
+ }
+
+ /* no reservation */
+ to_be32(&param->header.additional_len, 0);
+ SPDK_DEBUGLOG(SPDK_LOG_SCSI, "READ RESERVATIONS no reservation\n");
+ return sizeof(param->header);
+}
+
+static int
+scsi_pr_in_report_capabilities(struct spdk_scsi_task *task,
+ uint8_t *data, uint16_t data_len)
+{
+ struct spdk_scsi_pr_in_report_capabilities_data *param;
+
+ SPDK_DEBUGLOG(SPDK_LOG_SCSI, "PR IN REPORT CAPABILITIES\n");
+ param = (struct spdk_scsi_pr_in_report_capabilities_data *)data;
+
+ memset(param, 0, sizeof(*param));
+ to_be16(&param->length, sizeof(*param));
+ /* Compatible reservation handling to support RESERVE/RELEASE defined in SPC-2 */
+ param->crh = 1;
+ param->tmv = 1;
+ param->wr_ex = 1;
+ param->ex_ac = 1;
+ param->wr_ex_ro = 1;
+ param->ex_ac_ro = 1;
+ param->wr_ex_ar = 1;
+ param->ex_ac_ar = 1;
+
+ return sizeof(*param);
+}
+
+static int
+scsi_pr_in_read_full_status(struct spdk_scsi_task *task,
+ uint8_t *data, uint16_t data_len)
+{
+ struct spdk_scsi_lun *lun = task->lun;
+ struct spdk_scsi_pr_in_full_status_data *param;
+ struct spdk_scsi_pr_in_full_status_desc *desc;
+ struct spdk_scsi_pr_registrant *reg, *tmp;
+ bool all_regs = false;
+ uint32_t add_len = 0;
+
+ SPDK_DEBUGLOG(SPDK_LOG_SCSI, "PR IN READ FULL STATUS\n");
+
+ all_regs = scsi_pr_is_all_registrants_type(lun);
+ param = (struct spdk_scsi_pr_in_full_status_data *)data;
+ to_be32(&param->header.pr_generation, lun->pr_generation);
+
+ TAILQ_FOREACH_SAFE(reg, &lun->reg_head, link, tmp) {
+ desc = (struct spdk_scsi_pr_in_full_status_desc *)
+ ((uint8_t *)param->desc_list + add_len);
+ if (add_len + sizeof(*desc) + sizeof(param->header) > data_len) {
+ break;
+ }
+ add_len += sizeof(*desc);
+ desc->rkey = reg->rkey;
+ if (all_regs || lun->reservation.holder == reg) {
+ desc->r_holder = true;
+ desc->type = lun->reservation.rtype;
+ } else {
+ desc->r_holder = false;
+ desc->type = 0;
+ }
+ desc->all_tg_pt = 0;
+ desc->scope = SPDK_SCSI_PR_LU_SCOPE;
+ desc->relative_target_port_id = reg->relative_target_port_id;
+ if (add_len + reg->transport_id_len + sizeof(param->header) > data_len) {
+ break;
+ }
+ add_len += reg->transport_id_len;
+ memcpy(&desc->transport_id, reg->transport_id, reg->transport_id_len);
+ to_be32(&desc->desc_len, reg->transport_id_len);
+ }
+ to_be32(&param->header.additional_len, add_len);
+
+ return (sizeof(param->header) + add_len);
+}
+
+int
+scsi_pr_in(struct spdk_scsi_task *task, uint8_t *cdb,
+ uint8_t *data, uint16_t data_len)
+{
+ enum spdk_scsi_pr_in_action_code action;
+ int rc = 0;
+
+ action = cdb[1] & 0x1f;
+ if (data_len < sizeof(struct spdk_scsi_pr_in_read_header)) {
+ goto invalid;
+ }
+
+ switch (action) {
+ case SPDK_SCSI_PR_IN_READ_KEYS:
+ rc = scsi_pr_in_read_keys(task, data, data_len);
+ break;
+ case SPDK_SCSI_PR_IN_READ_RESERVATION:
+ if (data_len < sizeof(struct spdk_scsi_pr_in_read_reservations_data)) {
+ goto invalid;
+ }
+ rc = scsi_pr_in_read_reservations(task, data, data_len);
+ break;
+ case SPDK_SCSI_PR_IN_REPORT_CAPABILITIES:
+ rc = scsi_pr_in_report_capabilities(task, data, data_len);
+ break;
+ case SPDK_SCSI_PR_IN_READ_FULL_STATUS:
+ rc = scsi_pr_in_read_full_status(task, data, data_len);
+ break;
+ default:
+ goto invalid;
+ }
+
+ return rc;
+
+invalid:
+ spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION,
+ SPDK_SCSI_SENSE_ILLEGAL_REQUEST,
+ SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB,
+ SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ return -EINVAL;
+}
+
+int
+scsi_pr_check(struct spdk_scsi_task *task)
+{
+ struct spdk_scsi_lun *lun = task->lun;
+ uint8_t *cdb = task->cdb;
+ enum spdk_scsi_pr_type_code rtype;
+ enum spdk_scsi_pr_out_service_action_code action;
+ struct spdk_scsi_pr_registrant *reg;
+ bool dma_to_device = false;
+
+ /* no reservation holders */
+ if (!scsi_pr_has_reservation(lun)) {
+ return 0;
+ }
+
+ rtype = lun->reservation.rtype;
+ assert(rtype != 0);
+
+ reg = scsi_pr_get_registrant(lun, task->initiator_port, task->target_port);
+ /* current I_T nexus hold the reservation */
+ if (scsi_pr_registrant_is_holder(lun, reg)) {
+ return 0;
+ }
+
+ /* reservation is held by other I_T nexus */
+ switch (cdb[0]) {
+ case SPDK_SPC_INQUIRY:
+ case SPDK_SPC_REPORT_LUNS:
+ case SPDK_SPC_REQUEST_SENSE:
+ case SPDK_SPC_LOG_SENSE:
+ case SPDK_SPC_TEST_UNIT_READY:
+ case SPDK_SBC_START_STOP_UNIT:
+ case SPDK_SBC_READ_CAPACITY_10:
+ case SPDK_SPC_PERSISTENT_RESERVE_IN:
+ case SPDK_SPC_SERVICE_ACTION_IN_16:
+ /* CRH enabled, processed by scsi2_reserve() */
+ case SPDK_SPC2_RESERVE_6:
+ case SPDK_SPC2_RESERVE_10:
+ /* CRH enabled, processed by scsi2_release() */
+ case SPDK_SPC2_RELEASE_6:
+ case SPDK_SPC2_RELEASE_10:
+ return 0;
+ case SPDK_SPC_MODE_SELECT_6:
+ case SPDK_SPC_MODE_SELECT_10:
+ case SPDK_SPC_MODE_SENSE_6:
+ case SPDK_SPC_MODE_SENSE_10:
+ case SPDK_SPC_LOG_SELECT:
+ /* I_T nexus is registrant but not holder */
+ if (!reg) {
+ SPDK_DEBUGLOG(SPDK_LOG_SCSI, "CHECK: current I_T nexus "
+ "is not registered, cdb 0x%x\n", cdb[0]);
+ goto conflict;
+ }
+ return 0;
+ case SPDK_SPC_PERSISTENT_RESERVE_OUT:
+ action = cdb[1] & 0x1f;
+ SPDK_DEBUGLOG(SPDK_LOG_SCSI, "CHECK: PR OUT action %u\n", action);
+ switch (action) {
+ case SPDK_SCSI_PR_OUT_RELEASE:
+ case SPDK_SCSI_PR_OUT_CLEAR:
+ case SPDK_SCSI_PR_OUT_PREEMPT:
+ case SPDK_SCSI_PR_OUT_PREEMPT_AND_ABORT:
+ if (!reg) {
+ SPDK_ERRLOG("CHECK: PR OUT action %u\n", action);
+ goto conflict;
+ }
+ return 0;
+ case SPDK_SCSI_PR_OUT_REGISTER:
+ case SPDK_SCSI_PR_OUT_REG_AND_IGNORE_KEY:
+ return 0;
+ case SPDK_SCSI_PR_OUT_REG_AND_MOVE:
+ SPDK_ERRLOG("CHECK: PR OUT action %u\n", action);
+ goto conflict;
+ default:
+ SPDK_ERRLOG("CHECK: PR OUT invalid action %u\n", action);
+ goto conflict;
+ }
+
+ /* For most SBC R/W commands */
+ default:
+ break;
+ }
+
+ switch (cdb[0]) {
+ case SPDK_SBC_READ_6:
+ case SPDK_SBC_READ_10:
+ case SPDK_SBC_READ_12:
+ case SPDK_SBC_READ_16:
+ break;
+ case SPDK_SBC_WRITE_6:
+ case SPDK_SBC_WRITE_10:
+ case SPDK_SBC_WRITE_12:
+ case SPDK_SBC_WRITE_16:
+ case SPDK_SBC_UNMAP:
+ case SPDK_SBC_SYNCHRONIZE_CACHE_10:
+ case SPDK_SBC_SYNCHRONIZE_CACHE_16:
+ dma_to_device = true;
+ break;
+ default:
+ SPDK_ERRLOG("CHECK: unsupported SCSI command cdb 0x%x\n", cdb[0]);
+ goto conflict;
+ }
+
+ switch (rtype) {
+ case SPDK_SCSI_PR_WRITE_EXCLUSIVE:
+ if (dma_to_device) {
+ SPDK_ERRLOG("CHECK: Write Exclusive reservation type "
+ "rejects command 0x%x\n", cdb[0]);
+ goto conflict;
+ }
+ break;
+ case SPDK_SCSI_PR_EXCLUSIVE_ACCESS:
+ SPDK_ERRLOG("CHECK: Exclusive Access reservation type "
+ "rejects command 0x%x\n", cdb[0]);
+ goto conflict;
+ case SPDK_SCSI_PR_WRITE_EXCLUSIVE_REGS_ONLY:
+ case SPDK_SCSI_PR_WRITE_EXCLUSIVE_ALL_REGS:
+ if (!reg && dma_to_device) {
+ SPDK_ERRLOG("CHECK: Registrants only reservation "
+ "type reject command 0x%x\n", cdb[0]);
+ goto conflict;
+ }
+ break;
+ case SPDK_SCSI_PR_EXCLUSIVE_ACCESS_REGS_ONLY:
+ case SPDK_SCSI_PR_EXCLUSIVE_ACCESS_ALL_REGS:
+ if (!reg) {
+ SPDK_ERRLOG("CHECK: All Registrants reservation "
+ "type reject command 0x%x\n", cdb[0]);
+ goto conflict;
+ }
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+
+conflict:
+ spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_RESERVATION_CONFLICT,
+ SPDK_SCSI_SENSE_NO_SENSE,
+ SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE,
+ SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ return -1;
+}
+
+static int
+scsi2_check_reservation_conflict(struct spdk_scsi_task *task)
+{
+ struct spdk_scsi_lun *lun = task->lun;
+ struct spdk_scsi_pr_registrant *reg;
+ bool conflict = false;
+
+ reg = scsi_pr_get_registrant(lun, task->initiator_port, task->target_port);
+ if (reg) {
+ /*
+ * From spc4r31 5.9.3 Exceptions to SPC-2 RESERVE and RELEASE
+ * behavior
+ *
+ * A RESERVE(6) or RESERVE(10) command shall complete with GOOD
+ * status, but no reservation shall be established and the
+ * persistent reservation shall not be changed, if the command
+ * is received from a) and b) below.
+ *
+ * A RELEASE(6) or RELEASE(10) command shall complete with GOOD
+ * status, but the persistent reservation shall not be released,
+ * if the command is received from a) and b)
+ *
+ * a) An I_T nexus that is a persistent reservation holder; or
+ * b) An I_T nexus that is registered if a registrants only or
+ * all registrants type persistent reservation is present.
+ *
+ * In all other cases, a RESERVE(6) command, RESERVE(10) command,
+ * RELEASE(6) command, or RELEASE(10) command shall be processed
+ * as defined in SPC-2.
+ */
+ if (scsi_pr_registrant_is_holder(lun, reg)) {
+ return 1;
+ }
+
+ if (lun->reservation.rtype == SPDK_SCSI_PR_WRITE_EXCLUSIVE_REGS_ONLY ||
+ lun->reservation.rtype == SPDK_SCSI_PR_EXCLUSIVE_ACCESS_REGS_ONLY) {
+ return 1;
+ }
+
+ conflict = true;
+ } else {
+ /*
+ * From spc2r20 5.5.1 Reservations overview:
+ *
+ * If a logical unit has executed a PERSISTENT RESERVE OUT
+ * command with the REGISTER or the REGISTER AND IGNORE
+ * EXISTING KEY service action and is still registered by any
+ * initiator, all RESERVE commands and all RELEASE commands
+ * regardless of initiator shall conflict and shall terminate
+ * with a RESERVATION CONFLICT status.
+ */
+ conflict = TAILQ_EMPTY(&lun->reg_head) ? false : true;
+ }
+
+ if (conflict) {
+ spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_RESERVATION_CONFLICT,
+ SPDK_SCSI_SENSE_NO_SENSE,
+ SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE,
+ SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ return -1;
+ }
+
+ return 0;
+}
+
+int
+scsi2_reserve(struct spdk_scsi_task *task, uint8_t *cdb)
+{
+ struct spdk_scsi_lun *lun = task->lun;
+ struct spdk_scsi_pr_registrant *reg = &lun->scsi2_holder;
+ int ret;
+
+ /* Obsolete Bits and LongID set, returning ILLEGAL_REQUEST */
+ if (cdb[1] & 0x3) {
+ spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION,
+ SPDK_SCSI_SENSE_ILLEGAL_REQUEST,
+ SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB,
+ SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ return -1;
+ }
+
+ ret = scsi2_check_reservation_conflict(task);
+ /* PERSISTENT RESERVE is enabled */
+ if (ret == 1) {
+ return 0;
+ } else if (ret < 0) {
+ return ret;
+ }
+
+ /* SPC2 RESERVE */
+ reg->initiator_port = task->initiator_port;
+ if (task->initiator_port) {
+ snprintf(reg->initiator_port_name, sizeof(reg->initiator_port_name), "%s",
+ task->initiator_port->name);
+ reg->transport_id_len = task->initiator_port->transport_id_len;
+ memcpy(reg->transport_id, task->initiator_port->transport_id,
+ reg->transport_id_len);
+ }
+ reg->target_port = task->target_port;
+ if (task->target_port) {
+ snprintf(reg->target_port_name, sizeof(reg->target_port_name), "%s",
+ task->target_port->name);
+ }
+
+ lun->reservation.flags = SCSI_SPC2_RESERVE;
+ lun->reservation.holder = &lun->scsi2_holder;
+
+ return 0;
+}
+
+int
+scsi2_release(struct spdk_scsi_task *task)
+{
+ struct spdk_scsi_lun *lun = task->lun;
+ int ret;
+
+ ret = scsi2_check_reservation_conflict(task);
+ /* PERSISTENT RESERVE is enabled */
+ if (ret == 1) {
+ return 0;
+ } else if (ret < 0) {
+ return ret;
+ }
+
+ assert(lun->reservation.flags & SCSI_SPC2_RESERVE);
+
+ memset(&lun->reservation, 0, sizeof(struct spdk_scsi_pr_reservation));
+ memset(&lun->scsi2_holder, 0, sizeof(struct spdk_scsi_pr_registrant));
+
+ return 0;
+}
+
+int scsi2_reserve_check(struct spdk_scsi_task *task)
+{
+ struct spdk_scsi_lun *lun = task->lun;
+ uint8_t *cdb = task->cdb;
+
+ switch (cdb[0]) {
+ case SPDK_SPC_INQUIRY:
+ case SPDK_SPC2_RELEASE_6:
+ case SPDK_SPC2_RELEASE_10:
+ return 0;
+
+ default:
+ break;
+ }
+
+ /* no reservation holders */
+ if (!scsi_pr_has_reservation(lun)) {
+ return 0;
+ }
+
+ if (scsi2_it_nexus_is_holder(lun, task->initiator_port, task->target_port)) {
+ return 0;
+ }
+
+ spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_RESERVATION_CONFLICT,
+ SPDK_SCSI_SENSE_NO_SENSE,
+ SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE,
+ SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ return -1;
+}
diff --git a/src/spdk/lib/scsi/scsi_rpc.c b/src/spdk/lib/scsi/scsi_rpc.c
new file mode 100644
index 000000000..1938ddac7
--- /dev/null
+++ b/src/spdk/lib/scsi/scsi_rpc.c
@@ -0,0 +1,77 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "scsi_internal.h"
+
+#include "spdk/rpc.h"
+#include "spdk/util.h"
+
+static void
+rpc_scsi_get_devices(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct spdk_json_write_ctx *w;
+ struct spdk_scsi_dev *devs = scsi_dev_get_list();
+ int i;
+
+ if (params != NULL) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "scsi_get_devices requires no parameters");
+ return;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_array_begin(w);
+
+ for (i = 0; i < SPDK_SCSI_MAX_DEVS; i++) {
+ struct spdk_scsi_dev *dev = &devs[i];
+
+ if (!dev->is_allocated) {
+ continue;
+ }
+
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_int32(w, "id", dev->id);
+
+ spdk_json_write_named_string(w, "device_name", dev->name);
+
+ spdk_json_write_object_end(w);
+ }
+ spdk_json_write_array_end(w);
+
+ spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("scsi_get_devices", rpc_scsi_get_devices, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(scsi_get_devices, get_scsi_devices)
diff --git a/src/spdk/lib/scsi/spdk_scsi.map b/src/spdk/lib/scsi/spdk_scsi.map
new file mode 100644
index 000000000..643372699
--- /dev/null
+++ b/src/spdk/lib/scsi/spdk_scsi.map
@@ -0,0 +1,49 @@
+{
+ global:
+
+ # Public functions
+ spdk_scsi_init;
+ spdk_scsi_fini;
+ spdk_scsi_lun_get_id;
+ spdk_scsi_lun_get_bdev_name;
+ spdk_scsi_lun_get_dev;
+ spdk_scsi_lun_is_removing;
+ spdk_scsi_dev_get_name;
+ spdk_scsi_dev_get_id;
+ spdk_scsi_dev_get_lun;
+ spdk_scsi_dev_has_pending_tasks;
+ spdk_scsi_dev_destruct;
+ spdk_scsi_dev_queue_mgmt_task;
+ spdk_scsi_dev_queue_task;
+ spdk_scsi_dev_add_port;
+ spdk_scsi_dev_delete_port;
+ spdk_scsi_dev_find_port_by_id;
+ spdk_scsi_dev_allocate_io_channels;
+ spdk_scsi_dev_free_io_channels;
+ spdk_scsi_dev_construct;
+ spdk_scsi_dev_delete_lun;
+ spdk_scsi_dev_add_lun;
+ spdk_scsi_port_create;
+ spdk_scsi_port_free;
+ spdk_scsi_port_get_name;
+ spdk_scsi_task_construct;
+ spdk_scsi_task_put;
+ spdk_scsi_task_set_data;
+ spdk_scsi_task_scatter_data;
+ spdk_scsi_task_gather_data;
+ spdk_scsi_task_build_sense_data;
+ spdk_scsi_task_set_status;
+ spdk_scsi_task_copy_status;
+ spdk_scsi_task_process_null_lun;
+ spdk_scsi_task_process_abort;
+ spdk_scsi_lun_open;
+ spdk_scsi_lun_close;
+ spdk_scsi_lun_allocate_io_channel;
+ spdk_scsi_lun_free_io_channel;
+ spdk_scsi_lun_get_dif_ctx;
+ spdk_scsi_port_set_iscsi_transport_id;
+ spdk_scsi_lun_id_int_to_fmt;
+ spdk_scsi_lun_id_fmt_to_int;
+
+ local: *;
+};
diff --git a/src/spdk/lib/scsi/task.c b/src/spdk/lib/scsi/task.c
new file mode 100644
index 000000000..7fd8305ec
--- /dev/null
+++ b/src/spdk/lib/scsi/task.c
@@ -0,0 +1,300 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "scsi_internal.h"
+#include "spdk/endian.h"
+#include "spdk/env.h"
+#include "spdk/util.h"
+
+static void
+scsi_task_free_data(struct spdk_scsi_task *task)
+{
+ if (task->alloc_len != 0) {
+ spdk_dma_free(task->iov.iov_base);
+ task->alloc_len = 0;
+ }
+
+ task->iov.iov_base = NULL;
+ task->iov.iov_len = 0;
+}
+
+void
+spdk_scsi_task_put(struct spdk_scsi_task *task)
+{
+ if (!task) {
+ return;
+ }
+
+ assert(task->ref > 0);
+ task->ref--;
+
+ if (task->ref == 0) {
+ struct spdk_bdev_io *bdev_io = task->bdev_io;
+
+ if (bdev_io) {
+ spdk_bdev_free_io(bdev_io);
+ }
+
+ scsi_task_free_data(task);
+
+ task->free_fn(task);
+ }
+}
+
+void
+spdk_scsi_task_construct(struct spdk_scsi_task *task,
+ spdk_scsi_task_cpl cpl_fn,
+ spdk_scsi_task_free free_fn)
+{
+ assert(task != NULL);
+ assert(cpl_fn != NULL);
+ assert(free_fn != NULL);
+
+ task->cpl_fn = cpl_fn;
+ task->free_fn = free_fn;
+
+ task->ref++;
+
+ /*
+ * Pre-fill the iov_buffers to point to the embedded iov
+ */
+ assert(task->iov.iov_base == NULL);
+ task->iovs = &task->iov;
+ task->iovcnt = 1;
+}
+
+static void *
+scsi_task_alloc_data(struct spdk_scsi_task *task, uint32_t alloc_len)
+{
+ assert(task->alloc_len == 0);
+
+ task->iov.iov_base = spdk_dma_zmalloc(alloc_len, 0, NULL);
+ task->iov.iov_len = alloc_len;
+ task->alloc_len = alloc_len;
+
+ return task->iov.iov_base;
+}
+
+int
+spdk_scsi_task_scatter_data(struct spdk_scsi_task *task, const void *src, size_t buf_len)
+{
+ size_t len = 0;
+ size_t buf_left = buf_len;
+ int i;
+ struct iovec *iovs = task->iovs;
+ const uint8_t *pos;
+
+ if (buf_len == 0) {
+ return 0;
+ }
+
+ if (task->iovcnt == 1 && iovs[0].iov_base == NULL) {
+ scsi_task_alloc_data(task, buf_len);
+ iovs[0] = task->iov;
+ }
+
+ for (i = 0; i < task->iovcnt; i++) {
+ assert(iovs[i].iov_base != NULL);
+ len += iovs[i].iov_len;
+ }
+
+ if (len < buf_len) {
+ spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION,
+ SPDK_SCSI_SENSE_ILLEGAL_REQUEST,
+ SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB,
+ SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ return -1;
+ }
+
+ pos = src;
+
+ for (i = 0; i < task->iovcnt; i++) {
+ len = spdk_min(iovs[i].iov_len, buf_left);
+ buf_left -= len;
+ memcpy(iovs[i].iov_base, pos, len);
+ pos += len;
+ }
+
+ return buf_len;
+}
+
+void *
+spdk_scsi_task_gather_data(struct spdk_scsi_task *task, int *len)
+{
+ int i;
+ struct iovec *iovs = task->iovs;
+ size_t buf_len = 0;
+ uint8_t *buf, *pos;
+
+ for (i = 0; i < task->iovcnt; i++) {
+ assert(iovs[i].iov_base != NULL);
+ buf_len += iovs[i].iov_len;
+ }
+
+ if (buf_len == 0) {
+ *len = 0;
+ return NULL;
+ }
+
+ buf = calloc(1, buf_len);
+ if (buf == NULL) {
+ *len = -1;
+ return NULL;
+ }
+
+ pos = buf;
+ for (i = 0; i < task->iovcnt; i++) {
+ memcpy(pos, iovs[i].iov_base, iovs[i].iov_len);
+ pos += iovs[i].iov_len;
+ }
+
+ *len = buf_len;
+ return buf;
+}
+
+void
+spdk_scsi_task_set_data(struct spdk_scsi_task *task, void *data, uint32_t len)
+{
+ assert(task->iovcnt == 1);
+ assert(task->alloc_len == 0);
+
+ task->iovs[0].iov_base = data;
+ task->iovs[0].iov_len = len;
+}
+
+void
+spdk_scsi_task_build_sense_data(struct spdk_scsi_task *task, int sk, int asc, int ascq)
+{
+ uint8_t *cp;
+ int resp_code;
+
+ resp_code = 0x70; /* Current + Fixed format */
+
+ /* Sense Data */
+ cp = task->sense_data;
+
+ /* VALID(7) RESPONSE CODE(6-0) */
+ cp[0] = 0x80 | resp_code;
+ /* Obsolete */
+ cp[1] = 0;
+ /* FILEMARK(7) EOM(6) ILI(5) SENSE KEY(3-0) */
+ cp[2] = sk & 0xf;
+ /* INFORMATION */
+ memset(&cp[3], 0, 4);
+
+ /* ADDITIONAL SENSE LENGTH */
+ cp[7] = 10;
+
+ /* COMMAND-SPECIFIC INFORMATION */
+ memset(&cp[8], 0, 4);
+ /* ADDITIONAL SENSE CODE */
+ cp[12] = asc;
+ /* ADDITIONAL SENSE CODE QUALIFIER */
+ cp[13] = ascq;
+ /* FIELD REPLACEABLE UNIT CODE */
+ cp[14] = 0;
+
+ /* SKSV(7) SENSE KEY SPECIFIC(6-0,7-0,7-0) */
+ cp[15] = 0;
+ cp[16] = 0;
+ cp[17] = 0;
+
+ /* SenseLength */
+ task->sense_data_len = 18;
+}
+
+void
+spdk_scsi_task_set_status(struct spdk_scsi_task *task, int sc, int sk,
+ int asc, int ascq)
+{
+ if (sc == SPDK_SCSI_STATUS_CHECK_CONDITION) {
+ spdk_scsi_task_build_sense_data(task, sk, asc, ascq);
+ }
+ task->status = sc;
+}
+
+void
+spdk_scsi_task_copy_status(struct spdk_scsi_task *dst,
+ struct spdk_scsi_task *src)
+{
+ memcpy(dst->sense_data, src->sense_data, src->sense_data_len);
+ dst->sense_data_len = src->sense_data_len;
+ dst->status = src->status;
+}
+
+void
+spdk_scsi_task_process_null_lun(struct spdk_scsi_task *task)
+{
+ uint8_t buffer[36];
+ uint32_t allocation_len;
+ uint32_t data_len;
+
+ task->length = task->transfer_len;
+ if (task->cdb[0] == SPDK_SPC_INQUIRY) {
+ /*
+ * SPC-4 states that INQUIRY commands to an unsupported LUN
+ * must be served with PERIPHERAL QUALIFIER = 0x3 and
+ * PERIPHERAL DEVICE TYPE = 0x1F.
+ */
+ data_len = sizeof(buffer);
+
+ memset(buffer, 0, data_len);
+ /* PERIPHERAL QUALIFIER(7-5) PERIPHERAL DEVICE TYPE(4-0) */
+ buffer[0] = 0x03 << 5 | 0x1f;
+ /* ADDITIONAL LENGTH */
+ buffer[4] = data_len - 5;
+
+ allocation_len = from_be16(&task->cdb[3]);
+ if (spdk_scsi_task_scatter_data(task, buffer, spdk_min(allocation_len, data_len)) >= 0) {
+ task->data_transferred = data_len;
+ task->status = SPDK_SCSI_STATUS_GOOD;
+ }
+ } else {
+ /* LOGICAL UNIT NOT SUPPORTED */
+ spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION,
+ SPDK_SCSI_SENSE_ILLEGAL_REQUEST,
+ SPDK_SCSI_ASC_LOGICAL_UNIT_NOT_SUPPORTED,
+ SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ task->data_transferred = 0;
+ }
+}
+
+void
+spdk_scsi_task_process_abort(struct spdk_scsi_task *task)
+{
+ spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION,
+ SPDK_SCSI_SENSE_ABORTED_COMMAND,
+ SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE,
+ SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+}
diff --git a/src/spdk/lib/sock/Makefile b/src/spdk/lib/sock/Makefile
new file mode 100644
index 000000000..82fe41e90
--- /dev/null
+++ b/src/spdk/lib/sock/Makefile
@@ -0,0 +1,46 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 3
+SO_MINOR := 1
+
+C_SRCS = sock.c net_framework.c sock_rpc.c
+
+LIBNAME = sock
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_sock.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/sock/net_framework.c b/src/spdk/lib/sock/net_framework.c
new file mode 100644
index 000000000..45d52d162
--- /dev/null
+++ b/src/spdk/lib/sock/net_framework.c
@@ -0,0 +1,107 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/log.h"
+#include "spdk/net.h"
+#include "spdk/queue.h"
+
+static STAILQ_HEAD(, spdk_net_framework) g_net_frameworks =
+ STAILQ_HEAD_INITIALIZER(g_net_frameworks);
+
+static spdk_net_init_cb g_init_cb_fn = NULL;
+static void *g_init_cb_arg = NULL;
+
+static spdk_net_fini_cb g_fini_cb_fn = NULL;
+static void *g_fini_cb_arg = NULL;
+
+struct spdk_net_framework *g_next_net_framework = NULL;
+
+static inline struct spdk_net_framework *
+get_next_net_framework(struct spdk_net_framework *net)
+{
+ return net ? STAILQ_NEXT(net, link) : STAILQ_FIRST(&g_net_frameworks);
+}
+
+void
+spdk_net_framework_init_next(int rc)
+{
+ if (rc) {
+ SPDK_ERRLOG("Net framework %s failed to initalize with error %d\n", g_next_net_framework->name, rc);
+ g_init_cb_fn(g_init_cb_arg, rc);
+ return;
+ }
+
+ g_next_net_framework = get_next_net_framework(g_next_net_framework);
+ if (g_next_net_framework == NULL) {
+ g_init_cb_fn(g_init_cb_arg, 0);
+ return;
+ }
+
+ g_next_net_framework->init();
+}
+
+void
+spdk_net_framework_start(spdk_net_init_cb cb_fn, void *cb_arg)
+{
+ g_init_cb_fn = cb_fn;
+ g_init_cb_arg = cb_arg;
+
+ spdk_net_framework_init_next(0);
+}
+
+void
+spdk_net_framework_fini_next(void)
+{
+ g_next_net_framework = get_next_net_framework(g_next_net_framework);
+ if (g_next_net_framework == NULL) {
+ g_fini_cb_fn(g_fini_cb_arg);
+ return;
+ }
+
+ g_next_net_framework->fini();
+}
+
+void
+spdk_net_framework_fini(spdk_net_fini_cb cb_fn, void *cb_arg)
+{
+ g_fini_cb_fn = cb_fn;
+ g_fini_cb_arg = cb_arg;
+
+ spdk_net_framework_fini_next();
+}
+
+void
+spdk_net_framework_register(struct spdk_net_framework *frame)
+{
+ STAILQ_INSERT_TAIL(&g_net_frameworks, frame, link);
+}
diff --git a/src/spdk/lib/sock/sock.c b/src/spdk/lib/sock/sock.c
new file mode 100644
index 000000000..5ea90385c
--- /dev/null
+++ b/src/spdk/lib/sock/sock.c
@@ -0,0 +1,809 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation. All rights reserved.
+ * Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/log.h"
+#include "spdk/sock.h"
+#include "spdk_internal/sock.h"
+#include "spdk/queue.h"
+
+#define SPDK_SOCK_DEFAULT_PRIORITY 0
+#define SPDK_SOCK_OPTS_FIELD_OK(opts, field) (offsetof(struct spdk_sock_opts, field) + sizeof(opts->field) <= (opts->opts_size))
+
+static STAILQ_HEAD(, spdk_net_impl) g_net_impls = STAILQ_HEAD_INITIALIZER(g_net_impls);
+
+struct spdk_sock_placement_id_entry {
+ int placement_id;
+ uint32_t ref;
+ struct spdk_sock_group *group;
+ STAILQ_ENTRY(spdk_sock_placement_id_entry) link;
+};
+
+static STAILQ_HEAD(, spdk_sock_placement_id_entry) g_placement_id_map = STAILQ_HEAD_INITIALIZER(
+ g_placement_id_map);
+static pthread_mutex_t g_map_table_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+/* Insert a group into the placement map.
+ * If the group is already in the map, take a reference.
+ */
+static int
+sock_map_insert(int placement_id, struct spdk_sock_group *group)
+{
+ struct spdk_sock_placement_id_entry *entry;
+
+ pthread_mutex_lock(&g_map_table_mutex);
+ STAILQ_FOREACH(entry, &g_placement_id_map, link) {
+ if (placement_id == entry->placement_id) {
+ /* The mapping already exists, it means that different sockets have
+ * the same placement_ids.
+ */
+ entry->ref++;
+ pthread_mutex_unlock(&g_map_table_mutex);
+ return 0;
+ }
+ }
+
+ entry = calloc(1, sizeof(*entry));
+ if (!entry) {
+ SPDK_ERRLOG("Cannot allocate an entry for placement_id=%u\n", placement_id);
+ pthread_mutex_unlock(&g_map_table_mutex);
+ return -ENOMEM;
+ }
+
+ entry->placement_id = placement_id;
+ entry->group = group;
+ entry->ref++;
+
+ STAILQ_INSERT_TAIL(&g_placement_id_map, entry, link);
+ pthread_mutex_unlock(&g_map_table_mutex);
+
+ return 0;
+}
+
+/* Release a reference to the group for a given placement_id.
+ * If the reference count is 0, remove the group.
+ */
+static void
+sock_map_release(int placement_id)
+{
+ struct spdk_sock_placement_id_entry *entry;
+
+ pthread_mutex_lock(&g_map_table_mutex);
+ STAILQ_FOREACH(entry, &g_placement_id_map, link) {
+ if (placement_id == entry->placement_id) {
+ assert(entry->ref > 0);
+ entry->ref--;
+ break;
+ }
+ }
+
+ pthread_mutex_unlock(&g_map_table_mutex);
+}
+
+/* Look up the group for a placement_id. */
+static void
+sock_map_lookup(int placement_id, struct spdk_sock_group **group)
+{
+ struct spdk_sock_placement_id_entry *entry;
+
+ *group = NULL;
+ pthread_mutex_lock(&g_map_table_mutex);
+ STAILQ_FOREACH(entry, &g_placement_id_map, link) {
+ if (placement_id == entry->placement_id) {
+ assert(entry->group != NULL);
+ *group = entry->group;
+ break;
+ }
+ }
+ pthread_mutex_unlock(&g_map_table_mutex);
+}
+
+/* Remove the socket group from the map table */
+static void
+sock_remove_sock_group_from_map_table(struct spdk_sock_group *group)
+{
+ struct spdk_sock_placement_id_entry *entry, *tmp;
+
+ pthread_mutex_lock(&g_map_table_mutex);
+ STAILQ_FOREACH_SAFE(entry, &g_placement_id_map, link, tmp) {
+ if (entry->group == group) {
+ STAILQ_REMOVE(&g_placement_id_map, entry, spdk_sock_placement_id_entry, link);
+ free(entry);
+ }
+ }
+ pthread_mutex_unlock(&g_map_table_mutex);
+
+}
+
+int
+spdk_sock_get_optimal_sock_group(struct spdk_sock *sock, struct spdk_sock_group **group)
+{
+ int placement_id = 0, rc;
+
+ rc = sock->net_impl->get_placement_id(sock, &placement_id);
+ if (!rc && (placement_id != 0)) {
+ sock_map_lookup(placement_id, group);
+ return 0;
+ } else {
+ return -1;
+ }
+}
+
+int
+spdk_sock_getaddr(struct spdk_sock *sock, char *saddr, int slen, uint16_t *sport,
+ char *caddr, int clen, uint16_t *cport)
+{
+ return sock->net_impl->getaddr(sock, saddr, slen, sport, caddr, clen, cport);
+}
+
+void
+spdk_sock_get_default_opts(struct spdk_sock_opts *opts)
+{
+ assert(opts);
+
+ if (SPDK_SOCK_OPTS_FIELD_OK(opts, priority)) {
+ opts->priority = SPDK_SOCK_DEFAULT_PRIORITY;
+ }
+}
+
+/*
+ * opts The opts allocated in the current library.
+ * opts_user The opts passed by the caller.
+ * */
+static void
+sock_init_opts(struct spdk_sock_opts *opts, struct spdk_sock_opts *opts_user)
+{
+ assert(opts);
+ assert(opts_user);
+
+ opts->opts_size = sizeof(*opts);
+ spdk_sock_get_default_opts(opts);
+
+ /* reset the size according to the user */
+ opts->opts_size = opts_user->opts_size;
+ if (SPDK_SOCK_OPTS_FIELD_OK(opts, priority)) {
+ opts->priority = opts_user->priority;
+ }
+}
+
+struct spdk_sock *
+spdk_sock_connect(const char *ip, int port, char *impl_name)
+{
+ struct spdk_sock_opts opts;
+
+ opts.opts_size = sizeof(opts);
+ spdk_sock_get_default_opts(&opts);
+ return spdk_sock_connect_ext(ip, port, impl_name, &opts);
+}
+
+struct spdk_sock *
+spdk_sock_connect_ext(const char *ip, int port, char *impl_name, struct spdk_sock_opts *opts)
+{
+ struct spdk_net_impl *impl = NULL;
+ struct spdk_sock *sock;
+ struct spdk_sock_opts opts_local;
+
+ if (opts == NULL) {
+ SPDK_ERRLOG("the opts should not be NULL pointer\n");
+ return NULL;
+ }
+
+ STAILQ_FOREACH_FROM(impl, &g_net_impls, link) {
+ if (impl_name && strncmp(impl_name, impl->name, strlen(impl->name) + 1)) {
+ continue;
+ }
+
+ sock_init_opts(&opts_local, opts);
+ sock = impl->connect(ip, port, &opts_local);
+ if (sock != NULL) {
+ /* Copy the contents, both the two structures are the same ABI version */
+ memcpy(&sock->opts, &opts_local, sizeof(sock->opts));
+ sock->net_impl = impl;
+ TAILQ_INIT(&sock->queued_reqs);
+ TAILQ_INIT(&sock->pending_reqs);
+ return sock;
+ }
+ }
+
+ return NULL;
+}
+
+struct spdk_sock *
+spdk_sock_listen(const char *ip, int port, char *impl_name)
+{
+ struct spdk_sock_opts opts;
+
+ opts.opts_size = sizeof(opts);
+ spdk_sock_get_default_opts(&opts);
+ return spdk_sock_listen_ext(ip, port, impl_name, &opts);
+}
+
+struct spdk_sock *
+spdk_sock_listen_ext(const char *ip, int port, char *impl_name, struct spdk_sock_opts *opts)
+{
+ struct spdk_net_impl *impl = NULL;
+ struct spdk_sock *sock;
+ struct spdk_sock_opts opts_local;
+
+ if (opts == NULL) {
+ SPDK_ERRLOG("the opts should not be NULL pointer\n");
+ return NULL;
+ }
+
+ STAILQ_FOREACH_FROM(impl, &g_net_impls, link) {
+ if (impl_name && strncmp(impl_name, impl->name, strlen(impl->name) + 1)) {
+ continue;
+ }
+
+ sock_init_opts(&opts_local, opts);
+ sock = impl->listen(ip, port, &opts_local);
+ if (sock != NULL) {
+ /* Copy the contents, both the two structures are the same ABI version */
+ memcpy(&sock->opts, &opts_local, sizeof(sock->opts));
+ sock->net_impl = impl;
+ /* Don't need to initialize the request queues for listen
+ * sockets. */
+ return sock;
+ }
+ }
+
+ return NULL;
+}
+
+struct spdk_sock *
+spdk_sock_accept(struct spdk_sock *sock)
+{
+ struct spdk_sock *new_sock;
+
+ new_sock = sock->net_impl->accept(sock);
+ if (new_sock != NULL) {
+ /* Inherit the opts from the "accept sock" */
+ new_sock->opts = sock->opts;
+ memcpy(&new_sock->opts, &sock->opts, sizeof(new_sock->opts));
+ new_sock->net_impl = sock->net_impl;
+ TAILQ_INIT(&new_sock->queued_reqs);
+ TAILQ_INIT(&new_sock->pending_reqs);
+ }
+
+ return new_sock;
+}
+
+int
+spdk_sock_close(struct spdk_sock **_sock)
+{
+ struct spdk_sock *sock = *_sock;
+ int rc;
+
+ if (sock == NULL) {
+ errno = EBADF;
+ return -1;
+ }
+
+ if (sock->cb_fn != NULL) {
+ /* This sock is still part of a sock_group. */
+ errno = EBUSY;
+ return -1;
+ }
+
+ sock->flags.closed = true;
+
+ if (sock->cb_cnt > 0) {
+ /* Let the callback unwind before destroying the socket */
+ return 0;
+ }
+
+ spdk_sock_abort_requests(sock);
+
+ rc = sock->net_impl->close(sock);
+ if (rc == 0) {
+ *_sock = NULL;
+ }
+
+ return rc;
+}
+
+ssize_t
+spdk_sock_recv(struct spdk_sock *sock, void *buf, size_t len)
+{
+ if (sock == NULL) {
+ errno = EBADF;
+ return -1;
+ }
+
+ if (sock->flags.closed) {
+ errno = EBADF;
+ return -1;
+ }
+
+ return sock->net_impl->recv(sock, buf, len);
+}
+
+ssize_t
+spdk_sock_readv(struct spdk_sock *sock, struct iovec *iov, int iovcnt)
+{
+ if (sock == NULL) {
+ errno = EBADF;
+ return -1;
+ }
+
+ if (sock->flags.closed) {
+ errno = EBADF;
+ return -1;
+ }
+
+ return sock->net_impl->readv(sock, iov, iovcnt);
+}
+
+ssize_t
+spdk_sock_writev(struct spdk_sock *sock, struct iovec *iov, int iovcnt)
+{
+ if (sock == NULL) {
+ errno = EBADF;
+ return -1;
+ }
+
+ if (sock->flags.closed) {
+ errno = EBADF;
+ return -1;
+ }
+
+ return sock->net_impl->writev(sock, iov, iovcnt);
+}
+
+void
+spdk_sock_writev_async(struct spdk_sock *sock, struct spdk_sock_request *req)
+{
+ assert(req->cb_fn != NULL);
+
+ if (sock == NULL) {
+ req->cb_fn(req->cb_arg, -EBADF);
+ return;
+ }
+
+ if (sock->flags.closed) {
+ req->cb_fn(req->cb_arg, -EBADF);
+ return;
+ }
+
+ sock->net_impl->writev_async(sock, req);
+}
+
+int
+spdk_sock_flush(struct spdk_sock *sock)
+{
+ if (sock == NULL) {
+ return -EBADF;
+ }
+
+ if (sock->flags.closed) {
+ return -EBADF;
+ }
+
+ return sock->net_impl->flush(sock);
+}
+
+int
+spdk_sock_set_recvlowat(struct spdk_sock *sock, int nbytes)
+{
+ return sock->net_impl->set_recvlowat(sock, nbytes);
+}
+
+int
+spdk_sock_set_recvbuf(struct spdk_sock *sock, int sz)
+{
+ return sock->net_impl->set_recvbuf(sock, sz);
+}
+
+int
+spdk_sock_set_sendbuf(struct spdk_sock *sock, int sz)
+{
+ return sock->net_impl->set_sendbuf(sock, sz);
+}
+
+bool
+spdk_sock_is_ipv6(struct spdk_sock *sock)
+{
+ return sock->net_impl->is_ipv6(sock);
+}
+
+bool
+spdk_sock_is_ipv4(struct spdk_sock *sock)
+{
+ return sock->net_impl->is_ipv4(sock);
+}
+
+bool
+spdk_sock_is_connected(struct spdk_sock *sock)
+{
+ return sock->net_impl->is_connected(sock);
+}
+
+struct spdk_sock_group *
+spdk_sock_group_create(void *ctx)
+{
+ struct spdk_net_impl *impl = NULL;
+ struct spdk_sock_group *group;
+ struct spdk_sock_group_impl *group_impl;
+
+ group = calloc(1, sizeof(*group));
+ if (group == NULL) {
+ return NULL;
+ }
+
+ STAILQ_INIT(&group->group_impls);
+
+ STAILQ_FOREACH_FROM(impl, &g_net_impls, link) {
+ group_impl = impl->group_impl_create();
+ if (group_impl != NULL) {
+ STAILQ_INSERT_TAIL(&group->group_impls, group_impl, link);
+ TAILQ_INIT(&group_impl->socks);
+ group_impl->num_removed_socks = 0;
+ group_impl->net_impl = impl;
+ }
+ }
+
+ group->ctx = ctx;
+ return group;
+}
+
+void *
+spdk_sock_group_get_ctx(struct spdk_sock_group *group)
+{
+ if (group == NULL) {
+ return NULL;
+ }
+
+ return group->ctx;
+}
+
+int
+spdk_sock_group_add_sock(struct spdk_sock_group *group, struct spdk_sock *sock,
+ spdk_sock_cb cb_fn, void *cb_arg)
+{
+ struct spdk_sock_group_impl *group_impl = NULL;
+ int rc, placement_id = 0;
+
+ if (cb_fn == NULL) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ if (sock->group_impl != NULL) {
+ /*
+ * This sock is already part of a sock_group. Currently we don't
+ * support this.
+ */
+ errno = EBUSY;
+ return -1;
+ }
+
+ rc = sock->net_impl->get_placement_id(sock, &placement_id);
+ if (!rc && (placement_id != 0)) {
+ rc = sock_map_insert(placement_id, group);
+ if (rc < 0) {
+ return -1;
+ }
+ }
+
+ STAILQ_FOREACH_FROM(group_impl, &group->group_impls, link) {
+ if (sock->net_impl == group_impl->net_impl) {
+ break;
+ }
+ }
+
+ if (group_impl == NULL) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ rc = group_impl->net_impl->group_impl_add_sock(group_impl, sock);
+ if (rc == 0) {
+ TAILQ_INSERT_TAIL(&group_impl->socks, sock, link);
+ sock->group_impl = group_impl;
+ sock->cb_fn = cb_fn;
+ sock->cb_arg = cb_arg;
+ }
+
+ return rc;
+}
+
+int
+spdk_sock_group_remove_sock(struct spdk_sock_group *group, struct spdk_sock *sock)
+{
+ struct spdk_sock_group_impl *group_impl = NULL;
+ int rc, placement_id = 0;
+
+ STAILQ_FOREACH_FROM(group_impl, &group->group_impls, link) {
+ if (sock->net_impl == group_impl->net_impl) {
+ break;
+ }
+ }
+
+ if (group_impl == NULL) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ assert(group_impl == sock->group_impl);
+
+ rc = sock->net_impl->get_placement_id(sock, &placement_id);
+ if (!rc && (placement_id != 0)) {
+ sock_map_release(placement_id);
+ }
+
+ rc = group_impl->net_impl->group_impl_remove_sock(group_impl, sock);
+ if (rc == 0) {
+ TAILQ_REMOVE(&group_impl->socks, sock, link);
+ assert(group_impl->num_removed_socks < MAX_EVENTS_PER_POLL);
+ group_impl->removed_socks[group_impl->num_removed_socks] = (uintptr_t)sock;
+ group_impl->num_removed_socks++;
+ sock->group_impl = NULL;
+ sock->cb_fn = NULL;
+ sock->cb_arg = NULL;
+ }
+
+ return rc;
+}
+
+int
+spdk_sock_group_poll(struct spdk_sock_group *group)
+{
+ return spdk_sock_group_poll_count(group, MAX_EVENTS_PER_POLL);
+}
+
+static int
+sock_group_impl_poll_count(struct spdk_sock_group_impl *group_impl,
+ struct spdk_sock_group *group,
+ int max_events)
+{
+ struct spdk_sock *socks[MAX_EVENTS_PER_POLL];
+ int num_events, i;
+
+ if (TAILQ_EMPTY(&group_impl->socks)) {
+ return 0;
+ }
+
+ /* The number of removed sockets should be reset for each call to poll. */
+ group_impl->num_removed_socks = 0;
+
+ num_events = group_impl->net_impl->group_impl_poll(group_impl, max_events, socks);
+ if (num_events == -1) {
+ return -1;
+ }
+
+ for (i = 0; i < num_events; i++) {
+ struct spdk_sock *sock = socks[i];
+ int j;
+ bool valid = true;
+ for (j = 0; j < group_impl->num_removed_socks; j++) {
+ if ((uintptr_t)sock == group_impl->removed_socks[j]) {
+ valid = false;
+ break;
+ }
+ }
+
+ if (valid) {
+ assert(sock->cb_fn != NULL);
+ sock->cb_fn(sock->cb_arg, group, sock);
+ }
+ }
+
+ return num_events;
+}
+
+int
+spdk_sock_group_poll_count(struct spdk_sock_group *group, int max_events)
+{
+ struct spdk_sock_group_impl *group_impl = NULL;
+ int rc, num_events = 0;
+
+ if (max_events < 1) {
+ errno = -EINVAL;
+ return -1;
+ }
+
+ /*
+ * Only poll for up to 32 events at a time - if more events are pending,
+ * the next call to this function will reap them.
+ */
+ if (max_events > MAX_EVENTS_PER_POLL) {
+ max_events = MAX_EVENTS_PER_POLL;
+ }
+
+ STAILQ_FOREACH_FROM(group_impl, &group->group_impls, link) {
+ rc = sock_group_impl_poll_count(group_impl, group, max_events);
+ if (rc < 0) {
+ num_events = -1;
+ SPDK_ERRLOG("group_impl_poll_count for net(%s) failed\n",
+ group_impl->net_impl->name);
+ } else if (num_events >= 0) {
+ num_events += rc;
+ }
+ }
+
+ return num_events;
+}
+
+int
+spdk_sock_group_close(struct spdk_sock_group **group)
+{
+ struct spdk_sock_group_impl *group_impl = NULL, *tmp;
+ int rc;
+
+ if (*group == NULL) {
+ errno = EBADF;
+ return -1;
+ }
+
+ STAILQ_FOREACH_SAFE(group_impl, &(*group)->group_impls, link, tmp) {
+ if (!TAILQ_EMPTY(&group_impl->socks)) {
+ errno = EBUSY;
+ return -1;
+ }
+ }
+
+ STAILQ_FOREACH_SAFE(group_impl, &(*group)->group_impls, link, tmp) {
+ rc = group_impl->net_impl->group_impl_close(group_impl);
+ if (rc != 0) {
+ SPDK_ERRLOG("group_impl_close for net(%s) failed\n",
+ group_impl->net_impl->name);
+ }
+ }
+
+ sock_remove_sock_group_from_map_table(*group);
+ free(*group);
+ *group = NULL;
+
+ return 0;
+}
+
+static inline struct spdk_net_impl *
+sock_get_impl_by_name(const char *impl_name)
+{
+ struct spdk_net_impl *impl;
+
+ assert(impl_name != NULL);
+ STAILQ_FOREACH(impl, &g_net_impls, link) {
+ if (0 == strcmp(impl_name, impl->name)) {
+ return impl;
+ }
+ }
+
+ return NULL;
+}
+
+int
+spdk_sock_impl_get_opts(const char *impl_name, struct spdk_sock_impl_opts *opts, size_t *len)
+{
+ struct spdk_net_impl *impl;
+
+ if (!impl_name || !opts || !len) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ impl = sock_get_impl_by_name(impl_name);
+ if (!impl) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ if (!impl->get_opts) {
+ errno = ENOTSUP;
+ return -1;
+ }
+
+ return impl->get_opts(opts, len);
+}
+
+int
+spdk_sock_impl_set_opts(const char *impl_name, const struct spdk_sock_impl_opts *opts, size_t len)
+{
+ struct spdk_net_impl *impl;
+
+ if (!impl_name || !opts) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ impl = sock_get_impl_by_name(impl_name);
+ if (!impl) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ if (!impl->set_opts) {
+ errno = ENOTSUP;
+ return -1;
+ }
+
+ return impl->set_opts(opts, len);
+}
+
+void
+spdk_sock_write_config_json(struct spdk_json_write_ctx *w)
+{
+ struct spdk_net_impl *impl;
+ struct spdk_sock_impl_opts opts;
+ size_t len;
+
+ assert(w != NULL);
+
+ spdk_json_write_array_begin(w);
+
+ STAILQ_FOREACH(impl, &g_net_impls, link) {
+ if (!impl->get_opts) {
+ continue;
+ }
+
+ len = sizeof(opts);
+ if (impl->get_opts(&opts, &len) == 0) {
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "method", "sock_impl_set_options");
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_string(w, "impl_name", impl->name);
+ spdk_json_write_named_uint32(w, "recv_buf_size", opts.recv_buf_size);
+ spdk_json_write_named_uint32(w, "send_buf_size", opts.send_buf_size);
+ spdk_json_write_named_bool(w, "enable_recv_pipe", opts.enable_recv_pipe);
+ spdk_json_write_named_bool(w, "enable_zerocopy_send", opts.enable_zerocopy_send);
+ spdk_json_write_object_end(w);
+ spdk_json_write_object_end(w);
+ } else {
+ SPDK_ERRLOG("Failed to get socket options for socket implementation %s\n", impl->name);
+ }
+ }
+
+ spdk_json_write_array_end(w);
+}
+
+void
+spdk_net_impl_register(struct spdk_net_impl *impl, int priority)
+{
+ struct spdk_net_impl *cur, *prev;
+
+ impl->priority = priority;
+ prev = NULL;
+ STAILQ_FOREACH(cur, &g_net_impls, link) {
+ if (impl->priority > cur->priority) {
+ break;
+ }
+ prev = cur;
+ }
+
+ if (prev) {
+ STAILQ_INSERT_AFTER(&g_net_impls, prev, impl, link);
+ } else {
+ STAILQ_INSERT_HEAD(&g_net_impls, impl, link);
+ }
+}
diff --git a/src/spdk/lib/sock/sock_rpc.c b/src/spdk/lib/sock/sock_rpc.c
new file mode 100644
index 000000000..c8686a068
--- /dev/null
+++ b/src/spdk/lib/sock/sock_rpc.c
@@ -0,0 +1,161 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/sock.h"
+
+#include "spdk/rpc.h"
+#include "spdk/util.h"
+#include "spdk/string.h"
+
+#include "spdk_internal/log.h"
+
+
+static const struct spdk_json_object_decoder rpc_sock_impl_get_opts_decoders[] = {
+ { "impl_name", 0, spdk_json_decode_string, false },
+};
+
+static void
+rpc_sock_impl_get_options(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ char *impl_name = NULL;
+ struct spdk_sock_impl_opts sock_opts = {};
+ struct spdk_json_write_ctx *w;
+ size_t len;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_sock_impl_get_opts_decoders,
+ SPDK_COUNTOF(rpc_sock_impl_get_opts_decoders), &impl_name)) {
+ SPDK_ERRLOG("spdk_json_decode_object() failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid parameters");
+ return;
+ }
+
+ len = sizeof(sock_opts);
+ rc = spdk_sock_impl_get_opts(impl_name, &sock_opts, &len);
+ if (rc) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid parameters");
+ return;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_uint32(w, "recv_buf_size", sock_opts.recv_buf_size);
+ spdk_json_write_named_uint32(w, "send_buf_size", sock_opts.send_buf_size);
+ spdk_json_write_named_bool(w, "enable_recv_pipe", sock_opts.enable_recv_pipe);
+ spdk_json_write_named_bool(w, "enable_zerocopy_send", sock_opts.enable_zerocopy_send);
+ spdk_json_write_object_end(w);
+ spdk_jsonrpc_end_result(request, w);
+ free(impl_name);
+}
+SPDK_RPC_REGISTER("sock_impl_get_options", rpc_sock_impl_get_options,
+ SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME)
+
+struct spdk_rpc_sock_impl_set_opts {
+ char *impl_name;
+ struct spdk_sock_impl_opts sock_opts;
+};
+
+static const struct spdk_json_object_decoder rpc_sock_impl_set_opts_decoders[] = {
+ {
+ "impl_name", offsetof(struct spdk_rpc_sock_impl_set_opts, impl_name),
+ spdk_json_decode_string, false
+ },
+ {
+ "recv_buf_size", offsetof(struct spdk_rpc_sock_impl_set_opts, sock_opts.recv_buf_size),
+ spdk_json_decode_uint32, true
+ },
+ {
+ "send_buf_size", offsetof(struct spdk_rpc_sock_impl_set_opts, sock_opts.send_buf_size),
+ spdk_json_decode_uint32, true
+ },
+ {
+ "enable_recv_pipe", offsetof(struct spdk_rpc_sock_impl_set_opts, sock_opts.enable_recv_pipe),
+ spdk_json_decode_bool, true
+ },
+ {
+ "enable_zerocopy_send", offsetof(struct spdk_rpc_sock_impl_set_opts, sock_opts.enable_zerocopy_send),
+ spdk_json_decode_bool, true
+ },
+};
+
+static void
+rpc_sock_impl_set_options(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct spdk_rpc_sock_impl_set_opts opts = {};
+ struct spdk_json_write_ctx *w;
+ size_t len;
+ int rc;
+
+ /* Get type */
+ if (spdk_json_decode_object(params, rpc_sock_impl_set_opts_decoders,
+ SPDK_COUNTOF(rpc_sock_impl_set_opts_decoders), &opts)) {
+ SPDK_ERRLOG("spdk_json_decode_object() failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid parameters");
+ return;
+ }
+
+ /* Retrieve default opts for requested socket implementation */
+ len = sizeof(opts.sock_opts);
+ rc = spdk_sock_impl_get_opts(opts.impl_name, &opts.sock_opts, &len);
+ if (rc) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid parameters");
+ return;
+ }
+
+ /* Decode opts */
+ if (spdk_json_decode_object(params, rpc_sock_impl_set_opts_decoders,
+ SPDK_COUNTOF(rpc_sock_impl_set_opts_decoders), &opts)) {
+ SPDK_ERRLOG("spdk_json_decode_object() failed\n");
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid parameters");
+ return;
+ }
+
+ rc = spdk_sock_impl_set_opts(opts.impl_name, &opts.sock_opts, sizeof(opts.sock_opts));
+ if (rc != 0) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "Invalid parameters");
+ return;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ free(opts.impl_name);
+}
+SPDK_RPC_REGISTER("sock_impl_set_options", rpc_sock_impl_set_options, SPDK_RPC_STARTUP)
diff --git a/src/spdk/lib/sock/spdk_sock.map b/src/spdk/lib/sock/spdk_sock.map
new file mode 100644
index 000000000..e3fb44281
--- /dev/null
+++ b/src/spdk/lib/sock/spdk_sock.map
@@ -0,0 +1,47 @@
+{
+ global:
+
+ # public functions in spdk/sock.h
+ spdk_sock_get_default_opts;
+ spdk_sock_getaddr;
+ spdk_sock_connect;
+ spdk_sock_connect_ext;
+ spdk_sock_listen;
+ spdk_sock_listen_ext;
+ spdk_sock_accept;
+ spdk_sock_close;
+ spdk_sock_flush;
+ spdk_sock_recv;
+ spdk_sock_writev;
+ spdk_sock_writev_async;
+ spdk_sock_readv;
+ spdk_sock_set_recvlowat;
+ spdk_sock_set_recvbuf;
+ spdk_sock_set_sendbuf;
+ spdk_sock_is_ipv6;
+ spdk_sock_is_ipv4;
+ spdk_sock_is_connected;
+ spdk_sock_group_create;
+ spdk_sock_group_get_ctx;
+ spdk_sock_group_add_sock;
+ spdk_sock_group_remove_sock;
+ spdk_sock_group_poll;
+ spdk_sock_group_poll_count;
+ spdk_sock_group_close;
+ spdk_sock_get_optimal_sock_group;
+ spdk_sock_impl_get_opts;
+ spdk_sock_impl_set_opts;
+ spdk_sock_write_config_json;
+
+ # public functions in spdk/net.h
+ spdk_net_framework_register;
+ spdk_net_framework_start;
+ spdk_net_framework_fini;
+ spdk_net_framework_init_next;
+ spdk_net_framework_fini_next;
+
+ # internal function in spdk_internal/sock.h
+ spdk_net_impl_register;
+
+ local: *;
+};
diff --git a/src/spdk/lib/thread/Makefile b/src/spdk/lib/thread/Makefile
new file mode 100644
index 000000000..ceb7a394e
--- /dev/null
+++ b/src/spdk/lib/thread/Makefile
@@ -0,0 +1,45 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 3
+SO_MINOR := 0
+
+C_SRCS = thread.c
+LIBNAME = thread
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_thread.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/thread/spdk_thread.map b/src/spdk/lib/thread/spdk_thread.map
new file mode 100644
index 000000000..b71fa06eb
--- /dev/null
+++ b/src/spdk/lib/thread/spdk_thread.map
@@ -0,0 +1,55 @@
+{
+ global:
+
+ # public functions in spdk/thread.h
+ spdk_thread_lib_init;
+ spdk_thread_lib_init_ext;
+ spdk_thread_lib_fini;
+ spdk_thread_create;
+ spdk_set_thread;
+ spdk_thread_exit;
+ spdk_thread_is_exited;
+ spdk_thread_destroy;
+ spdk_thread_get_ctx;
+ spdk_thread_get_cpumask;
+ spdk_thread_set_cpumask;
+ spdk_thread_get_from_ctx;
+ spdk_thread_poll;
+ spdk_thread_next_poller_expiration;
+ spdk_thread_has_active_pollers;
+ spdk_thread_has_pollers;
+ spdk_thread_is_idle;
+ spdk_thread_get_count;
+ spdk_get_thread;
+ spdk_thread_get_name;
+ spdk_thread_get_id;
+ spdk_thread_get_by_id;
+ spdk_thread_get_stats;
+ spdk_thread_get_last_tsc;
+ spdk_thread_send_msg;
+ spdk_thread_send_critical_msg;
+ spdk_for_each_thread;
+ spdk_poller_register;
+ spdk_poller_register_named;
+ spdk_poller_unregister;
+ spdk_poller_pause;
+ spdk_poller_resume;
+ spdk_io_device_register;
+ spdk_io_device_unregister;
+ spdk_get_io_channel;
+ spdk_put_io_channel;
+ spdk_io_channel_get_ctx;
+ spdk_io_channel_from_ctx;
+ spdk_io_channel_get_thread;
+ spdk_for_each_channel;
+ spdk_io_channel_iter_get_io_device;
+ spdk_io_channel_iter_get_channel;
+ spdk_io_channel_iter_get_ctx;
+ spdk_for_each_channel_continue;
+
+ # internal functions in spdk_internal/thread.h
+ spdk_poller_state_str;
+ spdk_io_device_get_name;
+
+ local: *;
+};
diff --git a/src/spdk/lib/thread/thread.c b/src/spdk/lib/thread/thread.c
new file mode 100644
index 000000000..65d91ce35
--- /dev/null
+++ b/src/spdk/lib/thread/thread.c
@@ -0,0 +1,1636 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/env.h"
+#include "spdk/likely.h"
+#include "spdk/queue.h"
+#include "spdk/string.h"
+#include "spdk/thread.h"
+#include "spdk/util.h"
+
+#include "spdk_internal/log.h"
+#include "spdk_internal/thread.h"
+
+#define SPDK_MSG_BATCH_SIZE 8
+#define SPDK_MAX_DEVICE_NAME_LEN 256
+#define SPDK_THREAD_EXIT_TIMEOUT_SEC 5
+
+static pthread_mutex_t g_devlist_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static spdk_new_thread_fn g_new_thread_fn = NULL;
+static spdk_thread_op_fn g_thread_op_fn = NULL;
+static spdk_thread_op_supported_fn g_thread_op_supported_fn;
+static size_t g_ctx_sz = 0;
+/* Monotonic increasing ID is set to each created thread beginning at 1. Once the
+ * ID exceeds UINT64_MAX, further thread creation is not allowed and restarting
+ * SPDK application is required.
+ */
+static uint64_t g_thread_id = 1;
+
+struct io_device {
+ void *io_device;
+ char name[SPDK_MAX_DEVICE_NAME_LEN + 1];
+ spdk_io_channel_create_cb create_cb;
+ spdk_io_channel_destroy_cb destroy_cb;
+ spdk_io_device_unregister_cb unregister_cb;
+ struct spdk_thread *unregister_thread;
+ uint32_t ctx_size;
+ uint32_t for_each_count;
+ TAILQ_ENTRY(io_device) tailq;
+
+ uint32_t refcnt;
+
+ bool unregistered;
+};
+
+static TAILQ_HEAD(, io_device) g_io_devices = TAILQ_HEAD_INITIALIZER(g_io_devices);
+
+struct spdk_msg {
+ spdk_msg_fn fn;
+ void *arg;
+
+ SLIST_ENTRY(spdk_msg) link;
+};
+
+#define SPDK_MSG_MEMPOOL_CACHE_SIZE 1024
+static struct spdk_mempool *g_spdk_msg_mempool = NULL;
+
+static TAILQ_HEAD(, spdk_thread) g_threads = TAILQ_HEAD_INITIALIZER(g_threads);
+static uint32_t g_thread_count = 0;
+
+static __thread struct spdk_thread *tls_thread = NULL;
+
+static inline struct spdk_thread *
+_get_thread(void)
+{
+ return tls_thread;
+}
+
+static int
+_thread_lib_init(size_t ctx_sz)
+{
+ char mempool_name[SPDK_MAX_MEMZONE_NAME_LEN];
+
+ g_ctx_sz = ctx_sz;
+
+ snprintf(mempool_name, sizeof(mempool_name), "msgpool_%d", getpid());
+ g_spdk_msg_mempool = spdk_mempool_create(mempool_name,
+ 262144 - 1, /* Power of 2 minus 1 is optimal for memory consumption */
+ sizeof(struct spdk_msg),
+ 0, /* No cache. We do our own. */
+ SPDK_ENV_SOCKET_ID_ANY);
+
+ if (!g_spdk_msg_mempool) {
+ return -1;
+ }
+
+ return 0;
+}
+
+int
+spdk_thread_lib_init(spdk_new_thread_fn new_thread_fn, size_t ctx_sz)
+{
+ assert(g_new_thread_fn == NULL);
+ assert(g_thread_op_fn == NULL);
+
+ if (new_thread_fn == NULL) {
+ SPDK_INFOLOG(SPDK_LOG_THREAD, "new_thread_fn was not specified at spdk_thread_lib_init\n");
+ } else {
+ g_new_thread_fn = new_thread_fn;
+ }
+
+ return _thread_lib_init(ctx_sz);
+}
+
+int
+spdk_thread_lib_init_ext(spdk_thread_op_fn thread_op_fn,
+ spdk_thread_op_supported_fn thread_op_supported_fn,
+ size_t ctx_sz)
+{
+ assert(g_new_thread_fn == NULL);
+ assert(g_thread_op_fn == NULL);
+ assert(g_thread_op_supported_fn == NULL);
+
+ if ((thread_op_fn != NULL) != (thread_op_supported_fn != NULL)) {
+ SPDK_ERRLOG("Both must be defined or undefined together.\n");
+ return -EINVAL;
+ }
+
+ if (thread_op_fn == NULL && thread_op_supported_fn == NULL) {
+ SPDK_INFOLOG(SPDK_LOG_THREAD, "thread_op_fn and thread_op_supported_fn were not specified\n");
+ } else {
+ g_thread_op_fn = thread_op_fn;
+ g_thread_op_supported_fn = thread_op_supported_fn;
+ }
+
+ return _thread_lib_init(ctx_sz);
+}
+
+void
+spdk_thread_lib_fini(void)
+{
+ struct io_device *dev;
+
+ TAILQ_FOREACH(dev, &g_io_devices, tailq) {
+ SPDK_ERRLOG("io_device %s not unregistered\n", dev->name);
+ }
+
+ if (g_spdk_msg_mempool) {
+ spdk_mempool_free(g_spdk_msg_mempool);
+ g_spdk_msg_mempool = NULL;
+ }
+
+ g_new_thread_fn = NULL;
+ g_thread_op_fn = NULL;
+ g_thread_op_supported_fn = NULL;
+ g_ctx_sz = 0;
+}
+
+static void
+_free_thread(struct spdk_thread *thread)
+{
+ struct spdk_io_channel *ch;
+ struct spdk_msg *msg;
+ struct spdk_poller *poller, *ptmp;
+
+ TAILQ_FOREACH(ch, &thread->io_channels, tailq) {
+ SPDK_ERRLOG("thread %s still has channel for io_device %s\n",
+ thread->name, ch->dev->name);
+ }
+
+ TAILQ_FOREACH_SAFE(poller, &thread->active_pollers, tailq, ptmp) {
+ if (poller->state != SPDK_POLLER_STATE_UNREGISTERED) {
+ SPDK_WARNLOG("poller %s still registered at thread exit\n",
+ poller->name);
+ }
+ TAILQ_REMOVE(&thread->active_pollers, poller, tailq);
+ free(poller);
+ }
+
+ TAILQ_FOREACH_SAFE(poller, &thread->timed_pollers, tailq, ptmp) {
+ if (poller->state != SPDK_POLLER_STATE_UNREGISTERED) {
+ SPDK_WARNLOG("poller %s still registered at thread exit\n",
+ poller->name);
+ }
+ TAILQ_REMOVE(&thread->timed_pollers, poller, tailq);
+ free(poller);
+ }
+
+ TAILQ_FOREACH_SAFE(poller, &thread->paused_pollers, tailq, ptmp) {
+ SPDK_WARNLOG("poller %s still registered at thread exit\n", poller->name);
+ TAILQ_REMOVE(&thread->paused_pollers, poller, tailq);
+ free(poller);
+ }
+
+ pthread_mutex_lock(&g_devlist_mutex);
+ assert(g_thread_count > 0);
+ g_thread_count--;
+ TAILQ_REMOVE(&g_threads, thread, tailq);
+ pthread_mutex_unlock(&g_devlist_mutex);
+
+ msg = SLIST_FIRST(&thread->msg_cache);
+ while (msg != NULL) {
+ SLIST_REMOVE_HEAD(&thread->msg_cache, link);
+
+ assert(thread->msg_cache_count > 0);
+ thread->msg_cache_count--;
+ spdk_mempool_put(g_spdk_msg_mempool, msg);
+
+ msg = SLIST_FIRST(&thread->msg_cache);
+ }
+
+ assert(thread->msg_cache_count == 0);
+
+ spdk_ring_free(thread->messages);
+ free(thread);
+}
+
+struct spdk_thread *
+spdk_thread_create(const char *name, struct spdk_cpuset *cpumask)
+{
+ struct spdk_thread *thread;
+ struct spdk_msg *msgs[SPDK_MSG_MEMPOOL_CACHE_SIZE];
+ int rc = 0, i;
+
+ thread = calloc(1, sizeof(*thread) + g_ctx_sz);
+ if (!thread) {
+ SPDK_ERRLOG("Unable to allocate memory for thread\n");
+ return NULL;
+ }
+
+ if (cpumask) {
+ spdk_cpuset_copy(&thread->cpumask, cpumask);
+ } else {
+ spdk_cpuset_negate(&thread->cpumask);
+ }
+
+ TAILQ_INIT(&thread->io_channels);
+ TAILQ_INIT(&thread->active_pollers);
+ TAILQ_INIT(&thread->timed_pollers);
+ TAILQ_INIT(&thread->paused_pollers);
+ SLIST_INIT(&thread->msg_cache);
+ thread->msg_cache_count = 0;
+
+ thread->tsc_last = spdk_get_ticks();
+
+ thread->messages = spdk_ring_create(SPDK_RING_TYPE_MP_SC, 65536, SPDK_ENV_SOCKET_ID_ANY);
+ if (!thread->messages) {
+ SPDK_ERRLOG("Unable to allocate memory for message ring\n");
+ free(thread);
+ return NULL;
+ }
+
+ /* Fill the local message pool cache. */
+ rc = spdk_mempool_get_bulk(g_spdk_msg_mempool, (void **)msgs, SPDK_MSG_MEMPOOL_CACHE_SIZE);
+ if (rc == 0) {
+ /* If we can't populate the cache it's ok. The cache will get filled
+ * up organically as messages are passed to the thread. */
+ for (i = 0; i < SPDK_MSG_MEMPOOL_CACHE_SIZE; i++) {
+ SLIST_INSERT_HEAD(&thread->msg_cache, msgs[i], link);
+ thread->msg_cache_count++;
+ }
+ }
+
+ if (name) {
+ snprintf(thread->name, sizeof(thread->name), "%s", name);
+ } else {
+ snprintf(thread->name, sizeof(thread->name), "%p", thread);
+ }
+
+ pthread_mutex_lock(&g_devlist_mutex);
+ if (g_thread_id == 0) {
+ SPDK_ERRLOG("Thread ID rolled over. Further thread creation is not allowed.\n");
+ pthread_mutex_unlock(&g_devlist_mutex);
+ _free_thread(thread);
+ return NULL;
+ }
+ thread->id = g_thread_id++;
+ TAILQ_INSERT_TAIL(&g_threads, thread, tailq);
+ g_thread_count++;
+ pthread_mutex_unlock(&g_devlist_mutex);
+
+ SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Allocating new thread (%" PRIu64 ", %s)\n",
+ thread->id, thread->name);
+
+ if (g_new_thread_fn) {
+ rc = g_new_thread_fn(thread);
+ } else if (g_thread_op_supported_fn && g_thread_op_supported_fn(SPDK_THREAD_OP_NEW)) {
+ rc = g_thread_op_fn(thread, SPDK_THREAD_OP_NEW);
+ }
+
+ if (rc != 0) {
+ _free_thread(thread);
+ return NULL;
+ }
+
+ thread->state = SPDK_THREAD_STATE_RUNNING;
+
+ return thread;
+}
+
+void
+spdk_set_thread(struct spdk_thread *thread)
+{
+ tls_thread = thread;
+}
+
+static void
+thread_exit(struct spdk_thread *thread, uint64_t now)
+{
+ struct spdk_poller *poller;
+ struct spdk_io_channel *ch;
+
+ if (now >= thread->exit_timeout_tsc) {
+ SPDK_ERRLOG("thread %s got timeout, and move it to the exited state forcefully\n",
+ thread->name);
+ goto exited;
+ }
+
+ TAILQ_FOREACH(poller, &thread->active_pollers, tailq) {
+ if (poller->state != SPDK_POLLER_STATE_UNREGISTERED) {
+ SPDK_INFOLOG(SPDK_LOG_THREAD,
+ "thread %s still has active poller %s\n",
+ thread->name, poller->name);
+ return;
+ }
+ }
+
+ TAILQ_FOREACH(poller, &thread->timed_pollers, tailq) {
+ if (poller->state != SPDK_POLLER_STATE_UNREGISTERED) {
+ SPDK_INFOLOG(SPDK_LOG_THREAD,
+ "thread %s still has active timed poller %s\n",
+ thread->name, poller->name);
+ return;
+ }
+ }
+
+ TAILQ_FOREACH(poller, &thread->paused_pollers, tailq) {
+ SPDK_INFOLOG(SPDK_LOG_THREAD,
+ "thread %s still has paused poller %s\n",
+ thread->name, poller->name);
+ return;
+ }
+
+ TAILQ_FOREACH(ch, &thread->io_channels, tailq) {
+ SPDK_INFOLOG(SPDK_LOG_THREAD,
+ "thread %s still has channel for io_device %s\n",
+ thread->name, ch->dev->name);
+ return;
+ }
+
+exited:
+ thread->state = SPDK_THREAD_STATE_EXITED;
+}
+
+int
+spdk_thread_exit(struct spdk_thread *thread)
+{
+ SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Exit thread %s\n", thread->name);
+
+ assert(tls_thread == thread);
+
+ if (thread->state >= SPDK_THREAD_STATE_EXITING) {
+ SPDK_INFOLOG(SPDK_LOG_THREAD,
+ "thread %s is already exiting\n",
+ thread->name);
+ return 0;
+ }
+
+ thread->exit_timeout_tsc = spdk_get_ticks() + (spdk_get_ticks_hz() *
+ SPDK_THREAD_EXIT_TIMEOUT_SEC);
+ thread->state = SPDK_THREAD_STATE_EXITING;
+ return 0;
+}
+
+bool
+spdk_thread_is_exited(struct spdk_thread *thread)
+{
+ return thread->state == SPDK_THREAD_STATE_EXITED;
+}
+
+void
+spdk_thread_destroy(struct spdk_thread *thread)
+{
+ SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Destroy thread %s\n", thread->name);
+
+ assert(thread->state == SPDK_THREAD_STATE_EXITED);
+
+ if (tls_thread == thread) {
+ tls_thread = NULL;
+ }
+
+ _free_thread(thread);
+}
+
+void *
+spdk_thread_get_ctx(struct spdk_thread *thread)
+{
+ if (g_ctx_sz > 0) {
+ return thread->ctx;
+ }
+
+ return NULL;
+}
+
+struct spdk_cpuset *
+spdk_thread_get_cpumask(struct spdk_thread *thread)
+{
+ return &thread->cpumask;
+}
+
+int
+spdk_thread_set_cpumask(struct spdk_cpuset *cpumask)
+{
+ struct spdk_thread *thread;
+
+ if (!g_thread_op_supported_fn || !g_thread_op_supported_fn(SPDK_THREAD_OP_RESCHED)) {
+ SPDK_ERRLOG("Framework does not support reschedule operation.\n");
+ assert(false);
+ return -ENOTSUP;
+ }
+
+ thread = spdk_get_thread();
+ if (!thread) {
+ SPDK_ERRLOG("Called from non-SPDK thread\n");
+ assert(false);
+ return -EINVAL;
+ }
+
+ spdk_cpuset_copy(&thread->cpumask, cpumask);
+
+ /* Invoke framework's reschedule operation. If this function is called multiple times
+ * in a single spdk_thread_poll() context, the last cpumask will be used in the
+ * reschedule operation.
+ */
+ g_thread_op_fn(thread, SPDK_THREAD_OP_RESCHED);
+
+ return 0;
+}
+
+struct spdk_thread *
+spdk_thread_get_from_ctx(void *ctx)
+{
+ if (ctx == NULL) {
+ assert(false);
+ return NULL;
+ }
+
+ assert(g_ctx_sz > 0);
+
+ return SPDK_CONTAINEROF(ctx, struct spdk_thread, ctx);
+}
+
+static inline uint32_t
+msg_queue_run_batch(struct spdk_thread *thread, uint32_t max_msgs)
+{
+ unsigned count, i;
+ void *messages[SPDK_MSG_BATCH_SIZE];
+
+#ifdef DEBUG
+ /*
+ * spdk_ring_dequeue() fills messages and returns how many entries it wrote,
+ * so we will never actually read uninitialized data from events, but just to be sure
+ * (and to silence a static analyzer false positive), initialize the array to NULL pointers.
+ */
+ memset(messages, 0, sizeof(messages));
+#endif
+
+ if (max_msgs > 0) {
+ max_msgs = spdk_min(max_msgs, SPDK_MSG_BATCH_SIZE);
+ } else {
+ max_msgs = SPDK_MSG_BATCH_SIZE;
+ }
+
+ count = spdk_ring_dequeue(thread->messages, messages, max_msgs);
+ if (count == 0) {
+ return 0;
+ }
+
+ for (i = 0; i < count; i++) {
+ struct spdk_msg *msg = messages[i];
+
+ assert(msg != NULL);
+ msg->fn(msg->arg);
+
+ if (thread->msg_cache_count < SPDK_MSG_MEMPOOL_CACHE_SIZE) {
+ /* Insert the messages at the head. We want to re-use the hot
+ * ones. */
+ SLIST_INSERT_HEAD(&thread->msg_cache, msg, link);
+ thread->msg_cache_count++;
+ } else {
+ spdk_mempool_put(g_spdk_msg_mempool, msg);
+ }
+ }
+
+ return count;
+}
+
+static void
+poller_insert_timer(struct spdk_thread *thread, struct spdk_poller *poller, uint64_t now)
+{
+ struct spdk_poller *iter;
+
+ poller->next_run_tick = now + poller->period_ticks;
+
+ /*
+ * Insert poller in the thread's timed_pollers list in sorted order by next scheduled
+ * run time.
+ */
+ TAILQ_FOREACH_REVERSE(iter, &thread->timed_pollers, timed_pollers_head, tailq) {
+ if (iter->next_run_tick <= poller->next_run_tick) {
+ TAILQ_INSERT_AFTER(&thread->timed_pollers, iter, poller, tailq);
+ return;
+ }
+ }
+
+ /* No earlier pollers were found, so this poller must be the new head */
+ TAILQ_INSERT_HEAD(&thread->timed_pollers, poller, tailq);
+}
+
+static void
+thread_insert_poller(struct spdk_thread *thread, struct spdk_poller *poller)
+{
+ if (poller->period_ticks) {
+ poller_insert_timer(thread, poller, spdk_get_ticks());
+ } else {
+ TAILQ_INSERT_TAIL(&thread->active_pollers, poller, tailq);
+ }
+}
+
+static inline void
+thread_update_stats(struct spdk_thread *thread, uint64_t end,
+ uint64_t start, int rc)
+{
+ if (rc == 0) {
+ /* Poller status idle */
+ thread->stats.idle_tsc += end - start;
+ } else if (rc > 0) {
+ /* Poller status busy */
+ thread->stats.busy_tsc += end - start;
+ }
+ /* Store end time to use it as start time of the next spdk_thread_poll(). */
+ thread->tsc_last = end;
+}
+
+static int
+thread_poll(struct spdk_thread *thread, uint32_t max_msgs, uint64_t now)
+{
+ uint32_t msg_count;
+ struct spdk_poller *poller, *tmp;
+ spdk_msg_fn critical_msg;
+ int rc = 0;
+
+ critical_msg = thread->critical_msg;
+ if (spdk_unlikely(critical_msg != NULL)) {
+ critical_msg(NULL);
+ thread->critical_msg = NULL;
+ }
+
+ msg_count = msg_queue_run_batch(thread, max_msgs);
+ if (msg_count) {
+ rc = 1;
+ }
+
+ TAILQ_FOREACH_REVERSE_SAFE(poller, &thread->active_pollers,
+ active_pollers_head, tailq, tmp) {
+ int poller_rc;
+
+ if (poller->state == SPDK_POLLER_STATE_UNREGISTERED) {
+ TAILQ_REMOVE(&thread->active_pollers, poller, tailq);
+ free(poller);
+ continue;
+ } else if (poller->state == SPDK_POLLER_STATE_PAUSING) {
+ TAILQ_REMOVE(&thread->active_pollers, poller, tailq);
+ TAILQ_INSERT_TAIL(&thread->paused_pollers, poller, tailq);
+ poller->state = SPDK_POLLER_STATE_PAUSED;
+ continue;
+ }
+
+ poller->state = SPDK_POLLER_STATE_RUNNING;
+ poller_rc = poller->fn(poller->arg);
+
+ poller->run_count++;
+ if (poller_rc > 0) {
+ poller->busy_count++;
+ }
+
+#ifdef DEBUG
+ if (poller_rc == -1) {
+ SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Poller %s returned -1\n", poller->name);
+ }
+#endif
+
+ if (poller->state == SPDK_POLLER_STATE_UNREGISTERED) {
+ TAILQ_REMOVE(&thread->active_pollers, poller, tailq);
+ free(poller);
+ } else if (poller->state != SPDK_POLLER_STATE_PAUSED) {
+ poller->state = SPDK_POLLER_STATE_WAITING;
+ }
+
+ if (poller_rc > rc) {
+ rc = poller_rc;
+ }
+ }
+
+ TAILQ_FOREACH_SAFE(poller, &thread->timed_pollers, tailq, tmp) {
+ int timer_rc = 0;
+
+ if (poller->state == SPDK_POLLER_STATE_UNREGISTERED) {
+ TAILQ_REMOVE(&thread->timed_pollers, poller, tailq);
+ free(poller);
+ continue;
+ } else if (poller->state == SPDK_POLLER_STATE_PAUSING) {
+ TAILQ_REMOVE(&thread->timed_pollers, poller, tailq);
+ TAILQ_INSERT_TAIL(&thread->paused_pollers, poller, tailq);
+ poller->state = SPDK_POLLER_STATE_PAUSED;
+ continue;
+ }
+
+ if (now < poller->next_run_tick) {
+ break;
+ }
+
+ poller->state = SPDK_POLLER_STATE_RUNNING;
+ timer_rc = poller->fn(poller->arg);
+
+ poller->run_count++;
+ if (timer_rc > 0) {
+ poller->busy_count++;
+ }
+
+#ifdef DEBUG
+ if (timer_rc == -1) {
+ SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Timed poller %s returned -1\n", poller->name);
+ }
+#endif
+
+ if (poller->state == SPDK_POLLER_STATE_UNREGISTERED) {
+ TAILQ_REMOVE(&thread->timed_pollers, poller, tailq);
+ free(poller);
+ } else if (poller->state != SPDK_POLLER_STATE_PAUSED) {
+ poller->state = SPDK_POLLER_STATE_WAITING;
+ TAILQ_REMOVE(&thread->timed_pollers, poller, tailq);
+ poller_insert_timer(thread, poller, now);
+ }
+
+ if (timer_rc > rc) {
+ rc = timer_rc;
+ }
+ }
+
+ return rc;
+}
+
+int
+spdk_thread_poll(struct spdk_thread *thread, uint32_t max_msgs, uint64_t now)
+{
+ struct spdk_thread *orig_thread;
+ int rc;
+
+ orig_thread = _get_thread();
+ tls_thread = thread;
+
+ if (now == 0) {
+ now = spdk_get_ticks();
+ }
+
+ rc = thread_poll(thread, max_msgs, now);
+
+ if (spdk_unlikely(thread->state == SPDK_THREAD_STATE_EXITING)) {
+ thread_exit(thread, now);
+ }
+
+ thread_update_stats(thread, spdk_get_ticks(), now, rc);
+
+ tls_thread = orig_thread;
+
+ return rc;
+}
+
+uint64_t
+spdk_thread_next_poller_expiration(struct spdk_thread *thread)
+{
+ struct spdk_poller *poller;
+
+ poller = TAILQ_FIRST(&thread->timed_pollers);
+ if (poller) {
+ return poller->next_run_tick;
+ }
+
+ return 0;
+}
+
+int
+spdk_thread_has_active_pollers(struct spdk_thread *thread)
+{
+ return !TAILQ_EMPTY(&thread->active_pollers);
+}
+
+static bool
+thread_has_unpaused_pollers(struct spdk_thread *thread)
+{
+ if (TAILQ_EMPTY(&thread->active_pollers) &&
+ TAILQ_EMPTY(&thread->timed_pollers)) {
+ return false;
+ }
+
+ return true;
+}
+
+bool
+spdk_thread_has_pollers(struct spdk_thread *thread)
+{
+ if (!thread_has_unpaused_pollers(thread) &&
+ TAILQ_EMPTY(&thread->paused_pollers)) {
+ return false;
+ }
+
+ return true;
+}
+
+bool
+spdk_thread_is_idle(struct spdk_thread *thread)
+{
+ if (spdk_ring_count(thread->messages) ||
+ thread_has_unpaused_pollers(thread) ||
+ thread->critical_msg != NULL) {
+ return false;
+ }
+
+ return true;
+}
+
+uint32_t
+spdk_thread_get_count(void)
+{
+ /*
+ * Return cached value of the current thread count. We could acquire the
+ * lock and iterate through the TAILQ of threads to count them, but that
+ * count could still be invalidated after we release the lock.
+ */
+ return g_thread_count;
+}
+
+struct spdk_thread *
+spdk_get_thread(void)
+{
+ return _get_thread();
+}
+
+const char *
+spdk_thread_get_name(const struct spdk_thread *thread)
+{
+ return thread->name;
+}
+
+uint64_t
+spdk_thread_get_id(const struct spdk_thread *thread)
+{
+ return thread->id;
+}
+
+struct spdk_thread *
+spdk_thread_get_by_id(uint64_t id)
+{
+ struct spdk_thread *thread;
+
+ pthread_mutex_lock(&g_devlist_mutex);
+ TAILQ_FOREACH(thread, &g_threads, tailq) {
+ if (thread->id == id) {
+ pthread_mutex_unlock(&g_devlist_mutex);
+
+ return thread;
+ }
+ }
+ pthread_mutex_unlock(&g_devlist_mutex);
+
+ return NULL;
+}
+
+int
+spdk_thread_get_stats(struct spdk_thread_stats *stats)
+{
+ struct spdk_thread *thread;
+
+ thread = _get_thread();
+ if (!thread) {
+ SPDK_ERRLOG("No thread allocated\n");
+ return -EINVAL;
+ }
+
+ if (stats == NULL) {
+ return -EINVAL;
+ }
+
+ *stats = thread->stats;
+
+ return 0;
+}
+
+uint64_t
+spdk_thread_get_last_tsc(struct spdk_thread *thread)
+{
+ return thread->tsc_last;
+}
+
+int
+spdk_thread_send_msg(const struct spdk_thread *thread, spdk_msg_fn fn, void *ctx)
+{
+ struct spdk_thread *local_thread;
+ struct spdk_msg *msg;
+ int rc;
+
+ assert(thread != NULL);
+
+ if (spdk_unlikely(thread->state == SPDK_THREAD_STATE_EXITED)) {
+ SPDK_ERRLOG("Thread %s is marked as exited.\n", thread->name);
+ return -EIO;
+ }
+
+ local_thread = _get_thread();
+
+ msg = NULL;
+ if (local_thread != NULL) {
+ if (local_thread->msg_cache_count > 0) {
+ msg = SLIST_FIRST(&local_thread->msg_cache);
+ assert(msg != NULL);
+ SLIST_REMOVE_HEAD(&local_thread->msg_cache, link);
+ local_thread->msg_cache_count--;
+ }
+ }
+
+ if (msg == NULL) {
+ msg = spdk_mempool_get(g_spdk_msg_mempool);
+ if (!msg) {
+ SPDK_ERRLOG("msg could not be allocated\n");
+ return -ENOMEM;
+ }
+ }
+
+ msg->fn = fn;
+ msg->arg = ctx;
+
+ rc = spdk_ring_enqueue(thread->messages, (void **)&msg, 1, NULL);
+ if (rc != 1) {
+ SPDK_ERRLOG("msg could not be enqueued\n");
+ spdk_mempool_put(g_spdk_msg_mempool, msg);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+int
+spdk_thread_send_critical_msg(struct spdk_thread *thread, spdk_msg_fn fn)
+{
+ spdk_msg_fn expected = NULL;
+
+ if (__atomic_compare_exchange_n(&thread->critical_msg, &expected, fn, false, __ATOMIC_SEQ_CST,
+ __ATOMIC_SEQ_CST)) {
+ return 0;
+ }
+
+ return -EIO;
+}
+
+static struct spdk_poller *
+poller_register(spdk_poller_fn fn,
+ void *arg,
+ uint64_t period_microseconds,
+ const char *name)
+{
+ struct spdk_thread *thread;
+ struct spdk_poller *poller;
+ uint64_t quotient, remainder, ticks;
+
+ thread = spdk_get_thread();
+ if (!thread) {
+ assert(false);
+ return NULL;
+ }
+
+ if (spdk_unlikely(thread->state == SPDK_THREAD_STATE_EXITED)) {
+ SPDK_ERRLOG("thread %s is marked as exited\n", thread->name);
+ return NULL;
+ }
+
+ poller = calloc(1, sizeof(*poller));
+ if (poller == NULL) {
+ SPDK_ERRLOG("Poller memory allocation failed\n");
+ return NULL;
+ }
+
+ if (name) {
+ snprintf(poller->name, sizeof(poller->name), "%s", name);
+ } else {
+ snprintf(poller->name, sizeof(poller->name), "%p", fn);
+ }
+
+ poller->state = SPDK_POLLER_STATE_WAITING;
+ poller->fn = fn;
+ poller->arg = arg;
+ poller->thread = thread;
+
+ if (period_microseconds) {
+ quotient = period_microseconds / SPDK_SEC_TO_USEC;
+ remainder = period_microseconds % SPDK_SEC_TO_USEC;
+ ticks = spdk_get_ticks_hz();
+
+ poller->period_ticks = ticks * quotient + (ticks * remainder) / SPDK_SEC_TO_USEC;
+ } else {
+ poller->period_ticks = 0;
+ }
+
+ thread_insert_poller(thread, poller);
+
+ return poller;
+}
+
+struct spdk_poller *
+spdk_poller_register(spdk_poller_fn fn,
+ void *arg,
+ uint64_t period_microseconds)
+{
+ return poller_register(fn, arg, period_microseconds, NULL);
+}
+
+struct spdk_poller *
+spdk_poller_register_named(spdk_poller_fn fn,
+ void *arg,
+ uint64_t period_microseconds,
+ const char *name)
+{
+ return poller_register(fn, arg, period_microseconds, name);
+}
+
+void
+spdk_poller_unregister(struct spdk_poller **ppoller)
+{
+ struct spdk_thread *thread;
+ struct spdk_poller *poller;
+
+ poller = *ppoller;
+ if (poller == NULL) {
+ return;
+ }
+
+ *ppoller = NULL;
+
+ thread = spdk_get_thread();
+ if (!thread) {
+ assert(false);
+ return;
+ }
+
+ if (poller->thread != thread) {
+ SPDK_ERRLOG("different from the thread that called spdk_poller_register()\n");
+ assert(false);
+ return;
+ }
+
+ /* If the poller was paused, put it on the active_pollers list so that
+ * its unregistration can be processed by spdk_thread_poll().
+ */
+ if (poller->state == SPDK_POLLER_STATE_PAUSED) {
+ TAILQ_REMOVE(&thread->paused_pollers, poller, tailq);
+ TAILQ_INSERT_TAIL(&thread->active_pollers, poller, tailq);
+ poller->period_ticks = 0;
+ }
+
+ /* Simply set the state to unregistered. The poller will get cleaned up
+ * in a subsequent call to spdk_thread_poll().
+ */
+ poller->state = SPDK_POLLER_STATE_UNREGISTERED;
+}
+
+void
+spdk_poller_pause(struct spdk_poller *poller)
+{
+ struct spdk_thread *thread;
+
+ if (poller->state == SPDK_POLLER_STATE_PAUSED ||
+ poller->state == SPDK_POLLER_STATE_PAUSING) {
+ return;
+ }
+
+ thread = spdk_get_thread();
+ if (!thread) {
+ assert(false);
+ return;
+ }
+
+ /* If a poller is paused from within itself, we can immediately move it
+ * on the paused_pollers list. Otherwise we just set its state to
+ * SPDK_POLLER_STATE_PAUSING and let spdk_thread_poll() move it. It
+ * allows a poller to be paused from another one's context without
+ * breaking the TAILQ_FOREACH_REVERSE_SAFE iteration.
+ */
+ if (poller->state != SPDK_POLLER_STATE_RUNNING) {
+ poller->state = SPDK_POLLER_STATE_PAUSING;
+ } else {
+ if (poller->period_ticks > 0) {
+ TAILQ_REMOVE(&thread->timed_pollers, poller, tailq);
+ } else {
+ TAILQ_REMOVE(&thread->active_pollers, poller, tailq);
+ }
+
+ TAILQ_INSERT_TAIL(&thread->paused_pollers, poller, tailq);
+ poller->state = SPDK_POLLER_STATE_PAUSED;
+ }
+}
+
+void
+spdk_poller_resume(struct spdk_poller *poller)
+{
+ struct spdk_thread *thread;
+
+ if (poller->state != SPDK_POLLER_STATE_PAUSED &&
+ poller->state != SPDK_POLLER_STATE_PAUSING) {
+ return;
+ }
+
+ thread = spdk_get_thread();
+ if (!thread) {
+ assert(false);
+ return;
+ }
+
+ /* If a poller is paused it has to be removed from the paused pollers
+ * list and put on the active / timer list depending on its
+ * period_ticks. If a poller is still in the process of being paused,
+ * we just need to flip its state back to waiting, as it's already on
+ * the appropriate list.
+ */
+ if (poller->state == SPDK_POLLER_STATE_PAUSED) {
+ TAILQ_REMOVE(&thread->paused_pollers, poller, tailq);
+ thread_insert_poller(thread, poller);
+ }
+
+ poller->state = SPDK_POLLER_STATE_WAITING;
+}
+
+const char *
+spdk_poller_state_str(enum spdk_poller_state state)
+{
+ switch (state) {
+ case SPDK_POLLER_STATE_WAITING:
+ return "waiting";
+ case SPDK_POLLER_STATE_RUNNING:
+ return "running";
+ case SPDK_POLLER_STATE_UNREGISTERED:
+ return "unregistered";
+ case SPDK_POLLER_STATE_PAUSING:
+ return "pausing";
+ case SPDK_POLLER_STATE_PAUSED:
+ return "paused";
+ default:
+ return NULL;
+ }
+}
+
+struct call_thread {
+ struct spdk_thread *cur_thread;
+ spdk_msg_fn fn;
+ void *ctx;
+
+ struct spdk_thread *orig_thread;
+ spdk_msg_fn cpl;
+};
+
+static void
+_on_thread(void *ctx)
+{
+ struct call_thread *ct = ctx;
+ int rc __attribute__((unused));
+
+ ct->fn(ct->ctx);
+
+ pthread_mutex_lock(&g_devlist_mutex);
+ ct->cur_thread = TAILQ_NEXT(ct->cur_thread, tailq);
+ pthread_mutex_unlock(&g_devlist_mutex);
+
+ if (!ct->cur_thread) {
+ SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Completed thread iteration\n");
+
+ rc = spdk_thread_send_msg(ct->orig_thread, ct->cpl, ct->ctx);
+ free(ctx);
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Continuing thread iteration to %s\n",
+ ct->cur_thread->name);
+
+ rc = spdk_thread_send_msg(ct->cur_thread, _on_thread, ctx);
+ }
+ assert(rc == 0);
+}
+
+void
+spdk_for_each_thread(spdk_msg_fn fn, void *ctx, spdk_msg_fn cpl)
+{
+ struct call_thread *ct;
+ struct spdk_thread *thread;
+ int rc __attribute__((unused));
+
+ ct = calloc(1, sizeof(*ct));
+ if (!ct) {
+ SPDK_ERRLOG("Unable to perform thread iteration\n");
+ cpl(ctx);
+ return;
+ }
+
+ ct->fn = fn;
+ ct->ctx = ctx;
+ ct->cpl = cpl;
+
+ thread = _get_thread();
+ if (!thread) {
+ SPDK_ERRLOG("No thread allocated\n");
+ free(ct);
+ cpl(ctx);
+ return;
+ }
+ ct->orig_thread = thread;
+
+ pthread_mutex_lock(&g_devlist_mutex);
+ ct->cur_thread = TAILQ_FIRST(&g_threads);
+ pthread_mutex_unlock(&g_devlist_mutex);
+
+ SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Starting thread iteration from %s\n",
+ ct->orig_thread->name);
+
+ rc = spdk_thread_send_msg(ct->cur_thread, _on_thread, ct);
+ assert(rc == 0);
+}
+
+void
+spdk_io_device_register(void *io_device, spdk_io_channel_create_cb create_cb,
+ spdk_io_channel_destroy_cb destroy_cb, uint32_t ctx_size,
+ const char *name)
+{
+ struct io_device *dev, *tmp;
+ struct spdk_thread *thread;
+
+ assert(io_device != NULL);
+ assert(create_cb != NULL);
+ assert(destroy_cb != NULL);
+
+ thread = spdk_get_thread();
+ if (!thread) {
+ SPDK_ERRLOG("called from non-SPDK thread\n");
+ assert(false);
+ return;
+ }
+
+ dev = calloc(1, sizeof(struct io_device));
+ if (dev == NULL) {
+ SPDK_ERRLOG("could not allocate io_device\n");
+ return;
+ }
+
+ dev->io_device = io_device;
+ if (name) {
+ snprintf(dev->name, sizeof(dev->name), "%s", name);
+ } else {
+ snprintf(dev->name, sizeof(dev->name), "%p", dev);
+ }
+ dev->create_cb = create_cb;
+ dev->destroy_cb = destroy_cb;
+ dev->unregister_cb = NULL;
+ dev->ctx_size = ctx_size;
+ dev->for_each_count = 0;
+ dev->unregistered = false;
+ dev->refcnt = 0;
+
+ SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Registering io_device %s (%p) on thread %s\n",
+ dev->name, dev->io_device, thread->name);
+
+ pthread_mutex_lock(&g_devlist_mutex);
+ TAILQ_FOREACH(tmp, &g_io_devices, tailq) {
+ if (tmp->io_device == io_device) {
+ SPDK_ERRLOG("io_device %p already registered (old:%s new:%s)\n",
+ io_device, tmp->name, dev->name);
+ free(dev);
+ pthread_mutex_unlock(&g_devlist_mutex);
+ return;
+ }
+ }
+ TAILQ_INSERT_TAIL(&g_io_devices, dev, tailq);
+ pthread_mutex_unlock(&g_devlist_mutex);
+}
+
+static void
+_finish_unregister(void *arg)
+{
+ struct io_device *dev = arg;
+
+ SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Finishing unregistration of io_device %s (%p) on thread %s\n",
+ dev->name, dev->io_device, dev->unregister_thread->name);
+
+ dev->unregister_cb(dev->io_device);
+ free(dev);
+}
+
+static void
+io_device_free(struct io_device *dev)
+{
+ int rc __attribute__((unused));
+
+ if (dev->unregister_cb == NULL) {
+ free(dev);
+ } else {
+ assert(dev->unregister_thread != NULL);
+ SPDK_DEBUGLOG(SPDK_LOG_THREAD, "io_device %s (%p) needs to unregister from thread %s\n",
+ dev->name, dev->io_device, dev->unregister_thread->name);
+ rc = spdk_thread_send_msg(dev->unregister_thread, _finish_unregister, dev);
+ assert(rc == 0);
+ }
+}
+
+void
+spdk_io_device_unregister(void *io_device, spdk_io_device_unregister_cb unregister_cb)
+{
+ struct io_device *dev;
+ uint32_t refcnt;
+ struct spdk_thread *thread;
+
+ thread = spdk_get_thread();
+ if (!thread) {
+ SPDK_ERRLOG("called from non-SPDK thread\n");
+ assert(false);
+ return;
+ }
+
+ pthread_mutex_lock(&g_devlist_mutex);
+ TAILQ_FOREACH(dev, &g_io_devices, tailq) {
+ if (dev->io_device == io_device) {
+ break;
+ }
+ }
+
+ if (!dev) {
+ SPDK_ERRLOG("io_device %p not found\n", io_device);
+ assert(false);
+ pthread_mutex_unlock(&g_devlist_mutex);
+ return;
+ }
+
+ if (dev->for_each_count > 0) {
+ SPDK_ERRLOG("io_device %s (%p) has %u for_each calls outstanding\n",
+ dev->name, io_device, dev->for_each_count);
+ pthread_mutex_unlock(&g_devlist_mutex);
+ return;
+ }
+
+ dev->unregister_cb = unregister_cb;
+ dev->unregistered = true;
+ TAILQ_REMOVE(&g_io_devices, dev, tailq);
+ refcnt = dev->refcnt;
+ dev->unregister_thread = thread;
+ pthread_mutex_unlock(&g_devlist_mutex);
+
+ SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Unregistering io_device %s (%p) from thread %s\n",
+ dev->name, dev->io_device, thread->name);
+
+ if (refcnt > 0) {
+ /* defer deletion */
+ return;
+ }
+
+ io_device_free(dev);
+}
+
+const char *
+spdk_io_device_get_name(struct io_device *dev)
+{
+ return dev->name;
+}
+
+struct spdk_io_channel *
+spdk_get_io_channel(void *io_device)
+{
+ struct spdk_io_channel *ch;
+ struct spdk_thread *thread;
+ struct io_device *dev;
+ int rc;
+
+ pthread_mutex_lock(&g_devlist_mutex);
+ TAILQ_FOREACH(dev, &g_io_devices, tailq) {
+ if (dev->io_device == io_device) {
+ break;
+ }
+ }
+ if (dev == NULL) {
+ SPDK_ERRLOG("could not find io_device %p\n", io_device);
+ pthread_mutex_unlock(&g_devlist_mutex);
+ return NULL;
+ }
+
+ thread = _get_thread();
+ if (!thread) {
+ SPDK_ERRLOG("No thread allocated\n");
+ pthread_mutex_unlock(&g_devlist_mutex);
+ return NULL;
+ }
+
+ if (spdk_unlikely(thread->state == SPDK_THREAD_STATE_EXITED)) {
+ SPDK_ERRLOG("Thread %s is marked as exited\n", thread->name);
+ pthread_mutex_unlock(&g_devlist_mutex);
+ return NULL;
+ }
+
+ TAILQ_FOREACH(ch, &thread->io_channels, tailq) {
+ if (ch->dev == dev) {
+ ch->ref++;
+
+ SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Get io_channel %p for io_device %s (%p) on thread %s refcnt %u\n",
+ ch, dev->name, dev->io_device, thread->name, ch->ref);
+
+ /*
+ * An I/O channel already exists for this device on this
+ * thread, so return it.
+ */
+ pthread_mutex_unlock(&g_devlist_mutex);
+ return ch;
+ }
+ }
+
+ ch = calloc(1, sizeof(*ch) + dev->ctx_size);
+ if (ch == NULL) {
+ SPDK_ERRLOG("could not calloc spdk_io_channel\n");
+ pthread_mutex_unlock(&g_devlist_mutex);
+ return NULL;
+ }
+
+ ch->dev = dev;
+ ch->destroy_cb = dev->destroy_cb;
+ ch->thread = thread;
+ ch->ref = 1;
+ ch->destroy_ref = 0;
+ TAILQ_INSERT_TAIL(&thread->io_channels, ch, tailq);
+
+ SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Get io_channel %p for io_device %s (%p) on thread %s refcnt %u\n",
+ ch, dev->name, dev->io_device, thread->name, ch->ref);
+
+ dev->refcnt++;
+
+ pthread_mutex_unlock(&g_devlist_mutex);
+
+ rc = dev->create_cb(io_device, (uint8_t *)ch + sizeof(*ch));
+ if (rc != 0) {
+ pthread_mutex_lock(&g_devlist_mutex);
+ TAILQ_REMOVE(&ch->thread->io_channels, ch, tailq);
+ dev->refcnt--;
+ free(ch);
+ pthread_mutex_unlock(&g_devlist_mutex);
+ return NULL;
+ }
+
+ return ch;
+}
+
+static void
+put_io_channel(void *arg)
+{
+ struct spdk_io_channel *ch = arg;
+ bool do_remove_dev = true;
+ struct spdk_thread *thread;
+
+ thread = spdk_get_thread();
+ if (!thread) {
+ SPDK_ERRLOG("called from non-SPDK thread\n");
+ assert(false);
+ return;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_THREAD,
+ "Releasing io_channel %p for io_device %s (%p) on thread %s\n",
+ ch, ch->dev->name, ch->dev->io_device, thread->name);
+
+ assert(ch->thread == thread);
+
+ ch->destroy_ref--;
+
+ if (ch->ref > 0 || ch->destroy_ref > 0) {
+ /*
+ * Another reference to the associated io_device was requested
+ * after this message was sent but before it had a chance to
+ * execute.
+ */
+ return;
+ }
+
+ pthread_mutex_lock(&g_devlist_mutex);
+ TAILQ_REMOVE(&ch->thread->io_channels, ch, tailq);
+ pthread_mutex_unlock(&g_devlist_mutex);
+
+ /* Don't hold the devlist mutex while the destroy_cb is called. */
+ ch->destroy_cb(ch->dev->io_device, spdk_io_channel_get_ctx(ch));
+
+ pthread_mutex_lock(&g_devlist_mutex);
+ ch->dev->refcnt--;
+
+ if (!ch->dev->unregistered) {
+ do_remove_dev = false;
+ }
+
+ if (ch->dev->refcnt > 0) {
+ do_remove_dev = false;
+ }
+
+ pthread_mutex_unlock(&g_devlist_mutex);
+
+ if (do_remove_dev) {
+ io_device_free(ch->dev);
+ }
+ free(ch);
+}
+
+void
+spdk_put_io_channel(struct spdk_io_channel *ch)
+{
+ struct spdk_thread *thread;
+ int rc __attribute__((unused));
+
+ thread = spdk_get_thread();
+ if (!thread) {
+ SPDK_ERRLOG("called from non-SPDK thread\n");
+ assert(false);
+ return;
+ }
+
+ if (ch->thread != thread) {
+ SPDK_ERRLOG("different from the thread that called get_io_channel()\n");
+ assert(false);
+ return;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_THREAD,
+ "Putting io_channel %p for io_device %s (%p) on thread %s refcnt %u\n",
+ ch, ch->dev->name, ch->dev->io_device, thread->name, ch->ref);
+
+ ch->ref--;
+
+ if (ch->ref == 0) {
+ ch->destroy_ref++;
+ rc = spdk_thread_send_msg(thread, put_io_channel, ch);
+ assert(rc == 0);
+ }
+}
+
+struct spdk_io_channel *
+spdk_io_channel_from_ctx(void *ctx)
+{
+ return (struct spdk_io_channel *)((uint8_t *)ctx - sizeof(struct spdk_io_channel));
+}
+
+struct spdk_thread *
+spdk_io_channel_get_thread(struct spdk_io_channel *ch)
+{
+ return ch->thread;
+}
+
+struct spdk_io_channel_iter {
+ void *io_device;
+ struct io_device *dev;
+ spdk_channel_msg fn;
+ int status;
+ void *ctx;
+ struct spdk_io_channel *ch;
+
+ struct spdk_thread *cur_thread;
+
+ struct spdk_thread *orig_thread;
+ spdk_channel_for_each_cpl cpl;
+};
+
+void *
+spdk_io_channel_iter_get_io_device(struct spdk_io_channel_iter *i)
+{
+ return i->io_device;
+}
+
+struct spdk_io_channel *
+spdk_io_channel_iter_get_channel(struct spdk_io_channel_iter *i)
+{
+ return i->ch;
+}
+
+void *
+spdk_io_channel_iter_get_ctx(struct spdk_io_channel_iter *i)
+{
+ return i->ctx;
+}
+
+static void
+_call_completion(void *ctx)
+{
+ struct spdk_io_channel_iter *i = ctx;
+
+ if (i->cpl != NULL) {
+ i->cpl(i, i->status);
+ }
+ free(i);
+}
+
+static void
+_call_channel(void *ctx)
+{
+ struct spdk_io_channel_iter *i = ctx;
+ struct spdk_io_channel *ch;
+
+ /*
+ * It is possible that the channel was deleted before this
+ * message had a chance to execute. If so, skip calling
+ * the fn() on this thread.
+ */
+ pthread_mutex_lock(&g_devlist_mutex);
+ TAILQ_FOREACH(ch, &i->cur_thread->io_channels, tailq) {
+ if (ch->dev->io_device == i->io_device) {
+ break;
+ }
+ }
+ pthread_mutex_unlock(&g_devlist_mutex);
+
+ if (ch) {
+ i->fn(i);
+ } else {
+ spdk_for_each_channel_continue(i, 0);
+ }
+}
+
+void
+spdk_for_each_channel(void *io_device, spdk_channel_msg fn, void *ctx,
+ spdk_channel_for_each_cpl cpl)
+{
+ struct spdk_thread *thread;
+ struct spdk_io_channel *ch;
+ struct spdk_io_channel_iter *i;
+ int rc __attribute__((unused));
+
+ i = calloc(1, sizeof(*i));
+ if (!i) {
+ SPDK_ERRLOG("Unable to allocate iterator\n");
+ return;
+ }
+
+ i->io_device = io_device;
+ i->fn = fn;
+ i->ctx = ctx;
+ i->cpl = cpl;
+
+ pthread_mutex_lock(&g_devlist_mutex);
+ i->orig_thread = _get_thread();
+
+ TAILQ_FOREACH(thread, &g_threads, tailq) {
+ TAILQ_FOREACH(ch, &thread->io_channels, tailq) {
+ if (ch->dev->io_device == io_device) {
+ ch->dev->for_each_count++;
+ i->dev = ch->dev;
+ i->cur_thread = thread;
+ i->ch = ch;
+ pthread_mutex_unlock(&g_devlist_mutex);
+ rc = spdk_thread_send_msg(thread, _call_channel, i);
+ assert(rc == 0);
+ return;
+ }
+ }
+ }
+
+ pthread_mutex_unlock(&g_devlist_mutex);
+
+ rc = spdk_thread_send_msg(i->orig_thread, _call_completion, i);
+ assert(rc == 0);
+}
+
+void
+spdk_for_each_channel_continue(struct spdk_io_channel_iter *i, int status)
+{
+ struct spdk_thread *thread;
+ struct spdk_io_channel *ch;
+ int rc __attribute__((unused));
+
+ assert(i->cur_thread == spdk_get_thread());
+
+ i->status = status;
+
+ pthread_mutex_lock(&g_devlist_mutex);
+ if (status) {
+ goto end;
+ }
+ thread = TAILQ_NEXT(i->cur_thread, tailq);
+ while (thread) {
+ TAILQ_FOREACH(ch, &thread->io_channels, tailq) {
+ if (ch->dev->io_device == i->io_device) {
+ i->cur_thread = thread;
+ i->ch = ch;
+ pthread_mutex_unlock(&g_devlist_mutex);
+ rc = spdk_thread_send_msg(thread, _call_channel, i);
+ assert(rc == 0);
+ return;
+ }
+ }
+ thread = TAILQ_NEXT(thread, tailq);
+ }
+
+end:
+ i->dev->for_each_count--;
+ i->ch = NULL;
+ pthread_mutex_unlock(&g_devlist_mutex);
+
+ rc = spdk_thread_send_msg(i->orig_thread, _call_completion, i);
+ assert(rc == 0);
+}
+
+
+SPDK_LOG_REGISTER_COMPONENT("thread", SPDK_LOG_THREAD)
diff --git a/src/spdk/lib/trace/Makefile b/src/spdk/lib/trace/Makefile
new file mode 100644
index 000000000..9102c320a
--- /dev/null
+++ b/src/spdk/lib/trace/Makefile
@@ -0,0 +1,45 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = trace.c trace_flags.c trace_rpc.c
+LIBNAME = trace
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_trace.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/trace/spdk_trace.map b/src/spdk/lib/trace/spdk_trace.map
new file mode 100644
index 000000000..14a03b337
--- /dev/null
+++ b/src/spdk/lib/trace/spdk_trace.map
@@ -0,0 +1,29 @@
+{
+ global:
+
+ # public functions
+ _spdk_trace_record;
+ spdk_trace_get_tpoint_mask;
+ spdk_trace_set_tpoints;
+ spdk_trace_clear_tpoints;
+ spdk_trace_get_tpoint_group_mask;
+ spdk_trace_set_tpoint_group_mask;
+ spdk_trace_clear_tpoint_group_mask;
+ spdk_trace_init;
+ spdk_trace_cleanup;
+ spdk_trace_flags_init;
+ spdk_trace_register_owner;
+ spdk_trace_register_object;
+ spdk_trace_register_description;
+ spdk_trace_get_first_register_fn;
+ spdk_trace_get_next_register_fn;
+ spdk_trace_enable_tpoint_group;
+ spdk_trace_disable_tpoint_group;
+ spdk_trace_mask_usage;
+ spdk_trace_add_register_fn;
+
+ # public variables
+ g_trace_histories;
+
+ local: *;
+};
diff --git a/src/spdk/lib/trace/trace.c b/src/spdk/lib/trace/trace.c
new file mode 100644
index 000000000..621c52aae
--- /dev/null
+++ b/src/spdk/lib/trace/trace.c
@@ -0,0 +1,201 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/env.h"
+#include "spdk/string.h"
+#include "spdk/trace.h"
+#include "spdk/util.h"
+#include "spdk/barrier.h"
+#include "spdk/log.h"
+
+static int g_trace_fd = -1;
+static char g_shm_name[64];
+
+struct spdk_trace_histories *g_trace_histories;
+
+void
+_spdk_trace_record(uint64_t tsc, uint16_t tpoint_id, uint16_t poller_id, uint32_t size,
+ uint64_t object_id, uint64_t arg1)
+{
+ struct spdk_trace_history *lcore_history;
+ struct spdk_trace_entry *next_entry;
+ unsigned lcore;
+ uint64_t next_circular_entry;
+
+ lcore = spdk_env_get_current_core();
+ if (lcore >= SPDK_TRACE_MAX_LCORE) {
+ return;
+ }
+
+ lcore_history = spdk_get_per_lcore_history(g_trace_histories, lcore);
+ if (tsc == 0) {
+ tsc = spdk_get_ticks();
+ }
+
+ lcore_history->tpoint_count[tpoint_id]++;
+
+ /* Get next entry index in the circular buffer */
+ next_circular_entry = lcore_history->next_entry & (lcore_history->num_entries - 1);
+ next_entry = &lcore_history->entries[next_circular_entry];
+ next_entry->tsc = tsc;
+ next_entry->tpoint_id = tpoint_id;
+ next_entry->poller_id = poller_id;
+ next_entry->size = size;
+ next_entry->object_id = object_id;
+ next_entry->arg1 = arg1;
+
+ /* Ensure all elements of the trace entry are visible to outside trace tools */
+ spdk_smp_wmb();
+ lcore_history->next_entry++;
+}
+
+int
+spdk_trace_init(const char *shm_name, uint64_t num_entries)
+{
+ int i = 0;
+ int histories_size;
+ uint64_t lcore_offsets[SPDK_TRACE_MAX_LCORE + 1];
+
+ /* 0 entries requested - skip trace initialization */
+ if (num_entries == 0) {
+ return 0;
+ }
+
+ lcore_offsets[0] = sizeof(struct spdk_trace_flags);
+ for (i = 1; i < (int)SPDK_COUNTOF(lcore_offsets); i++) {
+ lcore_offsets[i] = spdk_get_trace_history_size(num_entries) + lcore_offsets[i - 1];
+ }
+ histories_size = lcore_offsets[SPDK_TRACE_MAX_LCORE];
+
+ snprintf(g_shm_name, sizeof(g_shm_name), "%s", shm_name);
+
+ g_trace_fd = shm_open(shm_name, O_RDWR | O_CREAT, 0600);
+ if (g_trace_fd == -1) {
+ SPDK_ERRLOG("could not shm_open spdk_trace\n");
+ SPDK_ERRLOG("errno=%d %s\n", errno, spdk_strerror(errno));
+ return 1;
+ }
+
+ if (ftruncate(g_trace_fd, histories_size) != 0) {
+ SPDK_ERRLOG("could not truncate shm\n");
+ goto trace_init_err;
+ }
+
+ g_trace_histories = mmap(NULL, histories_size, PROT_READ | PROT_WRITE,
+ MAP_SHARED, g_trace_fd, 0);
+ if (g_trace_histories == MAP_FAILED) {
+ SPDK_ERRLOG("could not mmap shm\n");
+ goto trace_init_err;
+ }
+
+ /* TODO: On FreeBSD, mlock on shm_open'd memory doesn't seem to work. Docs say that kern.ipc.shm_use_phys=1
+ * should allow it, but forcing that doesn't seem to work either. So for now just skip mlock on FreeBSD
+ * altogether.
+ */
+#if defined(__linux__)
+ if (mlock(g_trace_histories, histories_size) != 0) {
+ SPDK_ERRLOG("Could not mlock shm for tracing - %s.\n", spdk_strerror(errno));
+ if (errno == ENOMEM) {
+ SPDK_ERRLOG("Check /dev/shm for old tracing files that can be deleted.\n");
+ }
+ goto trace_init_err;
+ }
+#endif
+
+ memset(g_trace_histories, 0, histories_size);
+
+ g_trace_flags = &g_trace_histories->flags;
+
+ g_trace_flags->tsc_rate = spdk_get_ticks_hz();
+
+ for (i = 0; i < SPDK_TRACE_MAX_LCORE; i++) {
+ struct spdk_trace_history *lcore_history;
+
+ g_trace_flags->lcore_history_offsets[i] = lcore_offsets[i];
+ lcore_history = spdk_get_per_lcore_history(g_trace_histories, i);
+ lcore_history->lcore = i;
+ lcore_history->num_entries = num_entries;
+ }
+ g_trace_flags->lcore_history_offsets[SPDK_TRACE_MAX_LCORE] = lcore_offsets[SPDK_TRACE_MAX_LCORE];
+
+ spdk_trace_flags_init();
+
+ return 0;
+
+trace_init_err:
+ if (g_trace_histories != MAP_FAILED) {
+ munmap(g_trace_histories, histories_size);
+ }
+ close(g_trace_fd);
+ g_trace_fd = -1;
+ shm_unlink(shm_name);
+ g_trace_histories = NULL;
+
+ return 1;
+
+}
+
+void
+spdk_trace_cleanup(void)
+{
+ bool unlink;
+ int i;
+ struct spdk_trace_history *lcore_history;
+
+ if (g_trace_histories == NULL) {
+ return;
+ }
+
+ /*
+ * Only unlink the shm if there were no trace_entry recorded. This ensures the file
+ * can be used after this process exits/crashes for debugging.
+ * Note that we have to calculate this value before g_trace_histories gets unmapped.
+ */
+ for (i = 0; i < SPDK_TRACE_MAX_LCORE; i++) {
+ lcore_history = spdk_get_per_lcore_history(g_trace_histories, i);
+ unlink = lcore_history->entries[0].tsc == 0;
+ if (!unlink) {
+ break;
+ }
+ }
+
+ munmap(g_trace_histories, sizeof(struct spdk_trace_histories));
+ g_trace_histories = NULL;
+ close(g_trace_fd);
+
+ if (unlink) {
+ shm_unlink(g_shm_name);
+ }
+}
diff --git a/src/spdk/lib/trace/trace_flags.c b/src/spdk/lib/trace/trace_flags.c
new file mode 100644
index 000000000..615afe355
--- /dev/null
+++ b/src/spdk/lib/trace/trace_flags.c
@@ -0,0 +1,323 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/env.h"
+#include "spdk/trace.h"
+#include "spdk/log.h"
+#include "spdk_internal/log.h"
+
+struct spdk_trace_flags *g_trace_flags = NULL;
+static struct spdk_trace_register_fn *g_reg_fn_head = NULL;
+
+SPDK_LOG_REGISTER_COMPONENT("trace", SPDK_LOG_TRACE)
+
+uint64_t
+spdk_trace_get_tpoint_mask(uint32_t group_id)
+{
+ if (group_id >= SPDK_TRACE_MAX_GROUP_ID) {
+ SPDK_ERRLOG("invalid group ID %d\n", group_id);
+ return 0ULL;
+ }
+
+ return g_trace_flags->tpoint_mask[group_id];
+}
+
+void
+spdk_trace_set_tpoints(uint32_t group_id, uint64_t tpoint_mask)
+{
+ if (group_id >= SPDK_TRACE_MAX_GROUP_ID) {
+ SPDK_ERRLOG("invalid group ID %d\n", group_id);
+ return;
+ }
+
+ g_trace_flags->tpoint_mask[group_id] |= tpoint_mask;
+}
+
+void
+spdk_trace_clear_tpoints(uint32_t group_id, uint64_t tpoint_mask)
+{
+ if (group_id >= SPDK_TRACE_MAX_GROUP_ID) {
+ SPDK_ERRLOG("invalid group ID %d\n", group_id);
+ return;
+ }
+
+ g_trace_flags->tpoint_mask[group_id] &= ~tpoint_mask;
+}
+
+uint64_t
+spdk_trace_get_tpoint_group_mask(void)
+{
+ uint64_t mask = 0x0;
+ int i;
+
+ for (i = 0; i < SPDK_TRACE_MAX_GROUP_ID; i++) {
+ if (spdk_trace_get_tpoint_mask(i) != 0) {
+ mask |= (1ULL << i);
+ }
+ }
+
+ return mask;
+}
+
+void
+spdk_trace_set_tpoint_group_mask(uint64_t tpoint_group_mask)
+{
+ int i;
+
+ for (i = 0; i < SPDK_TRACE_MAX_GROUP_ID; i++) {
+ if (tpoint_group_mask & (1ULL << i)) {
+ spdk_trace_set_tpoints(i, -1ULL);
+ }
+ }
+}
+
+void
+spdk_trace_clear_tpoint_group_mask(uint64_t tpoint_group_mask)
+{
+ int i;
+
+ for (i = 0; i < SPDK_TRACE_MAX_GROUP_ID; i++) {
+ if (tpoint_group_mask & (1ULL << i)) {
+ spdk_trace_clear_tpoints(i, -1ULL);
+ }
+ }
+}
+
+struct spdk_trace_register_fn *
+spdk_trace_get_first_register_fn(void)
+{
+ return g_reg_fn_head;
+}
+
+struct spdk_trace_register_fn *
+spdk_trace_get_next_register_fn(struct spdk_trace_register_fn *register_fn)
+{
+ return register_fn->next;
+}
+
+static uint64_t
+trace_create_tpoint_group_mask(const char *group_name)
+{
+ uint64_t tpoint_group_mask = 0;
+ struct spdk_trace_register_fn *register_fn;
+
+ register_fn = spdk_trace_get_first_register_fn();
+ if (strcmp(group_name, "all") == 0) {
+ while (register_fn) {
+ tpoint_group_mask |= (1UL << register_fn->tgroup_id);
+
+ register_fn = spdk_trace_get_next_register_fn(register_fn);
+ }
+ } else {
+ while (register_fn) {
+ if (strcmp(group_name, register_fn->name) == 0) {
+ break;
+ }
+
+ register_fn = spdk_trace_get_next_register_fn(register_fn);
+ }
+
+ if (register_fn != NULL) {
+ tpoint_group_mask |= (1UL << register_fn->tgroup_id);
+ }
+ }
+
+ return tpoint_group_mask;
+}
+
+int
+spdk_trace_enable_tpoint_group(const char *group_name)
+{
+ uint64_t tpoint_group_mask = 0;
+
+ tpoint_group_mask = trace_create_tpoint_group_mask(group_name);
+ if (tpoint_group_mask == 0) {
+ return -1;
+ }
+
+ spdk_trace_set_tpoint_group_mask(tpoint_group_mask);
+ return 0;
+}
+
+int
+spdk_trace_disable_tpoint_group(const char *group_name)
+{
+ uint64_t tpoint_group_mask = 0;
+
+ tpoint_group_mask = trace_create_tpoint_group_mask(group_name);
+ if (tpoint_group_mask == 0) {
+ return -1;
+ }
+
+ spdk_trace_clear_tpoint_group_mask(tpoint_group_mask);
+ return 0;
+}
+
+void
+spdk_trace_mask_usage(FILE *f, const char *tmask_arg)
+{
+ struct spdk_trace_register_fn *register_fn;
+
+ fprintf(f, " %s, --tpoint-group-mask <mask>\n", tmask_arg);
+ fprintf(f, " tracepoint group mask for spdk trace buffers (default 0x0");
+
+ register_fn = g_reg_fn_head;
+ while (register_fn) {
+ fprintf(f, ", %s 0x%x", register_fn->name, 1 << register_fn->tgroup_id);
+ register_fn = register_fn->next;
+ }
+
+ fprintf(f, ", all 0xffff)\n");
+}
+
+void
+spdk_trace_register_owner(uint8_t type, char id_prefix)
+{
+ struct spdk_trace_owner *owner;
+
+ assert(type != OWNER_NONE);
+
+ /* 'owner' has 256 entries and since 'type' is a uint8_t, it
+ * can't overrun the array.
+ */
+ owner = &g_trace_flags->owner[type];
+ assert(owner->type == 0);
+
+ owner->type = type;
+ owner->id_prefix = id_prefix;
+}
+
+void
+spdk_trace_register_object(uint8_t type, char id_prefix)
+{
+ struct spdk_trace_object *object;
+
+ assert(type != OBJECT_NONE);
+
+ /* 'object' has 256 entries and since 'type' is a uint8_t, it
+ * can't overrun the array.
+ */
+ object = &g_trace_flags->object[type];
+ assert(object->type == 0);
+
+ object->type = type;
+ object->id_prefix = id_prefix;
+}
+
+void
+spdk_trace_register_description(const char *name, uint16_t tpoint_id, uint8_t owner_type,
+ uint8_t object_type, uint8_t new_object,
+ uint8_t arg1_type, const char *arg1_name)
+{
+ struct spdk_trace_tpoint *tpoint;
+
+ assert(tpoint_id != 0);
+ assert(tpoint_id < SPDK_TRACE_MAX_TPOINT_ID);
+
+ if (strnlen(name, sizeof(tpoint->name)) == sizeof(tpoint->name)) {
+ SPDK_ERRLOG("name (%s) too long\n", name);
+ }
+
+ tpoint = &g_trace_flags->tpoint[tpoint_id];
+ assert(tpoint->tpoint_id == 0);
+
+ snprintf(tpoint->name, sizeof(tpoint->name), "%s", name);
+ tpoint->tpoint_id = tpoint_id;
+ tpoint->object_type = object_type;
+ tpoint->owner_type = owner_type;
+ tpoint->new_object = new_object;
+ tpoint->arg1_type = arg1_type;
+ snprintf(tpoint->arg1_name, sizeof(tpoint->arg1_name), "%s", arg1_name);
+}
+
+void
+spdk_trace_add_register_fn(struct spdk_trace_register_fn *reg_fn)
+{
+ struct spdk_trace_register_fn *_reg_fn;
+
+ if (reg_fn->name == NULL) {
+ SPDK_ERRLOG("missing name for registering spdk trace tpoint group\n");
+ assert(false);
+ return;
+ }
+
+ if (strcmp(reg_fn->name, "all") == 0) {
+ SPDK_ERRLOG("illegal name (%s) for tpoint group\n", reg_fn->name);
+ assert(false);
+ return;
+ }
+
+ /* Ensure that no trace point group IDs and names are ever duplicated */
+ for (_reg_fn = g_reg_fn_head; _reg_fn; _reg_fn = _reg_fn->next) {
+ if (reg_fn->tgroup_id == _reg_fn->tgroup_id) {
+ SPDK_ERRLOG("duplicate tgroup_id (%d) with %s\n", _reg_fn->tgroup_id, _reg_fn->name);
+ assert(false);
+ return;
+ }
+
+ if (strcmp(reg_fn->name, _reg_fn->name) == 0) {
+ SPDK_ERRLOG("duplicate name with %s\n", _reg_fn->name);
+ assert(false);
+ return;
+ }
+ }
+
+ /* Arrange trace registration in order on tgroup_id */
+ if (g_reg_fn_head == NULL || reg_fn->tgroup_id < g_reg_fn_head->tgroup_id) {
+ reg_fn->next = g_reg_fn_head;
+ g_reg_fn_head = reg_fn;
+ return;
+ }
+
+ for (_reg_fn = g_reg_fn_head; _reg_fn; _reg_fn = _reg_fn->next) {
+ if (_reg_fn->next == NULL || reg_fn->tgroup_id < _reg_fn->next->tgroup_id) {
+ reg_fn->next = _reg_fn->next;
+ _reg_fn->next = reg_fn;
+ return;
+ }
+ }
+}
+
+void
+spdk_trace_flags_init(void)
+{
+ struct spdk_trace_register_fn *reg_fn;
+
+ reg_fn = g_reg_fn_head;
+ while (reg_fn) {
+ reg_fn->reg_fn();
+ reg_fn = reg_fn->next;
+ }
+}
diff --git a/src/spdk/lib/trace/trace_rpc.c b/src/spdk/lib/trace/trace_rpc.c
new file mode 100644
index 000000000..90dbfbc60
--- /dev/null
+++ b/src/spdk/lib/trace/trace_rpc.c
@@ -0,0 +1,170 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/rpc.h"
+#include "spdk/util.h"
+#include "spdk/trace.h"
+#include "spdk_internal/log.h"
+
+struct rpc_tpoint_group {
+ char *name;
+};
+
+static void
+free_rpc_tpoint_group(struct rpc_tpoint_group *p)
+{
+ free(p->name);
+}
+
+static const struct spdk_json_object_decoder rpc_tpoint_group_decoders[] = {
+ {"name", offsetof(struct rpc_tpoint_group, name), spdk_json_decode_string},
+};
+
+static void
+rpc_trace_enable_tpoint_group(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_tpoint_group req = {};
+ struct spdk_json_write_ctx *w;
+
+ if (spdk_json_decode_object(params, rpc_tpoint_group_decoders,
+ SPDK_COUNTOF(rpc_tpoint_group_decoders), &req)) {
+ SPDK_DEBUGLOG(SPDK_LOG_TRACE, "spdk_json_decode_object failed\n");
+ goto invalid;
+ }
+
+ if (req.name == NULL) {
+ SPDK_DEBUGLOG(SPDK_LOG_TRACE, "flag was NULL\n");
+ goto invalid;
+ }
+
+ if (spdk_trace_enable_tpoint_group(req.name)) {
+ goto invalid;
+ }
+
+ free_rpc_tpoint_group(&req);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ return;
+
+invalid:
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+ free_rpc_tpoint_group(&req);
+}
+SPDK_RPC_REGISTER("trace_enable_tpoint_group", rpc_trace_enable_tpoint_group,
+ SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(trace_enable_tpoint_group, enable_tpoint_group)
+
+static void
+rpc_trace_disable_tpoint_group(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_tpoint_group req = {};
+ struct spdk_json_write_ctx *w;
+
+ if (spdk_json_decode_object(params, rpc_tpoint_group_decoders,
+ SPDK_COUNTOF(rpc_tpoint_group_decoders), &req)) {
+ SPDK_DEBUGLOG(SPDK_LOG_TRACE, "spdk_json_decode_object failed\n");
+ goto invalid;
+ }
+
+ if (req.name == NULL) {
+ SPDK_DEBUGLOG(SPDK_LOG_TRACE, "flag was NULL\n");
+ goto invalid;
+ }
+
+ if (spdk_trace_disable_tpoint_group(req.name)) {
+ goto invalid;
+ }
+
+ free_rpc_tpoint_group(&req);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ return;
+
+invalid:
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters");
+ free_rpc_tpoint_group(&req);
+}
+SPDK_RPC_REGISTER("trace_disable_tpoint_group", rpc_trace_disable_tpoint_group,
+ SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(trace_disable_tpoint_group, disable_tpoint_group)
+
+static void
+rpc_trace_get_tpoint_group_mask(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ uint64_t tpoint_group_mask;
+ char mask_str[7];
+ bool enabled;
+ struct spdk_json_write_ctx *w;
+ struct spdk_trace_register_fn *register_fn;
+
+ if (params != NULL) {
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ "trace_get_tpoint_group_mask requires no parameters");
+ return;
+ }
+
+ w = spdk_jsonrpc_begin_result(request);
+ tpoint_group_mask = spdk_trace_get_tpoint_group_mask();
+
+ spdk_json_write_object_begin(w);
+
+ snprintf(mask_str, sizeof(mask_str), "0x%lx", tpoint_group_mask);
+ spdk_json_write_named_string(w, "tpoint_group_mask", mask_str);
+
+ register_fn = spdk_trace_get_first_register_fn();
+ while (register_fn) {
+ enabled = spdk_trace_get_tpoint_mask(register_fn->tgroup_id) != 0;
+
+ spdk_json_write_named_object_begin(w, register_fn->name);
+ spdk_json_write_named_bool(w, "enabled", enabled);
+
+ snprintf(mask_str, sizeof(mask_str), "0x%lx", (1UL << register_fn->tgroup_id));
+ spdk_json_write_named_string(w, "mask", mask_str);
+ spdk_json_write_object_end(w);
+
+ register_fn = spdk_trace_get_next_register_fn(register_fn);
+ }
+
+ spdk_json_write_object_end(w);
+ spdk_jsonrpc_end_result(request, w);
+}
+SPDK_RPC_REGISTER("trace_get_tpoint_group_mask", rpc_trace_get_tpoint_group_mask,
+ SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(trace_get_tpoint_group_mask, get_tpoint_group_mask)
diff --git a/src/spdk/lib/ut_mock/Makefile b/src/spdk/lib/ut_mock/Makefile
new file mode 100644
index 000000000..f4087807f
--- /dev/null
+++ b/src/spdk/lib/ut_mock/Makefile
@@ -0,0 +1,45 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = mock.c
+LIBNAME = ut_mock
+
+SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/ut_mock/mock.c b/src/spdk/lib/ut_mock/mock.c
new file mode 100644
index 000000000..cfe51c1d5
--- /dev/null
+++ b/src/spdk/lib/ut_mock/mock.c
@@ -0,0 +1,71 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk_internal/mock.h"
+
+DEFINE_WRAPPER(calloc, void *, (size_t nmemb, size_t size), (nmemb, size))
+
+DEFINE_WRAPPER(pthread_mutex_init, int,
+ (pthread_mutex_t *mtx, const pthread_mutexattr_t *attr),
+ (mtx, attr))
+
+DEFINE_WRAPPER(pthread_mutexattr_init, int,
+ (pthread_mutexattr_t *attr), (attr))
+
+DEFINE_WRAPPER(recvmsg, ssize_t, (int sockfd, struct msghdr *msg, int flags), (sockfd, msg, flags))
+
+DEFINE_WRAPPER(sendmsg, ssize_t, (int sockfd, const struct msghdr *msg, int flags), (sockfd, msg,
+ flags))
+
+DEFINE_WRAPPER(writev, ssize_t, (int fd, const struct iovec *iov, int iovcnt), (fd, iov, iovcnt))
+
+char *g_unlink_path;
+void (*g_unlink_callback)(void);
+
+int
+__attribute__((used))
+__wrap_unlink(const char *path)
+{
+ if (g_unlink_path == NULL) {
+ return ENOENT;
+ }
+
+ if (strcmp(g_unlink_path, path) != 0) {
+ return ENOENT;
+ }
+
+ if (g_unlink_callback) {
+ g_unlink_callback();
+ }
+ return 0;
+}
diff --git a/src/spdk/lib/util/Makefile b/src/spdk/lib/util/Makefile
new file mode 100644
index 000000000..23f8db6d0
--- /dev/null
+++ b/src/spdk/lib/util/Makefile
@@ -0,0 +1,47 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = base64.c bit_array.c cpuset.c crc16.c crc32.c crc32c.c crc32_ieee.c \
+ dif.c fd.c file.c iov.c math.c pipe.c strerror_tls.c string.c uuid.c
+LIBNAME = util
+LOCAL_SYS_LIBS = -luuid
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_util.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/util/base64.c b/src/spdk/lib/util/base64.c
new file mode 100644
index 000000000..adc5e15da
--- /dev/null
+++ b/src/spdk/lib/util/base64.c
@@ -0,0 +1,262 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+#include "spdk/endian.h"
+#include "spdk/base64.h"
+
+#ifdef __aarch64__
+#include "base64_neon.c"
+#endif
+
+#define BASE64_ENC_BITMASK 0x3FUL
+#define BASE64_PADDING_CHAR '='
+
+static const char base64_enc_table[] =
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+ "abcdefghijklmnopqrstuvwxyz"
+ "0123456789+/";
+
+static const char base64_urfsafe_enc_table[] =
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+ "abcdefghijklmnopqrstuvwxyz"
+ "0123456789-_";
+
+static const uint8_t
+base64_dec_table[] = {
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62, 255, 255, 255, 63,
+ 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 255, 255, 255, 255, 255, 255,
+ 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 255, 255, 255, 255, 255,
+ 255, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
+ 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+};
+
+static const uint8_t
+base64_urlsafe_dec_table[] = {
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62, 255, 255,
+ 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 255, 255, 255, 255, 255, 255,
+ 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 255, 255, 255, 255, 63,
+ 255, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
+ 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+};
+
+static int
+base64_encode(char *dst, const char *enc_table, const void *src, size_t src_len)
+{
+ uint32_t raw_u32;
+
+ if (!dst || !src || src_len <= 0) {
+ return -EINVAL;
+ }
+
+#ifdef __aarch64__
+ base64_encode_neon64(&dst, enc_table, &src, &src_len);
+#endif
+
+ while (src_len >= 4) {
+ raw_u32 = from_be32(src);
+
+ *dst++ = enc_table[(raw_u32 >> 26) & BASE64_ENC_BITMASK];
+ *dst++ = enc_table[(raw_u32 >> 20) & BASE64_ENC_BITMASK];
+ *dst++ = enc_table[(raw_u32 >> 14) & BASE64_ENC_BITMASK];
+ *dst++ = enc_table[(raw_u32 >> 8) & BASE64_ENC_BITMASK];
+
+ src_len -= 3;
+ src += 3;
+ }
+
+ if (src_len == 0) {
+ goto out;
+ }
+
+ raw_u32 = 0;
+ memcpy(&raw_u32, src, src_len);
+ raw_u32 = from_be32(&raw_u32);
+
+ *dst++ = enc_table[(raw_u32 >> 26) & BASE64_ENC_BITMASK];
+ *dst++ = enc_table[(raw_u32 >> 20) & BASE64_ENC_BITMASK];
+ *dst++ = (src_len >= 2) ? enc_table[(raw_u32 >> 14) & BASE64_ENC_BITMASK] : BASE64_PADDING_CHAR;
+ *dst++ = (src_len == 3) ? enc_table[(raw_u32 >> 8) & BASE64_ENC_BITMASK] : BASE64_PADDING_CHAR;
+
+out:
+ *dst = '\0';
+
+ return 0;
+}
+
+int
+spdk_base64_encode(char *dst, const void *src, size_t src_len)
+{
+ return base64_encode(dst, base64_enc_table, src, src_len);
+}
+
+int
+spdk_base64_urlsafe_encode(char *dst, const void *src, size_t src_len)
+{
+ return base64_encode(dst, base64_urfsafe_enc_table, src, src_len);
+}
+
+#ifdef __aarch64__
+static int
+base64_decode(void *dst, size_t *_dst_len, const uint8_t *dec_table,
+ const uint8_t *dec_table_opt, const char *src)
+#else
+static int
+base64_decode(void *dst, size_t *_dst_len, const uint8_t *dec_table, const char *src)
+#endif
+{
+ size_t src_strlen;
+ size_t tail_len = 0;
+ const uint8_t *src_in;
+ uint32_t tmp[4];
+ int i;
+
+ if (!src) {
+ return -EINVAL;
+ }
+
+ src_strlen = strlen(src);
+
+ /* strlen of src should be 4n */
+ if (src_strlen == 0 || src_strlen % 4 != 0) {
+ return -EINVAL;
+ }
+
+ /* Consider Base64 padding, it at most has 2 padding characters. */
+ for (i = 0; i < 2; i++) {
+ if (src[src_strlen - 1] != BASE64_PADDING_CHAR) {
+ break;
+ }
+ src_strlen--;
+ }
+
+ /* strlen of src without padding shouldn't be 4n+1 */
+ if (src_strlen == 0 || src_strlen % 4 == 1) {
+ return -EINVAL;
+ }
+
+ if (_dst_len) {
+ *_dst_len = spdk_base64_get_decoded_len(src_strlen);
+ }
+
+ /* If dst is NULL, the client is only concerned w/ _dst_len, return */
+ if (!dst) {
+ return 0;
+ }
+
+ src_in = (const uint8_t *) src;
+
+#ifdef __aarch64__
+ base64_decode_neon64(&dst, dec_table_opt, &src_in, &src_strlen);
+
+ if (src_strlen == 0) {
+ return 0;
+ }
+#endif
+
+ /* space of dst can be used by to_be32 */
+ while (src_strlen > 4) {
+ tmp[0] = dec_table[*src_in++];
+ tmp[1] = dec_table[*src_in++];
+ tmp[2] = dec_table[*src_in++];
+ tmp[3] = dec_table[*src_in++];
+
+ if (tmp[0] == 255 || tmp[1] == 255 || tmp[2] == 255 || tmp[3] == 255) {
+ return -EINVAL;
+ }
+
+ to_be32(dst, tmp[3] << 8 | tmp[2] << 14 | tmp[1] << 20 | tmp[0] << 26);
+
+ dst += 3;
+ src_strlen -= 4;
+ }
+
+ /* space of dst is not enough to be used by to_be32 */
+ tmp[0] = dec_table[src_in[0]];
+ tmp[1] = dec_table[src_in[1]];
+ tmp[2] = (src_strlen >= 3) ? dec_table[src_in[2]] : 0;
+ tmp[3] = (src_strlen == 4) ? dec_table[src_in[3]] : 0;
+ tail_len = src_strlen - 1;
+
+ if (tmp[0] == 255 || tmp[1] == 255 || tmp[2] == 255 || tmp[3] == 255) {
+ return -EINVAL;
+ }
+
+ to_be32(&tmp[3], tmp[3] << 8 | tmp[2] << 14 | tmp[1] << 20 | tmp[0] << 26);
+ memcpy(dst, (uint8_t *)&tmp[3], tail_len);
+
+ return 0;
+}
+
+int
+spdk_base64_decode(void *dst, size_t *dst_len, const char *src)
+{
+#ifdef __aarch64__
+ return base64_decode(dst, dst_len, base64_dec_table, base64_dec_table_neon64, src);
+#else
+ return base64_decode(dst, dst_len, base64_dec_table, src);
+#endif
+}
+
+int
+spdk_base64_urlsafe_decode(void *dst, size_t *dst_len, const char *src)
+{
+#ifdef __aarch64__
+ return base64_decode(dst, dst_len, base64_urlsafe_dec_table, base64_urlsafe_dec_table_neon64,
+ src);
+#else
+ return base64_decode(dst, dst_len, base64_urlsafe_dec_table, src);
+#endif
+}
diff --git a/src/spdk/lib/util/base64_neon.c b/src/spdk/lib/util/base64_neon.c
new file mode 100644
index 000000000..971cff06c
--- /dev/null
+++ b/src/spdk/lib/util/base64_neon.c
@@ -0,0 +1,225 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) 2005-2007, Nick Galbreath
+ * Copyright (c) 2013-2017, Alfred Klomp
+ * Copyright (c) 2015-2017, Wojciech Mula
+ * Copyright (c) 2016-2017, Matthieu Darbois
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+ * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __aarch64__
+#error Unsupported hardware
+#endif
+
+#include "spdk/stdinc.h"
+/*
+ * Encoding
+ * Use a 64-byte lookup to do the encoding.
+ * Reuse existing base64_dec_table and base64_dec_table.
+
+ * Decoding
+ * The input consists of five valid character sets in the Base64 alphabet,
+ * which we need to map back to the 6-bit values they represent.
+ * There are three ranges, two singles, and then there's the rest.
+ *
+ * LUT1[0-63] = base64_dec_table_neon64[0-63]
+ * LUT2[0-63] = base64_dec_table_neon64[64-127]
+ * # From To LUT Characters
+ * 1 [0..42] [255] #1 invalid input
+ * 2 [43] [62] #1 +
+ * 3 [44..46] [255] #1 invalid input
+ * 4 [47] [63] #1 /
+ * 5 [48..57] [52..61] #1 0..9
+ * 6 [58..63] [255] #1 invalid input
+ * 7 [64] [255] #2 invalid input
+ * 8 [65..90] [0..25] #2 A..Z
+ * 9 [91..96] [255] #2 invalid input
+ * 10 [97..122] [26..51] #2 a..z
+ * 11 [123..126] [255] #2 invalid input
+ * (12) Everything else => invalid input
+ */
+static const uint8_t base64_dec_table_neon64[] = {
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62, 255, 255, 255, 63,
+ 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 255, 255, 255, 255, 255, 255,
+ 0, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+ 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 255, 255, 255, 255,
+ 255, 255, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 255, 255, 255, 255
+};
+
+/*
+ * LUT1[0-63] = base64_urlsafe_dec_table_neon64[0-63]
+ * LUT2[0-63] = base64_urlsafe_dec_table_neon64[64-127]
+ * # From To LUT Characters
+ * 1 [0..44] [255] #1 invalid input
+ * 2 [45] [62] #1 -
+ * 3 [46..47] [255] #1 invalid input
+ * 5 [48..57] [52..61] #1 0..9
+ * 6 [58..63] [255] #1 invalid input
+ * 7 [64] [255] #2 invalid input
+ * 8 [65..90] [0..25] #2 A..Z
+ * 9 [91..94] [255] #2 invalid input
+ * 10 [95] [63] #2 _
+ * 11 [96] [255] #2 invalid input
+ * 12 [97..122] [26..51] #2 a..z
+ * 13 [123..126] [255] #2 invalid input
+ * (14) Everything else => invalid input
+ */
+static const uint8_t base64_urlsafe_dec_table_neon64[] = {
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62, 255, 255,
+ 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 255, 255, 255, 255, 255, 255,
+ 0, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+ 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 255, 255, 255, 255,
+ 63, 255, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 255, 255, 255, 255
+};
+
+#include <arm_neon.h>
+#define CMPGT(s,n) vcgtq_u8((s), vdupq_n_u8(n))
+
+static inline uint8x16x4_t
+load_64byte_table(const uint8_t *p)
+{
+ uint8x16x4_t ret;
+ ret.val[0] = vld1q_u8(p + 0);
+ ret.val[1] = vld1q_u8(p + 16);
+ ret.val[2] = vld1q_u8(p + 32);
+ ret.val[3] = vld1q_u8(p + 48);
+ return ret;
+}
+
+static void
+base64_encode_neon64(char **dst, const char *enc_table, const void **src, size_t *src_len)
+{
+ const uint8x16x4_t tbl_enc = load_64byte_table(enc_table);
+
+ while (*src_len >= 48) {
+ uint8x16x3_t str;
+ uint8x16x4_t res;
+
+ /* Load 48 bytes and deinterleave */
+ str = vld3q_u8((uint8_t *)*src);
+
+ /* Divide bits of three input bytes over four output bytes and clear top two bits */
+ res.val[0] = vshrq_n_u8(str.val[0], 2);
+ res.val[1] = vandq_u8(vorrq_u8(vshrq_n_u8(str.val[1], 4), vshlq_n_u8(str.val[0], 4)),
+ vdupq_n_u8(0x3F));
+ res.val[2] = vandq_u8(vorrq_u8(vshrq_n_u8(str.val[2], 6), vshlq_n_u8(str.val[1], 2)),
+ vdupq_n_u8(0x3F));
+ res.val[3] = vandq_u8(str.val[2], vdupq_n_u8(0x3F));
+
+ /*
+ * The bits have now been shifted to the right locations;
+ * translate their values 0..63 to the Base64 alphabet.
+ * Use a 64-byte table lookup:
+ */
+ res.val[0] = vqtbl4q_u8(tbl_enc, res.val[0]);
+ res.val[1] = vqtbl4q_u8(tbl_enc, res.val[1]);
+ res.val[2] = vqtbl4q_u8(tbl_enc, res.val[2]);
+ res.val[3] = vqtbl4q_u8(tbl_enc, res.val[3]);
+
+ /* Interleave and store result */
+ vst4q_u8((uint8_t *)*dst, res);
+
+ *src += 48; /* 3 * 16 bytes of input */
+ *dst += 64; /* 4 * 16 bytes of output */
+ *src_len -= 48;
+ }
+}
+
+static void
+base64_decode_neon64(void **dst, const uint8_t *dec_table_neon64, const uint8_t **src,
+ size_t *src_len)
+{
+ /*
+ * First LUT tbl_dec1 will use VTBL instruction (out of range indices are set to 0 in destination).
+ * Second LUT tbl_dec2 will use VTBX instruction (out of range indices will be unchanged in destination).
+ * Input [64..126] will be mapped to index [1..63] in tb1_dec2. Index 0 means that value comes from tb1_dec1.
+ */
+ const uint8x16x4_t tbl_dec1 = load_64byte_table(dec_table_neon64);
+ const uint8x16x4_t tbl_dec2 = load_64byte_table(dec_table_neon64 + 64);
+ const uint8x16_t offset = vdupq_n_u8(63U);
+
+ while (*src_len >= 64) {
+
+ uint8x16x4_t dec1, dec2;
+ uint8x16x3_t dec;
+
+ /* Load 64 bytes and deinterleave */
+ uint8x16x4_t str = vld4q_u8((uint8_t *)*src);
+
+ /* Get indices for 2nd LUT */
+ dec2.val[0] = vqsubq_u8(str.val[0], offset);
+ dec2.val[1] = vqsubq_u8(str.val[1], offset);
+ dec2.val[2] = vqsubq_u8(str.val[2], offset);
+ dec2.val[3] = vqsubq_u8(str.val[3], offset);
+
+ /* Get values from 1st LUT */
+ dec1.val[0] = vqtbl4q_u8(tbl_dec1, str.val[0]);
+ dec1.val[1] = vqtbl4q_u8(tbl_dec1, str.val[1]);
+ dec1.val[2] = vqtbl4q_u8(tbl_dec1, str.val[2]);
+ dec1.val[3] = vqtbl4q_u8(tbl_dec1, str.val[3]);
+
+ /* Get values from 2nd LUT */
+ dec2.val[0] = vqtbx4q_u8(dec2.val[0], tbl_dec2, dec2.val[0]);
+ dec2.val[1] = vqtbx4q_u8(dec2.val[1], tbl_dec2, dec2.val[1]);
+ dec2.val[2] = vqtbx4q_u8(dec2.val[2], tbl_dec2, dec2.val[2]);
+ dec2.val[3] = vqtbx4q_u8(dec2.val[3], tbl_dec2, dec2.val[3]);
+
+ /* Get final values */
+ str.val[0] = vorrq_u8(dec1.val[0], dec2.val[0]);
+ str.val[1] = vorrq_u8(dec1.val[1], dec2.val[1]);
+ str.val[2] = vorrq_u8(dec1.val[2], dec2.val[2]);
+ str.val[3] = vorrq_u8(dec1.val[3], dec2.val[3]);
+
+ /* Check for invalid input, any value larger than 63 */
+ uint8x16_t classified = CMPGT(str.val[0], 63);
+ classified = vorrq_u8(classified, CMPGT(str.val[1], 63));
+ classified = vorrq_u8(classified, CMPGT(str.val[2], 63));
+ classified = vorrq_u8(classified, CMPGT(str.val[3], 63));
+
+ /* check that all bits are zero */
+ if (vmaxvq_u8(classified) != 0U) {
+ break;
+ }
+
+ /* Compress four bytes into three */
+ dec.val[0] = vorrq_u8(vshlq_n_u8(str.val[0], 2), vshrq_n_u8(str.val[1], 4));
+ dec.val[1] = vorrq_u8(vshlq_n_u8(str.val[1], 4), vshrq_n_u8(str.val[2], 2));
+ dec.val[2] = vorrq_u8(vshlq_n_u8(str.val[2], 6), str.val[3]);
+
+ /* Interleave and store decoded result */
+ vst3q_u8((uint8_t *)*dst, dec);
+
+ *src += 64;
+ *dst += 48;
+ *src_len -= 64;
+ }
+}
diff --git a/src/spdk/lib/util/bit_array.c b/src/spdk/lib/util/bit_array.c
new file mode 100644
index 000000000..43c1a4d9b
--- /dev/null
+++ b/src/spdk/lib/util/bit_array.c
@@ -0,0 +1,363 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/bit_array.h"
+#include "spdk/env.h"
+
+#include "spdk/likely.h"
+#include "spdk/util.h"
+
+typedef uint64_t spdk_bit_array_word;
+#define SPDK_BIT_ARRAY_WORD_TZCNT(x) (__builtin_ctzll(x))
+#define SPDK_BIT_ARRAY_WORD_POPCNT(x) (__builtin_popcountll(x))
+#define SPDK_BIT_ARRAY_WORD_C(x) ((spdk_bit_array_word)(x))
+#define SPDK_BIT_ARRAY_WORD_BYTES sizeof(spdk_bit_array_word)
+#define SPDK_BIT_ARRAY_WORD_BITS (SPDK_BIT_ARRAY_WORD_BYTES * 8)
+#define SPDK_BIT_ARRAY_WORD_INDEX_SHIFT spdk_u32log2(SPDK_BIT_ARRAY_WORD_BITS)
+#define SPDK_BIT_ARRAY_WORD_INDEX_MASK ((1u << SPDK_BIT_ARRAY_WORD_INDEX_SHIFT) - 1)
+
+struct spdk_bit_array {
+ uint32_t bit_count;
+ spdk_bit_array_word words[];
+};
+
+struct spdk_bit_array *
+spdk_bit_array_create(uint32_t num_bits)
+{
+ struct spdk_bit_array *ba = NULL;
+
+ spdk_bit_array_resize(&ba, num_bits);
+
+ return ba;
+}
+
+void
+spdk_bit_array_free(struct spdk_bit_array **bap)
+{
+ struct spdk_bit_array *ba;
+
+ if (!bap) {
+ return;
+ }
+
+ ba = *bap;
+ *bap = NULL;
+ spdk_free(ba);
+}
+
+static inline uint32_t
+bit_array_word_count(uint32_t num_bits)
+{
+ return (num_bits + SPDK_BIT_ARRAY_WORD_BITS - 1) >> SPDK_BIT_ARRAY_WORD_INDEX_SHIFT;
+}
+
+static inline spdk_bit_array_word
+bit_array_word_mask(uint32_t num_bits)
+{
+ assert(num_bits < SPDK_BIT_ARRAY_WORD_BITS);
+ return (SPDK_BIT_ARRAY_WORD_C(1) << num_bits) - 1;
+}
+
+int
+spdk_bit_array_resize(struct spdk_bit_array **bap, uint32_t num_bits)
+{
+ struct spdk_bit_array *new_ba;
+ uint32_t old_word_count, new_word_count;
+ size_t new_size;
+
+ /*
+ * Max number of bits allowed is UINT32_MAX - 1, because we use UINT32_MAX to denote
+ * when a set or cleared bit cannot be found.
+ */
+ if (!bap || num_bits == UINT32_MAX) {
+ return -EINVAL;
+ }
+
+ new_word_count = bit_array_word_count(num_bits);
+ new_size = offsetof(struct spdk_bit_array, words) + new_word_count * SPDK_BIT_ARRAY_WORD_BYTES;
+
+ /*
+ * Always keep one extra word with a 0 and a 1 past the actual required size so that the
+ * find_first functions can just keep going until they match.
+ */
+ new_size += SPDK_BIT_ARRAY_WORD_BYTES;
+
+ new_ba = (struct spdk_bit_array *)spdk_realloc(*bap, new_size, 64);
+ if (!new_ba) {
+ return -ENOMEM;
+ }
+
+ /*
+ * Set up special extra word (see above comment about find_first_clear).
+ *
+ * This is set to 0b10 so that find_first_clear will find a 0 at the very first
+ * bit past the end of the buffer, and find_first_set will find a 1 at the next bit
+ * past that.
+ */
+ new_ba->words[new_word_count] = 0x2;
+
+ if (*bap == NULL) {
+ old_word_count = 0;
+ new_ba->bit_count = 0;
+ } else {
+ old_word_count = bit_array_word_count(new_ba->bit_count);
+ }
+
+ if (new_word_count > old_word_count) {
+ /* Zero out new entries */
+ memset(&new_ba->words[old_word_count], 0,
+ (new_word_count - old_word_count) * SPDK_BIT_ARRAY_WORD_BYTES);
+ } else if (new_word_count == old_word_count && num_bits < new_ba->bit_count) {
+ /* Make sure any existing partial last word is cleared beyond the new num_bits. */
+ uint32_t last_word_bits;
+ spdk_bit_array_word mask;
+
+ last_word_bits = num_bits & SPDK_BIT_ARRAY_WORD_INDEX_MASK;
+ mask = bit_array_word_mask(last_word_bits);
+ new_ba->words[old_word_count - 1] &= mask;
+ }
+
+ new_ba->bit_count = num_bits;
+ *bap = new_ba;
+ return 0;
+}
+
+uint32_t
+spdk_bit_array_capacity(const struct spdk_bit_array *ba)
+{
+ return ba->bit_count;
+}
+
+static inline int
+bit_array_get_word(const struct spdk_bit_array *ba, uint32_t bit_index,
+ uint32_t *word_index, uint32_t *word_bit_index)
+{
+ if (spdk_unlikely(bit_index >= ba->bit_count)) {
+ return -EINVAL;
+ }
+
+ *word_index = bit_index >> SPDK_BIT_ARRAY_WORD_INDEX_SHIFT;
+ *word_bit_index = bit_index & SPDK_BIT_ARRAY_WORD_INDEX_MASK;
+
+ return 0;
+}
+
+bool
+spdk_bit_array_get(const struct spdk_bit_array *ba, uint32_t bit_index)
+{
+ uint32_t word_index, word_bit_index;
+
+ if (bit_array_get_word(ba, bit_index, &word_index, &word_bit_index)) {
+ return false;
+ }
+
+ return (ba->words[word_index] >> word_bit_index) & 1U;
+}
+
+int
+spdk_bit_array_set(struct spdk_bit_array *ba, uint32_t bit_index)
+{
+ uint32_t word_index, word_bit_index;
+
+ if (bit_array_get_word(ba, bit_index, &word_index, &word_bit_index)) {
+ return -EINVAL;
+ }
+
+ ba->words[word_index] |= (SPDK_BIT_ARRAY_WORD_C(1) << word_bit_index);
+ return 0;
+}
+
+void
+spdk_bit_array_clear(struct spdk_bit_array *ba, uint32_t bit_index)
+{
+ uint32_t word_index, word_bit_index;
+
+ if (bit_array_get_word(ba, bit_index, &word_index, &word_bit_index)) {
+ /*
+ * Clearing past the end of the bit array is a no-op, since bit past the end
+ * are implicitly 0.
+ */
+ return;
+ }
+
+ ba->words[word_index] &= ~(SPDK_BIT_ARRAY_WORD_C(1) << word_bit_index);
+}
+
+static inline uint32_t
+bit_array_find_first(const struct spdk_bit_array *ba, uint32_t start_bit_index,
+ spdk_bit_array_word xor_mask)
+{
+ uint32_t word_index, first_word_bit_index;
+ spdk_bit_array_word word, first_word_mask;
+ const spdk_bit_array_word *words, *cur_word;
+
+ if (spdk_unlikely(start_bit_index >= ba->bit_count)) {
+ return ba->bit_count;
+ }
+
+ word_index = start_bit_index >> SPDK_BIT_ARRAY_WORD_INDEX_SHIFT;
+ words = ba->words;
+ cur_word = &words[word_index];
+
+ /*
+ * Special case for first word: skip start_bit_index % SPDK_BIT_ARRAY_WORD_BITS bits
+ * within the first word.
+ */
+ first_word_bit_index = start_bit_index & SPDK_BIT_ARRAY_WORD_INDEX_MASK;
+ first_word_mask = bit_array_word_mask(first_word_bit_index);
+
+ word = (*cur_word ^ xor_mask) & ~first_word_mask;
+
+ /*
+ * spdk_bit_array_resize() guarantees that an extra word with a 1 and a 0 will always be
+ * at the end of the words[] array, so just keep going until a word matches.
+ */
+ while (word == 0) {
+ word = *++cur_word ^ xor_mask;
+ }
+
+ return ((uintptr_t)cur_word - (uintptr_t)words) * 8 + SPDK_BIT_ARRAY_WORD_TZCNT(word);
+}
+
+
+uint32_t
+spdk_bit_array_find_first_set(const struct spdk_bit_array *ba, uint32_t start_bit_index)
+{
+ uint32_t bit_index;
+
+ bit_index = bit_array_find_first(ba, start_bit_index, 0);
+
+ /*
+ * If we ran off the end of the array and found the 1 bit in the extra word,
+ * return UINT32_MAX to indicate no actual 1 bits were found.
+ */
+ if (bit_index >= ba->bit_count) {
+ bit_index = UINT32_MAX;
+ }
+
+ return bit_index;
+}
+
+uint32_t
+spdk_bit_array_find_first_clear(const struct spdk_bit_array *ba, uint32_t start_bit_index)
+{
+ uint32_t bit_index;
+
+ bit_index = bit_array_find_first(ba, start_bit_index, SPDK_BIT_ARRAY_WORD_C(-1));
+
+ /*
+ * If we ran off the end of the array and found the 0 bit in the extra word,
+ * return UINT32_MAX to indicate no actual 0 bits were found.
+ */
+ if (bit_index >= ba->bit_count) {
+ bit_index = UINT32_MAX;
+ }
+
+ return bit_index;
+}
+
+uint32_t
+spdk_bit_array_count_set(const struct spdk_bit_array *ba)
+{
+ const spdk_bit_array_word *cur_word = ba->words;
+ uint32_t word_count = bit_array_word_count(ba->bit_count);
+ uint32_t set_count = 0;
+
+ while (word_count--) {
+ /*
+ * No special treatment is needed for the last (potentially partial) word, since
+ * spdk_bit_array_resize() makes sure the bits past bit_count are cleared.
+ */
+ set_count += SPDK_BIT_ARRAY_WORD_POPCNT(*cur_word++);
+ }
+
+ return set_count;
+}
+
+uint32_t
+spdk_bit_array_count_clear(const struct spdk_bit_array *ba)
+{
+ return ba->bit_count - spdk_bit_array_count_set(ba);
+}
+
+void
+spdk_bit_array_store_mask(const struct spdk_bit_array *ba, void *mask)
+{
+ uint32_t size, i;
+ uint32_t num_bits = spdk_bit_array_capacity(ba);
+
+ size = num_bits / CHAR_BIT;
+ memcpy(mask, ba->words, size);
+
+ for (i = 0; i < num_bits % CHAR_BIT; i++) {
+ if (spdk_bit_array_get(ba, i + size * CHAR_BIT)) {
+ ((uint8_t *)mask)[size] |= (1U << i);
+ } else {
+ ((uint8_t *)mask)[size] &= ~(1U << i);
+ }
+ }
+}
+
+void
+spdk_bit_array_load_mask(struct spdk_bit_array *ba, const void *mask)
+{
+ uint32_t size, i;
+ uint32_t num_bits = spdk_bit_array_capacity(ba);
+
+ size = num_bits / CHAR_BIT;
+ memcpy(ba->words, mask, size);
+
+ for (i = 0; i < num_bits % CHAR_BIT; i++) {
+ if (((uint8_t *)mask)[size] & (1U << i)) {
+ spdk_bit_array_set(ba, i + size * CHAR_BIT);
+ } else {
+ spdk_bit_array_clear(ba, i + size * CHAR_BIT);
+ }
+ }
+}
+
+void
+spdk_bit_array_clear_mask(struct spdk_bit_array *ba)
+{
+ uint32_t size, i;
+ uint32_t num_bits = spdk_bit_array_capacity(ba);
+
+ size = num_bits / CHAR_BIT;
+ memset(ba->words, 0, size);
+
+ for (i = 0; i < num_bits % CHAR_BIT; i++) {
+ spdk_bit_array_clear(ba, i + size * CHAR_BIT);
+ }
+}
diff --git a/src/spdk/lib/util/cpuset.c b/src/spdk/lib/util/cpuset.c
new file mode 100644
index 000000000..8d7c8dc89
--- /dev/null
+++ b/src/spdk/lib/util/cpuset.c
@@ -0,0 +1,336 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/cpuset.h"
+#include "spdk/log.h"
+
+struct spdk_cpuset *
+spdk_cpuset_alloc(void)
+{
+ return (struct spdk_cpuset *)calloc(sizeof(struct spdk_cpuset), 1);
+}
+
+void
+spdk_cpuset_free(struct spdk_cpuset *set)
+{
+ free(set);
+}
+
+bool
+spdk_cpuset_equal(const struct spdk_cpuset *set1, const struct spdk_cpuset *set2)
+{
+ assert(set1 != NULL);
+ assert(set2 != NULL);
+ return memcmp(set1->cpus, set2->cpus, sizeof(set2->cpus)) == 0;
+}
+
+void
+spdk_cpuset_copy(struct spdk_cpuset *dst, const struct spdk_cpuset *src)
+{
+ assert(dst != NULL);
+ assert(src != NULL);
+ memcpy(&dst->cpus, &src->cpus, sizeof(src->cpus));
+}
+
+void
+spdk_cpuset_negate(struct spdk_cpuset *set)
+{
+ unsigned int i;
+ assert(set != NULL);
+ for (i = 0; i < sizeof(set->cpus); i++) {
+ set->cpus[i] = ~set->cpus[i];
+ }
+}
+
+void
+spdk_cpuset_and(struct spdk_cpuset *dst, const struct spdk_cpuset *src)
+{
+ unsigned int i;
+ assert(dst != NULL);
+ assert(src != NULL);
+ for (i = 0; i < sizeof(src->cpus); i++) {
+ dst->cpus[i] &= src->cpus[i];
+ }
+}
+
+void
+spdk_cpuset_or(struct spdk_cpuset *dst, const struct spdk_cpuset *src)
+{
+ unsigned int i;
+ assert(dst != NULL);
+ assert(src != NULL);
+ for (i = 0; i < sizeof(src->cpus); i++) {
+ dst->cpus[i] |= src->cpus[i];
+ }
+}
+
+void
+spdk_cpuset_xor(struct spdk_cpuset *dst, const struct spdk_cpuset *src)
+{
+ unsigned int i;
+ assert(dst != NULL);
+ assert(src != NULL);
+ for (i = 0; i < sizeof(src->cpus); i++) {
+ dst->cpus[i] ^= src->cpus[i];
+ }
+}
+
+void
+spdk_cpuset_zero(struct spdk_cpuset *set)
+{
+ assert(set != NULL);
+ memset(set->cpus, 0, sizeof(set->cpus));
+}
+
+void
+spdk_cpuset_set_cpu(struct spdk_cpuset *set, uint32_t cpu, bool state)
+{
+ assert(set != NULL);
+ assert(cpu < sizeof(set->cpus) * 8);
+ if (state) {
+ set->cpus[cpu / 8] |= (1U << (cpu % 8));
+ } else {
+ set->cpus[cpu / 8] &= ~(1U << (cpu % 8));
+ }
+}
+
+bool
+spdk_cpuset_get_cpu(const struct spdk_cpuset *set, uint32_t cpu)
+{
+ assert(set != NULL);
+ assert(cpu < sizeof(set->cpus) * 8);
+ return (set->cpus[cpu / 8] >> (cpu % 8)) & 1U;
+}
+
+uint32_t
+spdk_cpuset_count(const struct spdk_cpuset *set)
+{
+ uint32_t count = 0;
+ uint8_t n;
+ unsigned int i;
+ for (i = 0; i < sizeof(set->cpus); i++) {
+ n = set->cpus[i];
+ while (n) {
+ n &= (n - 1);
+ count++;
+ }
+ }
+ return count;
+}
+
+const char *
+spdk_cpuset_fmt(struct spdk_cpuset *set)
+{
+ uint32_t lcore, lcore_max = 0;
+ int val, i, n;
+ char *ptr;
+ static const char *hex = "0123456789abcdef";
+
+ assert(set != NULL);
+
+ for (lcore = 0; lcore < sizeof(set->cpus) * 8; lcore++) {
+ if (spdk_cpuset_get_cpu(set, lcore)) {
+ lcore_max = lcore;
+ }
+ }
+
+ ptr = set->str;
+ n = lcore_max / 8;
+ val = set->cpus[n];
+
+ /* Store first number only if it is not leading zero */
+ if ((val & 0xf0) != 0) {
+ *(ptr++) = hex[(val & 0xf0) >> 4];
+ }
+ *(ptr++) = hex[val & 0x0f];
+
+ for (i = n - 1; i >= 0; i--) {
+ val = set->cpus[i];
+ *(ptr++) = hex[(val & 0xf0) >> 4];
+ *(ptr++) = hex[val & 0x0f];
+ }
+ *ptr = '\0';
+
+ return set->str;
+}
+
+static int
+hex_value(uint8_t c)
+{
+#define V(x, y) [x] = y + 1
+ static const int8_t val[256] = {
+ V('0', 0), V('1', 1), V('2', 2), V('3', 3), V('4', 4),
+ V('5', 5), V('6', 6), V('7', 7), V('8', 8), V('9', 9),
+ V('A', 0xA), V('B', 0xB), V('C', 0xC), V('D', 0xD), V('E', 0xE), V('F', 0xF),
+ V('a', 0xA), V('b', 0xB), V('c', 0xC), V('d', 0xD), V('e', 0xE), V('f', 0xF),
+ };
+#undef V
+
+ return val[c] - 1;
+}
+
+static int
+parse_list(const char *mask, struct spdk_cpuset *set)
+{
+ char *end;
+ const char *ptr = mask;
+ uint32_t lcore;
+ uint32_t lcore_min, lcore_max;
+
+ spdk_cpuset_zero(set);
+ lcore_min = UINT32_MAX;
+
+ ptr++;
+ end = (char *)ptr;
+ do {
+ while (isblank(*ptr)) {
+ ptr++;
+ }
+ if (*ptr == '\0' || *ptr == ']' || *ptr == '-' || *ptr == ',') {
+ goto invalid_character;
+ }
+
+ errno = 0;
+ lcore = strtoul(ptr, &end, 10);
+ if (errno) {
+ SPDK_ERRLOG("Conversion of core mask in '%s' failed\n", mask);
+ return -1;
+ }
+
+ if (lcore >= sizeof(set->cpus) * 8) {
+ SPDK_ERRLOG("Core number %" PRIu32 " is out of range in '%s'\n", lcore, mask);
+ return -1;
+ }
+
+ while (isblank(*end)) {
+ end++;
+ }
+
+ if (*end == '-') {
+ lcore_min = lcore;
+ } else if (*end == ',' || *end == ']') {
+ lcore_max = lcore;
+ if (lcore_min == UINT32_MAX) {
+ lcore_min = lcore;
+ }
+ if (lcore_min > lcore_max) {
+ SPDK_ERRLOG("Invalid range of CPUs (%" PRIu32 " > %" PRIu32 ")\n",
+ lcore_min, lcore_max);
+ return -1;
+ }
+ for (lcore = lcore_min; lcore <= lcore_max; lcore++) {
+ spdk_cpuset_set_cpu(set, lcore, true);
+ }
+ lcore_min = UINT32_MAX;
+ } else {
+ goto invalid_character;
+ }
+
+ ptr = end + 1;
+
+ } while (*end != ']');
+
+ return 0;
+
+invalid_character:
+ if (*end == '\0') {
+ SPDK_ERRLOG("Unexpected end of core list '%s'\n", mask);
+ } else {
+ SPDK_ERRLOG("Parsing of core list '%s' failed on character '%c'\n", mask, *end);
+ }
+ return -1;
+}
+
+static int
+parse_mask(const char *mask, struct spdk_cpuset *set, size_t len)
+{
+ int i, j;
+ char c;
+ int val;
+ uint32_t lcore = 0;
+
+ if (mask[0] == '0' && (mask[1] == 'x' || mask[1] == 'X')) {
+ mask += 2;
+ len -= 2;
+ }
+
+ spdk_cpuset_zero(set);
+ for (i = len - 1; i >= 0; i--) {
+ c = mask[i];
+ val = hex_value(c);
+ if (val < 0) {
+ /* Invalid character */
+ SPDK_ERRLOG("Invalid character in core mask '%s' (%c)\n", mask, c);
+ return -1;
+ }
+ for (j = 0; j < 4 && lcore < sizeof(set->cpus); j++, lcore++) {
+ if ((1 << j) & val) {
+ spdk_cpuset_set_cpu(set, lcore, true);
+ }
+ }
+ }
+
+ return 0;
+}
+
+int
+spdk_cpuset_parse(struct spdk_cpuset *set, const char *mask)
+{
+ int ret;
+ size_t len;
+
+ if (mask == NULL || set == NULL) {
+ return -1;
+ }
+
+ while (isblank(*mask)) {
+ mask++;
+ }
+
+ len = strlen(mask);
+ while (len > 0 && isblank(mask[len - 1])) {
+ len--;
+ }
+
+ if (len == 0) {
+ return -1;
+ }
+
+ if (mask[0] == '[') {
+ ret = parse_list(mask, set);
+ } else {
+ ret = parse_mask(mask, set, len);
+ }
+
+ return ret;
+}
diff --git a/src/spdk/lib/util/crc16.c b/src/spdk/lib/util/crc16.c
new file mode 100644
index 000000000..2ba168c4b
--- /dev/null
+++ b/src/spdk/lib/util/crc16.c
@@ -0,0 +1,668 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/crc16.h"
+#include "spdk/config.h"
+
+/*
+ * Use Intelligent Storage Acceleration Library for line speed CRC
+ */
+
+#ifdef SPDK_CONFIG_ISAL
+#include "isa-l/include/crc.h"
+
+uint16_t
+spdk_crc16_t10dif(uint16_t init_crc, const void *buf, size_t len)
+{
+ return (crc16_t10dif(init_crc, buf, len));
+}
+
+uint16_t
+spdk_crc16_t10dif_copy(uint16_t init_crc, uint8_t *dst, uint8_t *src,
+ size_t len)
+{
+ return (crc16_t10dif_copy(init_crc, dst, src, len));
+}
+
+#else
+/*
+ * Use table-driven (somewhat faster) CRC
+ */
+
+/*
+ * Static tables used for the table_driven implementation.
+ */
+
+static const uint16_t crc_table_fast[16][256] = {
+ {
+ 0x0000u, 0x8BB7u, 0x9CD9u, 0x176Eu, 0xB205u, 0x39B2u, 0x2EDCu, 0xA56Bu,
+ 0xEFBDu, 0x640Au, 0x7364u, 0xF8D3u, 0x5DB8u, 0xD60Fu, 0xC161u, 0x4AD6u,
+ 0x54CDu, 0xDF7Au, 0xC814u, 0x43A3u, 0xE6C8u, 0x6D7Fu, 0x7A11u, 0xF1A6u,
+ 0xBB70u, 0x30C7u, 0x27A9u, 0xAC1Eu, 0x0975u, 0x82C2u, 0x95ACu, 0x1E1Bu,
+ 0xA99Au, 0x222Du, 0x3543u, 0xBEF4u, 0x1B9Fu, 0x9028u, 0x8746u, 0x0CF1u,
+ 0x4627u, 0xCD90u, 0xDAFEu, 0x5149u, 0xF422u, 0x7F95u, 0x68FBu, 0xE34Cu,
+ 0xFD57u, 0x76E0u, 0x618Eu, 0xEA39u, 0x4F52u, 0xC4E5u, 0xD38Bu, 0x583Cu,
+ 0x12EAu, 0x995Du, 0x8E33u, 0x0584u, 0xA0EFu, 0x2B58u, 0x3C36u, 0xB781u,
+ 0xD883u, 0x5334u, 0x445Au, 0xCFEDu, 0x6A86u, 0xE131u, 0xF65Fu, 0x7DE8u,
+ 0x373Eu, 0xBC89u, 0xABE7u, 0x2050u, 0x853Bu, 0x0E8Cu, 0x19E2u, 0x9255u,
+ 0x8C4Eu, 0x07F9u, 0x1097u, 0x9B20u, 0x3E4Bu, 0xB5FCu, 0xA292u, 0x2925u,
+ 0x63F3u, 0xE844u, 0xFF2Au, 0x749Du, 0xD1F6u, 0x5A41u, 0x4D2Fu, 0xC698u,
+ 0x7119u, 0xFAAEu, 0xEDC0u, 0x6677u, 0xC31Cu, 0x48ABu, 0x5FC5u, 0xD472u,
+ 0x9EA4u, 0x1513u, 0x027Du, 0x89CAu, 0x2CA1u, 0xA716u, 0xB078u, 0x3BCFu,
+ 0x25D4u, 0xAE63u, 0xB90Du, 0x32BAu, 0x97D1u, 0x1C66u, 0x0B08u, 0x80BFu,
+ 0xCA69u, 0x41DEu, 0x56B0u, 0xDD07u, 0x786Cu, 0xF3DBu, 0xE4B5u, 0x6F02u,
+ 0x3AB1u, 0xB106u, 0xA668u, 0x2DDFu, 0x88B4u, 0x0303u, 0x146Du, 0x9FDAu,
+ 0xD50Cu, 0x5EBBu, 0x49D5u, 0xC262u, 0x6709u, 0xECBEu, 0xFBD0u, 0x7067u,
+ 0x6E7Cu, 0xE5CBu, 0xF2A5u, 0x7912u, 0xDC79u, 0x57CEu, 0x40A0u, 0xCB17u,
+ 0x81C1u, 0x0A76u, 0x1D18u, 0x96AFu, 0x33C4u, 0xB873u, 0xAF1Du, 0x24AAu,
+ 0x932Bu, 0x189Cu, 0x0FF2u, 0x8445u, 0x212Eu, 0xAA99u, 0xBDF7u, 0x3640u,
+ 0x7C96u, 0xF721u, 0xE04Fu, 0x6BF8u, 0xCE93u, 0x4524u, 0x524Au, 0xD9FDu,
+ 0xC7E6u, 0x4C51u, 0x5B3Fu, 0xD088u, 0x75E3u, 0xFE54u, 0xE93Au, 0x628Du,
+ 0x285Bu, 0xA3ECu, 0xB482u, 0x3F35u, 0x9A5Eu, 0x11E9u, 0x0687u, 0x8D30u,
+ 0xE232u, 0x6985u, 0x7EEBu, 0xF55Cu, 0x5037u, 0xDB80u, 0xCCEEu, 0x4759u,
+ 0x0D8Fu, 0x8638u, 0x9156u, 0x1AE1u, 0xBF8Au, 0x343Du, 0x2353u, 0xA8E4u,
+ 0xB6FFu, 0x3D48u, 0x2A26u, 0xA191u, 0x04FAu, 0x8F4Du, 0x9823u, 0x1394u,
+ 0x5942u, 0xD2F5u, 0xC59Bu, 0x4E2Cu, 0xEB47u, 0x60F0u, 0x779Eu, 0xFC29u,
+ 0x4BA8u, 0xC01Fu, 0xD771u, 0x5CC6u, 0xF9ADu, 0x721Au, 0x6574u, 0xEEC3u,
+ 0xA415u, 0x2FA2u, 0x38CCu, 0xB37Bu, 0x1610u, 0x9DA7u, 0x8AC9u, 0x017Eu,
+ 0x1F65u, 0x94D2u, 0x83BCu, 0x080Bu, 0xAD60u, 0x26D7u, 0x31B9u, 0xBA0Eu,
+ 0xF0D8u, 0x7B6Fu, 0x6C01u, 0xE7B6u, 0x42DDu, 0xC96Au, 0xDE04u, 0x55B3u
+ },
+ {
+ 0x0000u, 0x7562u, 0xEAC4u, 0x9FA6u, 0x5E3Fu, 0x2B5Du, 0xB4FBu, 0xC199u,
+ 0xBC7Eu, 0xC91Cu, 0x56BAu, 0x23D8u, 0xE241u, 0x9723u, 0x0885u, 0x7DE7u,
+ 0xF34Bu, 0x8629u, 0x198Fu, 0x6CEDu, 0xAD74u, 0xD816u, 0x47B0u, 0x32D2u,
+ 0x4F35u, 0x3A57u, 0xA5F1u, 0xD093u, 0x110Au, 0x6468u, 0xFBCEu, 0x8EACu,
+ 0x6D21u, 0x1843u, 0x87E5u, 0xF287u, 0x331Eu, 0x467Cu, 0xD9DAu, 0xACB8u,
+ 0xD15Fu, 0xA43Du, 0x3B9Bu, 0x4EF9u, 0x8F60u, 0xFA02u, 0x65A4u, 0x10C6u,
+ 0x9E6Au, 0xEB08u, 0x74AEu, 0x01CCu, 0xC055u, 0xB537u, 0x2A91u, 0x5FF3u,
+ 0x2214u, 0x5776u, 0xC8D0u, 0xBDB2u, 0x7C2Bu, 0x0949u, 0x96EFu, 0xE38Du,
+ 0xDA42u, 0xAF20u, 0x3086u, 0x45E4u, 0x847Du, 0xF11Fu, 0x6EB9u, 0x1BDBu,
+ 0x663Cu, 0x135Eu, 0x8CF8u, 0xF99Au, 0x3803u, 0x4D61u, 0xD2C7u, 0xA7A5u,
+ 0x2909u, 0x5C6Bu, 0xC3CDu, 0xB6AFu, 0x7736u, 0x0254u, 0x9DF2u, 0xE890u,
+ 0x9577u, 0xE015u, 0x7FB3u, 0x0AD1u, 0xCB48u, 0xBE2Au, 0x218Cu, 0x54EEu,
+ 0xB763u, 0xC201u, 0x5DA7u, 0x28C5u, 0xE95Cu, 0x9C3Eu, 0x0398u, 0x76FAu,
+ 0x0B1Du, 0x7E7Fu, 0xE1D9u, 0x94BBu, 0x5522u, 0x2040u, 0xBFE6u, 0xCA84u,
+ 0x4428u, 0x314Au, 0xAEECu, 0xDB8Eu, 0x1A17u, 0x6F75u, 0xF0D3u, 0x85B1u,
+ 0xF856u, 0x8D34u, 0x1292u, 0x67F0u, 0xA669u, 0xD30Bu, 0x4CADu, 0x39CFu,
+ 0x3F33u, 0x4A51u, 0xD5F7u, 0xA095u, 0x610Cu, 0x146Eu, 0x8BC8u, 0xFEAAu,
+ 0x834Du, 0xF62Fu, 0x6989u, 0x1CEBu, 0xDD72u, 0xA810u, 0x37B6u, 0x42D4u,
+ 0xCC78u, 0xB91Au, 0x26BCu, 0x53DEu, 0x9247u, 0xE725u, 0x7883u, 0x0DE1u,
+ 0x7006u, 0x0564u, 0x9AC2u, 0xEFA0u, 0x2E39u, 0x5B5Bu, 0xC4FDu, 0xB19Fu,
+ 0x5212u, 0x2770u, 0xB8D6u, 0xCDB4u, 0x0C2Du, 0x794Fu, 0xE6E9u, 0x938Bu,
+ 0xEE6Cu, 0x9B0Eu, 0x04A8u, 0x71CAu, 0xB053u, 0xC531u, 0x5A97u, 0x2FF5u,
+ 0xA159u, 0xD43Bu, 0x4B9Du, 0x3EFFu, 0xFF66u, 0x8A04u, 0x15A2u, 0x60C0u,
+ 0x1D27u, 0x6845u, 0xF7E3u, 0x8281u, 0x4318u, 0x367Au, 0xA9DCu, 0xDCBEu,
+ 0xE571u, 0x9013u, 0x0FB5u, 0x7AD7u, 0xBB4Eu, 0xCE2Cu, 0x518Au, 0x24E8u,
+ 0x590Fu, 0x2C6Du, 0xB3CBu, 0xC6A9u, 0x0730u, 0x7252u, 0xEDF4u, 0x9896u,
+ 0x163Au, 0x6358u, 0xFCFEu, 0x899Cu, 0x4805u, 0x3D67u, 0xA2C1u, 0xD7A3u,
+ 0xAA44u, 0xDF26u, 0x4080u, 0x35E2u, 0xF47Bu, 0x8119u, 0x1EBFu, 0x6BDDu,
+ 0x8850u, 0xFD32u, 0x6294u, 0x17F6u, 0xD66Fu, 0xA30Du, 0x3CABu, 0x49C9u,
+ 0x342Eu, 0x414Cu, 0xDEEAu, 0xAB88u, 0x6A11u, 0x1F73u, 0x80D5u, 0xF5B7u,
+ 0x7B1Bu, 0x0E79u, 0x91DFu, 0xE4BDu, 0x2524u, 0x5046u, 0xCFE0u, 0xBA82u,
+ 0xC765u, 0xB207u, 0x2DA1u, 0x58C3u, 0x995Au, 0xEC38u, 0x739Eu, 0x06FCu
+ },
+ {
+ 0x0000u, 0x7E66u, 0xFCCCu, 0x82AAu, 0x722Fu, 0x0C49u, 0x8EE3u, 0xF085u,
+ 0xE45Eu, 0x9A38u, 0x1892u, 0x66F4u, 0x9671u, 0xE817u, 0x6ABDu, 0x14DBu,
+ 0x430Bu, 0x3D6Du, 0xBFC7u, 0xC1A1u, 0x3124u, 0x4F42u, 0xCDE8u, 0xB38Eu,
+ 0xA755u, 0xD933u, 0x5B99u, 0x25FFu, 0xD57Au, 0xAB1Cu, 0x29B6u, 0x57D0u,
+ 0x8616u, 0xF870u, 0x7ADAu, 0x04BCu, 0xF439u, 0x8A5Fu, 0x08F5u, 0x7693u,
+ 0x6248u, 0x1C2Eu, 0x9E84u, 0xE0E2u, 0x1067u, 0x6E01u, 0xECABu, 0x92CDu,
+ 0xC51Du, 0xBB7Bu, 0x39D1u, 0x47B7u, 0xB732u, 0xC954u, 0x4BFEu, 0x3598u,
+ 0x2143u, 0x5F25u, 0xDD8Fu, 0xA3E9u, 0x536Cu, 0x2D0Au, 0xAFA0u, 0xD1C6u,
+ 0x879Bu, 0xF9FDu, 0x7B57u, 0x0531u, 0xF5B4u, 0x8BD2u, 0x0978u, 0x771Eu,
+ 0x63C5u, 0x1DA3u, 0x9F09u, 0xE16Fu, 0x11EAu, 0x6F8Cu, 0xED26u, 0x9340u,
+ 0xC490u, 0xBAF6u, 0x385Cu, 0x463Au, 0xB6BFu, 0xC8D9u, 0x4A73u, 0x3415u,
+ 0x20CEu, 0x5EA8u, 0xDC02u, 0xA264u, 0x52E1u, 0x2C87u, 0xAE2Du, 0xD04Bu,
+ 0x018Du, 0x7FEBu, 0xFD41u, 0x8327u, 0x73A2u, 0x0DC4u, 0x8F6Eu, 0xF108u,
+ 0xE5D3u, 0x9BB5u, 0x191Fu, 0x6779u, 0x97FCu, 0xE99Au, 0x6B30u, 0x1556u,
+ 0x4286u, 0x3CE0u, 0xBE4Au, 0xC02Cu, 0x30A9u, 0x4ECFu, 0xCC65u, 0xB203u,
+ 0xA6D8u, 0xD8BEu, 0x5A14u, 0x2472u, 0xD4F7u, 0xAA91u, 0x283Bu, 0x565Du,
+ 0x8481u, 0xFAE7u, 0x784Du, 0x062Bu, 0xF6AEu, 0x88C8u, 0x0A62u, 0x7404u,
+ 0x60DFu, 0x1EB9u, 0x9C13u, 0xE275u, 0x12F0u, 0x6C96u, 0xEE3Cu, 0x905Au,
+ 0xC78Au, 0xB9ECu, 0x3B46u, 0x4520u, 0xB5A5u, 0xCBC3u, 0x4969u, 0x370Fu,
+ 0x23D4u, 0x5DB2u, 0xDF18u, 0xA17Eu, 0x51FBu, 0x2F9Du, 0xAD37u, 0xD351u,
+ 0x0297u, 0x7CF1u, 0xFE5Bu, 0x803Du, 0x70B8u, 0x0EDEu, 0x8C74u, 0xF212u,
+ 0xE6C9u, 0x98AFu, 0x1A05u, 0x6463u, 0x94E6u, 0xEA80u, 0x682Au, 0x164Cu,
+ 0x419Cu, 0x3FFAu, 0xBD50u, 0xC336u, 0x33B3u, 0x4DD5u, 0xCF7Fu, 0xB119u,
+ 0xA5C2u, 0xDBA4u, 0x590Eu, 0x2768u, 0xD7EDu, 0xA98Bu, 0x2B21u, 0x5547u,
+ 0x031Au, 0x7D7Cu, 0xFFD6u, 0x81B0u, 0x7135u, 0x0F53u, 0x8DF9u, 0xF39Fu,
+ 0xE744u, 0x9922u, 0x1B88u, 0x65EEu, 0x956Bu, 0xEB0Du, 0x69A7u, 0x17C1u,
+ 0x4011u, 0x3E77u, 0xBCDDu, 0xC2BBu, 0x323Eu, 0x4C58u, 0xCEF2u, 0xB094u,
+ 0xA44Fu, 0xDA29u, 0x5883u, 0x26E5u, 0xD660u, 0xA806u, 0x2AACu, 0x54CAu,
+ 0x850Cu, 0xFB6Au, 0x79C0u, 0x07A6u, 0xF723u, 0x8945u, 0x0BEFu, 0x7589u,
+ 0x6152u, 0x1F34u, 0x9D9Eu, 0xE3F8u, 0x137Du, 0x6D1Bu, 0xEFB1u, 0x91D7u,
+ 0xC607u, 0xB861u, 0x3ACBu, 0x44ADu, 0xB428u, 0xCA4Eu, 0x48E4u, 0x3682u,
+ 0x2259u, 0x5C3Fu, 0xDE95u, 0xA0F3u, 0x5076u, 0x2E10u, 0xACBAu, 0xD2DCu
+ },
+ {
+ 0x0000u, 0x82B5u, 0x8EDDu, 0x0C68u, 0x960Du, 0x14B8u, 0x18D0u, 0x9A65u,
+ 0xA7ADu, 0x2518u, 0x2970u, 0xABC5u, 0x31A0u, 0xB315u, 0xBF7Du, 0x3DC8u,
+ 0xC4EDu, 0x4658u, 0x4A30u, 0xC885u, 0x52E0u, 0xD055u, 0xDC3Du, 0x5E88u,
+ 0x6340u, 0xE1F5u, 0xED9Du, 0x6F28u, 0xF54Du, 0x77F8u, 0x7B90u, 0xF925u,
+ 0x026Du, 0x80D8u, 0x8CB0u, 0x0E05u, 0x9460u, 0x16D5u, 0x1ABDu, 0x9808u,
+ 0xA5C0u, 0x2775u, 0x2B1Du, 0xA9A8u, 0x33CDu, 0xB178u, 0xBD10u, 0x3FA5u,
+ 0xC680u, 0x4435u, 0x485Du, 0xCAE8u, 0x508Du, 0xD238u, 0xDE50u, 0x5CE5u,
+ 0x612Du, 0xE398u, 0xEFF0u, 0x6D45u, 0xF720u, 0x7595u, 0x79FDu, 0xFB48u,
+ 0x04DAu, 0x866Fu, 0x8A07u, 0x08B2u, 0x92D7u, 0x1062u, 0x1C0Au, 0x9EBFu,
+ 0xA377u, 0x21C2u, 0x2DAAu, 0xAF1Fu, 0x357Au, 0xB7CFu, 0xBBA7u, 0x3912u,
+ 0xC037u, 0x4282u, 0x4EEAu, 0xCC5Fu, 0x563Au, 0xD48Fu, 0xD8E7u, 0x5A52u,
+ 0x679Au, 0xE52Fu, 0xE947u, 0x6BF2u, 0xF197u, 0x7322u, 0x7F4Au, 0xFDFFu,
+ 0x06B7u, 0x8402u, 0x886Au, 0x0ADFu, 0x90BAu, 0x120Fu, 0x1E67u, 0x9CD2u,
+ 0xA11Au, 0x23AFu, 0x2FC7u, 0xAD72u, 0x3717u, 0xB5A2u, 0xB9CAu, 0x3B7Fu,
+ 0xC25Au, 0x40EFu, 0x4C87u, 0xCE32u, 0x5457u, 0xD6E2u, 0xDA8Au, 0x583Fu,
+ 0x65F7u, 0xE742u, 0xEB2Au, 0x699Fu, 0xF3FAu, 0x714Fu, 0x7D27u, 0xFF92u,
+ 0x09B4u, 0x8B01u, 0x8769u, 0x05DCu, 0x9FB9u, 0x1D0Cu, 0x1164u, 0x93D1u,
+ 0xAE19u, 0x2CACu, 0x20C4u, 0xA271u, 0x3814u, 0xBAA1u, 0xB6C9u, 0x347Cu,
+ 0xCD59u, 0x4FECu, 0x4384u, 0xC131u, 0x5B54u, 0xD9E1u, 0xD589u, 0x573Cu,
+ 0x6AF4u, 0xE841u, 0xE429u, 0x669Cu, 0xFCF9u, 0x7E4Cu, 0x7224u, 0xF091u,
+ 0x0BD9u, 0x896Cu, 0x8504u, 0x07B1u, 0x9DD4u, 0x1F61u, 0x1309u, 0x91BCu,
+ 0xAC74u, 0x2EC1u, 0x22A9u, 0xA01Cu, 0x3A79u, 0xB8CCu, 0xB4A4u, 0x3611u,
+ 0xCF34u, 0x4D81u, 0x41E9u, 0xC35Cu, 0x5939u, 0xDB8Cu, 0xD7E4u, 0x5551u,
+ 0x6899u, 0xEA2Cu, 0xE644u, 0x64F1u, 0xFE94u, 0x7C21u, 0x7049u, 0xF2FCu,
+ 0x0D6Eu, 0x8FDBu, 0x83B3u, 0x0106u, 0x9B63u, 0x19D6u, 0x15BEu, 0x970Bu,
+ 0xAAC3u, 0x2876u, 0x241Eu, 0xA6ABu, 0x3CCEu, 0xBE7Bu, 0xB213u, 0x30A6u,
+ 0xC983u, 0x4B36u, 0x475Eu, 0xC5EBu, 0x5F8Eu, 0xDD3Bu, 0xD153u, 0x53E6u,
+ 0x6E2Eu, 0xEC9Bu, 0xE0F3u, 0x6246u, 0xF823u, 0x7A96u, 0x76FEu, 0xF44Bu,
+ 0x0F03u, 0x8DB6u, 0x81DEu, 0x036Bu, 0x990Eu, 0x1BBBu, 0x17D3u, 0x9566u,
+ 0xA8AEu, 0x2A1Bu, 0x2673u, 0xA4C6u, 0x3EA3u, 0xBC16u, 0xB07Eu, 0x32CBu,
+ 0xCBEEu, 0x495Bu, 0x4533u, 0xC786u, 0x5DE3u, 0xDF56u, 0xD33Eu, 0x518Bu,
+ 0x6C43u, 0xEEF6u, 0xE29Eu, 0x602Bu, 0xFA4Eu, 0x78FBu, 0x7493u, 0xF626u
+ },
+ {
+ 0x0000u, 0x1368u, 0x26D0u, 0x35B8u, 0x4DA0u, 0x5EC8u, 0x6B70u, 0x7818u,
+ 0x9B40u, 0x8828u, 0xBD90u, 0xAEF8u, 0xD6E0u, 0xC588u, 0xF030u, 0xE358u,
+ 0xBD37u, 0xAE5Fu, 0x9BE7u, 0x888Fu, 0xF097u, 0xE3FFu, 0xD647u, 0xC52Fu,
+ 0x2677u, 0x351Fu, 0x00A7u, 0x13CFu, 0x6BD7u, 0x78BFu, 0x4D07u, 0x5E6Fu,
+ 0xF1D9u, 0xE2B1u, 0xD709u, 0xC461u, 0xBC79u, 0xAF11u, 0x9AA9u, 0x89C1u,
+ 0x6A99u, 0x79F1u, 0x4C49u, 0x5F21u, 0x2739u, 0x3451u, 0x01E9u, 0x1281u,
+ 0x4CEEu, 0x5F86u, 0x6A3Eu, 0x7956u, 0x014Eu, 0x1226u, 0x279Eu, 0x34F6u,
+ 0xD7AEu, 0xC4C6u, 0xF17Eu, 0xE216u, 0x9A0Eu, 0x8966u, 0xBCDEu, 0xAFB6u,
+ 0x6805u, 0x7B6Du, 0x4ED5u, 0x5DBDu, 0x25A5u, 0x36CDu, 0x0375u, 0x101Du,
+ 0xF345u, 0xE02Du, 0xD595u, 0xC6FDu, 0xBEE5u, 0xAD8Du, 0x9835u, 0x8B5Du,
+ 0xD532u, 0xC65Au, 0xF3E2u, 0xE08Au, 0x9892u, 0x8BFAu, 0xBE42u, 0xAD2Au,
+ 0x4E72u, 0x5D1Au, 0x68A2u, 0x7BCAu, 0x03D2u, 0x10BAu, 0x2502u, 0x366Au,
+ 0x99DCu, 0x8AB4u, 0xBF0Cu, 0xAC64u, 0xD47Cu, 0xC714u, 0xF2ACu, 0xE1C4u,
+ 0x029Cu, 0x11F4u, 0x244Cu, 0x3724u, 0x4F3Cu, 0x5C54u, 0x69ECu, 0x7A84u,
+ 0x24EBu, 0x3783u, 0x023Bu, 0x1153u, 0x694Bu, 0x7A23u, 0x4F9Bu, 0x5CF3u,
+ 0xBFABu, 0xACC3u, 0x997Bu, 0x8A13u, 0xF20Bu, 0xE163u, 0xD4DBu, 0xC7B3u,
+ 0xD00Au, 0xC362u, 0xF6DAu, 0xE5B2u, 0x9DAAu, 0x8EC2u, 0xBB7Au, 0xA812u,
+ 0x4B4Au, 0x5822u, 0x6D9Au, 0x7EF2u, 0x06EAu, 0x1582u, 0x203Au, 0x3352u,
+ 0x6D3Du, 0x7E55u, 0x4BEDu, 0x5885u, 0x209Du, 0x33F5u, 0x064Du, 0x1525u,
+ 0xF67Du, 0xE515u, 0xD0ADu, 0xC3C5u, 0xBBDDu, 0xA8B5u, 0x9D0Du, 0x8E65u,
+ 0x21D3u, 0x32BBu, 0x0703u, 0x146Bu, 0x6C73u, 0x7F1Bu, 0x4AA3u, 0x59CBu,
+ 0xBA93u, 0xA9FBu, 0x9C43u, 0x8F2Bu, 0xF733u, 0xE45Bu, 0xD1E3u, 0xC28Bu,
+ 0x9CE4u, 0x8F8Cu, 0xBA34u, 0xA95Cu, 0xD144u, 0xC22Cu, 0xF794u, 0xE4FCu,
+ 0x07A4u, 0x14CCu, 0x2174u, 0x321Cu, 0x4A04u, 0x596Cu, 0x6CD4u, 0x7FBCu,
+ 0xB80Fu, 0xAB67u, 0x9EDFu, 0x8DB7u, 0xF5AFu, 0xE6C7u, 0xD37Fu, 0xC017u,
+ 0x234Fu, 0x3027u, 0x059Fu, 0x16F7u, 0x6EEFu, 0x7D87u, 0x483Fu, 0x5B57u,
+ 0x0538u, 0x1650u, 0x23E8u, 0x3080u, 0x4898u, 0x5BF0u, 0x6E48u, 0x7D20u,
+ 0x9E78u, 0x8D10u, 0xB8A8u, 0xABC0u, 0xD3D8u, 0xC0B0u, 0xF508u, 0xE660u,
+ 0x49D6u, 0x5ABEu, 0x6F06u, 0x7C6Eu, 0x0476u, 0x171Eu, 0x22A6u, 0x31CEu,
+ 0xD296u, 0xC1FEu, 0xF446u, 0xE72Eu, 0x9F36u, 0x8C5Eu, 0xB9E6u, 0xAA8Eu,
+ 0xF4E1u, 0xE789u, 0xD231u, 0xC159u, 0xB941u, 0xAA29u, 0x9F91u, 0x8CF9u,
+ 0x6FA1u, 0x7CC9u, 0x4971u, 0x5A19u, 0x2201u, 0x3169u, 0x04D1u, 0x17B9u
+ },
+ {
+ 0x0000u, 0x2BA3u, 0x5746u, 0x7CE5u, 0xAE8Cu, 0x852Fu, 0xF9CAu, 0xD269u,
+ 0xD6AFu, 0xFD0Cu, 0x81E9u, 0xAA4Au, 0x7823u, 0x5380u, 0x2F65u, 0x04C6u,
+ 0x26E9u, 0x0D4Au, 0x71AFu, 0x5A0Cu, 0x8865u, 0xA3C6u, 0xDF23u, 0xF480u,
+ 0xF046u, 0xDBE5u, 0xA700u, 0x8CA3u, 0x5ECAu, 0x7569u, 0x098Cu, 0x222Fu,
+ 0x4DD2u, 0x6671u, 0x1A94u, 0x3137u, 0xE35Eu, 0xC8FDu, 0xB418u, 0x9FBBu,
+ 0x9B7Du, 0xB0DEu, 0xCC3Bu, 0xE798u, 0x35F1u, 0x1E52u, 0x62B7u, 0x4914u,
+ 0x6B3Bu, 0x4098u, 0x3C7Du, 0x17DEu, 0xC5B7u, 0xEE14u, 0x92F1u, 0xB952u,
+ 0xBD94u, 0x9637u, 0xEAD2u, 0xC171u, 0x1318u, 0x38BBu, 0x445Eu, 0x6FFDu,
+ 0x9BA4u, 0xB007u, 0xCCE2u, 0xE741u, 0x3528u, 0x1E8Bu, 0x626Eu, 0x49CDu,
+ 0x4D0Bu, 0x66A8u, 0x1A4Du, 0x31EEu, 0xE387u, 0xC824u, 0xB4C1u, 0x9F62u,
+ 0xBD4Du, 0x96EEu, 0xEA0Bu, 0xC1A8u, 0x13C1u, 0x3862u, 0x4487u, 0x6F24u,
+ 0x6BE2u, 0x4041u, 0x3CA4u, 0x1707u, 0xC56Eu, 0xEECDu, 0x9228u, 0xB98Bu,
+ 0xD676u, 0xFDD5u, 0x8130u, 0xAA93u, 0x78FAu, 0x5359u, 0x2FBCu, 0x041Fu,
+ 0x00D9u, 0x2B7Au, 0x579Fu, 0x7C3Cu, 0xAE55u, 0x85F6u, 0xF913u, 0xD2B0u,
+ 0xF09Fu, 0xDB3Cu, 0xA7D9u, 0x8C7Au, 0x5E13u, 0x75B0u, 0x0955u, 0x22F6u,
+ 0x2630u, 0x0D93u, 0x7176u, 0x5AD5u, 0x88BCu, 0xA31Fu, 0xDFFAu, 0xF459u,
+ 0xBCFFu, 0x975Cu, 0xEBB9u, 0xC01Au, 0x1273u, 0x39D0u, 0x4535u, 0x6E96u,
+ 0x6A50u, 0x41F3u, 0x3D16u, 0x16B5u, 0xC4DCu, 0xEF7Fu, 0x939Au, 0xB839u,
+ 0x9A16u, 0xB1B5u, 0xCD50u, 0xE6F3u, 0x349Au, 0x1F39u, 0x63DCu, 0x487Fu,
+ 0x4CB9u, 0x671Au, 0x1BFFu, 0x305Cu, 0xE235u, 0xC996u, 0xB573u, 0x9ED0u,
+ 0xF12Du, 0xDA8Eu, 0xA66Bu, 0x8DC8u, 0x5FA1u, 0x7402u, 0x08E7u, 0x2344u,
+ 0x2782u, 0x0C21u, 0x70C4u, 0x5B67u, 0x890Eu, 0xA2ADu, 0xDE48u, 0xF5EBu,
+ 0xD7C4u, 0xFC67u, 0x8082u, 0xAB21u, 0x7948u, 0x52EBu, 0x2E0Eu, 0x05ADu,
+ 0x016Bu, 0x2AC8u, 0x562Du, 0x7D8Eu, 0xAFE7u, 0x8444u, 0xF8A1u, 0xD302u,
+ 0x275Bu, 0x0CF8u, 0x701Du, 0x5BBEu, 0x89D7u, 0xA274u, 0xDE91u, 0xF532u,
+ 0xF1F4u, 0xDA57u, 0xA6B2u, 0x8D11u, 0x5F78u, 0x74DBu, 0x083Eu, 0x239Du,
+ 0x01B2u, 0x2A11u, 0x56F4u, 0x7D57u, 0xAF3Eu, 0x849Du, 0xF878u, 0xD3DBu,
+ 0xD71Du, 0xFCBEu, 0x805Bu, 0xABF8u, 0x7991u, 0x5232u, 0x2ED7u, 0x0574u,
+ 0x6A89u, 0x412Au, 0x3DCFu, 0x166Cu, 0xC405u, 0xEFA6u, 0x9343u, 0xB8E0u,
+ 0xBC26u, 0x9785u, 0xEB60u, 0xC0C3u, 0x12AAu, 0x3909u, 0x45ECu, 0x6E4Fu,
+ 0x4C60u, 0x67C3u, 0x1B26u, 0x3085u, 0xE2ECu, 0xC94Fu, 0xB5AAu, 0x9E09u,
+ 0x9ACFu, 0xB16Cu, 0xCD89u, 0xE62Au, 0x3443u, 0x1FE0u, 0x6305u, 0x48A6u
+ },
+ {
+ 0x0000u, 0xF249u, 0x6F25u, 0x9D6Cu, 0xDE4Au, 0x2C03u, 0xB16Fu, 0x4326u,
+ 0x3723u, 0xC56Au, 0x5806u, 0xAA4Fu, 0xE969u, 0x1B20u, 0x864Cu, 0x7405u,
+ 0x6E46u, 0x9C0Fu, 0x0163u, 0xF32Au, 0xB00Cu, 0x4245u, 0xDF29u, 0x2D60u,
+ 0x5965u, 0xAB2Cu, 0x3640u, 0xC409u, 0x872Fu, 0x7566u, 0xE80Au, 0x1A43u,
+ 0xDC8Cu, 0x2EC5u, 0xB3A9u, 0x41E0u, 0x02C6u, 0xF08Fu, 0x6DE3u, 0x9FAAu,
+ 0xEBAFu, 0x19E6u, 0x848Au, 0x76C3u, 0x35E5u, 0xC7ACu, 0x5AC0u, 0xA889u,
+ 0xB2CAu, 0x4083u, 0xDDEFu, 0x2FA6u, 0x6C80u, 0x9EC9u, 0x03A5u, 0xF1ECu,
+ 0x85E9u, 0x77A0u, 0xEACCu, 0x1885u, 0x5BA3u, 0xA9EAu, 0x3486u, 0xC6CFu,
+ 0x32AFu, 0xC0E6u, 0x5D8Au, 0xAFC3u, 0xECE5u, 0x1EACu, 0x83C0u, 0x7189u,
+ 0x058Cu, 0xF7C5u, 0x6AA9u, 0x98E0u, 0xDBC6u, 0x298Fu, 0xB4E3u, 0x46AAu,
+ 0x5CE9u, 0xAEA0u, 0x33CCu, 0xC185u, 0x82A3u, 0x70EAu, 0xED86u, 0x1FCFu,
+ 0x6BCAu, 0x9983u, 0x04EFu, 0xF6A6u, 0xB580u, 0x47C9u, 0xDAA5u, 0x28ECu,
+ 0xEE23u, 0x1C6Au, 0x8106u, 0x734Fu, 0x3069u, 0xC220u, 0x5F4Cu, 0xAD05u,
+ 0xD900u, 0x2B49u, 0xB625u, 0x446Cu, 0x074Au, 0xF503u, 0x686Fu, 0x9A26u,
+ 0x8065u, 0x722Cu, 0xEF40u, 0x1D09u, 0x5E2Fu, 0xAC66u, 0x310Au, 0xC343u,
+ 0xB746u, 0x450Fu, 0xD863u, 0x2A2Au, 0x690Cu, 0x9B45u, 0x0629u, 0xF460u,
+ 0x655Eu, 0x9717u, 0x0A7Bu, 0xF832u, 0xBB14u, 0x495Du, 0xD431u, 0x2678u,
+ 0x527Du, 0xA034u, 0x3D58u, 0xCF11u, 0x8C37u, 0x7E7Eu, 0xE312u, 0x115Bu,
+ 0x0B18u, 0xF951u, 0x643Du, 0x9674u, 0xD552u, 0x271Bu, 0xBA77u, 0x483Eu,
+ 0x3C3Bu, 0xCE72u, 0x531Eu, 0xA157u, 0xE271u, 0x1038u, 0x8D54u, 0x7F1Du,
+ 0xB9D2u, 0x4B9Bu, 0xD6F7u, 0x24BEu, 0x6798u, 0x95D1u, 0x08BDu, 0xFAF4u,
+ 0x8EF1u, 0x7CB8u, 0xE1D4u, 0x139Du, 0x50BBu, 0xA2F2u, 0x3F9Eu, 0xCDD7u,
+ 0xD794u, 0x25DDu, 0xB8B1u, 0x4AF8u, 0x09DEu, 0xFB97u, 0x66FBu, 0x94B2u,
+ 0xE0B7u, 0x12FEu, 0x8F92u, 0x7DDBu, 0x3EFDu, 0xCCB4u, 0x51D8u, 0xA391u,
+ 0x57F1u, 0xA5B8u, 0x38D4u, 0xCA9Du, 0x89BBu, 0x7BF2u, 0xE69Eu, 0x14D7u,
+ 0x60D2u, 0x929Bu, 0x0FF7u, 0xFDBEu, 0xBE98u, 0x4CD1u, 0xD1BDu, 0x23F4u,
+ 0x39B7u, 0xCBFEu, 0x5692u, 0xA4DBu, 0xE7FDu, 0x15B4u, 0x88D8u, 0x7A91u,
+ 0x0E94u, 0xFCDDu, 0x61B1u, 0x93F8u, 0xD0DEu, 0x2297u, 0xBFFBu, 0x4DB2u,
+ 0x8B7Du, 0x7934u, 0xE458u, 0x1611u, 0x5537u, 0xA77Eu, 0x3A12u, 0xC85Bu,
+ 0xBC5Eu, 0x4E17u, 0xD37Bu, 0x2132u, 0x6214u, 0x905Du, 0x0D31u, 0xFF78u,
+ 0xE53Bu, 0x1772u, 0x8A1Eu, 0x7857u, 0x3B71u, 0xC938u, 0x5454u, 0xA61Du,
+ 0xD218u, 0x2051u, 0xBD3Du, 0x4F74u, 0x0C52u, 0xFE1Bu, 0x6377u, 0x913Eu
+ },
+ {
+ 0x0000u, 0xCABCu, 0x1ECFu, 0xD473u, 0x3D9Eu, 0xF722u, 0x2351u, 0xE9EDu,
+ 0x7B3Cu, 0xB180u, 0x65F3u, 0xAF4Fu, 0x46A2u, 0x8C1Eu, 0x586Du, 0x92D1u,
+ 0xF678u, 0x3CC4u, 0xE8B7u, 0x220Bu, 0xCBE6u, 0x015Au, 0xD529u, 0x1F95u,
+ 0x8D44u, 0x47F8u, 0x938Bu, 0x5937u, 0xB0DAu, 0x7A66u, 0xAE15u, 0x64A9u,
+ 0x6747u, 0xADFBu, 0x7988u, 0xB334u, 0x5AD9u, 0x9065u, 0x4416u, 0x8EAAu,
+ 0x1C7Bu, 0xD6C7u, 0x02B4u, 0xC808u, 0x21E5u, 0xEB59u, 0x3F2Au, 0xF596u,
+ 0x913Fu, 0x5B83u, 0x8FF0u, 0x454Cu, 0xACA1u, 0x661Du, 0xB26Eu, 0x78D2u,
+ 0xEA03u, 0x20BFu, 0xF4CCu, 0x3E70u, 0xD79Du, 0x1D21u, 0xC952u, 0x03EEu,
+ 0xCE8Eu, 0x0432u, 0xD041u, 0x1AFDu, 0xF310u, 0x39ACu, 0xEDDFu, 0x2763u,
+ 0xB5B2u, 0x7F0Eu, 0xAB7Du, 0x61C1u, 0x882Cu, 0x4290u, 0x96E3u, 0x5C5Fu,
+ 0x38F6u, 0xF24Au, 0x2639u, 0xEC85u, 0x0568u, 0xCFD4u, 0x1BA7u, 0xD11Bu,
+ 0x43CAu, 0x8976u, 0x5D05u, 0x97B9u, 0x7E54u, 0xB4E8u, 0x609Bu, 0xAA27u,
+ 0xA9C9u, 0x6375u, 0xB706u, 0x7DBAu, 0x9457u, 0x5EEBu, 0x8A98u, 0x4024u,
+ 0xD2F5u, 0x1849u, 0xCC3Au, 0x0686u, 0xEF6Bu, 0x25D7u, 0xF1A4u, 0x3B18u,
+ 0x5FB1u, 0x950Du, 0x417Eu, 0x8BC2u, 0x622Fu, 0xA893u, 0x7CE0u, 0xB65Cu,
+ 0x248Du, 0xEE31u, 0x3A42u, 0xF0FEu, 0x1913u, 0xD3AFu, 0x07DCu, 0xCD60u,
+ 0x16ABu, 0xDC17u, 0x0864u, 0xC2D8u, 0x2B35u, 0xE189u, 0x35FAu, 0xFF46u,
+ 0x6D97u, 0xA72Bu, 0x7358u, 0xB9E4u, 0x5009u, 0x9AB5u, 0x4EC6u, 0x847Au,
+ 0xE0D3u, 0x2A6Fu, 0xFE1Cu, 0x34A0u, 0xDD4Du, 0x17F1u, 0xC382u, 0x093Eu,
+ 0x9BEFu, 0x5153u, 0x8520u, 0x4F9Cu, 0xA671u, 0x6CCDu, 0xB8BEu, 0x7202u,
+ 0x71ECu, 0xBB50u, 0x6F23u, 0xA59Fu, 0x4C72u, 0x86CEu, 0x52BDu, 0x9801u,
+ 0x0AD0u, 0xC06Cu, 0x141Fu, 0xDEA3u, 0x374Eu, 0xFDF2u, 0x2981u, 0xE33Du,
+ 0x8794u, 0x4D28u, 0x995Bu, 0x53E7u, 0xBA0Au, 0x70B6u, 0xA4C5u, 0x6E79u,
+ 0xFCA8u, 0x3614u, 0xE267u, 0x28DBu, 0xC136u, 0x0B8Au, 0xDFF9u, 0x1545u,
+ 0xD825u, 0x1299u, 0xC6EAu, 0x0C56u, 0xE5BBu, 0x2F07u, 0xFB74u, 0x31C8u,
+ 0xA319u, 0x69A5u, 0xBDD6u, 0x776Au, 0x9E87u, 0x543Bu, 0x8048u, 0x4AF4u,
+ 0x2E5Du, 0xE4E1u, 0x3092u, 0xFA2Eu, 0x13C3u, 0xD97Fu, 0x0D0Cu, 0xC7B0u,
+ 0x5561u, 0x9FDDu, 0x4BAEu, 0x8112u, 0x68FFu, 0xA243u, 0x7630u, 0xBC8Cu,
+ 0xBF62u, 0x75DEu, 0xA1ADu, 0x6B11u, 0x82FCu, 0x4840u, 0x9C33u, 0x568Fu,
+ 0xC45Eu, 0x0EE2u, 0xDA91u, 0x102Du, 0xF9C0u, 0x337Cu, 0xE70Fu, 0x2DB3u,
+ 0x491Au, 0x83A6u, 0x57D5u, 0x9D69u, 0x7484u, 0xBE38u, 0x6A4Bu, 0xA0F7u,
+ 0x3226u, 0xF89Au, 0x2CE9u, 0xE655u, 0x0FB8u, 0xC504u, 0x1177u, 0xDBCBu
+ },
+ {
+ 0x0000u, 0x2D56u, 0x5AACu, 0x77FAu, 0xB558u, 0x980Eu, 0xEFF4u, 0xC2A2u,
+ 0xE107u, 0xCC51u, 0xBBABu, 0x96FDu, 0x545Fu, 0x7909u, 0x0EF3u, 0x23A5u,
+ 0x49B9u, 0x64EFu, 0x1315u, 0x3E43u, 0xFCE1u, 0xD1B7u, 0xA64Du, 0x8B1Bu,
+ 0xA8BEu, 0x85E8u, 0xF212u, 0xDF44u, 0x1DE6u, 0x30B0u, 0x474Au, 0x6A1Cu,
+ 0x9372u, 0xBE24u, 0xC9DEu, 0xE488u, 0x262Au, 0x0B7Cu, 0x7C86u, 0x51D0u,
+ 0x7275u, 0x5F23u, 0x28D9u, 0x058Fu, 0xC72Du, 0xEA7Bu, 0x9D81u, 0xB0D7u,
+ 0xDACBu, 0xF79Du, 0x8067u, 0xAD31u, 0x6F93u, 0x42C5u, 0x353Fu, 0x1869u,
+ 0x3BCCu, 0x169Au, 0x6160u, 0x4C36u, 0x8E94u, 0xA3C2u, 0xD438u, 0xF96Eu,
+ 0xAD53u, 0x8005u, 0xF7FFu, 0xDAA9u, 0x180Bu, 0x355Du, 0x42A7u, 0x6FF1u,
+ 0x4C54u, 0x6102u, 0x16F8u, 0x3BAEu, 0xF90Cu, 0xD45Au, 0xA3A0u, 0x8EF6u,
+ 0xE4EAu, 0xC9BCu, 0xBE46u, 0x9310u, 0x51B2u, 0x7CE4u, 0x0B1Eu, 0x2648u,
+ 0x05EDu, 0x28BBu, 0x5F41u, 0x7217u, 0xB0B5u, 0x9DE3u, 0xEA19u, 0xC74Fu,
+ 0x3E21u, 0x1377u, 0x648Du, 0x49DBu, 0x8B79u, 0xA62Fu, 0xD1D5u, 0xFC83u,
+ 0xDF26u, 0xF270u, 0x858Au, 0xA8DCu, 0x6A7Eu, 0x4728u, 0x30D2u, 0x1D84u,
+ 0x7798u, 0x5ACEu, 0x2D34u, 0x0062u, 0xC2C0u, 0xEF96u, 0x986Cu, 0xB53Au,
+ 0x969Fu, 0xBBC9u, 0xCC33u, 0xE165u, 0x23C7u, 0x0E91u, 0x796Bu, 0x543Du,
+ 0xD111u, 0xFC47u, 0x8BBDu, 0xA6EBu, 0x6449u, 0x491Fu, 0x3EE5u, 0x13B3u,
+ 0x3016u, 0x1D40u, 0x6ABAu, 0x47ECu, 0x854Eu, 0xA818u, 0xDFE2u, 0xF2B4u,
+ 0x98A8u, 0xB5FEu, 0xC204u, 0xEF52u, 0x2DF0u, 0x00A6u, 0x775Cu, 0x5A0Au,
+ 0x79AFu, 0x54F9u, 0x2303u, 0x0E55u, 0xCCF7u, 0xE1A1u, 0x965Bu, 0xBB0Du,
+ 0x4263u, 0x6F35u, 0x18CFu, 0x3599u, 0xF73Bu, 0xDA6Du, 0xAD97u, 0x80C1u,
+ 0xA364u, 0x8E32u, 0xF9C8u, 0xD49Eu, 0x163Cu, 0x3B6Au, 0x4C90u, 0x61C6u,
+ 0x0BDAu, 0x268Cu, 0x5176u, 0x7C20u, 0xBE82u, 0x93D4u, 0xE42Eu, 0xC978u,
+ 0xEADDu, 0xC78Bu, 0xB071u, 0x9D27u, 0x5F85u, 0x72D3u, 0x0529u, 0x287Fu,
+ 0x7C42u, 0x5114u, 0x26EEu, 0x0BB8u, 0xC91Au, 0xE44Cu, 0x93B6u, 0xBEE0u,
+ 0x9D45u, 0xB013u, 0xC7E9u, 0xEABFu, 0x281Du, 0x054Bu, 0x72B1u, 0x5FE7u,
+ 0x35FBu, 0x18ADu, 0x6F57u, 0x4201u, 0x80A3u, 0xADF5u, 0xDA0Fu, 0xF759u,
+ 0xD4FCu, 0xF9AAu, 0x8E50u, 0xA306u, 0x61A4u, 0x4CF2u, 0x3B08u, 0x165Eu,
+ 0xEF30u, 0xC266u, 0xB59Cu, 0x98CAu, 0x5A68u, 0x773Eu, 0x00C4u, 0x2D92u,
+ 0x0E37u, 0x2361u, 0x549Bu, 0x79CDu, 0xBB6Fu, 0x9639u, 0xE1C3u, 0xCC95u,
+ 0xA689u, 0x8BDFu, 0xFC25u, 0xD173u, 0x13D1u, 0x3E87u, 0x497Du, 0x642Bu,
+ 0x478Eu, 0x6AD8u, 0x1D22u, 0x3074u, 0xF2D6u, 0xDF80u, 0xA87Au, 0x852Cu
+ },
+ {
+ 0x0000u, 0x2995u, 0x532Au, 0x7ABFu, 0xA654u, 0x8FC1u, 0xF57Eu, 0xDCEBu,
+ 0xC71Fu, 0xEE8Au, 0x9435u, 0xBDA0u, 0x614Bu, 0x48DEu, 0x3261u, 0x1BF4u,
+ 0x0589u, 0x2C1Cu, 0x56A3u, 0x7F36u, 0xA3DDu, 0x8A48u, 0xF0F7u, 0xD962u,
+ 0xC296u, 0xEB03u, 0x91BCu, 0xB829u, 0x64C2u, 0x4D57u, 0x37E8u, 0x1E7Du,
+ 0x0B12u, 0x2287u, 0x5838u, 0x71ADu, 0xAD46u, 0x84D3u, 0xFE6Cu, 0xD7F9u,
+ 0xCC0Du, 0xE598u, 0x9F27u, 0xB6B2u, 0x6A59u, 0x43CCu, 0x3973u, 0x10E6u,
+ 0x0E9Bu, 0x270Eu, 0x5DB1u, 0x7424u, 0xA8CFu, 0x815Au, 0xFBE5u, 0xD270u,
+ 0xC984u, 0xE011u, 0x9AAEu, 0xB33Bu, 0x6FD0u, 0x4645u, 0x3CFAu, 0x156Fu,
+ 0x1624u, 0x3FB1u, 0x450Eu, 0x6C9Bu, 0xB070u, 0x99E5u, 0xE35Au, 0xCACFu,
+ 0xD13Bu, 0xF8AEu, 0x8211u, 0xAB84u, 0x776Fu, 0x5EFAu, 0x2445u, 0x0DD0u,
+ 0x13ADu, 0x3A38u, 0x4087u, 0x6912u, 0xB5F9u, 0x9C6Cu, 0xE6D3u, 0xCF46u,
+ 0xD4B2u, 0xFD27u, 0x8798u, 0xAE0Du, 0x72E6u, 0x5B73u, 0x21CCu, 0x0859u,
+ 0x1D36u, 0x34A3u, 0x4E1Cu, 0x6789u, 0xBB62u, 0x92F7u, 0xE848u, 0xC1DDu,
+ 0xDA29u, 0xF3BCu, 0x8903u, 0xA096u, 0x7C7Du, 0x55E8u, 0x2F57u, 0x06C2u,
+ 0x18BFu, 0x312Au, 0x4B95u, 0x6200u, 0xBEEBu, 0x977Eu, 0xEDC1u, 0xC454u,
+ 0xDFA0u, 0xF635u, 0x8C8Au, 0xA51Fu, 0x79F4u, 0x5061u, 0x2ADEu, 0x034Bu,
+ 0x2C48u, 0x05DDu, 0x7F62u, 0x56F7u, 0x8A1Cu, 0xA389u, 0xD936u, 0xF0A3u,
+ 0xEB57u, 0xC2C2u, 0xB87Du, 0x91E8u, 0x4D03u, 0x6496u, 0x1E29u, 0x37BCu,
+ 0x29C1u, 0x0054u, 0x7AEBu, 0x537Eu, 0x8F95u, 0xA600u, 0xDCBFu, 0xF52Au,
+ 0xEEDEu, 0xC74Bu, 0xBDF4u, 0x9461u, 0x488Au, 0x611Fu, 0x1BA0u, 0x3235u,
+ 0x275Au, 0x0ECFu, 0x7470u, 0x5DE5u, 0x810Eu, 0xA89Bu, 0xD224u, 0xFBB1u,
+ 0xE045u, 0xC9D0u, 0xB36Fu, 0x9AFAu, 0x4611u, 0x6F84u, 0x153Bu, 0x3CAEu,
+ 0x22D3u, 0x0B46u, 0x71F9u, 0x586Cu, 0x8487u, 0xAD12u, 0xD7ADu, 0xFE38u,
+ 0xE5CCu, 0xCC59u, 0xB6E6u, 0x9F73u, 0x4398u, 0x6A0Du, 0x10B2u, 0x3927u,
+ 0x3A6Cu, 0x13F9u, 0x6946u, 0x40D3u, 0x9C38u, 0xB5ADu, 0xCF12u, 0xE687u,
+ 0xFD73u, 0xD4E6u, 0xAE59u, 0x87CCu, 0x5B27u, 0x72B2u, 0x080Du, 0x2198u,
+ 0x3FE5u, 0x1670u, 0x6CCFu, 0x455Au, 0x99B1u, 0xB024u, 0xCA9Bu, 0xE30Eu,
+ 0xF8FAu, 0xD16Fu, 0xABD0u, 0x8245u, 0x5EAEu, 0x773Bu, 0x0D84u, 0x2411u,
+ 0x317Eu, 0x18EBu, 0x6254u, 0x4BC1u, 0x972Au, 0xBEBFu, 0xC400u, 0xED95u,
+ 0xF661u, 0xDFF4u, 0xA54Bu, 0x8CDEu, 0x5035u, 0x79A0u, 0x031Fu, 0x2A8Au,
+ 0x34F7u, 0x1D62u, 0x67DDu, 0x4E48u, 0x92A3u, 0xBB36u, 0xC189u, 0xE81Cu,
+ 0xF3E8u, 0xDA7Du, 0xA0C2u, 0x8957u, 0x55BCu, 0x7C29u, 0x0696u, 0x2F03u
+ },
+ {
+ 0x0000u, 0x5890u, 0xB120u, 0xE9B0u, 0xE9F7u, 0xB167u, 0x58D7u, 0x0047u,
+ 0x5859u, 0x00C9u, 0xE979u, 0xB1E9u, 0xB1AEu, 0xE93Eu, 0x008Eu, 0x581Eu,
+ 0xB0B2u, 0xE822u, 0x0192u, 0x5902u, 0x5945u, 0x01D5u, 0xE865u, 0xB0F5u,
+ 0xE8EBu, 0xB07Bu, 0x59CBu, 0x015Bu, 0x011Cu, 0x598Cu, 0xB03Cu, 0xE8ACu,
+ 0xEAD3u, 0xB243u, 0x5BF3u, 0x0363u, 0x0324u, 0x5BB4u, 0xB204u, 0xEA94u,
+ 0xB28Au, 0xEA1Au, 0x03AAu, 0x5B3Au, 0x5B7Du, 0x03EDu, 0xEA5Du, 0xB2CDu,
+ 0x5A61u, 0x02F1u, 0xEB41u, 0xB3D1u, 0xB396u, 0xEB06u, 0x02B6u, 0x5A26u,
+ 0x0238u, 0x5AA8u, 0xB318u, 0xEB88u, 0xEBCFu, 0xB35Fu, 0x5AEFu, 0x027Fu,
+ 0x5E11u, 0x0681u, 0xEF31u, 0xB7A1u, 0xB7E6u, 0xEF76u, 0x06C6u, 0x5E56u,
+ 0x0648u, 0x5ED8u, 0xB768u, 0xEFF8u, 0xEFBFu, 0xB72Fu, 0x5E9Fu, 0x060Fu,
+ 0xEEA3u, 0xB633u, 0x5F83u, 0x0713u, 0x0754u, 0x5FC4u, 0xB674u, 0xEEE4u,
+ 0xB6FAu, 0xEE6Au, 0x07DAu, 0x5F4Au, 0x5F0Du, 0x079Du, 0xEE2Du, 0xB6BDu,
+ 0xB4C2u, 0xEC52u, 0x05E2u, 0x5D72u, 0x5D35u, 0x05A5u, 0xEC15u, 0xB485u,
+ 0xEC9Bu, 0xB40Bu, 0x5DBBu, 0x052Bu, 0x056Cu, 0x5DFCu, 0xB44Cu, 0xECDCu,
+ 0x0470u, 0x5CE0u, 0xB550u, 0xEDC0u, 0xED87u, 0xB517u, 0x5CA7u, 0x0437u,
+ 0x5C29u, 0x04B9u, 0xED09u, 0xB599u, 0xB5DEu, 0xED4Eu, 0x04FEu, 0x5C6Eu,
+ 0xBC22u, 0xE4B2u, 0x0D02u, 0x5592u, 0x55D5u, 0x0D45u, 0xE4F5u, 0xBC65u,
+ 0xE47Bu, 0xBCEBu, 0x555Bu, 0x0DCBu, 0x0D8Cu, 0x551Cu, 0xBCACu, 0xE43Cu,
+ 0x0C90u, 0x5400u, 0xBDB0u, 0xE520u, 0xE567u, 0xBDF7u, 0x5447u, 0x0CD7u,
+ 0x54C9u, 0x0C59u, 0xE5E9u, 0xBD79u, 0xBD3Eu, 0xE5AEu, 0x0C1Eu, 0x548Eu,
+ 0x56F1u, 0x0E61u, 0xE7D1u, 0xBF41u, 0xBF06u, 0xE796u, 0x0E26u, 0x56B6u,
+ 0x0EA8u, 0x5638u, 0xBF88u, 0xE718u, 0xE75Fu, 0xBFCFu, 0x567Fu, 0x0EEFu,
+ 0xE643u, 0xBED3u, 0x5763u, 0x0FF3u, 0x0FB4u, 0x5724u, 0xBE94u, 0xE604u,
+ 0xBE1Au, 0xE68Au, 0x0F3Au, 0x57AAu, 0x57EDu, 0x0F7Du, 0xE6CDu, 0xBE5Du,
+ 0xE233u, 0xBAA3u, 0x5313u, 0x0B83u, 0x0BC4u, 0x5354u, 0xBAE4u, 0xE274u,
+ 0xBA6Au, 0xE2FAu, 0x0B4Au, 0x53DAu, 0x539Du, 0x0B0Du, 0xE2BDu, 0xBA2Du,
+ 0x5281u, 0x0A11u, 0xE3A1u, 0xBB31u, 0xBB76u, 0xE3E6u, 0x0A56u, 0x52C6u,
+ 0x0AD8u, 0x5248u, 0xBBF8u, 0xE368u, 0xE32Fu, 0xBBBFu, 0x520Fu, 0x0A9Fu,
+ 0x08E0u, 0x5070u, 0xB9C0u, 0xE150u, 0xE117u, 0xB987u, 0x5037u, 0x08A7u,
+ 0x50B9u, 0x0829u, 0xE199u, 0xB909u, 0xB94Eu, 0xE1DEu, 0x086Eu, 0x50FEu,
+ 0xB852u, 0xE0C2u, 0x0972u, 0x51E2u, 0x51A5u, 0x0935u, 0xE085u, 0xB815u,
+ 0xE00Bu, 0xB89Bu, 0x512Bu, 0x09BBu, 0x09FCu, 0x516Cu, 0xB8DCu, 0xE04Cu
+ },
+ {
+ 0x0000u, 0xF3F3u, 0x6C51u, 0x9FA2u, 0xD8A2u, 0x2B51u, 0xB4F3u, 0x4700u,
+ 0x3AF3u, 0xC900u, 0x56A2u, 0xA551u, 0xE251u, 0x11A2u, 0x8E00u, 0x7DF3u,
+ 0x75E6u, 0x8615u, 0x19B7u, 0xEA44u, 0xAD44u, 0x5EB7u, 0xC115u, 0x32E6u,
+ 0x4F15u, 0xBCE6u, 0x2344u, 0xD0B7u, 0x97B7u, 0x6444u, 0xFBE6u, 0x0815u,
+ 0xEBCCu, 0x183Fu, 0x879Du, 0x746Eu, 0x336Eu, 0xC09Du, 0x5F3Fu, 0xACCCu,
+ 0xD13Fu, 0x22CCu, 0xBD6Eu, 0x4E9Du, 0x099Du, 0xFA6Eu, 0x65CCu, 0x963Fu,
+ 0x9E2Au, 0x6DD9u, 0xF27Bu, 0x0188u, 0x4688u, 0xB57Bu, 0x2AD9u, 0xD92Au,
+ 0xA4D9u, 0x572Au, 0xC888u, 0x3B7Bu, 0x7C7Bu, 0x8F88u, 0x102Au, 0xE3D9u,
+ 0x5C2Fu, 0xAFDCu, 0x307Eu, 0xC38Du, 0x848Du, 0x777Eu, 0xE8DCu, 0x1B2Fu,
+ 0x66DCu, 0x952Fu, 0x0A8Du, 0xF97Eu, 0xBE7Eu, 0x4D8Du, 0xD22Fu, 0x21DCu,
+ 0x29C9u, 0xDA3Au, 0x4598u, 0xB66Bu, 0xF16Bu, 0x0298u, 0x9D3Au, 0x6EC9u,
+ 0x133Au, 0xE0C9u, 0x7F6Bu, 0x8C98u, 0xCB98u, 0x386Bu, 0xA7C9u, 0x543Au,
+ 0xB7E3u, 0x4410u, 0xDBB2u, 0x2841u, 0x6F41u, 0x9CB2u, 0x0310u, 0xF0E3u,
+ 0x8D10u, 0x7EE3u, 0xE141u, 0x12B2u, 0x55B2u, 0xA641u, 0x39E3u, 0xCA10u,
+ 0xC205u, 0x31F6u, 0xAE54u, 0x5DA7u, 0x1AA7u, 0xE954u, 0x76F6u, 0x8505u,
+ 0xF8F6u, 0x0B05u, 0x94A7u, 0x6754u, 0x2054u, 0xD3A7u, 0x4C05u, 0xBFF6u,
+ 0xB85Eu, 0x4BADu, 0xD40Fu, 0x27FCu, 0x60FCu, 0x930Fu, 0x0CADu, 0xFF5Eu,
+ 0x82ADu, 0x715Eu, 0xEEFCu, 0x1D0Fu, 0x5A0Fu, 0xA9FCu, 0x365Eu, 0xC5ADu,
+ 0xCDB8u, 0x3E4Bu, 0xA1E9u, 0x521Au, 0x151Au, 0xE6E9u, 0x794Bu, 0x8AB8u,
+ 0xF74Bu, 0x04B8u, 0x9B1Au, 0x68E9u, 0x2FE9u, 0xDC1Au, 0x43B8u, 0xB04Bu,
+ 0x5392u, 0xA061u, 0x3FC3u, 0xCC30u, 0x8B30u, 0x78C3u, 0xE761u, 0x1492u,
+ 0x6961u, 0x9A92u, 0x0530u, 0xF6C3u, 0xB1C3u, 0x4230u, 0xDD92u, 0x2E61u,
+ 0x2674u, 0xD587u, 0x4A25u, 0xB9D6u, 0xFED6u, 0x0D25u, 0x9287u, 0x6174u,
+ 0x1C87u, 0xEF74u, 0x70D6u, 0x8325u, 0xC425u, 0x37D6u, 0xA874u, 0x5B87u,
+ 0xE471u, 0x1782u, 0x8820u, 0x7BD3u, 0x3CD3u, 0xCF20u, 0x5082u, 0xA371u,
+ 0xDE82u, 0x2D71u, 0xB2D3u, 0x4120u, 0x0620u, 0xF5D3u, 0x6A71u, 0x9982u,
+ 0x9197u, 0x6264u, 0xFDC6u, 0x0E35u, 0x4935u, 0xBAC6u, 0x2564u, 0xD697u,
+ 0xAB64u, 0x5897u, 0xC735u, 0x34C6u, 0x73C6u, 0x8035u, 0x1F97u, 0xEC64u,
+ 0x0FBDu, 0xFC4Eu, 0x63ECu, 0x901Fu, 0xD71Fu, 0x24ECu, 0xBB4Eu, 0x48BDu,
+ 0x354Eu, 0xC6BDu, 0x591Fu, 0xAAECu, 0xEDECu, 0x1E1Fu, 0x81BDu, 0x724Eu,
+ 0x7A5Bu, 0x89A8u, 0x160Au, 0xE5F9u, 0xA2F9u, 0x510Au, 0xCEA8u, 0x3D5Bu,
+ 0x40A8u, 0xB35Bu, 0x2CF9u, 0xDF0Au, 0x980Au, 0x6BF9u, 0xF45Bu, 0x07A8u
+ },
+ {
+ 0x0000u, 0xFB0Bu, 0x7DA1u, 0x86AAu, 0xFB42u, 0x0049u, 0x86E3u, 0x7DE8u,
+ 0x7D33u, 0x8638u, 0x0092u, 0xFB99u, 0x8671u, 0x7D7Au, 0xFBD0u, 0x00DBu,
+ 0xFA66u, 0x016Du, 0x87C7u, 0x7CCCu, 0x0124u, 0xFA2Fu, 0x7C85u, 0x878Eu,
+ 0x8755u, 0x7C5Eu, 0xFAF4u, 0x01FFu, 0x7C17u, 0x871Cu, 0x01B6u, 0xFABDu,
+ 0x7F7Bu, 0x8470u, 0x02DAu, 0xF9D1u, 0x8439u, 0x7F32u, 0xF998u, 0x0293u,
+ 0x0248u, 0xF943u, 0x7FE9u, 0x84E2u, 0xF90Au, 0x0201u, 0x84ABu, 0x7FA0u,
+ 0x851Du, 0x7E16u, 0xF8BCu, 0x03B7u, 0x7E5Fu, 0x8554u, 0x03FEu, 0xF8F5u,
+ 0xF82Eu, 0x0325u, 0x858Fu, 0x7E84u, 0x036Cu, 0xF867u, 0x7ECDu, 0x85C6u,
+ 0xFEF6u, 0x05FDu, 0x8357u, 0x785Cu, 0x05B4u, 0xFEBFu, 0x7815u, 0x831Eu,
+ 0x83C5u, 0x78CEu, 0xFE64u, 0x056Fu, 0x7887u, 0x838Cu, 0x0526u, 0xFE2Du,
+ 0x0490u, 0xFF9Bu, 0x7931u, 0x823Au, 0xFFD2u, 0x04D9u, 0x8273u, 0x7978u,
+ 0x79A3u, 0x82A8u, 0x0402u, 0xFF09u, 0x82E1u, 0x79EAu, 0xFF40u, 0x044Bu,
+ 0x818Du, 0x7A86u, 0xFC2Cu, 0x0727u, 0x7ACFu, 0x81C4u, 0x076Eu, 0xFC65u,
+ 0xFCBEu, 0x07B5u, 0x811Fu, 0x7A14u, 0x07FCu, 0xFCF7u, 0x7A5Du, 0x8156u,
+ 0x7BEBu, 0x80E0u, 0x064Au, 0xFD41u, 0x80A9u, 0x7BA2u, 0xFD08u, 0x0603u,
+ 0x06D8u, 0xFDD3u, 0x7B79u, 0x8072u, 0xFD9Au, 0x0691u, 0x803Bu, 0x7B30u,
+ 0x765Bu, 0x8D50u, 0x0BFAu, 0xF0F1u, 0x8D19u, 0x7612u, 0xF0B8u, 0x0BB3u,
+ 0x0B68u, 0xF063u, 0x76C9u, 0x8DC2u, 0xF02Au, 0x0B21u, 0x8D8Bu, 0x7680u,
+ 0x8C3Du, 0x7736u, 0xF19Cu, 0x0A97u, 0x777Fu, 0x8C74u, 0x0ADEu, 0xF1D5u,
+ 0xF10Eu, 0x0A05u, 0x8CAFu, 0x77A4u, 0x0A4Cu, 0xF147u, 0x77EDu, 0x8CE6u,
+ 0x0920u, 0xF22Bu, 0x7481u, 0x8F8Au, 0xF262u, 0x0969u, 0x8FC3u, 0x74C8u,
+ 0x7413u, 0x8F18u, 0x09B2u, 0xF2B9u, 0x8F51u, 0x745Au, 0xF2F0u, 0x09FBu,
+ 0xF346u, 0x084Du, 0x8EE7u, 0x75ECu, 0x0804u, 0xF30Fu, 0x75A5u, 0x8EAEu,
+ 0x8E75u, 0x757Eu, 0xF3D4u, 0x08DFu, 0x7537u, 0x8E3Cu, 0x0896u, 0xF39Du,
+ 0x88ADu, 0x73A6u, 0xF50Cu, 0x0E07u, 0x73EFu, 0x88E4u, 0x0E4Eu, 0xF545u,
+ 0xF59Eu, 0x0E95u, 0x883Fu, 0x7334u, 0x0EDCu, 0xF5D7u, 0x737Du, 0x8876u,
+ 0x72CBu, 0x89C0u, 0x0F6Au, 0xF461u, 0x8989u, 0x7282u, 0xF428u, 0x0F23u,
+ 0x0FF8u, 0xF4F3u, 0x7259u, 0x8952u, 0xF4BAu, 0x0FB1u, 0x891Bu, 0x7210u,
+ 0xF7D6u, 0x0CDDu, 0x8A77u, 0x717Cu, 0x0C94u, 0xF79Fu, 0x7135u, 0x8A3Eu,
+ 0x8AE5u, 0x71EEu, 0xF744u, 0x0C4Fu, 0x71A7u, 0x8AACu, 0x0C06u, 0xF70Du,
+ 0x0DB0u, 0xF6BBu, 0x7011u, 0x8B1Au, 0xF6F2u, 0x0DF9u, 0x8B53u, 0x7058u,
+ 0x7083u, 0x8B88u, 0x0D22u, 0xF629u, 0x8BC1u, 0x70CAu, 0xF660u, 0x0D6Bu
+ },
+ {
+ 0x0000u, 0xECB6u, 0x52DBu, 0xBE6Du, 0xA5B6u, 0x4900u, 0xF76Du, 0x1BDBu,
+ 0xC0DBu, 0x2C6Du, 0x9200u, 0x7EB6u, 0x656Du, 0x89DBu, 0x37B6u, 0xDB00u,
+ 0x0A01u, 0xE6B7u, 0x58DAu, 0xB46Cu, 0xAFB7u, 0x4301u, 0xFD6Cu, 0x11DAu,
+ 0xCADAu, 0x266Cu, 0x9801u, 0x74B7u, 0x6F6Cu, 0x83DAu, 0x3DB7u, 0xD101u,
+ 0x1402u, 0xF8B4u, 0x46D9u, 0xAA6Fu, 0xB1B4u, 0x5D02u, 0xE36Fu, 0x0FD9u,
+ 0xD4D9u, 0x386Fu, 0x8602u, 0x6AB4u, 0x716Fu, 0x9DD9u, 0x23B4u, 0xCF02u,
+ 0x1E03u, 0xF2B5u, 0x4CD8u, 0xA06Eu, 0xBBB5u, 0x5703u, 0xE96Eu, 0x05D8u,
+ 0xDED8u, 0x326Eu, 0x8C03u, 0x60B5u, 0x7B6Eu, 0x97D8u, 0x29B5u, 0xC503u,
+ 0x2804u, 0xC4B2u, 0x7ADFu, 0x9669u, 0x8DB2u, 0x6104u, 0xDF69u, 0x33DFu,
+ 0xE8DFu, 0x0469u, 0xBA04u, 0x56B2u, 0x4D69u, 0xA1DFu, 0x1FB2u, 0xF304u,
+ 0x2205u, 0xCEB3u, 0x70DEu, 0x9C68u, 0x87B3u, 0x6B05u, 0xD568u, 0x39DEu,
+ 0xE2DEu, 0x0E68u, 0xB005u, 0x5CB3u, 0x4768u, 0xABDEu, 0x15B3u, 0xF905u,
+ 0x3C06u, 0xD0B0u, 0x6EDDu, 0x826Bu, 0x99B0u, 0x7506u, 0xCB6Bu, 0x27DDu,
+ 0xFCDDu, 0x106Bu, 0xAE06u, 0x42B0u, 0x596Bu, 0xB5DDu, 0x0BB0u, 0xE706u,
+ 0x3607u, 0xDAB1u, 0x64DCu, 0x886Au, 0x93B1u, 0x7F07u, 0xC16Au, 0x2DDCu,
+ 0xF6DCu, 0x1A6Au, 0xA407u, 0x48B1u, 0x536Au, 0xBFDCu, 0x01B1u, 0xED07u,
+ 0x5008u, 0xBCBEu, 0x02D3u, 0xEE65u, 0xF5BEu, 0x1908u, 0xA765u, 0x4BD3u,
+ 0x90D3u, 0x7C65u, 0xC208u, 0x2EBEu, 0x3565u, 0xD9D3u, 0x67BEu, 0x8B08u,
+ 0x5A09u, 0xB6BFu, 0x08D2u, 0xE464u, 0xFFBFu, 0x1309u, 0xAD64u, 0x41D2u,
+ 0x9AD2u, 0x7664u, 0xC809u, 0x24BFu, 0x3F64u, 0xD3D2u, 0x6DBFu, 0x8109u,
+ 0x440Au, 0xA8BCu, 0x16D1u, 0xFA67u, 0xE1BCu, 0x0D0Au, 0xB367u, 0x5FD1u,
+ 0x84D1u, 0x6867u, 0xD60Au, 0x3ABCu, 0x2167u, 0xCDD1u, 0x73BCu, 0x9F0Au,
+ 0x4E0Bu, 0xA2BDu, 0x1CD0u, 0xF066u, 0xEBBDu, 0x070Bu, 0xB966u, 0x55D0u,
+ 0x8ED0u, 0x6266u, 0xDC0Bu, 0x30BDu, 0x2B66u, 0xC7D0u, 0x79BDu, 0x950Bu,
+ 0x780Cu, 0x94BAu, 0x2AD7u, 0xC661u, 0xDDBAu, 0x310Cu, 0x8F61u, 0x63D7u,
+ 0xB8D7u, 0x5461u, 0xEA0Cu, 0x06BAu, 0x1D61u, 0xF1D7u, 0x4FBAu, 0xA30Cu,
+ 0x720Du, 0x9EBBu, 0x20D6u, 0xCC60u, 0xD7BBu, 0x3B0Du, 0x8560u, 0x69D6u,
+ 0xB2D6u, 0x5E60u, 0xE00Du, 0x0CBBu, 0x1760u, 0xFBD6u, 0x45BBu, 0xA90Du,
+ 0x6C0Eu, 0x80B8u, 0x3ED5u, 0xD263u, 0xC9B8u, 0x250Eu, 0x9B63u, 0x77D5u,
+ 0xACD5u, 0x4063u, 0xFE0Eu, 0x12B8u, 0x0963u, 0xE5D5u, 0x5BB8u, 0xB70Eu,
+ 0x660Fu, 0x8AB9u, 0x34D4u, 0xD862u, 0xC3B9u, 0x2F0Fu, 0x9162u, 0x7DD4u,
+ 0xA6D4u, 0x4A62u, 0xF40Fu, 0x18B9u, 0x0362u, 0xEFD4u, 0x51B9u, 0xBD0Fu
+ },
+ {
+ 0x0000u, 0xA010u, 0xCB97u, 0x6B87u, 0x1C99u, 0xBC89u, 0xD70Eu, 0x771Eu,
+ 0x3932u, 0x9922u, 0xF2A5u, 0x52B5u, 0x25ABu, 0x85BBu, 0xEE3Cu, 0x4E2Cu,
+ 0x7264u, 0xD274u, 0xB9F3u, 0x19E3u, 0x6EFDu, 0xCEEDu, 0xA56Au, 0x057Au,
+ 0x4B56u, 0xEB46u, 0x80C1u, 0x20D1u, 0x57CFu, 0xF7DFu, 0x9C58u, 0x3C48u,
+ 0xE4C8u, 0x44D8u, 0x2F5Fu, 0x8F4Fu, 0xF851u, 0x5841u, 0x33C6u, 0x93D6u,
+ 0xDDFAu, 0x7DEAu, 0x166Du, 0xB67Du, 0xC163u, 0x6173u, 0x0AF4u, 0xAAE4u,
+ 0x96ACu, 0x36BCu, 0x5D3Bu, 0xFD2Bu, 0x8A35u, 0x2A25u, 0x41A2u, 0xE1B2u,
+ 0xAF9Eu, 0x0F8Eu, 0x6409u, 0xC419u, 0xB307u, 0x1317u, 0x7890u, 0xD880u,
+ 0x4227u, 0xE237u, 0x89B0u, 0x29A0u, 0x5EBEu, 0xFEAEu, 0x9529u, 0x3539u,
+ 0x7B15u, 0xDB05u, 0xB082u, 0x1092u, 0x678Cu, 0xC79Cu, 0xAC1Bu, 0x0C0Bu,
+ 0x3043u, 0x9053u, 0xFBD4u, 0x5BC4u, 0x2CDAu, 0x8CCAu, 0xE74Du, 0x475Du,
+ 0x0971u, 0xA961u, 0xC2E6u, 0x62F6u, 0x15E8u, 0xB5F8u, 0xDE7Fu, 0x7E6Fu,
+ 0xA6EFu, 0x06FFu, 0x6D78u, 0xCD68u, 0xBA76u, 0x1A66u, 0x71E1u, 0xD1F1u,
+ 0x9FDDu, 0x3FCDu, 0x544Au, 0xF45Au, 0x8344u, 0x2354u, 0x48D3u, 0xE8C3u,
+ 0xD48Bu, 0x749Bu, 0x1F1Cu, 0xBF0Cu, 0xC812u, 0x6802u, 0x0385u, 0xA395u,
+ 0xEDB9u, 0x4DA9u, 0x262Eu, 0x863Eu, 0xF120u, 0x5130u, 0x3AB7u, 0x9AA7u,
+ 0x844Eu, 0x245Eu, 0x4FD9u, 0xEFC9u, 0x98D7u, 0x38C7u, 0x5340u, 0xF350u,
+ 0xBD7Cu, 0x1D6Cu, 0x76EBu, 0xD6FBu, 0xA1E5u, 0x01F5u, 0x6A72u, 0xCA62u,
+ 0xF62Au, 0x563Au, 0x3DBDu, 0x9DADu, 0xEAB3u, 0x4AA3u, 0x2124u, 0x8134u,
+ 0xCF18u, 0x6F08u, 0x048Fu, 0xA49Fu, 0xD381u, 0x7391u, 0x1816u, 0xB806u,
+ 0x6086u, 0xC096u, 0xAB11u, 0x0B01u, 0x7C1Fu, 0xDC0Fu, 0xB788u, 0x1798u,
+ 0x59B4u, 0xF9A4u, 0x9223u, 0x3233u, 0x452Du, 0xE53Du, 0x8EBAu, 0x2EAAu,
+ 0x12E2u, 0xB2F2u, 0xD975u, 0x7965u, 0x0E7Bu, 0xAE6Bu, 0xC5ECu, 0x65FCu,
+ 0x2BD0u, 0x8BC0u, 0xE047u, 0x4057u, 0x3749u, 0x9759u, 0xFCDEu, 0x5CCEu,
+ 0xC669u, 0x6679u, 0x0DFEu, 0xADEEu, 0xDAF0u, 0x7AE0u, 0x1167u, 0xB177u,
+ 0xFF5Bu, 0x5F4Bu, 0x34CCu, 0x94DCu, 0xE3C2u, 0x43D2u, 0x2855u, 0x8845u,
+ 0xB40Du, 0x141Du, 0x7F9Au, 0xDF8Au, 0xA894u, 0x0884u, 0x6303u, 0xC313u,
+ 0x8D3Fu, 0x2D2Fu, 0x46A8u, 0xE6B8u, 0x91A6u, 0x31B6u, 0x5A31u, 0xFA21u,
+ 0x22A1u, 0x82B1u, 0xE936u, 0x4926u, 0x3E38u, 0x9E28u, 0xF5AFu, 0x55BFu,
+ 0x1B93u, 0xBB83u, 0xD004u, 0x7014u, 0x070Au, 0xA71Au, 0xCC9Du, 0x6C8Du,
+ 0x50C5u, 0xF0D5u, 0x9B52u, 0x3B42u, 0x4C5Cu, 0xEC4Cu, 0x87CBu, 0x27DBu,
+ 0x69F7u, 0xC9E7u, 0xA260u, 0x0270u, 0x756Eu, 0xD57Eu, 0xBEF9u, 0x1EE9u
+ },
+ {
+ 0x0000u, 0x832Bu, 0x8DE1u, 0x0ECAu, 0x9075u, 0x135Eu, 0x1D94u, 0x9EBFu,
+ 0xAB5Du, 0x2876u, 0x26BCu, 0xA597u, 0x3B28u, 0xB803u, 0xB6C9u, 0x35E2u,
+ 0xDD0Du, 0x5E26u, 0x50ECu, 0xD3C7u, 0x4D78u, 0xCE53u, 0xC099u, 0x43B2u,
+ 0x7650u, 0xF57Bu, 0xFBB1u, 0x789Au, 0xE625u, 0x650Eu, 0x6BC4u, 0xE8EFu,
+ 0x31ADu, 0xB286u, 0xBC4Cu, 0x3F67u, 0xA1D8u, 0x22F3u, 0x2C39u, 0xAF12u,
+ 0x9AF0u, 0x19DBu, 0x1711u, 0x943Au, 0x0A85u, 0x89AEu, 0x8764u, 0x044Fu,
+ 0xECA0u, 0x6F8Bu, 0x6141u, 0xE26Au, 0x7CD5u, 0xFFFEu, 0xF134u, 0x721Fu,
+ 0x47FDu, 0xC4D6u, 0xCA1Cu, 0x4937u, 0xD788u, 0x54A3u, 0x5A69u, 0xD942u,
+ 0x635Au, 0xE071u, 0xEEBBu, 0x6D90u, 0xF32Fu, 0x7004u, 0x7ECEu, 0xFDE5u,
+ 0xC807u, 0x4B2Cu, 0x45E6u, 0xC6CDu, 0x5872u, 0xDB59u, 0xD593u, 0x56B8u,
+ 0xBE57u, 0x3D7Cu, 0x33B6u, 0xB09Du, 0x2E22u, 0xAD09u, 0xA3C3u, 0x20E8u,
+ 0x150Au, 0x9621u, 0x98EBu, 0x1BC0u, 0x857Fu, 0x0654u, 0x089Eu, 0x8BB5u,
+ 0x52F7u, 0xD1DCu, 0xDF16u, 0x5C3Du, 0xC282u, 0x41A9u, 0x4F63u, 0xCC48u,
+ 0xF9AAu, 0x7A81u, 0x744Bu, 0xF760u, 0x69DFu, 0xEAF4u, 0xE43Eu, 0x6715u,
+ 0x8FFAu, 0x0CD1u, 0x021Bu, 0x8130u, 0x1F8Fu, 0x9CA4u, 0x926Eu, 0x1145u,
+ 0x24A7u, 0xA78Cu, 0xA946u, 0x2A6Du, 0xB4D2u, 0x37F9u, 0x3933u, 0xBA18u,
+ 0xC6B4u, 0x459Fu, 0x4B55u, 0xC87Eu, 0x56C1u, 0xD5EAu, 0xDB20u, 0x580Bu,
+ 0x6DE9u, 0xEEC2u, 0xE008u, 0x6323u, 0xFD9Cu, 0x7EB7u, 0x707Du, 0xF356u,
+ 0x1BB9u, 0x9892u, 0x9658u, 0x1573u, 0x8BCCu, 0x08E7u, 0x062Du, 0x8506u,
+ 0xB0E4u, 0x33CFu, 0x3D05u, 0xBE2Eu, 0x2091u, 0xA3BAu, 0xAD70u, 0x2E5Bu,
+ 0xF719u, 0x7432u, 0x7AF8u, 0xF9D3u, 0x676Cu, 0xE447u, 0xEA8Du, 0x69A6u,
+ 0x5C44u, 0xDF6Fu, 0xD1A5u, 0x528Eu, 0xCC31u, 0x4F1Au, 0x41D0u, 0xC2FBu,
+ 0x2A14u, 0xA93Fu, 0xA7F5u, 0x24DEu, 0xBA61u, 0x394Au, 0x3780u, 0xB4ABu,
+ 0x8149u, 0x0262u, 0x0CA8u, 0x8F83u, 0x113Cu, 0x9217u, 0x9CDDu, 0x1FF6u,
+ 0xA5EEu, 0x26C5u, 0x280Fu, 0xAB24u, 0x359Bu, 0xB6B0u, 0xB87Au, 0x3B51u,
+ 0x0EB3u, 0x8D98u, 0x8352u, 0x0079u, 0x9EC6u, 0x1DEDu, 0x1327u, 0x900Cu,
+ 0x78E3u, 0xFBC8u, 0xF502u, 0x7629u, 0xE896u, 0x6BBDu, 0x6577u, 0xE65Cu,
+ 0xD3BEu, 0x5095u, 0x5E5Fu, 0xDD74u, 0x43CBu, 0xC0E0u, 0xCE2Au, 0x4D01u,
+ 0x9443u, 0x1768u, 0x19A2u, 0x9A89u, 0x0436u, 0x871Du, 0x89D7u, 0x0AFCu,
+ 0x3F1Eu, 0xBC35u, 0xB2FFu, 0x31D4u, 0xAF6Bu, 0x2C40u, 0x228Au, 0xA1A1u,
+ 0x494Eu, 0xCA65u, 0xC4AFu, 0x4784u, 0xD93Bu, 0x5A10u, 0x54DAu, 0xD7F1u,
+ 0xE213u, 0x6138u, 0x6FF2u, 0xECD9u, 0x7266u, 0xF14Du, 0xFF87u, 0x7CACu
+ }
+};
+
+static inline uint16_t
+crc_update_fast(uint16_t crc, const void *data, size_t data_len)
+{
+ const unsigned char *d = (const unsigned char *)data;
+ const unsigned char *d_end = d + data_len;
+ const unsigned char *d_last16 = d + (data_len & ~0x0F);
+
+ for (; d < d_last16 ; d += 16) {
+ crc = crc_table_fast[15][d[0] ^ (uint8_t)(crc >> 8)] ^
+ crc_table_fast[14][d[1] ^ (uint8_t)(crc >> 0)] ^
+ crc_table_fast[13][d[2]] ^
+ crc_table_fast[12][d[3]] ^
+ crc_table_fast[11][d[4]] ^
+ crc_table_fast[10][d[5]] ^
+ crc_table_fast[9][d[6]] ^
+ crc_table_fast[8][d[7]] ^
+ crc_table_fast[7][d[8]] ^
+ crc_table_fast[6][d[9]] ^
+ crc_table_fast[5][d[10]] ^
+ crc_table_fast[4][d[11]] ^
+ crc_table_fast[3][d[12]] ^
+ crc_table_fast[2][d[13]] ^
+ crc_table_fast[1][d[14]] ^
+ crc_table_fast[0][d[15]];
+ }
+ for (; d < d_end ; d++) {
+ crc = (crc << 8) ^ crc_table_fast[0][((uint8_t)(crc >> 8) ^ *d)];
+ }
+ return crc & 0xffff;
+}
+
+static inline uint16_t
+crc16_table_t10dif(uint16_t init_crc, const void *buf, size_t len)
+{
+ uint16_t crc;
+ const uint8_t *data = (const uint8_t *)buf;
+
+ crc = init_crc;
+ crc = crc_update_fast(crc, data, len);
+ return crc;
+}
+
+uint16_t
+spdk_crc16_t10dif(uint16_t init_crc, const void *buf, size_t len)
+{
+ return (crc16_table_t10dif(init_crc, buf, len));
+}
+
+uint16_t
+spdk_crc16_t10dif_copy(uint16_t init_crc, uint8_t *dst, uint8_t *src, size_t len)
+{
+ memcpy(dst, src, len);
+ return (crc16_table_t10dif(init_crc, src, len));
+}
+
+#endif
diff --git a/src/spdk/lib/util/crc32.c b/src/spdk/lib/util/crc32.c
new file mode 100644
index 000000000..34bb60b78
--- /dev/null
+++ b/src/spdk/lib/util/crc32.c
@@ -0,0 +1,95 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "util_internal.h"
+#include "spdk/crc32.h"
+
+void
+crc32_table_init(struct spdk_crc32_table *table, uint32_t polynomial_reflect)
+{
+ int i, j;
+ uint32_t val;
+
+ for (i = 0; i < 256; i++) {
+ val = i;
+ for (j = 0; j < 8; j++) {
+ if (val & 1) {
+ val = (val >> 1) ^ polynomial_reflect;
+ } else {
+ val = (val >> 1);
+ }
+ }
+ table->table[i] = val;
+ }
+}
+
+#ifdef SPDK_HAVE_ARM_CRC
+
+uint32_t
+crc32_update(const struct spdk_crc32_table *table, const void *buf, size_t len, uint32_t crc)
+{
+ size_t count;
+ const uint64_t *dword_buf;
+
+ count = len & 7;
+ while (count--) {
+ crc = __crc32b(crc, *(const uint8_t *)buf);
+ buf++;
+ }
+ dword_buf = (const uint64_t *)buf;
+
+ count = len / 8;
+ while (count--) {
+ crc = __crc32d(crc, *dword_buf);
+ dword_buf++;
+ }
+
+ return crc;
+}
+
+#else
+
+uint32_t
+crc32_update(const struct spdk_crc32_table *table, const void *buf, size_t len, uint32_t crc)
+{
+ const uint8_t *buf_u8 = buf;
+ size_t i;
+
+ for (i = 0; i < len; i++) {
+ crc = (crc >> 8) ^ table->table[(crc ^ buf_u8[i]) & 0xff];
+ }
+
+ return crc;
+}
+
+#endif
diff --git a/src/spdk/lib/util/crc32_ieee.c b/src/spdk/lib/util/crc32_ieee.c
new file mode 100644
index 000000000..ddc3c9901
--- /dev/null
+++ b/src/spdk/lib/util/crc32_ieee.c
@@ -0,0 +1,49 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "util_internal.h"
+#include "spdk/crc32.h"
+
+static struct spdk_crc32_table g_crc32_ieee_table;
+
+__attribute__((constructor)) static void
+crc32_ieee_init(void)
+{
+ crc32_table_init(&g_crc32_ieee_table, SPDK_CRC32_POLYNOMIAL_REFLECT);
+}
+
+uint32_t
+spdk_crc32_ieee_update(const void *buf, size_t len, uint32_t crc)
+{
+ return crc32_update(&g_crc32_ieee_table, buf, len, crc);
+}
diff --git a/src/spdk/lib/util/crc32c.c b/src/spdk/lib/util/crc32c.c
new file mode 100644
index 000000000..9acd8d80f
--- /dev/null
+++ b/src/spdk/lib/util/crc32c.c
@@ -0,0 +1,133 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "util_internal.h"
+#include "spdk/crc32.h"
+
+#ifdef SPDK_CONFIG_ISAL
+#define SPDK_HAVE_ISAL
+#include <isa-l/include/crc.h>
+#elif defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+#define SPDK_HAVE_ARM_CRC
+#include <arm_acle.h>
+#elif defined(__x86_64__) && defined(__SSE4_2__)
+#define SPDK_HAVE_SSE4_2
+#include <x86intrin.h>
+#endif
+
+#ifdef SPDK_HAVE_ISAL
+
+uint32_t
+spdk_crc32c_update(const void *buf, size_t len, uint32_t crc)
+{
+ return crc32_iscsi((unsigned char *)buf, len, crc);
+}
+
+#elif defined(SPDK_HAVE_SSE4_2)
+
+uint32_t
+spdk_crc32c_update(const void *buf, size_t len, uint32_t crc)
+{
+ uint64_t crc_tmp64;
+ size_t count;
+
+ /* _mm_crc32_u64() needs a 64-bit intermediate value */
+ crc_tmp64 = crc;
+
+ /* Process as much of the buffer as possible in 64-bit blocks. */
+ count = len / 8;
+ while (count--) {
+ uint64_t block;
+
+ /*
+ * Use memcpy() to avoid unaligned loads, which are undefined behavior in C.
+ * The compiler will optimize out the memcpy() in release builds.
+ */
+ memcpy(&block, buf, sizeof(block));
+ crc_tmp64 = _mm_crc32_u64(crc_tmp64, block);
+ buf += sizeof(block);
+ }
+ crc = (uint32_t)crc_tmp64;
+
+ /* Handle any trailing bytes. */
+ count = len & 7;
+ while (count--) {
+ crc = _mm_crc32_u8(crc, *(const uint8_t *)buf);
+ buf++;
+ }
+
+ return crc;
+}
+
+#elif defined(SPDK_HAVE_ARM_CRC)
+
+uint32_t
+spdk_crc32c_update(const void *buf, size_t len, uint32_t crc)
+{
+ size_t count;
+
+ count = len / 8;
+ while (count--) {
+ uint64_t block;
+
+ memcpy(&block, buf, sizeof(block));
+ crc = __crc32cd(crc, block);
+ buf += sizeof(block);
+ }
+
+ count = len & 7;
+ while (count--) {
+ crc = __crc32cb(crc, *(const uint8_t *)buf);
+ buf++;
+ }
+
+ return crc;
+}
+
+#else /* Neither SSE 4.2 nor ARM CRC32 instructions available */
+
+static struct spdk_crc32_table g_crc32c_table;
+
+__attribute__((constructor)) static void
+crc32c_init(void)
+{
+ crc32_table_init(&g_crc32c_table, SPDK_CRC32C_POLYNOMIAL_REFLECT);
+}
+
+uint32_t
+spdk_crc32c_update(const void *buf, size_t len, uint32_t crc)
+{
+ return crc32_update(&g_crc32c_table, buf, len, crc);
+}
+
+#endif
diff --git a/src/spdk/lib/util/dif.c b/src/spdk/lib/util/dif.c
new file mode 100644
index 000000000..64bce1487
--- /dev/null
+++ b/src/spdk/lib/util/dif.c
@@ -0,0 +1,1999 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/dif.h"
+#include "spdk/crc16.h"
+#include "spdk/crc32.h"
+#include "spdk/endian.h"
+#include "spdk/log.h"
+#include "spdk/util.h"
+
+/* Context to iterate or create a iovec array.
+ * Each sgl is either iterated or created at a time.
+ */
+struct _dif_sgl {
+ /* Current iovec in the iteration or creation */
+ struct iovec *iov;
+
+ /* Remaining count of iovecs in the iteration or creation. */
+ int iovcnt;
+
+ /* Current offset in the iovec */
+ uint32_t iov_offset;
+
+ /* Size of the created iovec array in bytes */
+ uint32_t total_size;
+};
+
+static inline void
+_dif_sgl_init(struct _dif_sgl *s, struct iovec *iovs, int iovcnt)
+{
+ s->iov = iovs;
+ s->iovcnt = iovcnt;
+ s->iov_offset = 0;
+ s->total_size = 0;
+}
+
+static void
+_dif_sgl_advance(struct _dif_sgl *s, uint32_t step)
+{
+ s->iov_offset += step;
+ while (s->iovcnt != 0) {
+ if (s->iov_offset < s->iov->iov_len) {
+ break;
+ }
+
+ s->iov_offset -= s->iov->iov_len;
+ s->iov++;
+ s->iovcnt--;
+ }
+}
+
+static inline void
+_dif_sgl_get_buf(struct _dif_sgl *s, void **_buf, uint32_t *_buf_len)
+{
+ if (_buf != NULL) {
+ *_buf = s->iov->iov_base + s->iov_offset;
+ }
+ if (_buf_len != NULL) {
+ *_buf_len = s->iov->iov_len - s->iov_offset;
+ }
+}
+
+static inline bool
+_dif_sgl_append(struct _dif_sgl *s, uint8_t *data, uint32_t data_len)
+{
+ assert(s->iovcnt > 0);
+ s->iov->iov_base = data;
+ s->iov->iov_len = data_len;
+ s->total_size += data_len;
+ s->iov++;
+ s->iovcnt--;
+
+ if (s->iovcnt > 0) {
+ return true;
+ } else {
+ return false;
+ }
+}
+
+static inline bool
+_dif_sgl_append_split(struct _dif_sgl *dst, struct _dif_sgl *src, uint32_t data_len)
+{
+ uint8_t *buf;
+ uint32_t buf_len;
+
+ while (data_len != 0) {
+ _dif_sgl_get_buf(src, (void *)&buf, &buf_len);
+ buf_len = spdk_min(buf_len, data_len);
+
+ if (!_dif_sgl_append(dst, buf, buf_len)) {
+ return false;
+ }
+
+ _dif_sgl_advance(src, buf_len);
+ data_len -= buf_len;
+ }
+
+ return true;
+}
+
+/* This function must be used before starting iteration. */
+static bool
+_dif_sgl_is_bytes_multiple(struct _dif_sgl *s, uint32_t bytes)
+{
+ int i;
+
+ for (i = 0; i < s->iovcnt; i++) {
+ if (s->iov[i].iov_len % bytes) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/* This function must be used before starting iteration. */
+static bool
+_dif_sgl_is_valid(struct _dif_sgl *s, uint32_t bytes)
+{
+ uint64_t total = 0;
+ int i;
+
+ for (i = 0; i < s->iovcnt; i++) {
+ total += s->iov[i].iov_len;
+ }
+
+ return total >= bytes;
+}
+
+static void
+_dif_sgl_copy(struct _dif_sgl *to, struct _dif_sgl *from)
+{
+ memcpy(to, from, sizeof(struct _dif_sgl));
+}
+
+static bool
+_dif_type_is_valid(enum spdk_dif_type dif_type, uint32_t dif_flags)
+{
+ switch (dif_type) {
+ case SPDK_DIF_TYPE1:
+ case SPDK_DIF_TYPE2:
+ case SPDK_DIF_DISABLE:
+ break;
+ case SPDK_DIF_TYPE3:
+ if (dif_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) {
+ SPDK_ERRLOG("Reference Tag should not be checked for Type 3\n");
+ return false;
+ }
+ break;
+ default:
+ SPDK_ERRLOG("Unknown DIF Type: %d\n", dif_type);
+ return false;
+ }
+
+ return true;
+}
+
+static bool
+_dif_is_disabled(enum spdk_dif_type dif_type)
+{
+ if (dif_type == SPDK_DIF_DISABLE) {
+ return true;
+ } else {
+ return false;
+ }
+}
+
+
+static uint32_t
+_get_guard_interval(uint32_t block_size, uint32_t md_size, bool dif_loc, bool md_interleave)
+{
+ if (!dif_loc) {
+ /* For metadata formats with more than 8 bytes, if the DIF is
+ * contained in the last 8 bytes of metadata, then the CRC
+ * covers all metadata up to but excluding these last 8 bytes.
+ */
+ if (md_interleave) {
+ return block_size - sizeof(struct spdk_dif);
+ } else {
+ return md_size - sizeof(struct spdk_dif);
+ }
+ } else {
+ /* For metadata formats with more than 8 bytes, if the DIF is
+ * contained in the first 8 bytes of metadata, then the CRC
+ * does not cover any metadata.
+ */
+ if (md_interleave) {
+ return block_size - md_size;
+ } else {
+ return 0;
+ }
+ }
+}
+
+int
+spdk_dif_ctx_init(struct spdk_dif_ctx *ctx, uint32_t block_size, uint32_t md_size,
+ bool md_interleave, bool dif_loc, enum spdk_dif_type dif_type, uint32_t dif_flags,
+ uint32_t init_ref_tag, uint16_t apptag_mask, uint16_t app_tag,
+ uint32_t data_offset, uint16_t guard_seed)
+{
+ uint32_t data_block_size;
+
+ if (md_size < sizeof(struct spdk_dif)) {
+ SPDK_ERRLOG("Metadata size is smaller than DIF size.\n");
+ return -EINVAL;
+ }
+
+ if (md_interleave) {
+ if (block_size < md_size) {
+ SPDK_ERRLOG("Block size is smaller than DIF size.\n");
+ return -EINVAL;
+ }
+ data_block_size = block_size - md_size;
+ } else {
+ if (block_size == 0 || (block_size % 512) != 0) {
+ SPDK_ERRLOG("Zero block size is not allowed\n");
+ return -EINVAL;
+ }
+ data_block_size = block_size;
+ }
+
+ if (!_dif_type_is_valid(dif_type, dif_flags)) {
+ SPDK_ERRLOG("DIF type is invalid.\n");
+ return -EINVAL;
+ }
+
+ ctx->block_size = block_size;
+ ctx->md_size = md_size;
+ ctx->md_interleave = md_interleave;
+ ctx->guard_interval = _get_guard_interval(block_size, md_size, dif_loc, md_interleave);
+ ctx->dif_type = dif_type;
+ ctx->dif_flags = dif_flags;
+ ctx->init_ref_tag = init_ref_tag;
+ ctx->apptag_mask = apptag_mask;
+ ctx->app_tag = app_tag;
+ ctx->data_offset = data_offset;
+ ctx->ref_tag_offset = data_offset / data_block_size;
+ ctx->last_guard = guard_seed;
+ ctx->guard_seed = guard_seed;
+ ctx->remapped_init_ref_tag = 0;
+
+ return 0;
+}
+
+void
+spdk_dif_ctx_set_data_offset(struct spdk_dif_ctx *ctx, uint32_t data_offset)
+{
+ uint32_t data_block_size;
+
+ if (ctx->md_interleave) {
+ data_block_size = ctx->block_size - ctx->md_size;
+ } else {
+ data_block_size = ctx->block_size;
+ }
+
+ ctx->data_offset = data_offset;
+ ctx->ref_tag_offset = data_offset / data_block_size;
+}
+
+void
+spdk_dif_ctx_set_remapped_init_ref_tag(struct spdk_dif_ctx *ctx,
+ uint32_t remapped_init_ref_tag)
+{
+ ctx->remapped_init_ref_tag = remapped_init_ref_tag;
+}
+
+static void
+_dif_generate(void *_dif, uint16_t guard, uint32_t offset_blocks,
+ const struct spdk_dif_ctx *ctx)
+{
+ struct spdk_dif *dif = _dif;
+ uint32_t ref_tag;
+
+ if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+ to_be16(&dif->guard, guard);
+ }
+
+ if (ctx->dif_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) {
+ to_be16(&dif->app_tag, ctx->app_tag);
+ }
+
+ if (ctx->dif_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) {
+ /* For type 1 and 2, the reference tag is incremented for each
+ * subsequent logical block. For type 3, the reference tag
+ * remains the same as the initial reference tag.
+ */
+ if (ctx->dif_type != SPDK_DIF_TYPE3) {
+ ref_tag = ctx->init_ref_tag + ctx->ref_tag_offset + offset_blocks;
+ } else {
+ ref_tag = ctx->init_ref_tag + ctx->ref_tag_offset;
+ }
+
+ to_be32(&dif->ref_tag, ref_tag);
+ }
+}
+
+static void
+dif_generate(struct _dif_sgl *sgl, uint32_t num_blocks, const struct spdk_dif_ctx *ctx)
+{
+ uint32_t offset_blocks = 0;
+ void *buf;
+ uint16_t guard = 0;
+
+ while (offset_blocks < num_blocks) {
+ _dif_sgl_get_buf(sgl, &buf, NULL);
+
+ if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+ guard = spdk_crc16_t10dif(ctx->guard_seed, buf, ctx->guard_interval);
+ }
+
+ _dif_generate(buf + ctx->guard_interval, guard, offset_blocks, ctx);
+
+ _dif_sgl_advance(sgl, ctx->block_size);
+ offset_blocks++;
+ }
+}
+
+static uint16_t
+_dif_generate_split(struct _dif_sgl *sgl, uint32_t offset_in_block, uint32_t data_len,
+ uint16_t guard, uint32_t offset_blocks, const struct spdk_dif_ctx *ctx)
+{
+ uint32_t offset_in_dif, buf_len;
+ void *buf;
+ struct spdk_dif dif = {};
+
+ assert(offset_in_block < ctx->guard_interval);
+ assert(offset_in_block + data_len < ctx->guard_interval ||
+ offset_in_block + data_len == ctx->block_size);
+
+ /* Compute CRC over split logical block data. */
+ while (data_len != 0 && offset_in_block < ctx->guard_interval) {
+ _dif_sgl_get_buf(sgl, &buf, &buf_len);
+ buf_len = spdk_min(buf_len, data_len);
+ buf_len = spdk_min(buf_len, ctx->guard_interval - offset_in_block);
+
+ if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+ guard = spdk_crc16_t10dif(guard, buf, buf_len);
+ }
+
+ _dif_sgl_advance(sgl, buf_len);
+ offset_in_block += buf_len;
+ data_len -= buf_len;
+ }
+
+ if (offset_in_block < ctx->guard_interval) {
+ return guard;
+ }
+
+ /* If a whole logical block data is parsed, generate DIF
+ * and save it to the temporary DIF area.
+ */
+ _dif_generate(&dif, guard, offset_blocks, ctx);
+
+ /* Copy generated DIF field to the split DIF field, and then
+ * skip metadata field after DIF field (if any).
+ */
+ while (offset_in_block < ctx->block_size) {
+ _dif_sgl_get_buf(sgl, &buf, &buf_len);
+
+ if (offset_in_block < ctx->guard_interval + sizeof(struct spdk_dif)) {
+ offset_in_dif = offset_in_block - ctx->guard_interval;
+ buf_len = spdk_min(buf_len, sizeof(struct spdk_dif) - offset_in_dif);
+
+ memcpy(buf, ((uint8_t *)&dif) + offset_in_dif, buf_len);
+ } else {
+ buf_len = spdk_min(buf_len, ctx->block_size - offset_in_block);
+ }
+
+ _dif_sgl_advance(sgl, buf_len);
+ offset_in_block += buf_len;
+ }
+
+ if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+ guard = ctx->guard_seed;
+ }
+
+ return guard;
+}
+
+static void
+dif_generate_split(struct _dif_sgl *sgl, uint32_t num_blocks,
+ const struct spdk_dif_ctx *ctx)
+{
+ uint32_t offset_blocks;
+ uint16_t guard = 0;
+
+ if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+ guard = ctx->guard_seed;
+ }
+
+ for (offset_blocks = 0; offset_blocks < num_blocks; offset_blocks++) {
+ _dif_generate_split(sgl, 0, ctx->block_size, guard, offset_blocks, ctx);
+ }
+}
+
+int
+spdk_dif_generate(struct iovec *iovs, int iovcnt, uint32_t num_blocks,
+ const struct spdk_dif_ctx *ctx)
+{
+ struct _dif_sgl sgl;
+
+ _dif_sgl_init(&sgl, iovs, iovcnt);
+
+ if (!_dif_sgl_is_valid(&sgl, ctx->block_size * num_blocks)) {
+ SPDK_ERRLOG("Size of iovec array is not valid.\n");
+ return -EINVAL;
+ }
+
+ if (_dif_is_disabled(ctx->dif_type)) {
+ return 0;
+ }
+
+ if (_dif_sgl_is_bytes_multiple(&sgl, ctx->block_size)) {
+ dif_generate(&sgl, num_blocks, ctx);
+ } else {
+ dif_generate_split(&sgl, num_blocks, ctx);
+ }
+
+ return 0;
+}
+
+static void
+_dif_error_set(struct spdk_dif_error *err_blk, uint8_t err_type,
+ uint32_t expected, uint32_t actual, uint32_t err_offset)
+{
+ if (err_blk) {
+ err_blk->err_type = err_type;
+ err_blk->expected = expected;
+ err_blk->actual = actual;
+ err_blk->err_offset = err_offset;
+ }
+}
+
+static int
+_dif_verify(void *_dif, uint16_t guard, uint32_t offset_blocks,
+ const struct spdk_dif_ctx *ctx, struct spdk_dif_error *err_blk)
+{
+ struct spdk_dif *dif = _dif;
+ uint16_t _guard;
+ uint16_t _app_tag;
+ uint32_t ref_tag, _ref_tag;
+
+ switch (ctx->dif_type) {
+ case SPDK_DIF_TYPE1:
+ case SPDK_DIF_TYPE2:
+ /* If Type 1 or 2 is used, then all DIF checks are disabled when
+ * the Application Tag is 0xFFFF.
+ */
+ if (dif->app_tag == 0xFFFF) {
+ return 0;
+ }
+ break;
+ case SPDK_DIF_TYPE3:
+ /* If Type 3 is used, then all DIF checks are disabled when the
+ * Application Tag is 0xFFFF and the Reference Tag is 0xFFFFFFFF.
+ */
+ if (dif->app_tag == 0xFFFF && dif->ref_tag == 0xFFFFFFFF) {
+ return 0;
+ }
+ break;
+ default:
+ break;
+ }
+
+ /* For type 1 and 2, the reference tag is incremented for each
+ * subsequent logical block. For type 3, the reference tag
+ * remains the same as the initial reference tag.
+ */
+ if (ctx->dif_type != SPDK_DIF_TYPE3) {
+ ref_tag = ctx->init_ref_tag + ctx->ref_tag_offset + offset_blocks;
+ } else {
+ ref_tag = ctx->init_ref_tag + ctx->ref_tag_offset;
+ }
+
+ if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+ /* Compare the DIF Guard field to the CRC computed over the logical
+ * block data.
+ */
+ _guard = from_be16(&dif->guard);
+ if (_guard != guard) {
+ _dif_error_set(err_blk, SPDK_DIF_GUARD_ERROR, _guard, guard,
+ offset_blocks);
+ SPDK_ERRLOG("Failed to compare Guard: LBA=%" PRIu32 "," \
+ " Expected=%x, Actual=%x\n",
+ ref_tag, _guard, guard);
+ return -1;
+ }
+ }
+
+ if (ctx->dif_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) {
+ /* Compare unmasked bits in the DIF Application Tag field to the
+ * passed Application Tag.
+ */
+ _app_tag = from_be16(&dif->app_tag);
+ if ((_app_tag & ctx->apptag_mask) != ctx->app_tag) {
+ _dif_error_set(err_blk, SPDK_DIF_APPTAG_ERROR, ctx->app_tag,
+ (_app_tag & ctx->apptag_mask), offset_blocks);
+ SPDK_ERRLOG("Failed to compare App Tag: LBA=%" PRIu32 "," \
+ " Expected=%x, Actual=%x\n",
+ ref_tag, ctx->app_tag, (_app_tag & ctx->apptag_mask));
+ return -1;
+ }
+ }
+
+ if (ctx->dif_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) {
+ switch (ctx->dif_type) {
+ case SPDK_DIF_TYPE1:
+ case SPDK_DIF_TYPE2:
+ /* Compare the DIF Reference Tag field to the passed Reference Tag.
+ * The passed Reference Tag will be the least significant 4 bytes
+ * of the LBA when Type 1 is used, and application specific value
+ * if Type 2 is used,
+ */
+ _ref_tag = from_be32(&dif->ref_tag);
+ if (_ref_tag != ref_tag) {
+ _dif_error_set(err_blk, SPDK_DIF_REFTAG_ERROR, ref_tag,
+ _ref_tag, offset_blocks);
+ SPDK_ERRLOG("Failed to compare Ref Tag: LBA=%" PRIu32 "," \
+ " Expected=%x, Actual=%x\n",
+ ref_tag, ref_tag, _ref_tag);
+ return -1;
+ }
+ break;
+ case SPDK_DIF_TYPE3:
+ /* For Type 3, computed Reference Tag remains unchanged.
+ * Hence ignore the Reference Tag field.
+ */
+ break;
+ default:
+ break;
+ }
+ }
+
+ return 0;
+}
+
+static int
+dif_verify(struct _dif_sgl *sgl, uint32_t num_blocks,
+ const struct spdk_dif_ctx *ctx, struct spdk_dif_error *err_blk)
+{
+ uint32_t offset_blocks = 0;
+ int rc;
+ void *buf;
+ uint16_t guard = 0;
+
+ while (offset_blocks < num_blocks) {
+ _dif_sgl_get_buf(sgl, &buf, NULL);
+
+ if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+ guard = spdk_crc16_t10dif(ctx->guard_seed, buf, ctx->guard_interval);
+ }
+
+ rc = _dif_verify(buf + ctx->guard_interval, guard, offset_blocks, ctx, err_blk);
+ if (rc != 0) {
+ return rc;
+ }
+
+ _dif_sgl_advance(sgl, ctx->block_size);
+ offset_blocks++;
+ }
+
+ return 0;
+}
+
+static int
+_dif_verify_split(struct _dif_sgl *sgl, uint32_t offset_in_block, uint32_t data_len,
+ uint16_t *_guard, uint32_t offset_blocks,
+ const struct spdk_dif_ctx *ctx, struct spdk_dif_error *err_blk)
+{
+ uint32_t offset_in_dif, buf_len;
+ void *buf;
+ uint16_t guard;
+ struct spdk_dif dif = {};
+ int rc;
+
+ assert(_guard != NULL);
+ assert(offset_in_block < ctx->guard_interval);
+ assert(offset_in_block + data_len < ctx->guard_interval ||
+ offset_in_block + data_len == ctx->block_size);
+
+ guard = *_guard;
+
+ /* Compute CRC over split logical block data. */
+ while (data_len != 0 && offset_in_block < ctx->guard_interval) {
+ _dif_sgl_get_buf(sgl, &buf, &buf_len);
+ buf_len = spdk_min(buf_len, data_len);
+ buf_len = spdk_min(buf_len, ctx->guard_interval - offset_in_block);
+
+ if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+ guard = spdk_crc16_t10dif(guard, buf, buf_len);
+ }
+
+ _dif_sgl_advance(sgl, buf_len);
+ offset_in_block += buf_len;
+ data_len -= buf_len;
+ }
+
+ if (offset_in_block < ctx->guard_interval) {
+ *_guard = guard;
+ return 0;
+ }
+
+ /* Copy the split DIF field to the temporary DIF buffer, and then
+ * skip metadata field after DIF field (if any). */
+ while (offset_in_block < ctx->block_size) {
+ _dif_sgl_get_buf(sgl, &buf, &buf_len);
+
+ if (offset_in_block < ctx->guard_interval + sizeof(struct spdk_dif)) {
+ offset_in_dif = offset_in_block - ctx->guard_interval;
+ buf_len = spdk_min(buf_len, sizeof(struct spdk_dif) - offset_in_dif);
+
+ memcpy((uint8_t *)&dif + offset_in_dif, buf, buf_len);
+ } else {
+ buf_len = spdk_min(buf_len, ctx->block_size - offset_in_block);
+ }
+ _dif_sgl_advance(sgl, buf_len);
+ offset_in_block += buf_len;
+ }
+
+ rc = _dif_verify(&dif, guard, offset_blocks, ctx, err_blk);
+ if (rc != 0) {
+ return rc;
+ }
+
+ if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+ guard = ctx->guard_seed;
+ }
+
+ *_guard = guard;
+ return 0;
+}
+
+static int
+dif_verify_split(struct _dif_sgl *sgl, uint32_t num_blocks,
+ const struct spdk_dif_ctx *ctx, struct spdk_dif_error *err_blk)
+{
+ uint32_t offset_blocks;
+ uint16_t guard = 0;
+ int rc;
+
+ if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+ guard = ctx->guard_seed;
+ }
+
+ for (offset_blocks = 0; offset_blocks < num_blocks; offset_blocks++) {
+ rc = _dif_verify_split(sgl, 0, ctx->block_size, &guard, offset_blocks,
+ ctx, err_blk);
+ if (rc != 0) {
+ return rc;
+ }
+ }
+
+ return 0;
+}
+
+int
+spdk_dif_verify(struct iovec *iovs, int iovcnt, uint32_t num_blocks,
+ const struct spdk_dif_ctx *ctx, struct spdk_dif_error *err_blk)
+{
+ struct _dif_sgl sgl;
+
+ _dif_sgl_init(&sgl, iovs, iovcnt);
+
+ if (!_dif_sgl_is_valid(&sgl, ctx->block_size * num_blocks)) {
+ SPDK_ERRLOG("Size of iovec array is not valid.\n");
+ return -EINVAL;
+ }
+
+ if (_dif_is_disabled(ctx->dif_type)) {
+ return 0;
+ }
+
+ if (_dif_sgl_is_bytes_multiple(&sgl, ctx->block_size)) {
+ return dif_verify(&sgl, num_blocks, ctx, err_blk);
+ } else {
+ return dif_verify_split(&sgl, num_blocks, ctx, err_blk);
+ }
+}
+
+static uint32_t
+dif_update_crc32c(struct _dif_sgl *sgl, uint32_t num_blocks,
+ uint32_t crc32c, const struct spdk_dif_ctx *ctx)
+{
+ uint32_t offset_blocks;
+ void *buf;
+
+ for (offset_blocks = 0; offset_blocks < num_blocks; offset_blocks++) {
+ _dif_sgl_get_buf(sgl, &buf, NULL);
+
+ crc32c = spdk_crc32c_update(buf, ctx->block_size - ctx->md_size, crc32c);
+
+ _dif_sgl_advance(sgl, ctx->block_size);
+ }
+
+ return crc32c;
+}
+
+static uint32_t
+_dif_update_crc32c_split(struct _dif_sgl *sgl, uint32_t offset_in_block, uint32_t data_len,
+ uint32_t crc32c, const struct spdk_dif_ctx *ctx)
+{
+ uint32_t data_block_size, buf_len;
+ void *buf;
+
+ data_block_size = ctx->block_size - ctx->md_size;
+
+ assert(offset_in_block + data_len <= ctx->block_size);
+
+ while (data_len != 0) {
+ _dif_sgl_get_buf(sgl, &buf, &buf_len);
+ buf_len = spdk_min(buf_len, data_len);
+
+ if (offset_in_block < data_block_size) {
+ buf_len = spdk_min(buf_len, data_block_size - offset_in_block);
+ crc32c = spdk_crc32c_update(buf, buf_len, crc32c);
+ }
+
+ _dif_sgl_advance(sgl, buf_len);
+ offset_in_block += buf_len;
+ data_len -= buf_len;
+ }
+
+ return crc32c;
+}
+
+static uint32_t
+dif_update_crc32c_split(struct _dif_sgl *sgl, uint32_t num_blocks,
+ uint32_t crc32c, const struct spdk_dif_ctx *ctx)
+{
+ uint32_t offset_blocks;
+
+ for (offset_blocks = 0; offset_blocks < num_blocks; offset_blocks++) {
+ crc32c = _dif_update_crc32c_split(sgl, 0, ctx->block_size, crc32c, ctx);
+ }
+
+ return crc32c;
+}
+
+int
+spdk_dif_update_crc32c(struct iovec *iovs, int iovcnt, uint32_t num_blocks,
+ uint32_t *_crc32c, const struct spdk_dif_ctx *ctx)
+{
+ struct _dif_sgl sgl;
+
+ if (_crc32c == NULL) {
+ return -EINVAL;
+ }
+
+ _dif_sgl_init(&sgl, iovs, iovcnt);
+
+ if (!_dif_sgl_is_valid(&sgl, ctx->block_size * num_blocks)) {
+ SPDK_ERRLOG("Size of iovec array is not valid.\n");
+ return -EINVAL;
+ }
+
+ if (_dif_sgl_is_bytes_multiple(&sgl, ctx->block_size)) {
+ *_crc32c = dif_update_crc32c(&sgl, num_blocks, *_crc32c, ctx);
+ } else {
+ *_crc32c = dif_update_crc32c_split(&sgl, num_blocks, *_crc32c, ctx);
+ }
+
+ return 0;
+}
+
+static void
+dif_generate_copy(struct _dif_sgl *src_sgl, struct _dif_sgl *dst_sgl,
+ uint32_t num_blocks, const struct spdk_dif_ctx *ctx)
+{
+ uint32_t offset_blocks = 0, data_block_size;
+ void *src, *dst;
+ uint16_t guard;
+
+ data_block_size = ctx->block_size - ctx->md_size;
+
+ while (offset_blocks < num_blocks) {
+ _dif_sgl_get_buf(src_sgl, &src, NULL);
+ _dif_sgl_get_buf(dst_sgl, &dst, NULL);
+
+ guard = 0;
+ if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+ guard = spdk_crc16_t10dif_copy(ctx->guard_seed, dst, src, data_block_size);
+ guard = spdk_crc16_t10dif(guard, dst + data_block_size,
+ ctx->guard_interval - data_block_size);
+ } else {
+ memcpy(dst, src, data_block_size);
+ }
+
+ _dif_generate(dst + ctx->guard_interval, guard, offset_blocks, ctx);
+
+ _dif_sgl_advance(src_sgl, data_block_size);
+ _dif_sgl_advance(dst_sgl, ctx->block_size);
+ offset_blocks++;
+ }
+}
+
+static void
+_dif_generate_copy_split(struct _dif_sgl *src_sgl, struct _dif_sgl *dst_sgl,
+ uint32_t offset_blocks, const struct spdk_dif_ctx *ctx)
+{
+ uint32_t offset_in_block, src_len, data_block_size;
+ uint16_t guard = 0;
+ void *src, *dst;
+
+ _dif_sgl_get_buf(dst_sgl, &dst, NULL);
+
+ data_block_size = ctx->block_size - ctx->md_size;
+
+ if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+ guard = ctx->guard_seed;
+ }
+ offset_in_block = 0;
+
+ while (offset_in_block < data_block_size) {
+ /* Compute CRC over split logical block data and copy
+ * data to bounce buffer.
+ */
+ _dif_sgl_get_buf(src_sgl, &src, &src_len);
+ src_len = spdk_min(src_len, data_block_size - offset_in_block);
+
+ if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+ guard = spdk_crc16_t10dif_copy(guard, dst + offset_in_block,
+ src, src_len);
+ } else {
+ memcpy(dst + offset_in_block, src, src_len);
+ }
+
+ _dif_sgl_advance(src_sgl, src_len);
+ offset_in_block += src_len;
+ }
+
+ if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+ guard = spdk_crc16_t10dif(guard, dst + data_block_size,
+ ctx->guard_interval - data_block_size);
+ }
+
+ _dif_sgl_advance(dst_sgl, ctx->block_size);
+
+ _dif_generate(dst + ctx->guard_interval, guard, offset_blocks, ctx);
+}
+
+static void
+dif_generate_copy_split(struct _dif_sgl *src_sgl, struct _dif_sgl *dst_sgl,
+ uint32_t num_blocks, const struct spdk_dif_ctx *ctx)
+{
+ uint32_t offset_blocks;
+
+ for (offset_blocks = 0; offset_blocks < num_blocks; offset_blocks++) {
+ _dif_generate_copy_split(src_sgl, dst_sgl, offset_blocks, ctx);
+ }
+}
+
+int
+spdk_dif_generate_copy(struct iovec *iovs, int iovcnt, struct iovec *bounce_iov,
+ uint32_t num_blocks, const struct spdk_dif_ctx *ctx)
+{
+ struct _dif_sgl src_sgl, dst_sgl;
+ uint32_t data_block_size;
+
+ _dif_sgl_init(&src_sgl, iovs, iovcnt);
+ _dif_sgl_init(&dst_sgl, bounce_iov, 1);
+
+ data_block_size = ctx->block_size - ctx->md_size;
+
+ if (!_dif_sgl_is_valid(&src_sgl, data_block_size * num_blocks) ||
+ !_dif_sgl_is_valid(&dst_sgl, ctx->block_size * num_blocks)) {
+ SPDK_ERRLOG("Size of iovec arrays are not valid.\n");
+ return -EINVAL;
+ }
+
+ if (_dif_is_disabled(ctx->dif_type)) {
+ return 0;
+ }
+
+ if (_dif_sgl_is_bytes_multiple(&src_sgl, data_block_size)) {
+ dif_generate_copy(&src_sgl, &dst_sgl, num_blocks, ctx);
+ } else {
+ dif_generate_copy_split(&src_sgl, &dst_sgl, num_blocks, ctx);
+ }
+
+ return 0;
+}
+
+static int
+dif_verify_copy(struct _dif_sgl *src_sgl, struct _dif_sgl *dst_sgl,
+ uint32_t num_blocks, const struct spdk_dif_ctx *ctx,
+ struct spdk_dif_error *err_blk)
+{
+ uint32_t offset_blocks = 0, data_block_size;
+ void *src, *dst;
+ int rc;
+ uint16_t guard;
+
+ data_block_size = ctx->block_size - ctx->md_size;
+
+ while (offset_blocks < num_blocks) {
+ _dif_sgl_get_buf(src_sgl, &src, NULL);
+ _dif_sgl_get_buf(dst_sgl, &dst, NULL);
+
+ guard = 0;
+ if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+ guard = spdk_crc16_t10dif_copy(ctx->guard_seed, dst, src, data_block_size);
+ guard = spdk_crc16_t10dif(guard, src + data_block_size,
+ ctx->guard_interval - data_block_size);
+ } else {
+ memcpy(dst, src, data_block_size);
+ }
+
+ rc = _dif_verify(src + ctx->guard_interval, guard, offset_blocks, ctx, err_blk);
+ if (rc != 0) {
+ return rc;
+ }
+
+ _dif_sgl_advance(src_sgl, ctx->block_size);
+ _dif_sgl_advance(dst_sgl, data_block_size);
+ offset_blocks++;
+ }
+
+ return 0;
+}
+
+static int
+_dif_verify_copy_split(struct _dif_sgl *src_sgl, struct _dif_sgl *dst_sgl,
+ uint32_t offset_blocks, const struct spdk_dif_ctx *ctx,
+ struct spdk_dif_error *err_blk)
+{
+ uint32_t offset_in_block, dst_len, data_block_size;
+ uint16_t guard = 0;
+ void *src, *dst;
+
+ _dif_sgl_get_buf(src_sgl, &src, NULL);
+
+ data_block_size = ctx->block_size - ctx->md_size;
+
+ if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+ guard = ctx->guard_seed;
+ }
+ offset_in_block = 0;
+
+ while (offset_in_block < data_block_size) {
+ /* Compute CRC over split logical block data and copy
+ * data to bounce buffer.
+ */
+ _dif_sgl_get_buf(dst_sgl, &dst, &dst_len);
+ dst_len = spdk_min(dst_len, data_block_size - offset_in_block);
+
+ if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+ guard = spdk_crc16_t10dif_copy(guard, dst,
+ src + offset_in_block, dst_len);
+ } else {
+ memcpy(dst, src + offset_in_block, dst_len);
+ }
+
+ _dif_sgl_advance(dst_sgl, dst_len);
+ offset_in_block += dst_len;
+ }
+
+ if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+ guard = spdk_crc16_t10dif(guard, src + data_block_size,
+ ctx->guard_interval - data_block_size);
+ }
+
+ _dif_sgl_advance(src_sgl, ctx->block_size);
+
+ return _dif_verify(src + ctx->guard_interval, guard, offset_blocks, ctx, err_blk);
+}
+
+static int
+dif_verify_copy_split(struct _dif_sgl *src_sgl, struct _dif_sgl *dst_sgl,
+ uint32_t num_blocks, const struct spdk_dif_ctx *ctx,
+ struct spdk_dif_error *err_blk)
+{
+ uint32_t offset_blocks;
+ int rc;
+
+ for (offset_blocks = 0; offset_blocks < num_blocks; offset_blocks++) {
+ rc = _dif_verify_copy_split(src_sgl, dst_sgl, offset_blocks, ctx, err_blk);
+ if (rc != 0) {
+ return rc;
+ }
+ }
+
+ return 0;
+}
+
+int
+spdk_dif_verify_copy(struct iovec *iovs, int iovcnt, struct iovec *bounce_iov,
+ uint32_t num_blocks, const struct spdk_dif_ctx *ctx,
+ struct spdk_dif_error *err_blk)
+{
+ struct _dif_sgl src_sgl, dst_sgl;
+ uint32_t data_block_size;
+
+ _dif_sgl_init(&src_sgl, bounce_iov, 1);
+ _dif_sgl_init(&dst_sgl, iovs, iovcnt);
+
+ data_block_size = ctx->block_size - ctx->md_size;
+
+ if (!_dif_sgl_is_valid(&dst_sgl, data_block_size * num_blocks) ||
+ !_dif_sgl_is_valid(&src_sgl, ctx->block_size * num_blocks)) {
+ SPDK_ERRLOG("Size of iovec arrays are not valid\n");
+ return -EINVAL;
+ }
+
+ if (_dif_is_disabled(ctx->dif_type)) {
+ return 0;
+ }
+
+ if (_dif_sgl_is_bytes_multiple(&dst_sgl, data_block_size)) {
+ return dif_verify_copy(&src_sgl, &dst_sgl, num_blocks, ctx, err_blk);
+ } else {
+ return dif_verify_copy_split(&src_sgl, &dst_sgl, num_blocks, ctx, err_blk);
+ }
+}
+
+static void
+_bit_flip(uint8_t *buf, uint32_t flip_bit)
+{
+ uint8_t byte;
+
+ byte = *buf;
+ byte ^= 1 << flip_bit;
+ *buf = byte;
+}
+
+static int
+_dif_inject_error(struct _dif_sgl *sgl,
+ uint32_t block_size, uint32_t num_blocks,
+ uint32_t inject_offset_blocks,
+ uint32_t inject_offset_bytes,
+ uint32_t inject_offset_bits)
+{
+ uint32_t offset_in_block, buf_len;
+ void *buf;
+
+ _dif_sgl_advance(sgl, block_size * inject_offset_blocks);
+
+ offset_in_block = 0;
+
+ while (offset_in_block < block_size) {
+ _dif_sgl_get_buf(sgl, &buf, &buf_len);
+ buf_len = spdk_min(buf_len, block_size - offset_in_block);
+
+ if (inject_offset_bytes >= offset_in_block &&
+ inject_offset_bytes < offset_in_block + buf_len) {
+ buf += inject_offset_bytes - offset_in_block;
+ _bit_flip(buf, inject_offset_bits);
+ return 0;
+ }
+
+ _dif_sgl_advance(sgl, buf_len);
+ offset_in_block += buf_len;
+ }
+
+ return -1;
+}
+
+static int
+dif_inject_error(struct _dif_sgl *sgl, uint32_t block_size, uint32_t num_blocks,
+ uint32_t start_inject_bytes, uint32_t inject_range_bytes,
+ uint32_t *inject_offset)
+{
+ uint32_t inject_offset_blocks, inject_offset_bytes, inject_offset_bits;
+ uint32_t offset_blocks;
+ int rc;
+
+ srand(time(0));
+
+ inject_offset_blocks = rand() % num_blocks;
+ inject_offset_bytes = start_inject_bytes + (rand() % inject_range_bytes);
+ inject_offset_bits = rand() % 8;
+
+ for (offset_blocks = 0; offset_blocks < num_blocks; offset_blocks++) {
+ if (offset_blocks == inject_offset_blocks) {
+ rc = _dif_inject_error(sgl, block_size, num_blocks,
+ inject_offset_blocks,
+ inject_offset_bytes,
+ inject_offset_bits);
+ if (rc == 0) {
+ *inject_offset = inject_offset_blocks;
+ }
+ return rc;
+ }
+ }
+
+ return -1;
+}
+
+#define _member_size(type, member) sizeof(((type *)0)->member)
+
+int
+spdk_dif_inject_error(struct iovec *iovs, int iovcnt, uint32_t num_blocks,
+ const struct spdk_dif_ctx *ctx, uint32_t inject_flags,
+ uint32_t *inject_offset)
+{
+ struct _dif_sgl sgl;
+ int rc;
+
+ _dif_sgl_init(&sgl, iovs, iovcnt);
+
+ if (!_dif_sgl_is_valid(&sgl, ctx->block_size * num_blocks)) {
+ SPDK_ERRLOG("Size of iovec array is not valid.\n");
+ return -EINVAL;
+ }
+
+ if (inject_flags & SPDK_DIF_REFTAG_ERROR) {
+ rc = dif_inject_error(&sgl, ctx->block_size, num_blocks,
+ ctx->guard_interval + offsetof(struct spdk_dif, ref_tag),
+ _member_size(struct spdk_dif, ref_tag),
+ inject_offset);
+ if (rc != 0) {
+ SPDK_ERRLOG("Failed to inject error to Reference Tag.\n");
+ return rc;
+ }
+ }
+
+ if (inject_flags & SPDK_DIF_APPTAG_ERROR) {
+ rc = dif_inject_error(&sgl, ctx->block_size, num_blocks,
+ ctx->guard_interval + offsetof(struct spdk_dif, app_tag),
+ _member_size(struct spdk_dif, app_tag),
+ inject_offset);
+ if (rc != 0) {
+ SPDK_ERRLOG("Failed to inject error to Application Tag.\n");
+ return rc;
+ }
+ }
+ if (inject_flags & SPDK_DIF_GUARD_ERROR) {
+ rc = dif_inject_error(&sgl, ctx->block_size, num_blocks,
+ ctx->guard_interval,
+ _member_size(struct spdk_dif, guard),
+ inject_offset);
+ if (rc != 0) {
+ SPDK_ERRLOG("Failed to inject error to Guard.\n");
+ return rc;
+ }
+ }
+
+ if (inject_flags & SPDK_DIF_DATA_ERROR) {
+ /* If the DIF information is contained within the last 8 bytes of
+ * metadata, then the CRC covers all metadata bytes up to but excluding
+ * the last 8 bytes. But error injection does not cover these metadata
+ * because classification is not determined yet.
+ *
+ * Note: Error injection to data block is expected to be detected as
+ * guard error.
+ */
+ rc = dif_inject_error(&sgl, ctx->block_size, num_blocks,
+ 0,
+ ctx->block_size - ctx->md_size,
+ inject_offset);
+ if (rc != 0) {
+ SPDK_ERRLOG("Failed to inject error to data block.\n");
+ return rc;
+ }
+ }
+
+ return 0;
+}
+
+static void
+dix_generate(struct _dif_sgl *data_sgl, struct _dif_sgl *md_sgl,
+ uint32_t num_blocks, const struct spdk_dif_ctx *ctx)
+{
+ uint32_t offset_blocks = 0;
+ uint16_t guard;
+ void *data_buf, *md_buf;
+
+ while (offset_blocks < num_blocks) {
+ _dif_sgl_get_buf(data_sgl, &data_buf, NULL);
+ _dif_sgl_get_buf(md_sgl, &md_buf, NULL);
+
+ guard = 0;
+ if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+ guard = spdk_crc16_t10dif(ctx->guard_seed, data_buf, ctx->block_size);
+ guard = spdk_crc16_t10dif(guard, md_buf, ctx->guard_interval);
+ }
+
+ _dif_generate(md_buf + ctx->guard_interval, guard, offset_blocks, ctx);
+
+ _dif_sgl_advance(data_sgl, ctx->block_size);
+ _dif_sgl_advance(md_sgl, ctx->md_size);
+ offset_blocks++;
+ }
+}
+
+static void
+_dix_generate_split(struct _dif_sgl *data_sgl, struct _dif_sgl *md_sgl,
+ uint32_t offset_blocks, const struct spdk_dif_ctx *ctx)
+{
+ uint32_t offset_in_block, data_buf_len;
+ uint16_t guard = 0;
+ void *data_buf, *md_buf;
+
+ _dif_sgl_get_buf(md_sgl, &md_buf, NULL);
+
+ if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+ guard = ctx->guard_seed;
+ }
+ offset_in_block = 0;
+
+ while (offset_in_block < ctx->block_size) {
+ _dif_sgl_get_buf(data_sgl, &data_buf, &data_buf_len);
+ data_buf_len = spdk_min(data_buf_len, ctx->block_size - offset_in_block);
+
+ if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+ guard = spdk_crc16_t10dif(guard, data_buf, data_buf_len);
+ }
+
+ _dif_sgl_advance(data_sgl, data_buf_len);
+ offset_in_block += data_buf_len;
+ }
+
+ if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+ guard = spdk_crc16_t10dif(guard, md_buf, ctx->guard_interval);
+ }
+
+ _dif_sgl_advance(md_sgl, ctx->md_size);
+
+ _dif_generate(md_buf + ctx->guard_interval, guard, offset_blocks, ctx);
+}
+
+static void
+dix_generate_split(struct _dif_sgl *data_sgl, struct _dif_sgl *md_sgl,
+ uint32_t num_blocks, const struct spdk_dif_ctx *ctx)
+{
+ uint32_t offset_blocks;
+
+ for (offset_blocks = 0; offset_blocks < num_blocks; offset_blocks++) {
+ _dix_generate_split(data_sgl, md_sgl, offset_blocks, ctx);
+ }
+}
+
+int
+spdk_dix_generate(struct iovec *iovs, int iovcnt, struct iovec *md_iov,
+ uint32_t num_blocks, const struct spdk_dif_ctx *ctx)
+{
+ struct _dif_sgl data_sgl, md_sgl;
+
+ _dif_sgl_init(&data_sgl, iovs, iovcnt);
+ _dif_sgl_init(&md_sgl, md_iov, 1);
+
+ if (!_dif_sgl_is_valid(&data_sgl, ctx->block_size * num_blocks) ||
+ !_dif_sgl_is_valid(&md_sgl, ctx->md_size * num_blocks)) {
+ SPDK_ERRLOG("Size of iovec array is not valid.\n");
+ return -EINVAL;
+ }
+
+ if (_dif_is_disabled(ctx->dif_type)) {
+ return 0;
+ }
+
+ if (_dif_sgl_is_bytes_multiple(&data_sgl, ctx->block_size)) {
+ dix_generate(&data_sgl, &md_sgl, num_blocks, ctx);
+ } else {
+ dix_generate_split(&data_sgl, &md_sgl, num_blocks, ctx);
+ }
+
+ return 0;
+}
+
+static int
+dix_verify(struct _dif_sgl *data_sgl, struct _dif_sgl *md_sgl,
+ uint32_t num_blocks, const struct spdk_dif_ctx *ctx,
+ struct spdk_dif_error *err_blk)
+{
+ uint32_t offset_blocks = 0;
+ uint16_t guard;
+ void *data_buf, *md_buf;
+ int rc;
+
+ while (offset_blocks < num_blocks) {
+ _dif_sgl_get_buf(data_sgl, &data_buf, NULL);
+ _dif_sgl_get_buf(md_sgl, &md_buf, NULL);
+
+ guard = 0;
+ if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+ guard = spdk_crc16_t10dif(ctx->guard_seed, data_buf, ctx->block_size);
+ guard = spdk_crc16_t10dif(guard, md_buf, ctx->guard_interval);
+ }
+
+ rc = _dif_verify(md_buf + ctx->guard_interval, guard, offset_blocks, ctx, err_blk);
+ if (rc != 0) {
+ return rc;
+ }
+
+ _dif_sgl_advance(data_sgl, ctx->block_size);
+ _dif_sgl_advance(md_sgl, ctx->md_size);
+ offset_blocks++;
+ }
+
+ return 0;
+}
+
+static int
+_dix_verify_split(struct _dif_sgl *data_sgl, struct _dif_sgl *md_sgl,
+ uint32_t offset_blocks, const struct spdk_dif_ctx *ctx,
+ struct spdk_dif_error *err_blk)
+{
+ uint32_t offset_in_block, data_buf_len;
+ uint16_t guard = 0;
+ void *data_buf, *md_buf;
+
+ _dif_sgl_get_buf(md_sgl, &md_buf, NULL);
+
+ if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+ guard = ctx->guard_seed;
+ }
+ offset_in_block = 0;
+
+ while (offset_in_block < ctx->block_size) {
+ _dif_sgl_get_buf(data_sgl, &data_buf, &data_buf_len);
+ data_buf_len = spdk_min(data_buf_len, ctx->block_size - offset_in_block);
+
+ if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+ guard = spdk_crc16_t10dif(guard, data_buf, data_buf_len);
+ }
+
+ _dif_sgl_advance(data_sgl, data_buf_len);
+ offset_in_block += data_buf_len;
+ }
+
+ if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+ guard = spdk_crc16_t10dif(guard, md_buf, ctx->guard_interval);
+ }
+
+ _dif_sgl_advance(md_sgl, ctx->md_size);
+
+ return _dif_verify(md_buf + ctx->guard_interval, guard, offset_blocks, ctx, err_blk);
+}
+
+static int
+dix_verify_split(struct _dif_sgl *data_sgl, struct _dif_sgl *md_sgl,
+ uint32_t num_blocks, const struct spdk_dif_ctx *ctx,
+ struct spdk_dif_error *err_blk)
+{
+ uint32_t offset_blocks;
+ int rc;
+
+ for (offset_blocks = 0; offset_blocks < num_blocks; offset_blocks++) {
+ rc = _dix_verify_split(data_sgl, md_sgl, offset_blocks, ctx, err_blk);
+ if (rc != 0) {
+ return rc;
+ }
+ }
+
+ return 0;
+}
+
+int
+spdk_dix_verify(struct iovec *iovs, int iovcnt, struct iovec *md_iov,
+ uint32_t num_blocks, const struct spdk_dif_ctx *ctx,
+ struct spdk_dif_error *err_blk)
+{
+ struct _dif_sgl data_sgl, md_sgl;
+
+ _dif_sgl_init(&data_sgl, iovs, iovcnt);
+ _dif_sgl_init(&md_sgl, md_iov, 1);
+
+ if (!_dif_sgl_is_valid(&data_sgl, ctx->block_size * num_blocks) ||
+ !_dif_sgl_is_valid(&md_sgl, ctx->md_size * num_blocks)) {
+ SPDK_ERRLOG("Size of iovec array is not valid.\n");
+ return -EINVAL;
+ }
+
+ if (_dif_is_disabled(ctx->dif_type)) {
+ return 0;
+ }
+
+ if (_dif_sgl_is_bytes_multiple(&data_sgl, ctx->block_size)) {
+ return dix_verify(&data_sgl, &md_sgl, num_blocks, ctx, err_blk);
+ } else {
+ return dix_verify_split(&data_sgl, &md_sgl, num_blocks, ctx, err_blk);
+ }
+}
+
+int
+spdk_dix_inject_error(struct iovec *iovs, int iovcnt, struct iovec *md_iov,
+ uint32_t num_blocks, const struct spdk_dif_ctx *ctx,
+ uint32_t inject_flags, uint32_t *inject_offset)
+{
+ struct _dif_sgl data_sgl, md_sgl;
+ int rc;
+
+ _dif_sgl_init(&data_sgl, iovs, iovcnt);
+ _dif_sgl_init(&md_sgl, md_iov, 1);
+
+ if (!_dif_sgl_is_valid(&data_sgl, ctx->block_size * num_blocks) ||
+ !_dif_sgl_is_valid(&md_sgl, ctx->md_size * num_blocks)) {
+ SPDK_ERRLOG("Size of iovec array is not valid.\n");
+ return -EINVAL;
+ }
+
+ if (inject_flags & SPDK_DIF_REFTAG_ERROR) {
+ rc = dif_inject_error(&md_sgl, ctx->md_size, num_blocks,
+ ctx->guard_interval + offsetof(struct spdk_dif, ref_tag),
+ _member_size(struct spdk_dif, ref_tag),
+ inject_offset);
+ if (rc != 0) {
+ SPDK_ERRLOG("Failed to inject error to Reference Tag.\n");
+ return rc;
+ }
+ }
+
+ if (inject_flags & SPDK_DIF_APPTAG_ERROR) {
+ rc = dif_inject_error(&md_sgl, ctx->md_size, num_blocks,
+ ctx->guard_interval + offsetof(struct spdk_dif, app_tag),
+ _member_size(struct spdk_dif, app_tag),
+ inject_offset);
+ if (rc != 0) {
+ SPDK_ERRLOG("Failed to inject error to Application Tag.\n");
+ return rc;
+ }
+ }
+
+ if (inject_flags & SPDK_DIF_GUARD_ERROR) {
+ rc = dif_inject_error(&md_sgl, ctx->md_size, num_blocks,
+ ctx->guard_interval,
+ _member_size(struct spdk_dif, guard),
+ inject_offset);
+ if (rc != 0) {
+ SPDK_ERRLOG("Failed to inject error to Guard.\n");
+ return rc;
+ }
+ }
+
+ if (inject_flags & SPDK_DIF_DATA_ERROR) {
+ /* Note: Error injection to data block is expected to be detected
+ * as guard error.
+ */
+ rc = dif_inject_error(&data_sgl, ctx->block_size, num_blocks,
+ 0,
+ ctx->block_size,
+ inject_offset);
+ if (rc != 0) {
+ SPDK_ERRLOG("Failed to inject error to Guard.\n");
+ return rc;
+ }
+ }
+
+ return 0;
+}
+
+static uint32_t
+_to_next_boundary(uint32_t offset, uint32_t boundary)
+{
+ return boundary - (offset % boundary);
+}
+
+static uint32_t
+_to_size_with_md(uint32_t size, uint32_t data_block_size, uint32_t block_size)
+{
+ return (size / data_block_size) * block_size + (size % data_block_size);
+}
+
+int
+spdk_dif_set_md_interleave_iovs(struct iovec *iovs, int iovcnt,
+ struct iovec *buf_iovs, int buf_iovcnt,
+ uint32_t data_offset, uint32_t data_len,
+ uint32_t *_mapped_len,
+ const struct spdk_dif_ctx *ctx)
+{
+ uint32_t data_block_size, data_unalign, buf_len, buf_offset, len;
+ struct _dif_sgl dif_sgl;
+ struct _dif_sgl buf_sgl;
+
+ if (iovs == NULL || iovcnt == 0 || buf_iovs == NULL || buf_iovcnt == 0) {
+ return -EINVAL;
+ }
+
+ data_block_size = ctx->block_size - ctx->md_size;
+
+ data_unalign = ctx->data_offset % data_block_size;
+
+ buf_len = _to_size_with_md(data_unalign + data_offset + data_len, data_block_size,
+ ctx->block_size);
+ buf_len -= data_unalign;
+
+ _dif_sgl_init(&dif_sgl, iovs, iovcnt);
+ _dif_sgl_init(&buf_sgl, buf_iovs, buf_iovcnt);
+
+ if (!_dif_sgl_is_valid(&buf_sgl, buf_len)) {
+ SPDK_ERRLOG("Buffer overflow will occur.\n");
+ return -ERANGE;
+ }
+
+ buf_offset = _to_size_with_md(data_unalign + data_offset, data_block_size, ctx->block_size);
+ buf_offset -= data_unalign;
+
+ _dif_sgl_advance(&buf_sgl, buf_offset);
+
+ while (data_len != 0) {
+ len = spdk_min(data_len, _to_next_boundary(ctx->data_offset + data_offset, data_block_size));
+ if (!_dif_sgl_append_split(&dif_sgl, &buf_sgl, len)) {
+ break;
+ }
+ _dif_sgl_advance(&buf_sgl, ctx->md_size);
+ data_offset += len;
+ data_len -= len;
+ }
+
+ if (_mapped_len != NULL) {
+ *_mapped_len = dif_sgl.total_size;
+ }
+
+ return iovcnt - dif_sgl.iovcnt;
+}
+
+static int
+_dif_sgl_setup_stream(struct _dif_sgl *sgl, uint32_t *_buf_offset, uint32_t *_buf_len,
+ uint32_t data_offset, uint32_t data_len,
+ const struct spdk_dif_ctx *ctx)
+{
+ uint32_t data_block_size, data_unalign, buf_len, buf_offset;
+
+ data_block_size = ctx->block_size - ctx->md_size;
+
+ data_unalign = ctx->data_offset % data_block_size;
+
+ /* If the last data block is complete, DIF of the data block is
+ * inserted or verified in this turn.
+ */
+ buf_len = _to_size_with_md(data_unalign + data_offset + data_len, data_block_size,
+ ctx->block_size);
+ buf_len -= data_unalign;
+
+ if (!_dif_sgl_is_valid(sgl, buf_len)) {
+ return -ERANGE;
+ }
+
+ buf_offset = _to_size_with_md(data_unalign + data_offset, data_block_size, ctx->block_size);
+ buf_offset -= data_unalign;
+
+ _dif_sgl_advance(sgl, buf_offset);
+ buf_len -= buf_offset;
+
+ buf_offset += data_unalign;
+
+ *_buf_offset = buf_offset;
+ *_buf_len = buf_len;
+
+ return 0;
+}
+
+int
+spdk_dif_generate_stream(struct iovec *iovs, int iovcnt,
+ uint32_t data_offset, uint32_t data_len,
+ struct spdk_dif_ctx *ctx)
+{
+ uint32_t buf_len = 0, buf_offset = 0;
+ uint32_t len, offset_in_block, offset_blocks;
+ uint16_t guard = 0;
+ struct _dif_sgl sgl;
+ int rc;
+
+ if (iovs == NULL || iovcnt == 0) {
+ return -EINVAL;
+ }
+
+ if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+ guard = ctx->last_guard;
+ }
+
+ _dif_sgl_init(&sgl, iovs, iovcnt);
+
+ rc = _dif_sgl_setup_stream(&sgl, &buf_offset, &buf_len, data_offset, data_len, ctx);
+ if (rc != 0) {
+ return rc;
+ }
+
+ while (buf_len != 0) {
+ len = spdk_min(buf_len, _to_next_boundary(buf_offset, ctx->block_size));
+ offset_in_block = buf_offset % ctx->block_size;
+ offset_blocks = buf_offset / ctx->block_size;
+
+ guard = _dif_generate_split(&sgl, offset_in_block, len, guard, offset_blocks, ctx);
+
+ buf_len -= len;
+ buf_offset += len;
+ }
+
+ if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+ ctx->last_guard = guard;
+ }
+
+ return 0;
+}
+
+int
+spdk_dif_verify_stream(struct iovec *iovs, int iovcnt,
+ uint32_t data_offset, uint32_t data_len,
+ struct spdk_dif_ctx *ctx,
+ struct spdk_dif_error *err_blk)
+{
+ uint32_t buf_len = 0, buf_offset = 0;
+ uint32_t len, offset_in_block, offset_blocks;
+ uint16_t guard = 0;
+ struct _dif_sgl sgl;
+ int rc = 0;
+
+ if (iovs == NULL || iovcnt == 0) {
+ return -EINVAL;
+ }
+
+ if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+ guard = ctx->last_guard;
+ }
+
+ _dif_sgl_init(&sgl, iovs, iovcnt);
+
+ rc = _dif_sgl_setup_stream(&sgl, &buf_offset, &buf_len, data_offset, data_len, ctx);
+ if (rc != 0) {
+ return rc;
+ }
+
+ while (buf_len != 0) {
+ len = spdk_min(buf_len, _to_next_boundary(buf_offset, ctx->block_size));
+ offset_in_block = buf_offset % ctx->block_size;
+ offset_blocks = buf_offset / ctx->block_size;
+
+ rc = _dif_verify_split(&sgl, offset_in_block, len, &guard, offset_blocks,
+ ctx, err_blk);
+ if (rc != 0) {
+ goto error;
+ }
+
+ buf_len -= len;
+ buf_offset += len;
+ }
+
+ if (ctx->dif_flags & SPDK_DIF_FLAGS_GUARD_CHECK) {
+ ctx->last_guard = guard;
+ }
+error:
+ return rc;
+}
+
+int
+spdk_dif_update_crc32c_stream(struct iovec *iovs, int iovcnt,
+ uint32_t data_offset, uint32_t data_len,
+ uint32_t *_crc32c, const struct spdk_dif_ctx *ctx)
+{
+ uint32_t buf_len = 0, buf_offset = 0, len, offset_in_block;
+ uint32_t crc32c;
+ struct _dif_sgl sgl;
+ int rc;
+
+ if (iovs == NULL || iovcnt == 0) {
+ return -EINVAL;
+ }
+
+ crc32c = *_crc32c;
+ _dif_sgl_init(&sgl, iovs, iovcnt);
+
+ rc = _dif_sgl_setup_stream(&sgl, &buf_offset, &buf_len, data_offset, data_len, ctx);
+ if (rc != 0) {
+ return rc;
+ }
+
+ while (buf_len != 0) {
+ len = spdk_min(buf_len, _to_next_boundary(buf_offset, ctx->block_size));
+ offset_in_block = buf_offset % ctx->block_size;
+
+ crc32c = _dif_update_crc32c_split(&sgl, offset_in_block, len, crc32c, ctx);
+
+ buf_len -= len;
+ buf_offset += len;
+ }
+
+ *_crc32c = crc32c;
+
+ return 0;
+}
+
+void
+spdk_dif_get_range_with_md(uint32_t data_offset, uint32_t data_len,
+ uint32_t *_buf_offset, uint32_t *_buf_len,
+ const struct spdk_dif_ctx *ctx)
+{
+ uint32_t data_block_size, data_unalign, buf_offset, buf_len;
+
+ if (!ctx->md_interleave) {
+ buf_offset = data_offset;
+ buf_len = data_len;
+ } else {
+ data_block_size = ctx->block_size - ctx->md_size;
+
+ data_unalign = data_offset % data_block_size;
+
+ buf_offset = _to_size_with_md(data_offset, data_block_size, ctx->block_size);
+ buf_len = _to_size_with_md(data_unalign + data_len, data_block_size, ctx->block_size) -
+ data_unalign;
+ }
+
+ if (_buf_offset != NULL) {
+ *_buf_offset = buf_offset;
+ }
+
+ if (_buf_len != NULL) {
+ *_buf_len = buf_len;
+ }
+}
+
+uint32_t
+spdk_dif_get_length_with_md(uint32_t data_len, const struct spdk_dif_ctx *ctx)
+{
+ uint32_t data_block_size;
+
+ if (!ctx->md_interleave) {
+ return data_len;
+ } else {
+ data_block_size = ctx->block_size - ctx->md_size;
+
+ return _to_size_with_md(data_len, data_block_size, ctx->block_size);
+ }
+}
+
+static int
+_dif_remap_ref_tag(struct _dif_sgl *sgl, uint32_t offset_blocks,
+ const struct spdk_dif_ctx *ctx, struct spdk_dif_error *err_blk)
+{
+ uint32_t offset, buf_len, expected = 0, _actual, remapped;
+ void *buf;
+ struct _dif_sgl tmp_sgl;
+ struct spdk_dif dif;
+
+ /* Fast forward to DIF field. */
+ _dif_sgl_advance(sgl, ctx->guard_interval);
+ _dif_sgl_copy(&tmp_sgl, sgl);
+
+ /* Copy the split DIF field to the temporary DIF buffer */
+ offset = 0;
+ while (offset < sizeof(struct spdk_dif)) {
+ _dif_sgl_get_buf(sgl, &buf, &buf_len);
+ buf_len = spdk_min(buf_len, sizeof(struct spdk_dif) - offset);
+
+ memcpy((uint8_t *)&dif + offset, buf, buf_len);
+
+ _dif_sgl_advance(sgl, buf_len);
+ offset += buf_len;
+ }
+
+ switch (ctx->dif_type) {
+ case SPDK_DIF_TYPE1:
+ case SPDK_DIF_TYPE2:
+ /* If Type 1 or 2 is used, then all DIF checks are disabled when
+ * the Application Tag is 0xFFFF.
+ */
+ if (dif.app_tag == 0xFFFF) {
+ goto end;
+ }
+ break;
+ case SPDK_DIF_TYPE3:
+ /* If Type 3 is used, then all DIF checks are disabled when the
+ * Application Tag is 0xFFFF and the Reference Tag is 0xFFFFFFFF.
+ */
+ if (dif.app_tag == 0xFFFF && dif.ref_tag == 0xFFFFFFFF) {
+ goto end;
+ }
+ break;
+ default:
+ break;
+ }
+
+ /* For type 1 and 2, the Reference Tag is incremented for each
+ * subsequent logical block. For type 3, the Reference Tag
+ * remains the same as the initial Reference Tag.
+ */
+ if (ctx->dif_type != SPDK_DIF_TYPE3) {
+ expected = ctx->init_ref_tag + ctx->ref_tag_offset + offset_blocks;
+ remapped = ctx->remapped_init_ref_tag + ctx->ref_tag_offset + offset_blocks;
+ } else {
+ remapped = ctx->remapped_init_ref_tag;
+ }
+
+ /* Verify the stored Reference Tag. */
+ switch (ctx->dif_type) {
+ case SPDK_DIF_TYPE1:
+ case SPDK_DIF_TYPE2:
+ /* Compare the DIF Reference Tag field to the computed Reference Tag.
+ * The computed Reference Tag will be the least significant 4 bytes
+ * of the LBA when Type 1 is used, and application specific value
+ * if Type 2 is used.
+ */
+ _actual = from_be32(&dif.ref_tag);
+ if (_actual != expected) {
+ _dif_error_set(err_blk, SPDK_DIF_REFTAG_ERROR, expected,
+ _actual, offset_blocks);
+ SPDK_ERRLOG("Failed to compare Ref Tag: LBA=%" PRIu32 "," \
+ " Expected=%x, Actual=%x\n",
+ expected, expected, _actual);
+ return -1;
+ }
+ break;
+ case SPDK_DIF_TYPE3:
+ /* For type 3, the computed Reference Tag remains unchanged.
+ * Hence ignore the Reference Tag field.
+ */
+ break;
+ default:
+ break;
+ }
+
+ /* Update the stored Reference Tag to the remapped one. */
+ to_be32(&dif.ref_tag, remapped);
+
+ offset = 0;
+ while (offset < sizeof(struct spdk_dif)) {
+ _dif_sgl_get_buf(&tmp_sgl, &buf, &buf_len);
+ buf_len = spdk_min(buf_len, sizeof(struct spdk_dif) - offset);
+
+ memcpy(buf, (uint8_t *)&dif + offset, buf_len);
+
+ _dif_sgl_advance(&tmp_sgl, buf_len);
+ offset += buf_len;
+ }
+
+end:
+ _dif_sgl_advance(sgl, ctx->block_size - ctx->guard_interval - sizeof(struct spdk_dif));
+
+ return 0;
+}
+
+int
+spdk_dif_remap_ref_tag(struct iovec *iovs, int iovcnt, uint32_t num_blocks,
+ const struct spdk_dif_ctx *ctx, struct spdk_dif_error *err_blk)
+{
+ struct _dif_sgl sgl;
+ uint32_t offset_blocks;
+ int rc;
+
+ _dif_sgl_init(&sgl, iovs, iovcnt);
+
+ if (!_dif_sgl_is_valid(&sgl, ctx->block_size * num_blocks)) {
+ SPDK_ERRLOG("Size of iovec array is not valid.\n");
+ return -EINVAL;
+ }
+
+ if (_dif_is_disabled(ctx->dif_type)) {
+ return 0;
+ }
+
+ if (!(ctx->dif_flags & SPDK_DIF_FLAGS_REFTAG_CHECK)) {
+ return 0;
+ }
+
+ for (offset_blocks = 0; offset_blocks < num_blocks; offset_blocks++) {
+ rc = _dif_remap_ref_tag(&sgl, offset_blocks, ctx, err_blk);
+ if (rc != 0) {
+ return rc;
+ }
+ }
+
+ return 0;
+}
+
+static int
+_dix_remap_ref_tag(struct _dif_sgl *md_sgl, uint32_t offset_blocks,
+ const struct spdk_dif_ctx *ctx, struct spdk_dif_error *err_blk)
+{
+ uint32_t expected = 0, _actual, remapped;
+ uint8_t *md_buf;
+ struct spdk_dif *dif;
+
+ _dif_sgl_get_buf(md_sgl, (void *)&md_buf, NULL);
+
+ dif = (struct spdk_dif *)(md_buf + ctx->guard_interval);
+
+ switch (ctx->dif_type) {
+ case SPDK_DIF_TYPE1:
+ case SPDK_DIF_TYPE2:
+ /* If Type 1 or 2 is used, then all DIF checks are disabled when
+ * the Application Tag is 0xFFFF.
+ */
+ if (dif->app_tag == 0xFFFF) {
+ goto end;
+ }
+ break;
+ case SPDK_DIF_TYPE3:
+ /* If Type 3 is used, then all DIF checks are disabled when the
+ * Application Tag is 0xFFFF and the Reference Tag is 0xFFFFFFFF.
+ */
+ if (dif->app_tag == 0xFFFF && dif->ref_tag == 0xFFFFFFFF) {
+ goto end;
+ }
+ break;
+ default:
+ break;
+ }
+
+ /* For type 1 and 2, the Reference Tag is incremented for each
+ * subsequent logical block. For type 3, the Reference Tag
+ * remains the same as the initialReference Tag.
+ */
+ if (ctx->dif_type != SPDK_DIF_TYPE3) {
+ expected = ctx->init_ref_tag + ctx->ref_tag_offset + offset_blocks;
+ remapped = ctx->remapped_init_ref_tag + ctx->ref_tag_offset + offset_blocks;
+ } else {
+ remapped = ctx->remapped_init_ref_tag;
+ }
+
+ /* Verify the stored Reference Tag. */
+ switch (ctx->dif_type) {
+ case SPDK_DIF_TYPE1:
+ case SPDK_DIF_TYPE2:
+ /* Compare the DIF Reference Tag field to the computed Reference Tag.
+ * The computed Reference Tag will be the least significant 4 bytes
+ * of the LBA when Type 1 is used, and application specific value
+ * if Type 2 is used.
+ */
+ _actual = from_be32(&dif->ref_tag);
+ if (_actual != expected) {
+ _dif_error_set(err_blk, SPDK_DIF_REFTAG_ERROR, expected,
+ _actual, offset_blocks);
+ SPDK_ERRLOG("Failed to compare Ref Tag: LBA=%" PRIu32 "," \
+ " Expected=%x, Actual=%x\n",
+ expected, expected, _actual);
+ return -1;
+ }
+ break;
+ case SPDK_DIF_TYPE3:
+ /* For type 3, the computed Reference Tag remains unchanged.
+ * Hence ignore the Reference Tag field.
+ */
+ break;
+ default:
+ break;
+ }
+
+ /* Update the stored Reference Tag to the remapped one. */
+ to_be32(&dif->ref_tag, remapped);
+
+end:
+ _dif_sgl_advance(md_sgl, ctx->md_size);
+
+ return 0;
+}
+
+int
+spdk_dix_remap_ref_tag(struct iovec *md_iov, uint32_t num_blocks,
+ const struct spdk_dif_ctx *ctx,
+ struct spdk_dif_error *err_blk)
+{
+ struct _dif_sgl md_sgl;
+ uint32_t offset_blocks;
+ int rc;
+
+ _dif_sgl_init(&md_sgl, md_iov, 1);
+
+ if (!_dif_sgl_is_valid(&md_sgl, ctx->md_size * num_blocks)) {
+ SPDK_ERRLOG("Size of metadata iovec array is not valid.\n");
+ return -EINVAL;
+ }
+
+ if (_dif_is_disabled(ctx->dif_type)) {
+ return 0;
+ }
+
+ if (!(ctx->dif_flags & SPDK_DIF_FLAGS_REFTAG_CHECK)) {
+ return 0;
+ }
+
+ for (offset_blocks = 0; offset_blocks < num_blocks; offset_blocks++) {
+ rc = _dix_remap_ref_tag(&md_sgl, offset_blocks, ctx, err_blk);
+ if (rc != 0) {
+ return rc;
+ }
+ }
+
+ return 0;
+}
diff --git a/src/spdk/lib/util/fd.c b/src/spdk/lib/util/fd.c
new file mode 100644
index 000000000..6b0d0d554
--- /dev/null
+++ b/src/spdk/lib/util/fd.c
@@ -0,0 +1,103 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/fd.h"
+
+#ifdef __linux__
+#include <linux/fs.h>
+#endif
+
+static uint64_t
+dev_get_size(int fd)
+{
+#if defined(DIOCGMEDIASIZE) /* FreeBSD */
+ off_t size;
+
+ if (ioctl(fd, DIOCGMEDIASIZE, &size) == 0) {
+ return size;
+ }
+#elif defined(__linux__) && defined(BLKGETSIZE64)
+ uint64_t size;
+
+ if (ioctl(fd, BLKGETSIZE64, &size) == 0) {
+ return size;
+ }
+#endif
+
+ return 0;
+}
+
+uint32_t
+spdk_fd_get_blocklen(int fd)
+{
+#if defined(DKIOCGETBLOCKSIZE) /* FreeBSD */
+ uint32_t blocklen;
+
+ if (ioctl(fd, DKIOCGETBLOCKSIZE, &blocklen) == 0) {
+ return blocklen;
+ }
+#elif defined(__linux__) && defined(BLKSSZGET)
+ uint32_t blocklen;
+
+ if (ioctl(fd, BLKSSZGET, &blocklen) == 0) {
+ return blocklen;
+ }
+#endif
+
+ return 0;
+}
+
+uint64_t
+spdk_fd_get_size(int fd)
+{
+ struct stat st;
+
+ if (fstat(fd, &st) != 0) {
+ return 0;
+ }
+
+ if (S_ISLNK(st.st_mode)) {
+ return 0;
+ }
+
+ if (S_ISBLK(st.st_mode) || S_ISCHR(st.st_mode)) {
+ return dev_get_size(fd);
+ } else if (S_ISREG(st.st_mode)) {
+ return st.st_size;
+ }
+
+ /* Not REG, CHR or BLK */
+ return 0;
+}
diff --git a/src/spdk/lib/util/file.c b/src/spdk/lib/util/file.c
new file mode 100644
index 000000000..2ba08547b
--- /dev/null
+++ b/src/spdk/lib/util/file.c
@@ -0,0 +1,71 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/file.h"
+
+void *
+spdk_posix_file_load(FILE *file, size_t *size)
+{
+ void *newbuf, *buf = NULL;
+ size_t rc, buf_size, cur_size = 0;
+
+ *size = 0;
+ buf_size = 128 * 1024;
+
+ while (buf_size <= 1024 * 1024 * 1024) {
+ newbuf = realloc(buf, buf_size);
+ if (newbuf == NULL) {
+ free(buf);
+ return NULL;
+ }
+ buf = newbuf;
+
+ rc = fread(buf + cur_size, 1, buf_size - cur_size, file);
+ cur_size += rc;
+
+ if (feof(file)) {
+ *size = cur_size;
+ return buf;
+ }
+
+ if (ferror(file)) {
+ free(buf);
+ return NULL;
+ }
+
+ buf_size *= 2;
+ }
+
+ free(buf);
+ return NULL;
+}
diff --git a/src/spdk/lib/util/iov.c b/src/spdk/lib/util/iov.c
new file mode 100644
index 000000000..e89ef9d21
--- /dev/null
+++ b/src/spdk/lib/util/iov.c
@@ -0,0 +1,111 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/util.h"
+
+size_t
+spdk_iovcpy(struct iovec *siov, size_t siovcnt, struct iovec *diov, size_t diovcnt)
+{
+ size_t total_sz;
+ size_t sidx;
+ size_t didx;
+ int siov_len;
+ uint8_t *siov_base;
+ int diov_len;
+ uint8_t *diov_base;
+
+ /* d prefix = destination. s prefix = source. */
+
+ assert(diovcnt > 0);
+ assert(siovcnt > 0);
+
+ total_sz = 0;
+ sidx = 0;
+ didx = 0;
+ siov_len = siov[0].iov_len;
+ siov_base = siov[0].iov_base;
+ diov_len = diov[0].iov_len;
+ diov_base = diov[0].iov_base;
+ while (siov_len > 0 && diov_len > 0) {
+ if (siov_len == diov_len) {
+ memcpy(diov_base, siov_base, siov_len);
+ total_sz += siov_len;
+
+ /* Advance both iovs to the next element */
+ sidx++;
+ if (sidx == siovcnt) {
+ break;
+ }
+
+ didx++;
+ if (didx == diovcnt) {
+ break;
+ }
+
+ siov_len = siov[sidx].iov_len;
+ siov_base = siov[sidx].iov_base;
+ diov_len = diov[didx].iov_len;
+ diov_base = diov[didx].iov_base;
+ } else if (siov_len < diov_len) {
+ memcpy(diov_base, siov_base, siov_len);
+ total_sz += siov_len;
+
+ /* Advance only the source to the next element */
+ sidx++;
+ if (sidx == siovcnt) {
+ break;
+ }
+
+ diov_base += siov_len;
+ diov_len -= siov_len;
+ siov_len = siov[sidx].iov_len;
+ siov_base = siov[sidx].iov_base;
+ } else {
+ memcpy(diov_base, siov_base, diov_len);
+ total_sz += diov_len;
+
+ /* Advance only the destination to the next element */
+ didx++;
+ if (didx == diovcnt) {
+ break;
+ }
+
+ siov_base += diov_len;
+ siov_len -= diov_len;
+ diov_len = diov[didx].iov_len;
+ diov_base = diov[didx].iov_base;
+ }
+ }
+
+ return total_sz;
+}
diff --git a/src/spdk/lib/util/math.c b/src/spdk/lib/util/math.c
new file mode 100644
index 000000000..7d1852421
--- /dev/null
+++ b/src/spdk/lib/util/math.c
@@ -0,0 +1,69 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+#include "spdk/util.h"
+
+/* The following will automatically generate several version of
+ * this function, targeted at different architectures. This
+ * is only supported by GCC 6 or newer. */
+#if defined(__GNUC__) && __GNUC__ >= 6 && !defined(__clang__) \
+ && (defined(__i386__) || defined(__x86_64__))
+__attribute__((target_clones("bmi", "arch=core2", "arch=atom", "default")))
+#endif
+uint32_t
+spdk_u32log2(uint32_t x)
+{
+ if (x == 0) {
+ /* log(0) is undefined */
+ return 0;
+ }
+ return 31u - __builtin_clz(x);
+}
+
+/* The following will automatically generate several version of
+ * this function, targeted at different architectures. This
+ * is only supported by GCC 6 or newer. */
+#if defined(__GNUC__) && __GNUC__ >= 6 && !defined(__clang__) \
+ && (defined(__i386__) || defined(__x86_64__))
+__attribute__((target_clones("bmi", "arch=core2", "arch=atom", "default")))
+#endif
+uint64_t
+spdk_u64log2(uint64_t x)
+{
+ if (x == 0) {
+ /* log(0) is undefined */
+ return 0;
+ }
+ return 63u - __builtin_clzl(x);
+}
diff --git a/src/spdk/lib/util/pipe.c b/src/spdk/lib/util/pipe.c
new file mode 100644
index 000000000..1c640dd2e
--- /dev/null
+++ b/src/spdk/lib/util/pipe.c
@@ -0,0 +1,246 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/pipe.h"
+#include "spdk/util.h"
+
+struct spdk_pipe {
+ uint8_t *buf;
+ uint32_t sz;
+
+ uint32_t write;
+ uint32_t read;
+};
+
+struct spdk_pipe *
+spdk_pipe_create(void *buf, uint32_t sz)
+{
+ struct spdk_pipe *pipe;
+
+ pipe = calloc(1, sizeof(*pipe));
+ if (pipe == NULL) {
+ return NULL;
+ }
+
+ pipe->buf = buf;
+ pipe->sz = sz;
+
+ return pipe;
+}
+
+void
+spdk_pipe_destroy(struct spdk_pipe *pipe)
+{
+ free(pipe);
+}
+
+int
+spdk_pipe_writer_get_buffer(struct spdk_pipe *pipe, uint32_t requested_sz, struct iovec *iovs)
+{
+ uint32_t sz;
+ uint32_t read;
+ uint32_t write;
+
+ read = pipe->read;
+ write = pipe->write;
+
+ if (read <= write) {
+ requested_sz = spdk_min(requested_sz, ((read + pipe->sz) - write - 1));
+
+ sz = spdk_min(requested_sz, pipe->sz - write);
+
+ iovs[0].iov_base = (sz == 0) ? NULL : (pipe->buf + write);
+ iovs[0].iov_len = sz;
+
+ requested_sz -= sz;
+
+ if (requested_sz > 0) {
+ sz = spdk_min(requested_sz, read);
+
+ iovs[1].iov_base = (sz == 0) ? NULL : pipe->buf;
+ iovs[1].iov_len = sz;
+ } else {
+ iovs[1].iov_base = NULL;
+ iovs[1].iov_len = 0;
+ }
+ } else {
+ sz = spdk_min(requested_sz, read - write - 1);
+
+ iovs[0].iov_base = (sz == 0) ? NULL : (pipe->buf + write);
+ iovs[0].iov_len = sz;
+ iovs[1].iov_base = NULL;
+ iovs[1].iov_len = 0;
+ }
+
+ return iovs[0].iov_len + iovs[1].iov_len;
+}
+
+int
+spdk_pipe_writer_advance(struct spdk_pipe *pipe, uint32_t requested_sz)
+{
+ uint32_t sz;
+ uint32_t read;
+ uint32_t write;
+
+ read = pipe->read;
+ write = pipe->write;
+
+ if (requested_sz > pipe->sz - 1) {
+ return -EINVAL;
+ }
+
+ if (read <= write) {
+ if (requested_sz > (read + pipe->sz) - write) {
+ return -EINVAL;
+ }
+
+ sz = spdk_min(requested_sz, pipe->sz - write);
+
+ write += sz;
+ if (write > pipe->sz - 1) {
+ write = 0;
+ }
+ requested_sz -= sz;
+
+ if (requested_sz > 0) {
+ if (requested_sz >= read) {
+ return -EINVAL;
+ }
+
+ write = requested_sz;
+ }
+ } else {
+ if (requested_sz > (read - write - 1)) {
+ return -EINVAL;
+ }
+
+ write += requested_sz;
+ }
+
+ pipe->write = write;
+
+ return 0;
+}
+
+uint32_t
+spdk_pipe_reader_bytes_available(struct spdk_pipe *pipe)
+{
+ uint32_t read;
+ uint32_t write;
+
+ read = pipe->read;
+ write = pipe->write;
+
+ if (read <= write) {
+ return write - read;
+ }
+
+ return (write + pipe->sz) - read;
+}
+
+int
+spdk_pipe_reader_get_buffer(struct spdk_pipe *pipe, uint32_t requested_sz, struct iovec *iovs)
+{
+ uint32_t sz;
+ uint32_t read;
+ uint32_t write;
+
+ read = pipe->read;
+ write = pipe->write;
+
+ if (read <= write) {
+ sz = spdk_min(requested_sz, write - read);
+
+ iovs[0].iov_base = (sz == 0) ? NULL : (pipe->buf + read);
+ iovs[0].iov_len = sz;
+ iovs[1].iov_base = NULL;
+ iovs[1].iov_len = 0;
+ } else {
+ sz = spdk_min(requested_sz, pipe->sz - read);
+
+ iovs[0].iov_base = (sz == 0) ? NULL : (pipe->buf + read);
+ iovs[0].iov_len = sz;
+
+ requested_sz -= sz;
+
+ if (requested_sz > 0) {
+ sz = spdk_min(requested_sz, write);
+ iovs[1].iov_base = (sz == 0) ? NULL : pipe->buf;
+ iovs[1].iov_len = sz;
+ } else {
+ iovs[1].iov_base = NULL;
+ iovs[1].iov_len = 0;
+ }
+ }
+
+ return iovs[0].iov_len + iovs[1].iov_len;
+}
+
+int
+spdk_pipe_reader_advance(struct spdk_pipe *pipe, uint32_t requested_sz)
+{
+ uint32_t sz;
+ uint32_t read;
+ uint32_t write;
+
+ read = pipe->read;
+ write = pipe->write;
+
+ if (read <= write) {
+ if (requested_sz > (write - read)) {
+ return -EINVAL;
+ }
+
+ read += requested_sz;
+ } else {
+ sz = spdk_min(requested_sz, pipe->sz - read);
+
+ read += sz;
+ if (read > pipe->sz - 1) {
+ read = 0;
+ }
+ requested_sz -= sz;
+
+ if (requested_sz > 0) {
+ if (requested_sz > write) {
+ return -EINVAL;
+ }
+
+ read = requested_sz;
+ }
+ }
+
+ pipe->read = read;
+
+ return 0;
+}
diff --git a/src/spdk/lib/util/spdk_util.map b/src/spdk/lib/util/spdk_util.map
new file mode 100644
index 000000000..07e067faa
--- /dev/null
+++ b/src/spdk/lib/util/spdk_util.map
@@ -0,0 +1,128 @@
+{
+ global:
+
+ # public functions in base64.h
+ spdk_base64_encode;
+ spdk_base64_urlsafe_encode;
+ spdk_base64_decode;
+ spdk_base64_urlsafe_decode;
+
+ # public functions in bit_array.h
+ spdk_bit_array_capacity;
+ spdk_bit_array_create;
+ spdk_bit_array_free;
+ spdk_bit_array_resize;
+ spdk_bit_array_get;
+ spdk_bit_array_set;
+ spdk_bit_array_clear;
+ spdk_bit_array_find_first_set;
+ spdk_bit_array_find_first_clear;
+ spdk_bit_array_count_set;
+ spdk_bit_array_count_clear;
+ spdk_bit_array_store_mask;
+ spdk_bit_array_load_mask;
+ spdk_bit_array_clear_mask;
+
+ # public functions in cpuset.h
+ spdk_cpuset_alloc;
+ spdk_cpuset_free;
+ spdk_cpuset_equal;
+ spdk_cpuset_copy;
+ spdk_cpuset_and;
+ spdk_cpuset_or;
+ spdk_cpuset_xor;
+ spdk_cpuset_negate;
+ spdk_cpuset_zero;
+ spdk_cpuset_set_cpu;
+ spdk_cpuset_get_cpu;
+ spdk_cpuset_count;
+ spdk_cpuset_fmt;
+ spdk_cpuset_parse;
+
+ # public functions in crc16.h
+ spdk_crc16_t10dif;
+ spdk_crc16_t10dif_copy;
+
+ # public functions in crc32.h
+ spdk_crc32_ieee_update;
+ spdk_crc32c_update;
+
+ # public functions in dif.h
+ spdk_dif_ctx_init;
+ spdk_dif_ctx_set_data_offset;
+ spdk_dif_ctx_set_remapped_init_ref_tag;
+ spdk_dif_generate;
+ spdk_dif_verify;
+ spdk_dif_update_crc32c;
+ spdk_dif_generate_copy;
+ spdk_dif_verify_copy;
+ spdk_dif_inject_error;
+ spdk_dix_generate;
+ spdk_dix_verify;
+ spdk_dix_inject_error;
+ spdk_dif_set_md_interleave_iovs;
+ spdk_dif_generate_stream;
+ spdk_dif_verify_stream;
+ spdk_dif_update_crc32c_stream;
+ spdk_dif_get_range_with_md;
+ spdk_dif_get_length_with_md;
+ spdk_dif_remap_ref_tag;
+ spdk_dix_remap_ref_tag;
+
+ # public functions in fd.h
+ spdk_fd_get_size;
+ spdk_fd_get_blocklen;
+
+ # public functions in file.h
+ spdk_posix_file_load;
+
+ # public functions in pipe.h
+ spdk_pipe_create;
+ spdk_pipe_destroy;
+ spdk_pipe_writer_get_buffer;
+ spdk_pipe_writer_advance;
+ spdk_pipe_reader_bytes_available;
+ spdk_pipe_reader_get_buffer;
+ spdk_pipe_reader_advance;
+
+ # public functions in string.h
+ spdk_sprintf_alloc;
+ spdk_vsprintf_alloc;
+ spdk_sprintf_append_realloc;
+ spdk_vsprintf_append_realloc;
+ spdk_strlwr;
+ spdk_strsepq;
+ spdk_str_trim;
+ spdk_strerror_r;
+ spdk_strerror;
+ spdk_str_chomp;
+ spdk_strcpy_pad;
+ spdk_strlen_pad;
+ spdk_parse_ip_addr;
+ spdk_parse_capacity;
+ spdk_mem_all_zero;
+ spdk_strtol;
+ spdk_strtoll;
+
+ # public functions in util.h
+ spdk_u32log2;
+ spdk_u64log2;
+ spdk_iovcpy;
+
+ # resolvers for functions in util.h
+ spdk_u32log2.resolver;
+ spdk_u64log2.resolver;
+
+ # public functions in uuid.h
+ spdk_uuid_parse;
+ spdk_uuid_fmt_lower;
+ spdk_uuid_compare;
+ spdk_uuid_generate;
+ spdk_uuid_copy;
+
+
+
+
+
+ local: *;
+};
diff --git a/src/spdk/lib/util/strerror_tls.c b/src/spdk/lib/util/strerror_tls.c
new file mode 100644
index 000000000..c9dc8f13f
--- /dev/null
+++ b/src/spdk/lib/util/strerror_tls.c
@@ -0,0 +1,43 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/string.h"
+
+static __thread char strerror_message[64];
+
+const char *
+spdk_strerror(int errnum)
+{
+ spdk_strerror_r(errnum, strerror_message, sizeof(strerror_message));
+ return strerror_message;
+}
diff --git a/src/spdk/lib/util/string.c b/src/spdk/lib/util/string.c
new file mode 100644
index 000000000..30ac1628a
--- /dev/null
+++ b/src/spdk/lib/util/string.c
@@ -0,0 +1,476 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/string.h"
+
+char *
+spdk_vsprintf_append_realloc(char *buffer, const char *format, va_list args)
+{
+ va_list args_copy;
+ char *new_buffer;
+ int orig_size = 0, new_size;
+
+ /* Original buffer size */
+ if (buffer) {
+ orig_size = strlen(buffer);
+ }
+
+ /* Necessary buffer size */
+ va_copy(args_copy, args);
+ new_size = vsnprintf(NULL, 0, format, args_copy);
+ va_end(args_copy);
+
+ if (new_size < 0) {
+ return NULL;
+ }
+ new_size += orig_size + 1;
+
+ new_buffer = realloc(buffer, new_size);
+ if (new_buffer == NULL) {
+ return NULL;
+ }
+
+ vsnprintf(new_buffer + orig_size, new_size - orig_size, format, args);
+
+ return new_buffer;
+}
+
+char *
+spdk_sprintf_append_realloc(char *buffer, const char *format, ...)
+{
+ va_list args;
+ char *ret;
+
+ va_start(args, format);
+ ret = spdk_vsprintf_append_realloc(buffer, format, args);
+ va_end(args);
+
+ return ret;
+}
+
+char *
+spdk_vsprintf_alloc(const char *format, va_list args)
+{
+ return spdk_vsprintf_append_realloc(NULL, format, args);
+}
+
+char *
+spdk_sprintf_alloc(const char *format, ...)
+{
+ va_list args;
+ char *ret;
+
+ va_start(args, format);
+ ret = spdk_vsprintf_alloc(format, args);
+ va_end(args);
+
+ return ret;
+}
+
+char *
+spdk_strlwr(char *s)
+{
+ char *p;
+
+ if (s == NULL) {
+ return NULL;
+ }
+
+ p = s;
+ while (*p != '\0') {
+ *p = tolower(*p);
+ p++;
+ }
+
+ return s;
+}
+
+char *
+spdk_strsepq(char **stringp, const char *delim)
+{
+ char *p, *q, *r;
+ int quoted = 0, bslash = 0;
+
+ p = *stringp;
+ if (p == NULL) {
+ return NULL;
+ }
+
+ r = q = p;
+ while (*q != '\0' && *q != '\n') {
+ /* eat quoted characters */
+ if (bslash) {
+ bslash = 0;
+ *r++ = *q++;
+ continue;
+ } else if (quoted) {
+ if (quoted == '"' && *q == '\\') {
+ bslash = 1;
+ q++;
+ continue;
+ } else if (*q == quoted) {
+ quoted = 0;
+ q++;
+ continue;
+ }
+ *r++ = *q++;
+ continue;
+ } else if (*q == '\\') {
+ bslash = 1;
+ q++;
+ continue;
+ } else if (*q == '"' || *q == '\'') {
+ quoted = *q;
+ q++;
+ continue;
+ }
+
+ /* separator? */
+ if (strchr(delim, *q) == NULL) {
+ *r++ = *q++;
+ continue;
+ }
+
+ /* new string */
+ q++;
+ break;
+ }
+ *r = '\0';
+
+ /* skip tailer */
+ while (*q != '\0' && strchr(delim, *q) != NULL) {
+ q++;
+ }
+ if (*q != '\0') {
+ *stringp = q;
+ } else {
+ *stringp = NULL;
+ }
+
+ return p;
+}
+
+char *
+spdk_str_trim(char *s)
+{
+ char *p, *q;
+
+ if (s == NULL) {
+ return NULL;
+ }
+
+ /* remove header */
+ p = s;
+ while (*p != '\0' && isspace(*p)) {
+ p++;
+ }
+
+ /* remove tailer */
+ q = p + strlen(p);
+ while (q - 1 >= p && isspace(*(q - 1))) {
+ q--;
+ *q = '\0';
+ }
+
+ /* if remove header, move */
+ if (p != s) {
+ q = s;
+ while (*p != '\0') {
+ *q++ = *p++;
+ }
+ *q = '\0';
+ }
+
+ return s;
+}
+
+void
+spdk_strcpy_pad(void *dst, const char *src, size_t size, int pad)
+{
+ size_t len;
+
+ len = strlen(src);
+ if (len < size) {
+ memcpy(dst, src, len);
+ memset((char *)dst + len, pad, size - len);
+ } else {
+ memcpy(dst, src, size);
+ }
+}
+
+size_t
+spdk_strlen_pad(const void *str, size_t size, int pad)
+{
+ const uint8_t *start;
+ const uint8_t *iter;
+ uint8_t pad_byte;
+
+ pad_byte = (uint8_t)pad;
+ start = (const uint8_t *)str;
+
+ if (size == 0) {
+ return 0;
+ }
+
+ iter = start + size - 1;
+ while (1) {
+ if (*iter != pad_byte) {
+ return iter - start + 1;
+ }
+
+ if (iter == start) {
+ /* Hit the start of the string finding only pad_byte. */
+ return 0;
+ }
+ iter--;
+ }
+}
+
+int
+spdk_parse_ip_addr(char *ip, char **host, char **port)
+{
+ char *p;
+
+ if (ip == NULL) {
+ return -EINVAL;
+ }
+
+ *host = NULL;
+ *port = NULL;
+
+ if (ip[0] == '[') {
+ /* IPv6 */
+ p = strchr(ip, ']');
+ if (p == NULL) {
+ return -EINVAL;
+ }
+ *host = &ip[1];
+ *p = '\0';
+
+ p++;
+ if (*p == '\0') {
+ return 0;
+ } else if (*p != ':') {
+ return -EINVAL;
+ }
+
+ p++;
+ if (*p == '\0') {
+ return 0;
+ }
+
+ *port = p;
+ } else {
+ /* IPv4 */
+ p = strchr(ip, ':');
+ if (p == NULL) {
+ *host = ip;
+ return 0;
+ }
+
+ *host = ip;
+ *p = '\0';
+
+ p++;
+ if (*p == '\0') {
+ return 0;
+ }
+
+ *port = p;
+ }
+
+ return 0;
+}
+
+size_t
+spdk_str_chomp(char *s)
+{
+ size_t len = strlen(s);
+ size_t removed = 0;
+
+ while (len > 0) {
+ if (s[len - 1] != '\r' && s[len - 1] != '\n') {
+ break;
+ }
+
+ s[len - 1] = '\0';
+ len--;
+ removed++;
+ }
+
+ return removed;
+}
+
+void
+spdk_strerror_r(int errnum, char *buf, size_t buflen)
+{
+ int rc;
+
+#if defined(__USE_GNU)
+ char *new_buffer;
+ new_buffer = strerror_r(errnum, buf, buflen);
+ if (new_buffer == buf) {
+ rc = 0;
+ } else if (new_buffer != NULL) {
+ snprintf(buf, buflen, "%s", new_buffer);
+ rc = 0;
+ } else {
+ rc = 1;
+ }
+#else
+ rc = strerror_r(errnum, buf, buflen);
+#endif
+
+ if (rc != 0) {
+ snprintf(buf, buflen, "Unknown error %d", errnum);
+ }
+}
+
+int
+spdk_parse_capacity(const char *cap_str, uint64_t *cap, bool *has_prefix)
+{
+ int rc;
+ char bin_prefix;
+
+ rc = sscanf(cap_str, "%"SCNu64"%c", cap, &bin_prefix);
+ if (rc == 1) {
+ *has_prefix = false;
+ return 0;
+ } else if (rc == 0) {
+ if (errno == 0) {
+ /* No scanf matches - the string does not start with a digit */
+ return -EINVAL;
+ } else {
+ /* Parsing error */
+ return -errno;
+ }
+ }
+
+ *has_prefix = true;
+ switch (bin_prefix) {
+ case 'k':
+ case 'K':
+ *cap *= 1024;
+ break;
+ case 'm':
+ case 'M':
+ *cap *= 1024 * 1024;
+ break;
+ case 'g':
+ case 'G':
+ *cap *= 1024 * 1024 * 1024;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+bool
+spdk_mem_all_zero(const void *data, size_t size)
+{
+ const uint8_t *buf = data;
+
+ while (size--) {
+ if (*buf++ != 0) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+long int
+spdk_strtol(const char *nptr, int base)
+{
+ long val;
+ char *endptr;
+
+ /* Since strtoll() can legitimately return 0, LONG_MAX, or LONG_MIN
+ * on both success and failure, the calling program should set errno
+ * to 0 before the call.
+ */
+ errno = 0;
+
+ val = strtol(nptr, &endptr, base);
+
+ if (!errno && *endptr != '\0') {
+ /* Non integer character was found. */
+ return -EINVAL;
+ } else if (errno == ERANGE && (val == LONG_MAX || val == LONG_MIN)) {
+ /* Overflow occurred. */
+ return -ERANGE;
+ } else if (errno != 0 && val == 0) {
+ /* Other error occurred. */
+ return -errno;
+ } else if (val < 0) {
+ /* Input string was negative number. */
+ return -ERANGE;
+ }
+
+ return val;
+}
+
+long long int
+spdk_strtoll(const char *nptr, int base)
+{
+ long long val;
+ char *endptr;
+
+ /* Since strtoll() can legitimately return 0, LLONG_MAX, or LLONG_MIN
+ * on both success and failure, the calling program should set errno
+ * to 0 before the call.
+ */
+ errno = 0;
+
+ val = strtoll(nptr, &endptr, base);
+
+ if (!errno && *endptr != '\0') {
+ /* Non integer character was found. */
+ return -EINVAL;
+ } else if (errno == ERANGE && (val == LLONG_MAX || val == LLONG_MIN)) {
+ /* Overflow occurred. */
+ return -ERANGE;
+ } else if (errno != 0 && val == 0) {
+ /* Other error occurred. */
+ return -errno;
+ } else if (val < 0) {
+ /* Input string was negative number. */
+ return -ERANGE;
+ }
+
+ return val;
+}
diff --git a/src/spdk/lib/util/util_internal.h b/src/spdk/lib/util/util_internal.h
new file mode 100644
index 000000000..655ef513d
--- /dev/null
+++ b/src/spdk/lib/util/util_internal.h
@@ -0,0 +1,77 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (C) 2008-2012 Daisuke Aoyama <aoyama@peach.ne.jp>.
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_UTIL_INTERNAL_H
+#define SPDK_UTIL_INTERNAL_H
+
+#include "spdk/stdinc.h"
+
+/**
+ * IEEE CRC-32 polynomial (bit reflected)
+ */
+#define SPDK_CRC32_POLYNOMIAL_REFLECT 0xedb88320UL
+
+/**
+ * CRC-32C (Castagnoli) polynomial (bit reflected)
+ */
+#define SPDK_CRC32C_POLYNOMIAL_REFLECT 0x82f63b78UL
+
+struct spdk_crc32_table {
+ uint32_t table[256];
+};
+
+/**
+ * Initialize a CRC32 lookup table for a given polynomial.
+ *
+ * \param table Table to fill with precalculated CRC-32 data.
+ * \param polynomial_reflect Bit-reflected CRC-32 polynomial.
+ */
+void crc32_table_init(struct spdk_crc32_table *table,
+ uint32_t polynomial_reflect);
+
+
+/**
+ * Calculate a partial CRC-32 checksum.
+ *
+ * \param table CRC-32 table initialized with crc32_table_init().
+ * \param buf Data buffer to checksum.
+ * \param len Length of buf in bytes.
+ * \param crc Previous CRC-32 value.
+ * \return Updated CRC-32 value.
+ */
+uint32_t crc32_update(const struct spdk_crc32_table *table,
+ const void *buf, size_t len,
+ uint32_t crc);
+
+#endif /* SPDK_UTIL_INTERNAL_H */
diff --git a/src/spdk/lib/util/uuid.c b/src/spdk/lib/util/uuid.c
new file mode 100644
index 000000000..176f65880
--- /dev/null
+++ b/src/spdk/lib/util/uuid.c
@@ -0,0 +1,73 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/uuid.h"
+
+#include <uuid/uuid.h>
+
+SPDK_STATIC_ASSERT(sizeof(struct spdk_uuid) == sizeof(uuid_t), "Size mismatch");
+
+int
+spdk_uuid_parse(struct spdk_uuid *uuid, const char *uuid_str)
+{
+ return uuid_parse(uuid_str, (void *)uuid) == 0 ? 0 : -EINVAL;
+}
+
+int
+spdk_uuid_fmt_lower(char *uuid_str, size_t uuid_str_size, const struct spdk_uuid *uuid)
+{
+ if (uuid_str_size < SPDK_UUID_STRING_LEN) {
+ return -EINVAL;
+ }
+
+ uuid_unparse_lower((void *)uuid, uuid_str);
+ return 0;
+}
+
+int
+spdk_uuid_compare(const struct spdk_uuid *u1, const struct spdk_uuid *u2)
+{
+ return uuid_compare((void *)u1, (void *)u2);
+}
+
+void
+spdk_uuid_generate(struct spdk_uuid *uuid)
+{
+ uuid_generate((void *)uuid);
+}
+
+void
+spdk_uuid_copy(struct spdk_uuid *dst, const struct spdk_uuid *src)
+{
+ uuid_copy((void *)dst, (void *)src);
+}
diff --git a/src/spdk/lib/vhost/Makefile b/src/spdk/lib/vhost/Makefile
new file mode 100644
index 000000000..1fe9b6e40
--- /dev/null
+++ b/src/spdk/lib/vhost/Makefile
@@ -0,0 +1,54 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 4
+SO_MINOR := 0
+
+CFLAGS += -I.
+CFLAGS += $(ENV_CFLAGS)
+
+C_SRCS = vhost.c vhost_rpc.c vhost_scsi.c vhost_blk.c rte_vhost_compat.c
+
+ifeq ($(CONFIG_VHOST_INTERNAL_LIB),y)
+C_SRCS += vhost_nvme.c
+CFLAGS := -I../rte_vhost $(CFLAGS)
+endif
+
+LIBNAME = vhost
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_vhost.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/vhost/rte_vhost_compat.c b/src/spdk/lib/vhost/rte_vhost_compat.c
new file mode 100644
index 000000000..53f31bfd7
--- /dev/null
+++ b/src/spdk/lib/vhost/rte_vhost_compat.c
@@ -0,0 +1,402 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * Set of workarounds for rte_vhost to make it work with device types
+ * other than vhost-net.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/env.h"
+#include "spdk/likely.h"
+#include "spdk/string.h"
+#include "spdk/util.h"
+#include "spdk/memory.h"
+#include "spdk/barrier.h"
+#include "spdk/vhost.h"
+#include "vhost_internal.h"
+
+#include "spdk_internal/vhost_user.h"
+
+static inline void
+vhost_session_mem_region_calc(uint64_t *previous_start, uint64_t *start, uint64_t *end,
+ uint64_t *len, struct rte_vhost_mem_region *region)
+{
+ *start = FLOOR_2MB(region->mmap_addr);
+ *end = CEIL_2MB(region->mmap_addr + region->mmap_size);
+ if (*start == *previous_start) {
+ *start += (size_t) VALUE_2MB;
+ }
+ *previous_start = *start;
+ *len = *end - *start;
+}
+
+void
+vhost_session_mem_register(struct rte_vhost_memory *mem)
+{
+ uint64_t start, end, len;
+ uint32_t i;
+ uint64_t previous_start = UINT64_MAX;
+
+
+ for (i = 0; i < mem->nregions; i++) {
+ vhost_session_mem_region_calc(&previous_start, &start, &end, &len, &mem->regions[i]);
+ SPDK_INFOLOG(SPDK_LOG_VHOST, "Registering VM memory for vtophys translation - 0x%jx len:0x%jx\n",
+ start, len);
+
+ if (spdk_mem_register((void *)start, len) != 0) {
+ SPDK_WARNLOG("Failed to register memory region %"PRIu32". Future vtophys translation might fail.\n",
+ i);
+ continue;
+ }
+ }
+}
+
+void
+vhost_session_mem_unregister(struct rte_vhost_memory *mem)
+{
+ uint64_t start, end, len;
+ uint32_t i;
+ uint64_t previous_start = UINT64_MAX;
+
+ for (i = 0; i < mem->nregions; i++) {
+ vhost_session_mem_region_calc(&previous_start, &start, &end, &len, &mem->regions[i]);
+ if (spdk_vtophys((void *) start, NULL) == SPDK_VTOPHYS_ERROR) {
+ continue; /* region has not been registered */
+ }
+
+ if (spdk_mem_unregister((void *)start, len) != 0) {
+ assert(false);
+ }
+ }
+}
+
+static int
+new_connection(int vid)
+{
+ char ifname[PATH_MAX];
+
+ if (rte_vhost_get_ifname(vid, ifname, PATH_MAX) < 0) {
+ SPDK_ERRLOG("Couldn't get a valid ifname for device with vid %d\n", vid);
+ return -1;
+ }
+
+ return vhost_new_connection_cb(vid, ifname);
+}
+
+static int
+start_device(int vid)
+{
+ return vhost_start_device_cb(vid);
+}
+
+static void
+stop_device(int vid)
+{
+ vhost_stop_device_cb(vid);
+}
+
+static void
+destroy_connection(int vid)
+{
+ vhost_destroy_connection_cb(vid);
+}
+
+static const struct vhost_device_ops g_spdk_vhost_ops = {
+ .new_device = start_device,
+ .destroy_device = stop_device,
+ .new_connection = new_connection,
+ .destroy_connection = destroy_connection,
+#ifdef SPDK_CONFIG_VHOST_INTERNAL_LIB
+ .get_config = vhost_get_config_cb,
+ .set_config = vhost_set_config_cb,
+ .vhost_nvme_admin_passthrough = vhost_nvme_admin_passthrough,
+ .vhost_nvme_set_cq_call = vhost_nvme_set_cq_call,
+ .vhost_nvme_get_cap = vhost_nvme_get_cap,
+ .vhost_nvme_set_bar_mr = vhost_nvme_set_bar_mr,
+#endif
+};
+
+#ifndef SPDK_CONFIG_VHOST_INTERNAL_LIB
+
+static enum rte_vhost_msg_result
+extern_vhost_pre_msg_handler(int vid, void *_msg)
+{
+ struct vhost_user_msg *msg = _msg;
+ struct spdk_vhost_session *vsession;
+
+ vsession = vhost_session_find_by_vid(vid);
+ if (vsession == NULL) {
+ SPDK_ERRLOG("Received a message to unitialized session (vid %d).\n", vid);
+ assert(false);
+ return RTE_VHOST_MSG_RESULT_ERR;
+ }
+
+ switch (msg->request) {
+ case VHOST_USER_GET_VRING_BASE:
+ if (vsession->forced_polling && vsession->started) {
+ /* Our queue is stopped for whatever reason, but we may still
+ * need to poll it after it's initialized again.
+ */
+ g_spdk_vhost_ops.destroy_device(vid);
+ }
+ break;
+ case VHOST_USER_SET_VRING_BASE:
+ case VHOST_USER_SET_VRING_ADDR:
+ case VHOST_USER_SET_VRING_NUM:
+ case VHOST_USER_SET_VRING_KICK:
+ if (vsession->forced_polling && vsession->started) {
+ /* Additional queues are being initialized, so we either processed
+ * enough I/Os and are switching from SeaBIOS to the OS now, or
+ * we were never in SeaBIOS in the first place. Either way, we
+ * don't need our workaround anymore.
+ */
+ g_spdk_vhost_ops.destroy_device(vid);
+ vsession->forced_polling = false;
+ }
+ break;
+ case VHOST_USER_SET_VRING_CALL:
+ /* rte_vhost will close the previous callfd and won't notify
+ * us about any change. This will effectively make SPDK fail
+ * to deliver any subsequent interrupts until a session is
+ * restarted. We stop the session here before closing the previous
+ * fd (so that all interrupts must have been delivered by the
+ * time the descriptor is closed) and start right after (which
+ * will make SPDK retrieve the latest, up-to-date callfd from
+ * rte_vhost.
+ */
+ case VHOST_USER_SET_MEM_TABLE:
+ /* rte_vhost will unmap previous memory that SPDK may still
+ * have pending DMA operations on. We can't let that happen,
+ * so stop the device before letting rte_vhost unmap anything.
+ * This will block until all pending I/Os are finished.
+ * We will start the device again from the post-processing
+ * message handler.
+ */
+ if (vsession->started) {
+ g_spdk_vhost_ops.destroy_device(vid);
+ vsession->needs_restart = true;
+ }
+ break;
+ case VHOST_USER_GET_CONFIG: {
+ int rc = 0;
+
+ spdk_vhost_lock();
+ if (vsession->vdev->backend->vhost_get_config) {
+ rc = vsession->vdev->backend->vhost_get_config(vsession->vdev,
+ msg->payload.cfg.region, msg->payload.cfg.size);
+ if (rc != 0) {
+ msg->size = 0;
+ }
+ }
+ spdk_vhost_unlock();
+
+ return RTE_VHOST_MSG_RESULT_REPLY;
+ }
+ case VHOST_USER_SET_CONFIG: {
+ int rc = 0;
+
+ spdk_vhost_lock();
+ if (vsession->vdev->backend->vhost_set_config) {
+ rc = vsession->vdev->backend->vhost_set_config(vsession->vdev,
+ msg->payload.cfg.region, msg->payload.cfg.offset,
+ msg->payload.cfg.size, msg->payload.cfg.flags);
+ }
+ spdk_vhost_unlock();
+
+ return rc == 0 ? RTE_VHOST_MSG_RESULT_OK : RTE_VHOST_MSG_RESULT_ERR;
+ }
+ default:
+ break;
+ }
+
+ return RTE_VHOST_MSG_RESULT_NOT_HANDLED;
+}
+
+static enum rte_vhost_msg_result
+extern_vhost_post_msg_handler(int vid, void *_msg)
+{
+ struct vhost_user_msg *msg = _msg;
+ struct spdk_vhost_session *vsession;
+
+ vsession = vhost_session_find_by_vid(vid);
+ if (vsession == NULL) {
+ SPDK_ERRLOG("Received a message to unitialized session (vid %d).\n", vid);
+ assert(false);
+ return RTE_VHOST_MSG_RESULT_ERR;
+ }
+
+ if (vsession->needs_restart) {
+ g_spdk_vhost_ops.new_device(vid);
+ vsession->needs_restart = false;
+ return RTE_VHOST_MSG_RESULT_NOT_HANDLED;
+ }
+
+ switch (msg->request) {
+ case VHOST_USER_SET_FEATURES:
+ /* rte_vhost requires all queues to be fully initialized in order
+ * to start I/O processing. This behavior is not compliant with the
+ * vhost-user specification and doesn't work with QEMU 2.12+, which
+ * will only initialize 1 I/O queue for the SeaBIOS boot.
+ * Theoretically, we should start polling each virtqueue individually
+ * after receiving its SET_VRING_KICK message, but rte_vhost is not
+ * designed to poll individual queues. So here we use a workaround
+ * to detect when the vhost session could be potentially at that SeaBIOS
+ * stage and we mark it to start polling as soon as its first virtqueue
+ * gets initialized. This doesn't hurt any non-QEMU vhost slaves
+ * and allows QEMU 2.12+ to boot correctly. SET_FEATURES could be sent
+ * at any time, but QEMU will send it at least once on SeaBIOS
+ * initialization - whenever powered-up or rebooted.
+ */
+ vsession->forced_polling = true;
+ break;
+ case VHOST_USER_SET_VRING_KICK:
+ /* vhost-user spec tells us to start polling a queue after receiving
+ * its SET_VRING_KICK message. Let's do it!
+ */
+ if (vsession->forced_polling && !vsession->started) {
+ g_spdk_vhost_ops.new_device(vid);
+ }
+ break;
+ default:
+ break;
+ }
+
+ return RTE_VHOST_MSG_RESULT_NOT_HANDLED;
+}
+
+struct rte_vhost_user_extern_ops g_spdk_extern_vhost_ops = {
+ .pre_msg_handle = extern_vhost_pre_msg_handler,
+ .post_msg_handle = extern_vhost_post_msg_handler,
+};
+
+void
+vhost_session_install_rte_compat_hooks(struct spdk_vhost_session *vsession)
+{
+ int rc;
+
+ rc = rte_vhost_extern_callback_register(vsession->vid, &g_spdk_extern_vhost_ops, NULL);
+ if (rc != 0) {
+ SPDK_ERRLOG("rte_vhost_extern_callback_register() failed for vid = %d\n",
+ vsession->vid);
+ return;
+ }
+}
+
+#else /* SPDK_CONFIG_VHOST_INTERNAL_LIB */
+
+void
+vhost_session_install_rte_compat_hooks(struct spdk_vhost_session *vsession)
+{
+ /* nothing to do. all the changes are already incorporated into rte_vhost */
+}
+
+#endif
+
+int
+vhost_register_unix_socket(const char *path, const char *ctrl_name,
+ uint64_t virtio_features, uint64_t disabled_features, uint64_t protocol_features)
+{
+ struct stat file_stat;
+#ifndef SPDK_CONFIG_VHOST_INTERNAL_LIB
+ uint64_t features = 0;
+#endif
+
+ /* Register vhost driver to handle vhost messages. */
+ if (stat(path, &file_stat) != -1) {
+ if (!S_ISSOCK(file_stat.st_mode)) {
+ SPDK_ERRLOG("Cannot create a domain socket at path \"%s\": "
+ "The file already exists and is not a socket.\n",
+ path);
+ return -EIO;
+ } else if (unlink(path) != 0) {
+ SPDK_ERRLOG("Cannot create a domain socket at path \"%s\": "
+ "The socket already exists and failed to unlink.\n",
+ path);
+ return -EIO;
+ }
+ }
+
+ if (rte_vhost_driver_register(path, 0) != 0) {
+ SPDK_ERRLOG("Could not register controller %s with vhost library\n", ctrl_name);
+ SPDK_ERRLOG("Check if domain socket %s already exists\n", path);
+ return -EIO;
+ }
+ if (rte_vhost_driver_set_features(path, virtio_features) ||
+ rte_vhost_driver_disable_features(path, disabled_features)) {
+ SPDK_ERRLOG("Couldn't set vhost features for controller %s\n", ctrl_name);
+
+ rte_vhost_driver_unregister(path);
+ return -EIO;
+ }
+
+ if (rte_vhost_driver_callback_register(path, &g_spdk_vhost_ops) != 0) {
+ rte_vhost_driver_unregister(path);
+ SPDK_ERRLOG("Couldn't register callbacks for controller %s\n", ctrl_name);
+ return -EIO;
+ }
+
+#ifndef SPDK_CONFIG_VHOST_INTERNAL_LIB
+ rte_vhost_driver_get_protocol_features(path, &features);
+ features |= protocol_features;
+ rte_vhost_driver_set_protocol_features(path, features);
+#endif
+
+ if (rte_vhost_driver_start(path) != 0) {
+ SPDK_ERRLOG("Failed to start vhost driver for controller %s (%d): %s\n",
+ ctrl_name, errno, spdk_strerror(errno));
+ rte_vhost_driver_unregister(path);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+int
+vhost_get_mem_table(int vid, struct rte_vhost_memory **mem)
+{
+ return rte_vhost_get_mem_table(vid, mem);
+}
+
+int
+vhost_driver_unregister(const char *path)
+{
+ return rte_vhost_driver_unregister(path);
+}
+
+int
+vhost_get_negotiated_features(int vid, uint64_t *negotiated_features)
+{
+ return rte_vhost_get_negotiated_features(vid, negotiated_features);
+}
diff --git a/src/spdk/lib/vhost/spdk_vhost.map b/src/spdk/lib/vhost/spdk_vhost.map
new file mode 100644
index 000000000..de38e5a5e
--- /dev/null
+++ b/src/spdk/lib/vhost/spdk_vhost.map
@@ -0,0 +1,27 @@
+{
+ global:
+
+ # public functions
+ spdk_vhost_set_socket_path;
+ spdk_vhost_init;
+ spdk_vhost_fini;
+ spdk_vhost_config_json;
+ spdk_vhost_shutdown_cb;
+ spdk_vhost_lock;
+ spdk_vhost_trylock;
+ spdk_vhost_unlock;
+ spdk_vhost_dev_find;
+ spdk_vhost_dev_next;
+ spdk_vhost_dev_get_name;
+ spdk_vhost_dev_get_cpumask;
+ spdk_vhost_set_coalescing;
+ spdk_vhost_get_coalescing;
+ spdk_vhost_scsi_dev_construct;
+ spdk_vhost_scsi_dev_add_tgt;
+ spdk_vhost_scsi_dev_get_tgt;
+ spdk_vhost_scsi_dev_remove_tgt;
+ spdk_vhost_blk_construct;
+ spdk_vhost_dev_remove;
+
+ local: *;
+};
diff --git a/src/spdk/lib/vhost/vhost.c b/src/spdk/lib/vhost/vhost.c
new file mode 100644
index 000000000..b904d8bf9
--- /dev/null
+++ b/src/spdk/lib/vhost/vhost.c
@@ -0,0 +1,1634 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/env.h"
+#include "spdk/likely.h"
+#include "spdk/string.h"
+#include "spdk/util.h"
+#include "spdk/memory.h"
+#include "spdk/barrier.h"
+#include "spdk/vhost.h"
+#include "vhost_internal.h"
+
+static struct spdk_cpuset g_vhost_core_mask;
+
+/* Path to folder where character device will be created. Can be set by user. */
+static char dev_dirname[PATH_MAX] = "";
+
+/* Thread performing all vhost management operations */
+static struct spdk_thread *g_vhost_init_thread;
+
+static spdk_vhost_fini_cb g_fini_cpl_cb;
+
+/**
+ * DPDK calls our callbacks synchronously but the work those callbacks
+ * perform needs to be async. Luckily, all DPDK callbacks are called on
+ * a DPDK-internal pthread, so we'll just wait on a semaphore in there.
+ */
+static sem_t g_dpdk_sem;
+
+/** Return code for the current DPDK callback */
+static int g_dpdk_response;
+
+struct vhost_session_fn_ctx {
+ /** Device pointer obtained before enqueuing the event */
+ struct spdk_vhost_dev *vdev;
+
+ /** ID of the session to send event to. */
+ uint32_t vsession_id;
+
+ /** User provided function to be executed on session's thread. */
+ spdk_vhost_session_fn cb_fn;
+
+ /**
+ * User provided function to be called on the init thread
+ * after iterating through all sessions.
+ */
+ spdk_vhost_dev_fn cpl_fn;
+
+ /** Custom user context */
+ void *user_ctx;
+};
+
+static TAILQ_HEAD(, spdk_vhost_dev) g_vhost_devices = TAILQ_HEAD_INITIALIZER(
+ g_vhost_devices);
+static pthread_mutex_t g_vhost_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+void *vhost_gpa_to_vva(struct spdk_vhost_session *vsession, uint64_t addr, uint64_t len)
+{
+ void *vva;
+ uint64_t newlen;
+
+ newlen = len;
+ vva = (void *)rte_vhost_va_from_guest_pa(vsession->mem, addr, &newlen);
+ if (newlen != len) {
+ return NULL;
+ }
+
+ return vva;
+
+}
+
+static void
+vhost_log_req_desc(struct spdk_vhost_session *vsession, struct spdk_vhost_virtqueue *virtqueue,
+ uint16_t req_id)
+{
+ struct vring_desc *desc, *desc_table;
+ uint32_t desc_table_size;
+ int rc;
+
+ if (spdk_likely(!vhost_dev_has_feature(vsession, VHOST_F_LOG_ALL))) {
+ return;
+ }
+
+ rc = vhost_vq_get_desc(vsession, virtqueue, req_id, &desc, &desc_table, &desc_table_size);
+ if (spdk_unlikely(rc != 0)) {
+ SPDK_ERRLOG("Can't log used ring descriptors!\n");
+ return;
+ }
+
+ do {
+ if (vhost_vring_desc_is_wr(desc)) {
+ /* To be honest, only pages realy touched should be logged, but
+ * doing so would require tracking those changes in each backed.
+ * Also backend most likely will touch all/most of those pages so
+ * for lets assume we touched all pages passed to as writeable buffers. */
+ rte_vhost_log_write(vsession->vid, desc->addr, desc->len);
+ }
+ vhost_vring_desc_get_next(&desc, desc_table, desc_table_size);
+ } while (desc);
+}
+
+static void
+vhost_log_used_vring_elem(struct spdk_vhost_session *vsession,
+ struct spdk_vhost_virtqueue *virtqueue,
+ uint16_t idx)
+{
+ uint64_t offset, len;
+
+ if (spdk_likely(!vhost_dev_has_feature(vsession, VHOST_F_LOG_ALL))) {
+ return;
+ }
+
+ if (spdk_unlikely(virtqueue->packed.packed_ring)) {
+ offset = idx * sizeof(struct vring_packed_desc);
+ len = sizeof(struct vring_packed_desc);
+ } else {
+ offset = offsetof(struct vring_used, ring[idx]);
+ len = sizeof(virtqueue->vring.used->ring[idx]);
+ }
+
+ rte_vhost_log_used_vring(vsession->vid, virtqueue->vring_idx, offset, len);
+}
+
+static void
+vhost_log_used_vring_idx(struct spdk_vhost_session *vsession,
+ struct spdk_vhost_virtqueue *virtqueue)
+{
+ uint64_t offset, len;
+ uint16_t vq_idx;
+
+ if (spdk_likely(!vhost_dev_has_feature(vsession, VHOST_F_LOG_ALL))) {
+ return;
+ }
+
+ offset = offsetof(struct vring_used, idx);
+ len = sizeof(virtqueue->vring.used->idx);
+ vq_idx = virtqueue - vsession->virtqueue;
+
+ rte_vhost_log_used_vring(vsession->vid, vq_idx, offset, len);
+}
+
+/*
+ * Get available requests from avail ring.
+ */
+uint16_t
+vhost_vq_avail_ring_get(struct spdk_vhost_virtqueue *virtqueue, uint16_t *reqs,
+ uint16_t reqs_len)
+{
+ struct rte_vhost_vring *vring = &virtqueue->vring;
+ struct vring_avail *avail = vring->avail;
+ uint16_t size_mask = vring->size - 1;
+ uint16_t last_idx = virtqueue->last_avail_idx, avail_idx = avail->idx;
+ uint16_t count, i;
+
+ count = avail_idx - last_idx;
+ if (spdk_likely(count == 0)) {
+ return 0;
+ }
+
+ if (spdk_unlikely(count > vring->size)) {
+ /* TODO: the queue is unrecoverably broken and should be marked so.
+ * For now we will fail silently and report there are no new avail entries.
+ */
+ return 0;
+ }
+
+ count = spdk_min(count, reqs_len);
+ virtqueue->last_avail_idx += count;
+ for (i = 0; i < count; i++) {
+ reqs[i] = vring->avail->ring[(last_idx + i) & size_mask];
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_RING,
+ "AVAIL: last_idx=%"PRIu16" avail_idx=%"PRIu16" count=%"PRIu16"\n",
+ last_idx, avail_idx, count);
+
+ return count;
+}
+
+static bool
+vhost_vring_desc_is_indirect(struct vring_desc *cur_desc)
+{
+ return !!(cur_desc->flags & VRING_DESC_F_INDIRECT);
+}
+
+static bool
+vhost_vring_packed_desc_is_indirect(struct vring_packed_desc *cur_desc)
+{
+ return (cur_desc->flags & VRING_DESC_F_INDIRECT) != 0;
+}
+
+int
+vhost_vq_get_desc(struct spdk_vhost_session *vsession, struct spdk_vhost_virtqueue *virtqueue,
+ uint16_t req_idx, struct vring_desc **desc, struct vring_desc **desc_table,
+ uint32_t *desc_table_size)
+{
+ if (spdk_unlikely(req_idx >= virtqueue->vring.size)) {
+ return -1;
+ }
+
+ *desc = &virtqueue->vring.desc[req_idx];
+
+ if (vhost_vring_desc_is_indirect(*desc)) {
+ *desc_table_size = (*desc)->len / sizeof(**desc);
+ *desc_table = vhost_gpa_to_vva(vsession, (*desc)->addr,
+ sizeof(**desc) * *desc_table_size);
+ *desc = *desc_table;
+ if (*desc == NULL) {
+ return -1;
+ }
+
+ return 0;
+ }
+
+ *desc_table = virtqueue->vring.desc;
+ *desc_table_size = virtqueue->vring.size;
+
+ return 0;
+}
+
+int
+vhost_vq_get_desc_packed(struct spdk_vhost_session *vsession,
+ struct spdk_vhost_virtqueue *virtqueue,
+ uint16_t req_idx, struct vring_packed_desc **desc,
+ struct vring_packed_desc **desc_table, uint32_t *desc_table_size)
+{
+ *desc = &virtqueue->vring.desc_packed[req_idx];
+
+ /* In packed ring when the desc is non-indirect we get next desc
+ * by judging (desc->flag & VRING_DESC_F_NEXT) != 0. When the desc
+ * is indirect we get next desc by idx and desc_table_size. It's
+ * different from split ring.
+ */
+ if (vhost_vring_packed_desc_is_indirect(*desc)) {
+ *desc_table_size = (*desc)->len / sizeof(struct vring_packed_desc);
+ *desc_table = vhost_gpa_to_vva(vsession, (*desc)->addr,
+ (*desc)->len);
+ *desc = *desc_table;
+ if (spdk_unlikely(*desc == NULL)) {
+ return -1;
+ }
+ } else {
+ *desc_table = NULL;
+ *desc_table_size = 0;
+ }
+
+ return 0;
+}
+
+int
+vhost_vq_used_signal(struct spdk_vhost_session *vsession,
+ struct spdk_vhost_virtqueue *virtqueue)
+{
+ if (virtqueue->used_req_cnt == 0) {
+ return 0;
+ }
+
+ virtqueue->req_cnt += virtqueue->used_req_cnt;
+ virtqueue->used_req_cnt = 0;
+
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_RING,
+ "Queue %td - USED RING: sending IRQ: last used %"PRIu16"\n",
+ virtqueue - vsession->virtqueue, virtqueue->last_used_idx);
+
+ if (rte_vhost_vring_call(vsession->vid, virtqueue->vring_idx) == 0) {
+ /* interrupt signalled */
+ return 1;
+ } else {
+ /* interrupt not signalled */
+ return 0;
+ }
+}
+
+
+static void
+check_session_io_stats(struct spdk_vhost_session *vsession, uint64_t now)
+{
+ struct spdk_vhost_virtqueue *virtqueue;
+ uint32_t irq_delay_base = vsession->coalescing_delay_time_base;
+ uint32_t io_threshold = vsession->coalescing_io_rate_threshold;
+ int32_t irq_delay;
+ uint32_t req_cnt;
+ uint16_t q_idx;
+
+ if (now < vsession->next_stats_check_time) {
+ return;
+ }
+
+ vsession->next_stats_check_time = now + vsession->stats_check_interval;
+ for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
+ virtqueue = &vsession->virtqueue[q_idx];
+
+ req_cnt = virtqueue->req_cnt + virtqueue->used_req_cnt;
+ if (req_cnt <= io_threshold) {
+ continue;
+ }
+
+ irq_delay = (irq_delay_base * (req_cnt - io_threshold)) / io_threshold;
+ virtqueue->irq_delay_time = (uint32_t) spdk_max(0, irq_delay);
+
+ virtqueue->req_cnt = 0;
+ virtqueue->next_event_time = now;
+ }
+}
+
+static inline bool
+vhost_vq_event_is_suppressed(struct spdk_vhost_virtqueue *vq)
+{
+ if (spdk_unlikely(vq->packed.packed_ring)) {
+ if (vq->vring.driver_event->flags & VRING_PACKED_EVENT_FLAG_DISABLE) {
+ return true;
+ }
+ } else {
+ if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+void
+vhost_session_used_signal(struct spdk_vhost_session *vsession)
+{
+ struct spdk_vhost_virtqueue *virtqueue;
+ uint64_t now;
+ uint16_t q_idx;
+
+ if (vsession->coalescing_delay_time_base == 0) {
+ for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
+ virtqueue = &vsession->virtqueue[q_idx];
+
+ if (virtqueue->vring.desc == NULL) {
+ continue;
+ }
+
+ if (vhost_vq_event_is_suppressed(virtqueue)) {
+ continue;
+ }
+
+ vhost_vq_used_signal(vsession, virtqueue);
+ }
+ } else {
+ now = spdk_get_ticks();
+ check_session_io_stats(vsession, now);
+
+ for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
+ virtqueue = &vsession->virtqueue[q_idx];
+
+ /* No need for event right now */
+ if (now < virtqueue->next_event_time) {
+ continue;
+ }
+
+ if (vhost_vq_event_is_suppressed(virtqueue)) {
+ continue;
+ }
+
+ if (!vhost_vq_used_signal(vsession, virtqueue)) {
+ continue;
+ }
+
+ /* Syscall is quite long so update time */
+ now = spdk_get_ticks();
+ virtqueue->next_event_time = now + virtqueue->irq_delay_time;
+ }
+ }
+}
+
+static int
+vhost_session_set_coalescing(struct spdk_vhost_dev *vdev,
+ struct spdk_vhost_session *vsession, void *ctx)
+{
+ vsession->coalescing_delay_time_base =
+ vdev->coalescing_delay_us * spdk_get_ticks_hz() / 1000000ULL;
+ vsession->coalescing_io_rate_threshold =
+ vdev->coalescing_iops_threshold * SPDK_VHOST_STATS_CHECK_INTERVAL_MS / 1000U;
+ return 0;
+}
+
+static int
+vhost_dev_set_coalescing(struct spdk_vhost_dev *vdev, uint32_t delay_base_us,
+ uint32_t iops_threshold)
+{
+ uint64_t delay_time_base = delay_base_us * spdk_get_ticks_hz() / 1000000ULL;
+ uint32_t io_rate = iops_threshold * SPDK_VHOST_STATS_CHECK_INTERVAL_MS / 1000U;
+
+ if (delay_time_base >= UINT32_MAX) {
+ SPDK_ERRLOG("Delay time of %"PRIu32" is to big\n", delay_base_us);
+ return -EINVAL;
+ } else if (io_rate == 0) {
+ SPDK_ERRLOG("IOPS rate of %"PRIu32" is too low. Min is %u\n", io_rate,
+ 1000U / SPDK_VHOST_STATS_CHECK_INTERVAL_MS);
+ return -EINVAL;
+ }
+
+ vdev->coalescing_delay_us = delay_base_us;
+ vdev->coalescing_iops_threshold = iops_threshold;
+ return 0;
+}
+
+int
+spdk_vhost_set_coalescing(struct spdk_vhost_dev *vdev, uint32_t delay_base_us,
+ uint32_t iops_threshold)
+{
+ int rc;
+
+ rc = vhost_dev_set_coalescing(vdev, delay_base_us, iops_threshold);
+ if (rc != 0) {
+ return rc;
+ }
+
+ vhost_dev_foreach_session(vdev, vhost_session_set_coalescing, NULL, NULL);
+ return 0;
+}
+
+void
+spdk_vhost_get_coalescing(struct spdk_vhost_dev *vdev, uint32_t *delay_base_us,
+ uint32_t *iops_threshold)
+{
+ if (delay_base_us) {
+ *delay_base_us = vdev->coalescing_delay_us;
+ }
+
+ if (iops_threshold) {
+ *iops_threshold = vdev->coalescing_iops_threshold;
+ }
+}
+
+/*
+ * Enqueue id and len to used ring.
+ */
+void
+vhost_vq_used_ring_enqueue(struct spdk_vhost_session *vsession,
+ struct spdk_vhost_virtqueue *virtqueue,
+ uint16_t id, uint32_t len)
+{
+ struct rte_vhost_vring *vring = &virtqueue->vring;
+ struct vring_used *used = vring->used;
+ uint16_t last_idx = virtqueue->last_used_idx & (vring->size - 1);
+ uint16_t vq_idx = virtqueue->vring_idx;
+
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_RING,
+ "Queue %td - USED RING: last_idx=%"PRIu16" req id=%"PRIu16" len=%"PRIu32"\n",
+ virtqueue - vsession->virtqueue, virtqueue->last_used_idx, id, len);
+
+ vhost_log_req_desc(vsession, virtqueue, id);
+
+ virtqueue->last_used_idx++;
+ used->ring[last_idx].id = id;
+ used->ring[last_idx].len = len;
+
+ /* Ensure the used ring is updated before we log it or increment used->idx. */
+ spdk_smp_wmb();
+
+ rte_vhost_set_last_inflight_io_split(vsession->vid, vq_idx, id);
+
+ vhost_log_used_vring_elem(vsession, virtqueue, last_idx);
+ * (volatile uint16_t *) &used->idx = virtqueue->last_used_idx;
+ vhost_log_used_vring_idx(vsession, virtqueue);
+
+ rte_vhost_clr_inflight_desc_split(vsession->vid, vq_idx, virtqueue->last_used_idx, id);
+
+ virtqueue->used_req_cnt++;
+}
+
+void
+vhost_vq_packed_ring_enqueue(struct spdk_vhost_session *vsession,
+ struct spdk_vhost_virtqueue *virtqueue,
+ uint16_t num_descs, uint16_t buffer_id,
+ uint32_t length)
+{
+ struct vring_packed_desc *desc = &virtqueue->vring.desc_packed[virtqueue->last_used_idx];
+ bool used, avail;
+
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_RING,
+ "Queue %td - RING: buffer_id=%"PRIu16"\n",
+ virtqueue - vsession->virtqueue, buffer_id);
+
+ /* When the descriptor is used, two flags in descriptor
+ * avail flag and used flag are set to equal
+ * and used flag value == used_wrap_counter.
+ */
+ used = !!(desc->flags & VRING_DESC_F_USED);
+ avail = !!(desc->flags & VRING_DESC_F_AVAIL);
+ if (spdk_unlikely(used == virtqueue->packed.used_phase && used == avail)) {
+ SPDK_ERRLOG("descriptor has been used before\n");
+ return;
+ }
+
+ /* In used desc addr is unused and len specifies the buffer length
+ * that has been written to by the device.
+ */
+ desc->addr = 0;
+ desc->len = length;
+
+ /* This bit specifies whether any data has been written by the device */
+ if (length != 0) {
+ desc->flags |= VRING_DESC_F_WRITE;
+ }
+
+ /* Buffer ID is included in the last descriptor in the list.
+ * The driver needs to keep track of the size of the list corresponding
+ * to each buffer ID.
+ */
+ desc->id = buffer_id;
+
+ /* A device MUST NOT make the descriptor used before buffer_id is
+ * written to the descriptor.
+ */
+ spdk_smp_wmb();
+ /* To mark a desc as used, the device sets the F_USED bit in flags to match
+ * the internal Device ring wrap counter. It also sets the F_AVAIL bit to
+ * match the same value.
+ */
+ if (virtqueue->packed.used_phase) {
+ desc->flags |= VRING_DESC_F_AVAIL_USED;
+ } else {
+ desc->flags &= ~VRING_DESC_F_AVAIL_USED;
+ }
+
+ vhost_log_used_vring_elem(vsession, virtqueue, virtqueue->last_used_idx);
+ virtqueue->last_used_idx += num_descs;
+ if (virtqueue->last_used_idx >= virtqueue->vring.size) {
+ virtqueue->last_used_idx -= virtqueue->vring.size;
+ virtqueue->packed.used_phase = !virtqueue->packed.used_phase;
+ }
+
+ virtqueue->used_req_cnt++;
+}
+
+bool
+vhost_vq_packed_ring_is_avail(struct spdk_vhost_virtqueue *virtqueue)
+{
+ uint16_t flags = virtqueue->vring.desc_packed[virtqueue->last_avail_idx].flags;
+
+ /* To mark a desc as available, the driver sets the F_AVAIL bit in flags
+ * to match the internal avail wrap counter. It also sets the F_USED bit to
+ * match the inverse value but it's not mandatory.
+ */
+ return (!!(flags & VRING_DESC_F_AVAIL) == virtqueue->packed.avail_phase);
+}
+
+bool
+vhost_vring_packed_desc_is_wr(struct vring_packed_desc *cur_desc)
+{
+ return (cur_desc->flags & VRING_DESC_F_WRITE) != 0;
+}
+
+int
+vhost_vring_packed_desc_get_next(struct vring_packed_desc **desc, uint16_t *req_idx,
+ struct spdk_vhost_virtqueue *vq,
+ struct vring_packed_desc *desc_table,
+ uint32_t desc_table_size)
+{
+ if (desc_table != NULL) {
+ /* When the desc_table isn't NULL means it's indirect and we get the next
+ * desc by req_idx and desc_table_size. The return value is NULL means
+ * we reach the last desc of this request.
+ */
+ (*req_idx)++;
+ if (*req_idx < desc_table_size) {
+ *desc = &desc_table[*req_idx];
+ } else {
+ *desc = NULL;
+ }
+ } else {
+ /* When the desc_table is NULL means it's non-indirect and we get the next
+ * desc by req_idx and F_NEXT in flags. The return value is NULL means
+ * we reach the last desc of this request. When return new desc
+ * we update the req_idx too.
+ */
+ if (((*desc)->flags & VRING_DESC_F_NEXT) == 0) {
+ *desc = NULL;
+ return 0;
+ }
+
+ *req_idx = (*req_idx + 1) % vq->vring.size;
+ *desc = &vq->vring.desc_packed[*req_idx];
+ }
+
+ return 0;
+}
+
+static int
+vhost_vring_desc_payload_to_iov(struct spdk_vhost_session *vsession, struct iovec *iov,
+ uint16_t *iov_index, uintptr_t payload, uint64_t remaining)
+{
+ uintptr_t vva;
+ uint64_t len;
+
+ do {
+ if (*iov_index >= SPDK_VHOST_IOVS_MAX) {
+ SPDK_ERRLOG("SPDK_VHOST_IOVS_MAX(%d) reached\n", SPDK_VHOST_IOVS_MAX);
+ return -1;
+ }
+ len = remaining;
+ vva = (uintptr_t)rte_vhost_va_from_guest_pa(vsession->mem, payload, &len);
+ if (vva == 0 || len == 0) {
+ SPDK_ERRLOG("gpa_to_vva(%p) == NULL\n", (void *)payload);
+ return -1;
+ }
+ iov[*iov_index].iov_base = (void *)vva;
+ iov[*iov_index].iov_len = len;
+ remaining -= len;
+ payload += len;
+ (*iov_index)++;
+ } while (remaining);
+
+ return 0;
+}
+
+int
+vhost_vring_packed_desc_to_iov(struct spdk_vhost_session *vsession, struct iovec *iov,
+ uint16_t *iov_index, const struct vring_packed_desc *desc)
+{
+ return vhost_vring_desc_payload_to_iov(vsession, iov, iov_index,
+ desc->addr, desc->len);
+}
+
+/* 1, Traverse the desc chain to get the buffer_id and return buffer_id as task_idx.
+ * 2, Update the vq->last_avail_idx to point next available desc chain.
+ * 3, Update the avail_wrap_counter if last_avail_idx overturn.
+ */
+uint16_t
+vhost_vring_packed_desc_get_buffer_id(struct spdk_vhost_virtqueue *vq, uint16_t req_idx,
+ uint16_t *num_descs)
+{
+ struct vring_packed_desc *desc;
+ uint16_t desc_head = req_idx;
+
+ *num_descs = 1;
+
+ desc = &vq->vring.desc_packed[req_idx];
+ if (!vhost_vring_packed_desc_is_indirect(desc)) {
+ while ((desc->flags & VRING_DESC_F_NEXT) != 0) {
+ req_idx = (req_idx + 1) % vq->vring.size;
+ desc = &vq->vring.desc_packed[req_idx];
+ (*num_descs)++;
+ }
+ }
+
+ /* Queue Size doesn't have to be a power of 2
+ * Device maintains last_avail_idx so we can make sure
+ * the value is valid(0 ~ vring.size - 1)
+ */
+ vq->last_avail_idx = (req_idx + 1) % vq->vring.size;
+ if (vq->last_avail_idx < desc_head) {
+ vq->packed.avail_phase = !vq->packed.avail_phase;
+ }
+
+ return desc->id;
+}
+
+int
+vhost_vring_desc_get_next(struct vring_desc **desc,
+ struct vring_desc *desc_table, uint32_t desc_table_size)
+{
+ struct vring_desc *old_desc = *desc;
+ uint16_t next_idx;
+
+ if ((old_desc->flags & VRING_DESC_F_NEXT) == 0) {
+ *desc = NULL;
+ return 0;
+ }
+
+ next_idx = old_desc->next;
+ if (spdk_unlikely(next_idx >= desc_table_size)) {
+ *desc = NULL;
+ return -1;
+ }
+
+ *desc = &desc_table[next_idx];
+ return 0;
+}
+
+int
+vhost_vring_desc_to_iov(struct spdk_vhost_session *vsession, struct iovec *iov,
+ uint16_t *iov_index, const struct vring_desc *desc)
+{
+ return vhost_vring_desc_payload_to_iov(vsession, iov, iov_index,
+ desc->addr, desc->len);
+}
+
+static struct spdk_vhost_session *
+vhost_session_find_by_id(struct spdk_vhost_dev *vdev, unsigned id)
+{
+ struct spdk_vhost_session *vsession;
+
+ TAILQ_FOREACH(vsession, &vdev->vsessions, tailq) {
+ if (vsession->id == id) {
+ return vsession;
+ }
+ }
+
+ return NULL;
+}
+
+struct spdk_vhost_session *
+vhost_session_find_by_vid(int vid)
+{
+ struct spdk_vhost_dev *vdev;
+ struct spdk_vhost_session *vsession;
+
+ TAILQ_FOREACH(vdev, &g_vhost_devices, tailq) {
+ TAILQ_FOREACH(vsession, &vdev->vsessions, tailq) {
+ if (vsession->vid == vid) {
+ return vsession;
+ }
+ }
+ }
+
+ return NULL;
+}
+
+struct spdk_vhost_dev *
+spdk_vhost_dev_next(struct spdk_vhost_dev *vdev)
+{
+ if (vdev == NULL) {
+ return TAILQ_FIRST(&g_vhost_devices);
+ }
+
+ return TAILQ_NEXT(vdev, tailq);
+}
+
+struct spdk_vhost_dev *
+spdk_vhost_dev_find(const char *ctrlr_name)
+{
+ struct spdk_vhost_dev *vdev;
+ size_t dev_dirname_len = strlen(dev_dirname);
+
+ if (strncmp(ctrlr_name, dev_dirname, dev_dirname_len) == 0) {
+ ctrlr_name += dev_dirname_len;
+ }
+
+ TAILQ_FOREACH(vdev, &g_vhost_devices, tailq) {
+ if (strcmp(vdev->name, ctrlr_name) == 0) {
+ return vdev;
+ }
+ }
+
+ return NULL;
+}
+
+static int
+vhost_parse_core_mask(const char *mask, struct spdk_cpuset *cpumask)
+{
+ int rc;
+
+ if (cpumask == NULL) {
+ return -1;
+ }
+
+ if (mask == NULL) {
+ spdk_cpuset_copy(cpumask, &g_vhost_core_mask);
+ return 0;
+ }
+
+ rc = spdk_cpuset_parse(cpumask, mask);
+ if (rc < 0) {
+ SPDK_ERRLOG("invalid cpumask %s\n", mask);
+ return -1;
+ }
+
+ spdk_cpuset_and(cpumask, &g_vhost_core_mask);
+
+ if (spdk_cpuset_count(cpumask) == 0) {
+ SPDK_ERRLOG("no cpu is selected among core mask(=%s)\n",
+ spdk_cpuset_fmt(&g_vhost_core_mask));
+ return -1;
+ }
+
+ return 0;
+}
+
+static void
+vhost_setup_core_mask(void *ctx)
+{
+ struct spdk_thread *thread = spdk_get_thread();
+ spdk_cpuset_or(&g_vhost_core_mask, spdk_thread_get_cpumask(thread));
+}
+
+static void
+vhost_setup_core_mask_done(void *ctx)
+{
+ spdk_vhost_init_cb init_cb = ctx;
+
+ if (spdk_cpuset_count(&g_vhost_core_mask) == 0) {
+ init_cb(-ECHILD);
+ return;
+ }
+
+ init_cb(0);
+}
+
+static void
+vhost_dev_thread_exit(void *arg1)
+{
+ spdk_thread_exit(spdk_get_thread());
+}
+
+int
+vhost_dev_register(struct spdk_vhost_dev *vdev, const char *name, const char *mask_str,
+ const struct spdk_vhost_dev_backend *backend)
+{
+ char path[PATH_MAX];
+ struct spdk_cpuset cpumask = {};
+ int rc;
+
+ assert(vdev);
+ if (name == NULL) {
+ SPDK_ERRLOG("Can't register controller with no name\n");
+ return -EINVAL;
+ }
+
+ if (vhost_parse_core_mask(mask_str, &cpumask) != 0) {
+ SPDK_ERRLOG("cpumask %s is invalid (core mask is 0x%s)\n",
+ mask_str, spdk_cpuset_fmt(&g_vhost_core_mask));
+ return -EINVAL;
+ }
+
+ if (spdk_vhost_dev_find(name)) {
+ SPDK_ERRLOG("vhost controller %s already exists.\n", name);
+ return -EEXIST;
+ }
+
+ if (snprintf(path, sizeof(path), "%s%s", dev_dirname, name) >= (int)sizeof(path)) {
+ SPDK_ERRLOG("Resulting socket path for controller %s is too long: %s%s\n", name, dev_dirname,
+ name);
+ return -EINVAL;
+ }
+
+ vdev->name = strdup(name);
+ vdev->path = strdup(path);
+ if (vdev->name == NULL || vdev->path == NULL) {
+ rc = -EIO;
+ goto out;
+ }
+
+ vdev->thread = spdk_thread_create(vdev->name, &cpumask);
+ if (vdev->thread == NULL) {
+ SPDK_ERRLOG("Failed to create thread for vhost controller %s.\n", name);
+ rc = -EIO;
+ goto out;
+ }
+
+ vdev->registered = true;
+ vdev->backend = backend;
+ TAILQ_INIT(&vdev->vsessions);
+
+ vhost_dev_set_coalescing(vdev, SPDK_VHOST_COALESCING_DELAY_BASE_US,
+ SPDK_VHOST_VQ_IOPS_COALESCING_THRESHOLD);
+
+ if (vhost_register_unix_socket(path, name, vdev->virtio_features, vdev->disabled_features,
+ vdev->protocol_features)) {
+ spdk_thread_send_msg(vdev->thread, vhost_dev_thread_exit, NULL);
+ rc = -EIO;
+ goto out;
+ }
+
+ TAILQ_INSERT_TAIL(&g_vhost_devices, vdev, tailq);
+
+ SPDK_INFOLOG(SPDK_LOG_VHOST, "Controller %s: new controller added\n", vdev->name);
+ return 0;
+
+out:
+ free(vdev->name);
+ free(vdev->path);
+ return rc;
+}
+
+int
+vhost_dev_unregister(struct spdk_vhost_dev *vdev)
+{
+ if (!TAILQ_EMPTY(&vdev->vsessions)) {
+ SPDK_ERRLOG("Controller %s has still valid connection.\n", vdev->name);
+ return -EBUSY;
+ }
+
+ if (vdev->registered && vhost_driver_unregister(vdev->path) != 0) {
+ SPDK_ERRLOG("Could not unregister controller %s with vhost library\n"
+ "Check if domain socket %s still exists\n",
+ vdev->name, vdev->path);
+ return -EIO;
+ }
+
+ SPDK_INFOLOG(SPDK_LOG_VHOST, "Controller %s: removed\n", vdev->name);
+
+ spdk_thread_send_msg(vdev->thread, vhost_dev_thread_exit, NULL);
+
+ free(vdev->name);
+ free(vdev->path);
+ TAILQ_REMOVE(&g_vhost_devices, vdev, tailq);
+ return 0;
+}
+
+const char *
+spdk_vhost_dev_get_name(struct spdk_vhost_dev *vdev)
+{
+ assert(vdev != NULL);
+ return vdev->name;
+}
+
+const struct spdk_cpuset *
+spdk_vhost_dev_get_cpumask(struct spdk_vhost_dev *vdev)
+{
+ assert(vdev != NULL);
+ return spdk_thread_get_cpumask(vdev->thread);
+}
+
+static void
+wait_for_semaphore(int timeout_sec, const char *errmsg)
+{
+ struct timespec timeout;
+ int rc;
+
+ clock_gettime(CLOCK_REALTIME, &timeout);
+ timeout.tv_sec += timeout_sec;
+ rc = sem_timedwait(&g_dpdk_sem, &timeout);
+ if (rc != 0) {
+ SPDK_ERRLOG("Timeout waiting for event: %s.\n", errmsg);
+ sem_wait(&g_dpdk_sem);
+ }
+}
+
+static void
+vhost_session_cb_done(int rc)
+{
+ g_dpdk_response = rc;
+ sem_post(&g_dpdk_sem);
+}
+
+void
+vhost_session_start_done(struct spdk_vhost_session *vsession, int response)
+{
+ if (response == 0) {
+ vsession->started = true;
+
+ assert(vsession->vdev->active_session_num < UINT32_MAX);
+ vsession->vdev->active_session_num++;
+ }
+
+ vhost_session_cb_done(response);
+}
+
+void
+vhost_session_stop_done(struct spdk_vhost_session *vsession, int response)
+{
+ if (response == 0) {
+ vsession->started = false;
+
+ assert(vsession->vdev->active_session_num > 0);
+ vsession->vdev->active_session_num--;
+ }
+
+ vhost_session_cb_done(response);
+}
+
+static void
+vhost_event_cb(void *arg1)
+{
+ struct vhost_session_fn_ctx *ctx = arg1;
+ struct spdk_vhost_session *vsession;
+
+ if (pthread_mutex_trylock(&g_vhost_mutex) != 0) {
+ spdk_thread_send_msg(spdk_get_thread(), vhost_event_cb, arg1);
+ return;
+ }
+
+ vsession = vhost_session_find_by_id(ctx->vdev, ctx->vsession_id);
+ ctx->cb_fn(ctx->vdev, vsession, NULL);
+ pthread_mutex_unlock(&g_vhost_mutex);
+}
+
+int
+vhost_session_send_event(struct spdk_vhost_session *vsession,
+ spdk_vhost_session_fn cb_fn, unsigned timeout_sec,
+ const char *errmsg)
+{
+ struct vhost_session_fn_ctx ev_ctx = {0};
+ struct spdk_vhost_dev *vdev = vsession->vdev;
+
+ ev_ctx.vdev = vdev;
+ ev_ctx.vsession_id = vsession->id;
+ ev_ctx.cb_fn = cb_fn;
+
+ spdk_thread_send_msg(vdev->thread, vhost_event_cb, &ev_ctx);
+
+ pthread_mutex_unlock(&g_vhost_mutex);
+ wait_for_semaphore(timeout_sec, errmsg);
+ pthread_mutex_lock(&g_vhost_mutex);
+
+ return g_dpdk_response;
+}
+
+static void
+foreach_session_finish_cb(void *arg1)
+{
+ struct vhost_session_fn_ctx *ev_ctx = arg1;
+ struct spdk_vhost_dev *vdev = ev_ctx->vdev;
+
+ if (pthread_mutex_trylock(&g_vhost_mutex) != 0) {
+ spdk_thread_send_msg(spdk_get_thread(),
+ foreach_session_finish_cb, arg1);
+ return;
+ }
+
+ assert(vdev->pending_async_op_num > 0);
+ vdev->pending_async_op_num--;
+ if (ev_ctx->cpl_fn != NULL) {
+ ev_ctx->cpl_fn(vdev, ev_ctx->user_ctx);
+ }
+
+ pthread_mutex_unlock(&g_vhost_mutex);
+ free(ev_ctx);
+}
+
+static void
+foreach_session(void *arg1)
+{
+ struct vhost_session_fn_ctx *ev_ctx = arg1;
+ struct spdk_vhost_session *vsession;
+ struct spdk_vhost_dev *vdev = ev_ctx->vdev;
+ int rc;
+
+ if (pthread_mutex_trylock(&g_vhost_mutex) != 0) {
+ spdk_thread_send_msg(spdk_get_thread(), foreach_session, arg1);
+ return;
+ }
+
+ TAILQ_FOREACH(vsession, &vdev->vsessions, tailq) {
+ if (vsession->initialized) {
+ rc = ev_ctx->cb_fn(vdev, vsession, ev_ctx->user_ctx);
+ if (rc < 0) {
+ goto out;
+ }
+ }
+ }
+
+out:
+ pthread_mutex_unlock(&g_vhost_mutex);
+
+ spdk_thread_send_msg(g_vhost_init_thread, foreach_session_finish_cb, arg1);
+}
+
+void
+vhost_dev_foreach_session(struct spdk_vhost_dev *vdev,
+ spdk_vhost_session_fn fn,
+ spdk_vhost_dev_fn cpl_fn,
+ void *arg)
+{
+ struct vhost_session_fn_ctx *ev_ctx;
+
+ ev_ctx = calloc(1, sizeof(*ev_ctx));
+ if (ev_ctx == NULL) {
+ SPDK_ERRLOG("Failed to alloc vhost event.\n");
+ assert(false);
+ return;
+ }
+
+ ev_ctx->vdev = vdev;
+ ev_ctx->cb_fn = fn;
+ ev_ctx->cpl_fn = cpl_fn;
+ ev_ctx->user_ctx = arg;
+
+ assert(vdev->pending_async_op_num < UINT32_MAX);
+ vdev->pending_async_op_num++;
+
+ spdk_thread_send_msg(vdev->thread, foreach_session, ev_ctx);
+}
+
+static int
+_stop_session(struct spdk_vhost_session *vsession)
+{
+ struct spdk_vhost_dev *vdev = vsession->vdev;
+ struct spdk_vhost_virtqueue *q;
+ int rc;
+ uint16_t i;
+
+ rc = vdev->backend->stop_session(vsession);
+ if (rc != 0) {
+ SPDK_ERRLOG("Couldn't stop device with vid %d.\n", vsession->vid);
+ pthread_mutex_unlock(&g_vhost_mutex);
+ return rc;
+ }
+
+ for (i = 0; i < vsession->max_queues; i++) {
+ q = &vsession->virtqueue[i];
+
+ /* vring.desc and vring.desc_packed are in a union struct
+ * so q->vring.desc can replace q->vring.desc_packed.
+ */
+ if (q->vring.desc == NULL) {
+ continue;
+ }
+
+ /* Packed virtqueues support up to 2^15 entries each
+ * so left one bit can be used as wrap counter.
+ */
+ if (q->packed.packed_ring) {
+ q->last_avail_idx = q->last_avail_idx |
+ ((uint16_t)q->packed.avail_phase << 15);
+ q->last_used_idx = q->last_used_idx |
+ ((uint16_t)q->packed.used_phase << 15);
+ }
+
+ rte_vhost_set_vring_base(vsession->vid, i, q->last_avail_idx, q->last_used_idx);
+ }
+
+ vhost_session_mem_unregister(vsession->mem);
+ free(vsession->mem);
+
+ return 0;
+}
+
+int
+vhost_stop_device_cb(int vid)
+{
+ struct spdk_vhost_session *vsession;
+ int rc;
+
+ pthread_mutex_lock(&g_vhost_mutex);
+ vsession = vhost_session_find_by_vid(vid);
+ if (vsession == NULL) {
+ SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid);
+ pthread_mutex_unlock(&g_vhost_mutex);
+ return -EINVAL;
+ }
+
+ if (!vsession->started) {
+ /* already stopped, nothing to do */
+ pthread_mutex_unlock(&g_vhost_mutex);
+ return -EALREADY;
+ }
+
+ rc = _stop_session(vsession);
+ pthread_mutex_unlock(&g_vhost_mutex);
+
+ return rc;
+}
+
+int
+vhost_start_device_cb(int vid)
+{
+ struct spdk_vhost_dev *vdev;
+ struct spdk_vhost_session *vsession;
+ int rc = -1;
+ uint16_t i;
+ bool packed_ring;
+
+ pthread_mutex_lock(&g_vhost_mutex);
+
+ vsession = vhost_session_find_by_vid(vid);
+ if (vsession == NULL) {
+ SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid);
+ goto out;
+ }
+
+ vdev = vsession->vdev;
+ if (vsession->started) {
+ /* already started, nothing to do */
+ rc = 0;
+ goto out;
+ }
+
+ if (vhost_get_negotiated_features(vid, &vsession->negotiated_features) != 0) {
+ SPDK_ERRLOG("vhost device %d: Failed to get negotiated driver features\n", vid);
+ goto out;
+ }
+
+ packed_ring = ((vsession->negotiated_features & (1ULL << VIRTIO_F_RING_PACKED)) != 0);
+
+ vsession->max_queues = 0;
+ memset(vsession->virtqueue, 0, sizeof(vsession->virtqueue));
+ for (i = 0; i < SPDK_VHOST_MAX_VQUEUES; i++) {
+ struct spdk_vhost_virtqueue *q = &vsession->virtqueue[i];
+
+ q->vring_idx = -1;
+ if (rte_vhost_get_vhost_vring(vid, i, &q->vring)) {
+ continue;
+ }
+ q->vring_idx = i;
+ rte_vhost_get_vhost_ring_inflight(vid, i, &q->vring_inflight);
+
+ /* vring.desc and vring.desc_packed are in a union struct
+ * so q->vring.desc can replace q->vring.desc_packed.
+ */
+ if (q->vring.desc == NULL || q->vring.size == 0) {
+ continue;
+ }
+
+ if (rte_vhost_get_vring_base(vsession->vid, i, &q->last_avail_idx, &q->last_used_idx)) {
+ q->vring.desc = NULL;
+ continue;
+ }
+
+ if (packed_ring) {
+ /* Packed virtqueues support up to 2^15 entries each
+ * so left one bit can be used as wrap counter.
+ */
+ q->packed.avail_phase = q->last_avail_idx >> 15;
+ q->last_avail_idx = q->last_avail_idx & 0x7FFF;
+ q->packed.used_phase = q->last_used_idx >> 15;
+ q->last_used_idx = q->last_used_idx & 0x7FFF;
+
+ /* Disable I/O submission notifications, we'll be polling. */
+ q->vring.device_event->flags = VRING_PACKED_EVENT_FLAG_DISABLE;
+ } else {
+ /* Disable I/O submission notifications, we'll be polling. */
+ q->vring.used->flags = VRING_USED_F_NO_NOTIFY;
+ }
+
+ q->packed.packed_ring = packed_ring;
+ vsession->max_queues = i + 1;
+ }
+
+ if (vhost_get_mem_table(vid, &vsession->mem) != 0) {
+ SPDK_ERRLOG("vhost device %d: Failed to get guest memory table\n", vid);
+ goto out;
+ }
+
+ /*
+ * Not sure right now but this look like some kind of QEMU bug and guest IO
+ * might be frozed without kicking all queues after live-migration. This look like
+ * the previous vhost instance failed to effectively deliver all interrupts before
+ * the GET_VRING_BASE message. This shouldn't harm guest since spurious interrupts
+ * should be ignored by guest virtio driver.
+ *
+ * Tested on QEMU 2.10.91 and 2.11.50.
+ */
+ for (i = 0; i < vsession->max_queues; i++) {
+ struct spdk_vhost_virtqueue *q = &vsession->virtqueue[i];
+
+ /* vring.desc and vring.desc_packed are in a union struct
+ * so q->vring.desc can replace q->vring.desc_packed.
+ */
+ if (q->vring.desc != NULL && q->vring.size > 0) {
+ rte_vhost_vring_call(vsession->vid, q->vring_idx);
+ }
+ }
+
+ vhost_session_set_coalescing(vdev, vsession, NULL);
+ vhost_session_mem_register(vsession->mem);
+ vsession->initialized = true;
+ rc = vdev->backend->start_session(vsession);
+ if (rc != 0) {
+ vhost_session_mem_unregister(vsession->mem);
+ free(vsession->mem);
+ goto out;
+ }
+
+out:
+ pthread_mutex_unlock(&g_vhost_mutex);
+ return rc;
+}
+
+#ifdef SPDK_CONFIG_VHOST_INTERNAL_LIB
+int
+vhost_get_config_cb(int vid, uint8_t *config, uint32_t len)
+{
+ struct spdk_vhost_session *vsession;
+ struct spdk_vhost_dev *vdev;
+ int rc = -1;
+
+ pthread_mutex_lock(&g_vhost_mutex);
+ vsession = vhost_session_find_by_vid(vid);
+ if (vsession == NULL) {
+ SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid);
+ goto out;
+ }
+
+ vdev = vsession->vdev;
+ if (vdev->backend->vhost_get_config) {
+ rc = vdev->backend->vhost_get_config(vdev, config, len);
+ }
+
+out:
+ pthread_mutex_unlock(&g_vhost_mutex);
+ return rc;
+}
+
+int
+vhost_set_config_cb(int vid, uint8_t *config, uint32_t offset, uint32_t size, uint32_t flags)
+{
+ struct spdk_vhost_session *vsession;
+ struct spdk_vhost_dev *vdev;
+ int rc = -1;
+
+ pthread_mutex_lock(&g_vhost_mutex);
+ vsession = vhost_session_find_by_vid(vid);
+ if (vsession == NULL) {
+ SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid);
+ goto out;
+ }
+
+ vdev = vsession->vdev;
+ if (vdev->backend->vhost_set_config) {
+ rc = vdev->backend->vhost_set_config(vdev, config, offset, size, flags);
+ }
+
+out:
+ pthread_mutex_unlock(&g_vhost_mutex);
+ return rc;
+}
+#endif
+
+int
+spdk_vhost_set_socket_path(const char *basename)
+{
+ int ret;
+
+ if (basename && strlen(basename) > 0) {
+ ret = snprintf(dev_dirname, sizeof(dev_dirname) - 2, "%s", basename);
+ if (ret <= 0) {
+ return -EINVAL;
+ }
+ if ((size_t)ret >= sizeof(dev_dirname) - 2) {
+ SPDK_ERRLOG("Char dev dir path length %d is too long\n", ret);
+ return -EINVAL;
+ }
+
+ if (dev_dirname[ret - 1] != '/') {
+ dev_dirname[ret] = '/';
+ dev_dirname[ret + 1] = '\0';
+ }
+ }
+
+ return 0;
+}
+
+void
+vhost_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
+{
+ assert(vdev->backend->dump_info_json != NULL);
+ vdev->backend->dump_info_json(vdev, w);
+}
+
+int
+spdk_vhost_dev_remove(struct spdk_vhost_dev *vdev)
+{
+ if (vdev->pending_async_op_num) {
+ return -EBUSY;
+ }
+
+ return vdev->backend->remove_device(vdev);
+}
+
+int
+vhost_new_connection_cb(int vid, const char *ifname)
+{
+ struct spdk_vhost_dev *vdev;
+ struct spdk_vhost_session *vsession;
+
+ pthread_mutex_lock(&g_vhost_mutex);
+
+ vdev = spdk_vhost_dev_find(ifname);
+ if (vdev == NULL) {
+ SPDK_ERRLOG("Couldn't find device with vid %d to create connection for.\n", vid);
+ pthread_mutex_unlock(&g_vhost_mutex);
+ return -1;
+ }
+
+ /* We expect sessions inside vdev->vsessions to be sorted in ascending
+ * order in regard of vsession->id. For now we always set id = vsessions_cnt++
+ * and append each session to the very end of the vsessions list.
+ * This is required for spdk_vhost_dev_foreach_session() to work.
+ */
+ if (vdev->vsessions_num == UINT_MAX) {
+ assert(false);
+ return -EINVAL;
+ }
+
+ if (posix_memalign((void **)&vsession, SPDK_CACHE_LINE_SIZE, sizeof(*vsession) +
+ vdev->backend->session_ctx_size)) {
+ SPDK_ERRLOG("vsession alloc failed\n");
+ pthread_mutex_unlock(&g_vhost_mutex);
+ return -1;
+ }
+ memset(vsession, 0, sizeof(*vsession) + vdev->backend->session_ctx_size);
+
+ vsession->vdev = vdev;
+ vsession->vid = vid;
+ vsession->id = vdev->vsessions_num++;
+ vsession->name = spdk_sprintf_alloc("%ss%u", vdev->name, vsession->vid);
+ if (vsession->name == NULL) {
+ SPDK_ERRLOG("vsession alloc failed\n");
+ pthread_mutex_unlock(&g_vhost_mutex);
+ free(vsession);
+ return -1;
+ }
+ vsession->started = false;
+ vsession->initialized = false;
+ vsession->next_stats_check_time = 0;
+ vsession->stats_check_interval = SPDK_VHOST_STATS_CHECK_INTERVAL_MS *
+ spdk_get_ticks_hz() / 1000UL;
+ TAILQ_INSERT_TAIL(&vdev->vsessions, vsession, tailq);
+
+ vhost_session_install_rte_compat_hooks(vsession);
+ pthread_mutex_unlock(&g_vhost_mutex);
+ return 0;
+}
+
+int
+vhost_destroy_connection_cb(int vid)
+{
+ struct spdk_vhost_session *vsession;
+ int rc = 0;
+
+ pthread_mutex_lock(&g_vhost_mutex);
+ vsession = vhost_session_find_by_vid(vid);
+ if (vsession == NULL) {
+ SPDK_ERRLOG("Couldn't find session with vid %d.\n", vid);
+ pthread_mutex_unlock(&g_vhost_mutex);
+ return -EINVAL;
+ }
+
+ if (vsession->started) {
+ rc = _stop_session(vsession);
+ }
+
+ TAILQ_REMOVE(&vsession->vdev->vsessions, vsession, tailq);
+ free(vsession->name);
+ free(vsession);
+ pthread_mutex_unlock(&g_vhost_mutex);
+
+ return rc;
+}
+
+void
+spdk_vhost_lock(void)
+{
+ pthread_mutex_lock(&g_vhost_mutex);
+}
+
+int
+spdk_vhost_trylock(void)
+{
+ return -pthread_mutex_trylock(&g_vhost_mutex);
+}
+
+void
+spdk_vhost_unlock(void)
+{
+ pthread_mutex_unlock(&g_vhost_mutex);
+}
+
+void
+spdk_vhost_init(spdk_vhost_init_cb init_cb)
+{
+ size_t len;
+ int ret;
+
+ g_vhost_init_thread = spdk_get_thread();
+ assert(g_vhost_init_thread != NULL);
+
+ if (dev_dirname[0] == '\0') {
+ if (getcwd(dev_dirname, sizeof(dev_dirname) - 1) == NULL) {
+ SPDK_ERRLOG("getcwd failed (%d): %s\n", errno, spdk_strerror(errno));
+ ret = -1;
+ goto out;
+ }
+
+ len = strlen(dev_dirname);
+ if (dev_dirname[len - 1] != '/') {
+ dev_dirname[len] = '/';
+ dev_dirname[len + 1] = '\0';
+ }
+ }
+
+ ret = sem_init(&g_dpdk_sem, 0, 0);
+ if (ret != 0) {
+ SPDK_ERRLOG("Failed to initialize semaphore for rte_vhost pthread.\n");
+ ret = -1;
+ goto out;
+ }
+
+ ret = vhost_scsi_controller_construct();
+ if (ret != 0) {
+ SPDK_ERRLOG("Cannot construct vhost controllers\n");
+ goto out;
+ }
+
+ ret = vhost_blk_controller_construct();
+ if (ret != 0) {
+ SPDK_ERRLOG("Cannot construct vhost block controllers\n");
+ goto out;
+ }
+
+#ifdef SPDK_CONFIG_VHOST_INTERNAL_LIB
+ ret = vhost_nvme_controller_construct();
+ if (ret != 0) {
+ SPDK_ERRLOG("Cannot construct vhost NVMe controllers\n");
+ goto out;
+ }
+#endif
+
+ spdk_cpuset_zero(&g_vhost_core_mask);
+
+ /* iterate threads instead of using SPDK_ENV_FOREACH_CORE to ensure that threads are really
+ * created.
+ */
+ spdk_for_each_thread(vhost_setup_core_mask, init_cb, vhost_setup_core_mask_done);
+ return;
+out:
+ init_cb(ret);
+}
+
+static void
+vhost_fini(void *arg1)
+{
+ struct spdk_vhost_dev *vdev, *tmp;
+
+ spdk_vhost_lock();
+ vdev = spdk_vhost_dev_next(NULL);
+ while (vdev != NULL) {
+ tmp = spdk_vhost_dev_next(vdev);
+ spdk_vhost_dev_remove(vdev);
+ /* don't care if it fails, there's nothing we can do for now */
+ vdev = tmp;
+ }
+ spdk_vhost_unlock();
+
+ spdk_cpuset_zero(&g_vhost_core_mask);
+
+ /* All devices are removed now. */
+ sem_destroy(&g_dpdk_sem);
+
+ g_fini_cpl_cb();
+}
+
+static void *
+session_shutdown(void *arg)
+{
+ struct spdk_vhost_dev *vdev = NULL;
+
+ TAILQ_FOREACH(vdev, &g_vhost_devices, tailq) {
+ vhost_driver_unregister(vdev->path);
+ vdev->registered = false;
+ }
+
+ SPDK_INFOLOG(SPDK_LOG_VHOST, "Exiting\n");
+ spdk_thread_send_msg(g_vhost_init_thread, vhost_fini, NULL);
+ return NULL;
+}
+
+void
+spdk_vhost_fini(spdk_vhost_fini_cb fini_cb)
+{
+ pthread_t tid;
+ int rc;
+
+ assert(spdk_get_thread() == g_vhost_init_thread);
+ g_fini_cpl_cb = fini_cb;
+
+ /* rte_vhost API for removing sockets is not asynchronous. Since it may call SPDK
+ * ops for stopping a device or removing a connection, we need to call it from
+ * a separate thread to avoid deadlock.
+ */
+ rc = pthread_create(&tid, NULL, &session_shutdown, NULL);
+ if (rc < 0) {
+ SPDK_ERRLOG("Failed to start session shutdown thread (%d): %s\n", rc, spdk_strerror(rc));
+ abort();
+ }
+ pthread_detach(tid);
+}
+
+void
+spdk_vhost_config_json(struct spdk_json_write_ctx *w)
+{
+ struct spdk_vhost_dev *vdev;
+ uint32_t delay_base_us;
+ uint32_t iops_threshold;
+
+ spdk_json_write_array_begin(w);
+
+ spdk_vhost_lock();
+ vdev = spdk_vhost_dev_next(NULL);
+ while (vdev != NULL) {
+ vdev->backend->write_config_json(vdev, w);
+
+ spdk_vhost_get_coalescing(vdev, &delay_base_us, &iops_threshold);
+ if (delay_base_us) {
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "method", "vhost_controller_set_coalescing");
+
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_string(w, "ctrlr", vdev->name);
+ spdk_json_write_named_uint32(w, "delay_base_us", delay_base_us);
+ spdk_json_write_named_uint32(w, "iops_threshold", iops_threshold);
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+ }
+ vdev = spdk_vhost_dev_next(vdev);
+ }
+ spdk_vhost_unlock();
+
+ spdk_json_write_array_end(w);
+}
+
+SPDK_LOG_REGISTER_COMPONENT("vhost", SPDK_LOG_VHOST)
+SPDK_LOG_REGISTER_COMPONENT("vhost_ring", SPDK_LOG_VHOST_RING)
diff --git a/src/spdk/lib/vhost/vhost_blk.c b/src/spdk/lib/vhost/vhost_blk.c
new file mode 100644
index 000000000..d387cb27d
--- /dev/null
+++ b/src/spdk/lib/vhost/vhost_blk.c
@@ -0,0 +1,1354 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/virtio_blk.h>
+
+#include "spdk/env.h"
+#include "spdk/bdev.h"
+#include "spdk/bdev_module.h"
+#include "spdk/conf.h"
+#include "spdk/thread.h"
+#include "spdk/likely.h"
+#include "spdk/string.h"
+#include "spdk/util.h"
+#include "spdk/vhost.h"
+
+#include "vhost_internal.h"
+#include <rte_version.h>
+
+/* Minimal set of features supported by every SPDK VHOST-BLK device */
+#define SPDK_VHOST_BLK_FEATURES_BASE (SPDK_VHOST_FEATURES | \
+ (1ULL << VIRTIO_BLK_F_SIZE_MAX) | (1ULL << VIRTIO_BLK_F_SEG_MAX) | \
+ (1ULL << VIRTIO_BLK_F_GEOMETRY) | (1ULL << VIRTIO_BLK_F_BLK_SIZE) | \
+ (1ULL << VIRTIO_BLK_F_TOPOLOGY) | (1ULL << VIRTIO_BLK_F_BARRIER) | \
+ (1ULL << VIRTIO_BLK_F_SCSI) | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) | \
+ (1ULL << VIRTIO_BLK_F_MQ))
+
+/* Not supported features */
+#define SPDK_VHOST_BLK_DISABLED_FEATURES (SPDK_VHOST_DISABLED_FEATURES | \
+ (1ULL << VIRTIO_BLK_F_GEOMETRY) | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) | \
+ (1ULL << VIRTIO_BLK_F_BARRIER) | (1ULL << VIRTIO_BLK_F_SCSI))
+
+/* Vhost-blk support protocol features */
+#ifndef SPDK_CONFIG_VHOST_INTERNAL_LIB
+#define SPDK_VHOST_BLK_PROTOCOL_FEATURES ((1ULL << VHOST_USER_PROTOCOL_F_CONFIG) | \
+ (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD))
+#else
+#define SPDK_VHOST_BLK_PROTOCOL_FEATURES (1ULL << VHOST_USER_PROTOCOL_F_CONFIG)
+#endif
+
+struct spdk_vhost_blk_task {
+ struct spdk_bdev_io *bdev_io;
+ struct spdk_vhost_blk_session *bvsession;
+ struct spdk_vhost_virtqueue *vq;
+
+ volatile uint8_t *status;
+
+ uint16_t req_idx;
+ uint16_t num_descs;
+ uint16_t buffer_id;
+
+ /* for io wait */
+ struct spdk_bdev_io_wait_entry bdev_io_wait;
+
+ /* If set, the task is currently used for I/O processing. */
+ bool used;
+
+ /** Number of bytes that were written. */
+ uint32_t used_len;
+ uint16_t iovcnt;
+ struct iovec iovs[SPDK_VHOST_IOVS_MAX];
+};
+
+struct spdk_vhost_blk_dev {
+ struct spdk_vhost_dev vdev;
+ struct spdk_bdev *bdev;
+ struct spdk_bdev_desc *bdev_desc;
+ /* dummy_io_channel is used to hold a bdev reference */
+ struct spdk_io_channel *dummy_io_channel;
+ bool readonly;
+};
+
+struct spdk_vhost_blk_session {
+ /* The parent session must be the very first field in this struct */
+ struct spdk_vhost_session vsession;
+ struct spdk_vhost_blk_dev *bvdev;
+ struct spdk_poller *requestq_poller;
+ struct spdk_io_channel *io_channel;
+ struct spdk_poller *stop_poller;
+};
+
+/* forward declaration */
+static const struct spdk_vhost_dev_backend vhost_blk_device_backend;
+
+static int
+process_blk_request(struct spdk_vhost_blk_task *task,
+ struct spdk_vhost_blk_session *bvsession,
+ struct spdk_vhost_virtqueue *vq);
+
+static void
+blk_task_finish(struct spdk_vhost_blk_task *task)
+{
+ assert(task->bvsession->vsession.task_cnt > 0);
+ task->bvsession->vsession.task_cnt--;
+ task->used = false;
+}
+
+static void
+blk_task_init(struct spdk_vhost_blk_task *task)
+{
+ task->used = true;
+ task->iovcnt = SPDK_COUNTOF(task->iovs);
+ task->status = NULL;
+ task->used_len = 0;
+}
+
+static void
+blk_task_enqueue(struct spdk_vhost_blk_task *task)
+{
+ if (task->vq->packed.packed_ring) {
+ vhost_vq_packed_ring_enqueue(&task->bvsession->vsession, task->vq,
+ task->num_descs,
+ task->buffer_id, task->used_len);
+ } else {
+ vhost_vq_used_ring_enqueue(&task->bvsession->vsession, task->vq,
+ task->req_idx, task->used_len);
+ }
+}
+
+static void
+invalid_blk_request(struct spdk_vhost_blk_task *task, uint8_t status)
+{
+ if (task->status) {
+ *task->status = status;
+ }
+
+ blk_task_enqueue(task);
+ blk_task_finish(task);
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Invalid request (status=%" PRIu8")\n", status);
+}
+
+/*
+ * Process task's descriptor chain and setup data related fields.
+ * Return
+ * total size of suplied buffers
+ *
+ * FIXME: Make this function return to rd_cnt and wr_cnt
+ */
+static int
+blk_iovs_split_queue_setup(struct spdk_vhost_blk_session *bvsession,
+ struct spdk_vhost_virtqueue *vq,
+ uint16_t req_idx, struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length)
+{
+ struct spdk_vhost_session *vsession = &bvsession->vsession;
+ struct spdk_vhost_dev *vdev = vsession->vdev;
+ struct vring_desc *desc, *desc_table;
+ uint16_t out_cnt = 0, cnt = 0;
+ uint32_t desc_table_size, len = 0;
+ uint32_t desc_handled_cnt;
+ int rc;
+
+ rc = vhost_vq_get_desc(vsession, vq, req_idx, &desc, &desc_table, &desc_table_size);
+ if (rc != 0) {
+ SPDK_ERRLOG("%s: invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx);
+ return -1;
+ }
+
+ desc_handled_cnt = 0;
+ while (1) {
+ /*
+ * Maximum cnt reached?
+ * Should not happen if request is well formatted, otherwise this is a BUG.
+ */
+ if (spdk_unlikely(cnt == *iovs_cnt)) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "%s: max IOVs in request reached (req_idx = %"PRIu16").\n",
+ vsession->name, req_idx);
+ return -1;
+ }
+
+ if (spdk_unlikely(vhost_vring_desc_to_iov(vsession, iovs, &cnt, desc))) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "%s: invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n",
+ vsession->name, req_idx, cnt);
+ return -1;
+ }
+
+ len += desc->len;
+
+ out_cnt += vhost_vring_desc_is_wr(desc);
+
+ rc = vhost_vring_desc_get_next(&desc, desc_table, desc_table_size);
+ if (rc != 0) {
+ SPDK_ERRLOG("%s: descriptor chain at index %"PRIu16" terminated unexpectedly.\n",
+ vsession->name, req_idx);
+ return -1;
+ } else if (desc == NULL) {
+ break;
+ }
+
+ desc_handled_cnt++;
+ if (spdk_unlikely(desc_handled_cnt > desc_table_size)) {
+ /* Break a cycle and report an error, if any. */
+ SPDK_ERRLOG("%s: found a cycle in the descriptor chain: desc_table_size = %d, desc_handled_cnt = %d.\n",
+ vsession->name, desc_table_size, desc_handled_cnt);
+ return -1;
+ }
+ }
+
+ /*
+ * There must be least two descriptors.
+ * First contain request so it must be readable.
+ * Last descriptor contain buffer for response so it must be writable.
+ */
+ if (spdk_unlikely(out_cnt == 0 || cnt < 2)) {
+ return -1;
+ }
+
+ *length = len;
+ *iovs_cnt = cnt;
+ return 0;
+}
+
+static int
+blk_iovs_packed_queue_setup(struct spdk_vhost_blk_session *bvsession,
+ struct spdk_vhost_virtqueue *vq,
+ uint16_t req_idx, struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length)
+{
+ struct spdk_vhost_session *vsession = &bvsession->vsession;
+ struct spdk_vhost_dev *vdev = vsession->vdev;
+ struct vring_packed_desc *desc = NULL, *desc_table;
+ uint16_t out_cnt = 0, cnt = 0;
+ uint32_t desc_table_size, len = 0;
+ int rc = 0;
+
+ rc = vhost_vq_get_desc_packed(vsession, vq, req_idx, &desc,
+ &desc_table, &desc_table_size);
+ if (spdk_unlikely(rc != 0)) {
+ SPDK_ERRLOG("%s: Invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx);
+ return rc;
+ }
+
+ if (desc_table != NULL) {
+ req_idx = 0;
+ }
+
+ while (1) {
+ /*
+ * Maximum cnt reached?
+ * Should not happen if request is well formatted, otherwise this is a BUG.
+ */
+ if (spdk_unlikely(cnt == *iovs_cnt)) {
+ SPDK_ERRLOG("%s: max IOVs in request reached (req_idx = %"PRIu16").\n",
+ vsession->name, req_idx);
+ return -EINVAL;
+ }
+
+ if (spdk_unlikely(vhost_vring_packed_desc_to_iov(vsession, iovs, &cnt, desc))) {
+ SPDK_ERRLOG("%s: invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n",
+ vsession->name, req_idx, cnt);
+ return -EINVAL;
+ }
+
+ len += desc->len;
+ out_cnt += vhost_vring_packed_desc_is_wr(desc);
+
+ /* desc is NULL means we reach the last desc of this request */
+ vhost_vring_packed_desc_get_next(&desc, &req_idx, vq, desc_table, desc_table_size);
+ if (desc == NULL) {
+ break;
+ }
+ }
+
+ /*
+ * There must be least two descriptors.
+ * First contain request so it must be readable.
+ * Last descriptor contain buffer for response so it must be writable.
+ */
+ if (spdk_unlikely(out_cnt == 0 || cnt < 2)) {
+ return -EINVAL;
+ }
+
+ *length = len;
+ *iovs_cnt = cnt;
+
+ return 0;
+}
+
+static void
+blk_request_finish(bool success, struct spdk_vhost_blk_task *task)
+{
+ *task->status = success ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR;
+
+ blk_task_enqueue(task);
+
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Finished task (%p) req_idx=%d\n status: %s\n", task,
+ task->req_idx, success ? "OK" : "FAIL");
+ blk_task_finish(task);
+}
+
+static void
+blk_request_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct spdk_vhost_blk_task *task = cb_arg;
+
+ spdk_bdev_free_io(bdev_io);
+ blk_request_finish(success, task);
+}
+
+static void
+blk_request_resubmit(void *arg)
+{
+ struct spdk_vhost_blk_task *task = (struct spdk_vhost_blk_task *)arg;
+ int rc = 0;
+
+ blk_task_init(task);
+
+ rc = process_blk_request(task, task->bvsession, task->vq);
+ if (rc == 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p resubmitted ======\n", task);
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p failed ======\n", task);
+ }
+}
+
+static inline void
+blk_request_queue_io(struct spdk_vhost_blk_task *task)
+{
+ int rc;
+ struct spdk_vhost_blk_session *bvsession = task->bvsession;
+ struct spdk_bdev *bdev = bvsession->bvdev->bdev;
+
+ task->bdev_io_wait.bdev = bdev;
+ task->bdev_io_wait.cb_fn = blk_request_resubmit;
+ task->bdev_io_wait.cb_arg = task;
+
+ rc = spdk_bdev_queue_io_wait(bdev, bvsession->io_channel, &task->bdev_io_wait);
+ if (rc != 0) {
+ SPDK_ERRLOG("%s: failed to queue I/O, rc=%d\n", bvsession->vsession.name, rc);
+ invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
+ }
+}
+
+static int
+process_blk_request(struct spdk_vhost_blk_task *task,
+ struct spdk_vhost_blk_session *bvsession,
+ struct spdk_vhost_virtqueue *vq)
+{
+ struct spdk_vhost_blk_dev *bvdev = bvsession->bvdev;
+ const struct virtio_blk_outhdr *req;
+ struct virtio_blk_discard_write_zeroes *desc;
+ struct iovec *iov;
+ uint32_t type;
+ uint32_t payload_len;
+ uint64_t flush_bytes;
+ int rc;
+
+ if (vq->packed.packed_ring) {
+ rc = blk_iovs_packed_queue_setup(bvsession, vq, task->req_idx, task->iovs, &task->iovcnt,
+ &payload_len);
+ } else {
+ rc = blk_iovs_split_queue_setup(bvsession, vq, task->req_idx, task->iovs, &task->iovcnt,
+ &payload_len);
+ }
+
+ if (rc) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Invalid request (req_idx = %"PRIu16").\n", task->req_idx);
+ /* Only READ and WRITE are supported for now. */
+ invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
+ return -1;
+ }
+
+ iov = &task->iovs[0];
+ if (spdk_unlikely(iov->iov_len != sizeof(*req))) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK,
+ "First descriptor size is %zu but expected %zu (req_idx = %"PRIu16").\n",
+ iov->iov_len, sizeof(*req), task->req_idx);
+ invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
+ return -1;
+ }
+
+ req = iov->iov_base;
+
+ iov = &task->iovs[task->iovcnt - 1];
+ if (spdk_unlikely(iov->iov_len != 1)) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK,
+ "Last descriptor size is %zu but expected %d (req_idx = %"PRIu16").\n",
+ iov->iov_len, 1, task->req_idx);
+ invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
+ return -1;
+ }
+
+ task->status = iov->iov_base;
+ payload_len -= sizeof(*req) + sizeof(*task->status);
+ task->iovcnt -= 2;
+
+ type = req->type;
+#ifdef VIRTIO_BLK_T_BARRIER
+ /* Don't care about barier for now (as QEMU's virtio-blk do). */
+ type &= ~VIRTIO_BLK_T_BARRIER;
+#endif
+
+ switch (type) {
+ case VIRTIO_BLK_T_IN:
+ case VIRTIO_BLK_T_OUT:
+ if (spdk_unlikely(payload_len == 0 || (payload_len & (512 - 1)) != 0)) {
+ SPDK_ERRLOG("%s - passed IO buffer is not multiple of 512b (req_idx = %"PRIu16").\n",
+ type ? "WRITE" : "READ", task->req_idx);
+ invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
+ return -1;
+ }
+
+ if (type == VIRTIO_BLK_T_IN) {
+ task->used_len = payload_len + sizeof(*task->status);
+ rc = spdk_bdev_readv(bvdev->bdev_desc, bvsession->io_channel,
+ &task->iovs[1], task->iovcnt, req->sector * 512,
+ payload_len, blk_request_complete_cb, task);
+ } else if (!bvdev->readonly) {
+ task->used_len = sizeof(*task->status);
+ rc = spdk_bdev_writev(bvdev->bdev_desc, bvsession->io_channel,
+ &task->iovs[1], task->iovcnt, req->sector * 512,
+ payload_len, blk_request_complete_cb, task);
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Device is in read-only mode!\n");
+ rc = -1;
+ }
+
+ if (rc) {
+ if (rc == -ENOMEM) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
+ blk_request_queue_io(task);
+ } else {
+ invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
+ return -1;
+ }
+ }
+ break;
+ case VIRTIO_BLK_T_DISCARD:
+ desc = task->iovs[1].iov_base;
+ if (payload_len != sizeof(*desc)) {
+ SPDK_NOTICELOG("Invalid discard payload size: %u\n", payload_len);
+ invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
+ return -1;
+ }
+
+ rc = spdk_bdev_unmap(bvdev->bdev_desc, bvsession->io_channel,
+ desc->sector * 512, desc->num_sectors * 512,
+ blk_request_complete_cb, task);
+ if (rc) {
+ if (rc == -ENOMEM) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
+ blk_request_queue_io(task);
+ } else {
+ invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
+ return -1;
+ }
+ }
+ break;
+ case VIRTIO_BLK_T_WRITE_ZEROES:
+ desc = task->iovs[1].iov_base;
+ if (payload_len != sizeof(*desc)) {
+ SPDK_NOTICELOG("Invalid write zeroes payload size: %u\n", payload_len);
+ invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
+ return -1;
+ }
+
+ /* Zeroed and Unmap the range, SPDK doen't support it. */
+ if (desc->flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) {
+ SPDK_NOTICELOG("Can't support Write Zeroes with Unmap flag\n");
+ invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
+ return -1;
+ }
+
+ rc = spdk_bdev_write_zeroes(bvdev->bdev_desc, bvsession->io_channel,
+ desc->sector * 512, desc->num_sectors * 512,
+ blk_request_complete_cb, task);
+ if (rc) {
+ if (rc == -ENOMEM) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
+ blk_request_queue_io(task);
+ } else {
+ invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
+ return -1;
+ }
+ }
+ break;
+ case VIRTIO_BLK_T_FLUSH:
+ flush_bytes = spdk_bdev_get_num_blocks(bvdev->bdev) * spdk_bdev_get_block_size(bvdev->bdev);
+ if (req->sector != 0) {
+ SPDK_NOTICELOG("sector must be zero for flush command\n");
+ invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
+ return -1;
+ }
+ rc = spdk_bdev_flush(bvdev->bdev_desc, bvsession->io_channel,
+ 0, flush_bytes,
+ blk_request_complete_cb, task);
+ if (rc) {
+ if (rc == -ENOMEM) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n");
+ blk_request_queue_io(task);
+ } else {
+ invalid_blk_request(task, VIRTIO_BLK_S_IOERR);
+ return -1;
+ }
+ }
+ break;
+ case VIRTIO_BLK_T_GET_ID:
+ if (!task->iovcnt || !payload_len) {
+ invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
+ return -1;
+ }
+ task->used_len = spdk_min((size_t)VIRTIO_BLK_ID_BYTES, task->iovs[1].iov_len);
+ spdk_strcpy_pad(task->iovs[1].iov_base, spdk_bdev_get_product_name(bvdev->bdev),
+ task->used_len, ' ');
+ blk_request_finish(true, task);
+ break;
+ default:
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Not supported request type '%"PRIu32"'.\n", type);
+ invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP);
+ return -1;
+ }
+
+ return 0;
+}
+
+static void
+process_blk_task(struct spdk_vhost_virtqueue *vq, uint16_t req_idx)
+{
+ struct spdk_vhost_blk_task *task;
+ uint16_t task_idx = req_idx, num_descs;
+
+ if (vq->packed.packed_ring) {
+ /* Packed ring used the buffer_id as the task_idx to get task struct.
+ * In kernel driver, it uses the vq->free_head to set the buffer_id so the value
+ * must be in the range of 0 ~ vring.size. The free_head value must be unique
+ * in the outstanding requests.
+ * We can't use the req_idx as the task_idx because the desc can be reused in
+ * the next phase even when it's not completed in the previous phase. For example,
+ * At phase 0, last_used_idx was 2 and desc0 was not completed.Then after moving
+ * phase 1, last_avail_idx is updated to 1. In this case, req_idx can not be used
+ * as task_idx because we will know task[0]->used is true at phase 1.
+ * The split queue is quite different, the desc would insert into the free list when
+ * device completes the request, the driver gets the desc from the free list which
+ * ensures the req_idx is unique in the outstanding requests.
+ */
+ task_idx = vhost_vring_packed_desc_get_buffer_id(vq, req_idx, &num_descs);
+ }
+
+ task = &((struct spdk_vhost_blk_task *)vq->tasks)[task_idx];
+ if (spdk_unlikely(task->used)) {
+ SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n",
+ task->bvsession->vsession.name, task_idx);
+ task->used_len = 0;
+ blk_task_enqueue(task);
+ return;
+ }
+
+ if (vq->packed.packed_ring) {
+ task->req_idx = req_idx;
+ task->num_descs = num_descs;
+ task->buffer_id = task_idx;
+ }
+
+ task->bvsession->vsession.task_cnt++;
+
+ blk_task_init(task);
+
+ if (process_blk_request(task, task->bvsession, vq) == 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p req_idx %d submitted ======\n", task,
+ task_idx);
+ } else {
+ SPDK_ERRLOG("====== Task %p req_idx %d failed ======\n", task, task_idx);
+ }
+}
+
+static void
+submit_inflight_desc(struct spdk_vhost_blk_session *bvsession,
+ struct spdk_vhost_virtqueue *vq)
+{
+ struct spdk_vhost_session *vsession = &bvsession->vsession;
+ spdk_vhost_resubmit_info *resubmit = vq->vring_inflight.resubmit_inflight;
+ spdk_vhost_resubmit_desc *resubmit_list;
+ uint16_t req_idx;
+
+ if (spdk_likely(resubmit == NULL || resubmit->resubmit_list == NULL)) {
+ return;
+ }
+
+ resubmit_list = resubmit->resubmit_list;
+ while (resubmit->resubmit_num-- > 0) {
+ req_idx = resubmit_list[resubmit->resubmit_num].index;
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Start processing request idx %"PRIu16"======\n",
+ req_idx);
+
+ if (spdk_unlikely(req_idx >= vq->vring.size)) {
+ SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n",
+ vsession->name, req_idx, vq->vring.size);
+ vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0);
+ continue;
+ }
+
+ process_blk_task(vq, req_idx);
+ }
+
+ free(resubmit_list);
+ resubmit->resubmit_list = NULL;
+}
+
+static void
+process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
+{
+ struct spdk_vhost_session *vsession = &bvsession->vsession;
+ uint16_t reqs[SPDK_VHOST_VQ_MAX_SUBMISSIONS];
+ uint16_t reqs_cnt, i;
+
+ submit_inflight_desc(bvsession, vq);
+
+ reqs_cnt = vhost_vq_avail_ring_get(vq, reqs, SPDK_COUNTOF(reqs));
+ if (!reqs_cnt) {
+ return;
+ }
+
+ for (i = 0; i < reqs_cnt; i++) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Starting processing request idx %"PRIu16"======\n",
+ reqs[i]);
+
+ if (spdk_unlikely(reqs[i] >= vq->vring.size)) {
+ SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n",
+ vsession->name, reqs[i], vq->vring.size);
+ vhost_vq_used_ring_enqueue(vsession, vq, reqs[i], 0);
+ continue;
+ }
+
+ rte_vhost_set_inflight_desc_split(vsession->vid, vq->vring_idx, reqs[i]);
+
+ process_blk_task(vq, reqs[i]);
+ }
+}
+
+static void
+process_packed_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
+{
+ uint16_t i = 0;
+
+ while (i++ < SPDK_VHOST_VQ_MAX_SUBMISSIONS &&
+ vhost_vq_packed_ring_is_avail(vq)) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Starting processing request idx %"PRIu16"======\n",
+ vq->last_avail_idx);
+
+ process_blk_task(vq, vq->last_avail_idx);
+ }
+}
+
+static int
+vdev_worker(void *arg)
+{
+ struct spdk_vhost_blk_session *bvsession = arg;
+ struct spdk_vhost_session *vsession = &bvsession->vsession;
+
+ uint16_t q_idx;
+ bool packed_ring;
+
+ /* In a session, every vq supports the same format */
+ packed_ring = vsession->virtqueue[0].packed.packed_ring;
+ for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
+ if (packed_ring) {
+ process_packed_vq(bvsession, &vsession->virtqueue[q_idx]);
+ } else {
+ process_vq(bvsession, &vsession->virtqueue[q_idx]);
+ }
+ }
+
+ vhost_session_used_signal(vsession);
+
+ return SPDK_POLLER_BUSY;
+}
+
+static void
+no_bdev_process_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
+{
+ struct spdk_vhost_session *vsession = &bvsession->vsession;
+ struct iovec iovs[SPDK_VHOST_IOVS_MAX];
+ uint32_t length;
+ uint16_t iovcnt, req_idx;
+
+ if (vhost_vq_avail_ring_get(vq, &req_idx, 1) != 1) {
+ return;
+ }
+
+ iovcnt = SPDK_COUNTOF(iovs);
+ if (blk_iovs_split_queue_setup(bvsession, vq, req_idx, iovs, &iovcnt, &length) == 0) {
+ *(volatile uint8_t *)iovs[iovcnt - 1].iov_base = VIRTIO_BLK_S_IOERR;
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Aborting request %" PRIu16"\n", req_idx);
+ }
+
+ vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0);
+}
+
+static void
+no_bdev_process_packed_vq(struct spdk_vhost_blk_session *bvsession, struct spdk_vhost_virtqueue *vq)
+{
+ struct spdk_vhost_session *vsession = &bvsession->vsession;
+ struct spdk_vhost_blk_task *task;
+ uint32_t length;
+ uint16_t req_idx = vq->last_avail_idx;
+ uint16_t task_idx, num_descs;
+
+ if (!vhost_vq_packed_ring_is_avail(vq)) {
+ return;
+ }
+
+ task_idx = vhost_vring_packed_desc_get_buffer_id(vq, req_idx, &num_descs);
+ task = &((struct spdk_vhost_blk_task *)vq->tasks)[task_idx];
+ if (spdk_unlikely(task->used)) {
+ SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n",
+ vsession->name, req_idx);
+ vhost_vq_packed_ring_enqueue(vsession, vq, num_descs,
+ task->buffer_id, task->used_len);
+ return;
+ }
+
+ task->req_idx = req_idx;
+ task->num_descs = num_descs;
+ task->buffer_id = task_idx;
+ blk_task_init(task);
+
+ if (blk_iovs_packed_queue_setup(bvsession, vq, task->req_idx, task->iovs, &task->iovcnt,
+ &length)) {
+ *(volatile uint8_t *)(task->iovs[task->iovcnt - 1].iov_base) = VIRTIO_BLK_S_IOERR;
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Aborting request %" PRIu16"\n", req_idx);
+ }
+
+ task->used = false;
+ vhost_vq_packed_ring_enqueue(vsession, vq, num_descs,
+ task->buffer_id, task->used_len);
+}
+
+static int
+no_bdev_vdev_worker(void *arg)
+{
+ struct spdk_vhost_blk_session *bvsession = arg;
+ struct spdk_vhost_session *vsession = &bvsession->vsession;
+ uint16_t q_idx;
+ bool packed_ring;
+
+ /* In a session, every vq supports the same format */
+ packed_ring = vsession->virtqueue[0].packed.packed_ring;
+ for (q_idx = 0; q_idx < vsession->max_queues; q_idx++) {
+ if (packed_ring) {
+ no_bdev_process_packed_vq(bvsession, &vsession->virtqueue[q_idx]);
+ } else {
+ no_bdev_process_vq(bvsession, &vsession->virtqueue[q_idx]);
+ }
+ }
+
+ vhost_session_used_signal(vsession);
+
+ if (vsession->task_cnt == 0 && bvsession->io_channel) {
+ spdk_put_io_channel(bvsession->io_channel);
+ bvsession->io_channel = NULL;
+ }
+
+ return SPDK_POLLER_BUSY;
+}
+
+static struct spdk_vhost_blk_session *
+to_blk_session(struct spdk_vhost_session *vsession)
+{
+ assert(vsession->vdev->backend == &vhost_blk_device_backend);
+ return (struct spdk_vhost_blk_session *)vsession;
+}
+
+static struct spdk_vhost_blk_dev *
+to_blk_dev(struct spdk_vhost_dev *vdev)
+{
+ if (vdev == NULL) {
+ return NULL;
+ }
+
+ if (vdev->backend != &vhost_blk_device_backend) {
+ SPDK_ERRLOG("%s: not a vhost-blk device\n", vdev->name);
+ return NULL;
+ }
+
+ return SPDK_CONTAINEROF(vdev, struct spdk_vhost_blk_dev, vdev);
+}
+
+static int
+vhost_session_bdev_resize_cb(struct spdk_vhost_dev *vdev,
+ struct spdk_vhost_session *vsession,
+ void *ctx)
+{
+#if RTE_VERSION >= RTE_VERSION_NUM(20, 02, 0, 0)
+ SPDK_NOTICELOG("bdev send slave msg to vid(%d)\n", vsession->vid);
+ rte_vhost_slave_config_change(vsession->vid, false);
+#else
+ SPDK_NOTICELOG("bdev does not support resize until DPDK submodule version >= 20.02\n");
+#endif
+
+ return 0;
+}
+
+static void
+blk_resize_cb(void *resize_ctx)
+{
+ struct spdk_vhost_blk_dev *bvdev = resize_ctx;
+
+ spdk_vhost_lock();
+ vhost_dev_foreach_session(&bvdev->vdev, vhost_session_bdev_resize_cb,
+ NULL, NULL);
+ spdk_vhost_unlock();
+}
+
+static void
+vhost_dev_bdev_remove_cpl_cb(struct spdk_vhost_dev *vdev, void *ctx)
+{
+
+ /* All sessions have been notified, time to close the bdev */
+ struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
+
+ assert(bvdev != NULL);
+ spdk_put_io_channel(bvdev->dummy_io_channel);
+ spdk_bdev_close(bvdev->bdev_desc);
+ bvdev->bdev_desc = NULL;
+ bvdev->bdev = NULL;
+}
+
+static int
+vhost_session_bdev_remove_cb(struct spdk_vhost_dev *vdev,
+ struct spdk_vhost_session *vsession,
+ void *ctx)
+{
+ struct spdk_vhost_blk_session *bvsession;
+
+ bvsession = (struct spdk_vhost_blk_session *)vsession;
+ if (bvsession->requestq_poller) {
+ spdk_poller_unregister(&bvsession->requestq_poller);
+ bvsession->requestq_poller = SPDK_POLLER_REGISTER(no_bdev_vdev_worker, bvsession, 0);
+ }
+
+ return 0;
+}
+
+static void
+bdev_remove_cb(void *remove_ctx)
+{
+ struct spdk_vhost_blk_dev *bvdev = remove_ctx;
+
+ SPDK_WARNLOG("%s: hot-removing bdev - all further requests will fail.\n",
+ bvdev->vdev.name);
+
+ spdk_vhost_lock();
+ vhost_dev_foreach_session(&bvdev->vdev, vhost_session_bdev_remove_cb,
+ vhost_dev_bdev_remove_cpl_cb, NULL);
+ spdk_vhost_unlock();
+}
+
+static void
+bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev,
+ void *event_ctx)
+{
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Bdev event: type %d, name %s\n",
+ type,
+ bdev->name);
+
+ switch (type) {
+ case SPDK_BDEV_EVENT_REMOVE:
+ SPDK_NOTICELOG("bdev name (%s) received event(SPDK_BDEV_EVENT_REMOVE)\n", bdev->name);
+ bdev_remove_cb(event_ctx);
+ break;
+ case SPDK_BDEV_EVENT_RESIZE:
+ SPDK_NOTICELOG("bdev name (%s) received event(SPDK_BDEV_EVENT_RESIZE)\n", bdev->name);
+ blk_resize_cb(event_ctx);
+ break;
+ default:
+ SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type);
+ break;
+ }
+}
+
+static void
+free_task_pool(struct spdk_vhost_blk_session *bvsession)
+{
+ struct spdk_vhost_session *vsession = &bvsession->vsession;
+ struct spdk_vhost_virtqueue *vq;
+ uint16_t i;
+
+ for (i = 0; i < vsession->max_queues; i++) {
+ vq = &vsession->virtqueue[i];
+ if (vq->tasks == NULL) {
+ continue;
+ }
+
+ spdk_free(vq->tasks);
+ vq->tasks = NULL;
+ }
+}
+
+static int
+alloc_task_pool(struct spdk_vhost_blk_session *bvsession)
+{
+ struct spdk_vhost_session *vsession = &bvsession->vsession;
+ struct spdk_vhost_virtqueue *vq;
+ struct spdk_vhost_blk_task *task;
+ uint32_t task_cnt;
+ uint16_t i;
+ uint32_t j;
+
+ for (i = 0; i < vsession->max_queues; i++) {
+ vq = &vsession->virtqueue[i];
+ if (vq->vring.desc == NULL) {
+ continue;
+ }
+
+ task_cnt = vq->vring.size;
+ if (task_cnt > SPDK_VHOST_MAX_VQ_SIZE) {
+ /* sanity check */
+ SPDK_ERRLOG("%s: virtuque %"PRIu16" is too big. (size = %"PRIu32", max = %"PRIu32")\n",
+ vsession->name, i, task_cnt, SPDK_VHOST_MAX_VQ_SIZE);
+ free_task_pool(bvsession);
+ return -1;
+ }
+ vq->tasks = spdk_zmalloc(sizeof(struct spdk_vhost_blk_task) * task_cnt,
+ SPDK_CACHE_LINE_SIZE, NULL,
+ SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+ if (vq->tasks == NULL) {
+ SPDK_ERRLOG("%s: failed to allocate %"PRIu32" tasks for virtqueue %"PRIu16"\n",
+ vsession->name, task_cnt, i);
+ free_task_pool(bvsession);
+ return -1;
+ }
+
+ for (j = 0; j < task_cnt; j++) {
+ task = &((struct spdk_vhost_blk_task *)vq->tasks)[j];
+ task->bvsession = bvsession;
+ task->req_idx = j;
+ task->vq = vq;
+ }
+ }
+
+ return 0;
+}
+
+static int
+vhost_blk_start_cb(struct spdk_vhost_dev *vdev,
+ struct spdk_vhost_session *vsession, void *unused)
+{
+ struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession);
+ struct spdk_vhost_blk_dev *bvdev;
+ int i, rc = 0;
+
+ bvdev = to_blk_dev(vdev);
+ assert(bvdev != NULL);
+ bvsession->bvdev = bvdev;
+
+ /* validate all I/O queues are in a contiguous index range */
+ for (i = 0; i < vsession->max_queues; i++) {
+ /* vring.desc and vring.desc_packed are in a union struct
+ * so q->vring.desc can replace q->vring.desc_packed.
+ */
+ if (vsession->virtqueue[i].vring.desc == NULL) {
+ SPDK_ERRLOG("%s: queue %"PRIu32" is empty\n", vsession->name, i);
+ rc = -1;
+ goto out;
+ }
+ }
+
+ rc = alloc_task_pool(bvsession);
+ if (rc != 0) {
+ SPDK_ERRLOG("%s: failed to alloc task pool.\n", vsession->name);
+ goto out;
+ }
+
+ if (bvdev->bdev) {
+ bvsession->io_channel = spdk_bdev_get_io_channel(bvdev->bdev_desc);
+ if (!bvsession->io_channel) {
+ free_task_pool(bvsession);
+ SPDK_ERRLOG("%s: I/O channel allocation failed\n", vsession->name);
+ rc = -1;
+ goto out;
+ }
+ }
+
+ bvsession->requestq_poller = SPDK_POLLER_REGISTER(bvdev->bdev ? vdev_worker : no_bdev_vdev_worker,
+ bvsession, 0);
+ SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: started poller on lcore %d\n",
+ vsession->name, spdk_env_get_current_core());
+out:
+ vhost_session_start_done(vsession, rc);
+ return rc;
+}
+
+static int
+vhost_blk_start(struct spdk_vhost_session *vsession)
+{
+ return vhost_session_send_event(vsession, vhost_blk_start_cb,
+ 3, "start session");
+}
+
+static int
+destroy_session_poller_cb(void *arg)
+{
+ struct spdk_vhost_blk_session *bvsession = arg;
+ struct spdk_vhost_session *vsession = &bvsession->vsession;
+ int i;
+
+ if (vsession->task_cnt > 0) {
+ return SPDK_POLLER_BUSY;
+ }
+
+ if (spdk_vhost_trylock() != 0) {
+ return SPDK_POLLER_BUSY;
+ }
+
+ for (i = 0; i < vsession->max_queues; i++) {
+ vsession->virtqueue[i].next_event_time = 0;
+ vhost_vq_used_signal(vsession, &vsession->virtqueue[i]);
+ }
+
+ SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: stopping poller on lcore %d\n",
+ vsession->name, spdk_env_get_current_core());
+
+ if (bvsession->io_channel) {
+ spdk_put_io_channel(bvsession->io_channel);
+ bvsession->io_channel = NULL;
+ }
+
+ free_task_pool(bvsession);
+ spdk_poller_unregister(&bvsession->stop_poller);
+ vhost_session_stop_done(vsession, 0);
+
+ spdk_vhost_unlock();
+ return SPDK_POLLER_BUSY;
+}
+
+static int
+vhost_blk_stop_cb(struct spdk_vhost_dev *vdev,
+ struct spdk_vhost_session *vsession, void *unused)
+{
+ struct spdk_vhost_blk_session *bvsession = to_blk_session(vsession);
+
+ spdk_poller_unregister(&bvsession->requestq_poller);
+ bvsession->stop_poller = SPDK_POLLER_REGISTER(destroy_session_poller_cb,
+ bvsession, 1000);
+ return 0;
+}
+
+static int
+vhost_blk_stop(struct spdk_vhost_session *vsession)
+{
+ return vhost_session_send_event(vsession, vhost_blk_stop_cb,
+ 3, "stop session");
+}
+
+static void
+vhost_blk_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
+{
+ struct spdk_vhost_blk_dev *bvdev;
+
+ bvdev = to_blk_dev(vdev);
+ assert(bvdev != NULL);
+
+ spdk_json_write_named_object_begin(w, "block");
+
+ spdk_json_write_named_bool(w, "readonly", bvdev->readonly);
+
+ spdk_json_write_name(w, "bdev");
+ if (bvdev->bdev) {
+ spdk_json_write_string(w, spdk_bdev_get_name(bvdev->bdev));
+ } else {
+ spdk_json_write_null(w);
+ }
+
+ spdk_json_write_object_end(w);
+}
+
+static void
+vhost_blk_write_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
+{
+ struct spdk_vhost_blk_dev *bvdev;
+
+ bvdev = to_blk_dev(vdev);
+ assert(bvdev != NULL);
+
+ if (!bvdev->bdev) {
+ return;
+ }
+
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "method", "vhost_create_blk_controller");
+
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_string(w, "ctrlr", vdev->name);
+ spdk_json_write_named_string(w, "dev_name", spdk_bdev_get_name(bvdev->bdev));
+ spdk_json_write_named_string(w, "cpumask",
+ spdk_cpuset_fmt(spdk_thread_get_cpumask(vdev->thread)));
+ spdk_json_write_named_bool(w, "readonly", bvdev->readonly);
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+}
+
+static int vhost_blk_destroy(struct spdk_vhost_dev *dev);
+
+static int
+vhost_blk_get_config(struct spdk_vhost_dev *vdev, uint8_t *config,
+ uint32_t len)
+{
+ struct virtio_blk_config blkcfg;
+ struct spdk_vhost_blk_dev *bvdev;
+ struct spdk_bdev *bdev;
+ uint32_t blk_size;
+ uint64_t blkcnt;
+
+ memset(&blkcfg, 0, sizeof(blkcfg));
+ bvdev = to_blk_dev(vdev);
+ assert(bvdev != NULL);
+ bdev = bvdev->bdev;
+ if (bdev == NULL) {
+ /* We can't just return -1 here as this GET_CONFIG message might
+ * be caused by a QEMU VM reboot. Returning -1 will indicate an
+ * error to QEMU, who might then decide to terminate itself.
+ * We don't want that. A simple reboot shouldn't break the system.
+ *
+ * Presenting a block device with block size 0 and block count 0
+ * doesn't cause any problems on QEMU side and the virtio-pci
+ * device is even still available inside the VM, but there will
+ * be no block device created for it - the kernel drivers will
+ * silently reject it.
+ */
+ blk_size = 0;
+ blkcnt = 0;
+ } else {
+ blk_size = spdk_bdev_get_block_size(bdev);
+ blkcnt = spdk_bdev_get_num_blocks(bdev);
+ if (spdk_bdev_get_buf_align(bdev) > 1) {
+ blkcfg.size_max = SPDK_BDEV_LARGE_BUF_MAX_SIZE;
+ blkcfg.seg_max = spdk_min(SPDK_VHOST_IOVS_MAX - 2 - 1, BDEV_IO_NUM_CHILD_IOV - 2 - 1);
+ } else {
+ blkcfg.size_max = 131072;
+ /* -2 for REQ and RESP and -1 for region boundary splitting */
+ blkcfg.seg_max = SPDK_VHOST_IOVS_MAX - 2 - 1;
+ }
+ }
+
+ blkcfg.blk_size = blk_size;
+ /* minimum I/O size in blocks */
+ blkcfg.min_io_size = 1;
+ /* expressed in 512 Bytes sectors */
+ blkcfg.capacity = (blkcnt * blk_size) / 512;
+ /* QEMU can overwrite this value when started */
+ blkcfg.num_queues = SPDK_VHOST_MAX_VQUEUES;
+
+ if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
+ /* 16MiB, expressed in 512 Bytes */
+ blkcfg.max_discard_sectors = 32768;
+ blkcfg.max_discard_seg = 1;
+ blkcfg.discard_sector_alignment = blk_size / 512;
+ }
+ if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
+ blkcfg.max_write_zeroes_sectors = 32768;
+ blkcfg.max_write_zeroes_seg = 1;
+ }
+
+ memcpy(config, &blkcfg, spdk_min(len, sizeof(blkcfg)));
+
+ return 0;
+}
+
+static const struct spdk_vhost_dev_backend vhost_blk_device_backend = {
+ .session_ctx_size = sizeof(struct spdk_vhost_blk_session) - sizeof(struct spdk_vhost_session),
+ .start_session = vhost_blk_start,
+ .stop_session = vhost_blk_stop,
+ .vhost_get_config = vhost_blk_get_config,
+ .dump_info_json = vhost_blk_dump_info_json,
+ .write_config_json = vhost_blk_write_config_json,
+ .remove_device = vhost_blk_destroy,
+};
+
+int
+vhost_blk_controller_construct(void)
+{
+ struct spdk_conf_section *sp;
+ unsigned ctrlr_num;
+ char *bdev_name;
+ char *cpumask;
+ char *name;
+ bool readonly;
+ bool packed_ring;
+
+ for (sp = spdk_conf_first_section(NULL); sp != NULL; sp = spdk_conf_next_section(sp)) {
+ if (!spdk_conf_section_match_prefix(sp, "VhostBlk")) {
+ continue;
+ }
+
+ if (sscanf(spdk_conf_section_get_name(sp), "VhostBlk%u", &ctrlr_num) != 1) {
+ SPDK_ERRLOG("Section '%s' has non-numeric suffix.\n",
+ spdk_conf_section_get_name(sp));
+ return -1;
+ }
+
+ name = spdk_conf_section_get_val(sp, "Name");
+ if (name == NULL) {
+ SPDK_ERRLOG("VhostBlk%u: missing Name\n", ctrlr_num);
+ return -1;
+ }
+
+ cpumask = spdk_conf_section_get_val(sp, "Cpumask");
+ readonly = spdk_conf_section_get_boolval(sp, "ReadOnly", false);
+ packed_ring = spdk_conf_section_get_boolval(sp, "PackedRing", false);
+
+ bdev_name = spdk_conf_section_get_val(sp, "Dev");
+ if (bdev_name == NULL) {
+ continue;
+ }
+
+ if (spdk_vhost_blk_construct(name, cpumask, bdev_name,
+ readonly, packed_ring) < 0) {
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+int
+spdk_vhost_blk_construct(const char *name, const char *cpumask, const char *dev_name,
+ bool readonly, bool packed_ring)
+{
+ struct spdk_vhost_blk_dev *bvdev = NULL;
+ struct spdk_vhost_dev *vdev;
+ struct spdk_bdev *bdev;
+ int ret = 0;
+
+ spdk_vhost_lock();
+ bdev = spdk_bdev_get_by_name(dev_name);
+ if (bdev == NULL) {
+ SPDK_ERRLOG("%s: bdev '%s' not found\n",
+ name, dev_name);
+ ret = -ENODEV;
+ goto out;
+ }
+
+ bvdev = calloc(1, sizeof(*bvdev));
+ if (bvdev == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ vdev = &bvdev->vdev;
+ vdev->virtio_features = SPDK_VHOST_BLK_FEATURES_BASE;
+ vdev->disabled_features = SPDK_VHOST_BLK_DISABLED_FEATURES;
+ vdev->protocol_features = SPDK_VHOST_BLK_PROTOCOL_FEATURES;
+
+ vdev->virtio_features |= (uint64_t)packed_ring << VIRTIO_F_RING_PACKED;
+
+ if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
+ vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_DISCARD);
+ }
+ if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
+ vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_WRITE_ZEROES);
+ }
+ if (readonly) {
+ vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_RO);
+ }
+ if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH)) {
+ vdev->virtio_features |= (1ULL << VIRTIO_BLK_F_FLUSH);
+ }
+
+ ret = spdk_bdev_open_ext(dev_name, true, bdev_event_cb, bvdev, &bvdev->bdev_desc);
+ if (ret != 0) {
+ SPDK_ERRLOG("%s: could not open bdev '%s', error=%d\n",
+ name, dev_name, ret);
+ goto out;
+ }
+
+ /*
+ * When starting qemu with vhost-user-blk multiqueue, the vhost device will
+ * be started/stopped many times, related to the queues num, as the
+ * vhost-user backend doesn't know the exact number of queues used for this
+ * device. The target have to stop and start the device once got a valid
+ * IO queue.
+ * When stoping and starting the vhost device, the backend bdev io device
+ * will be deleted and created repeatedly.
+ * Hold a bdev reference so that in the struct spdk_vhost_blk_dev, so that
+ * the io device will not be deleted.
+ */
+ bvdev->dummy_io_channel = spdk_bdev_get_io_channel(bvdev->bdev_desc);
+
+ bvdev->bdev = bdev;
+ bvdev->readonly = readonly;
+ ret = vhost_dev_register(vdev, name, cpumask, &vhost_blk_device_backend);
+ if (ret != 0) {
+ spdk_put_io_channel(bvdev->dummy_io_channel);
+ spdk_bdev_close(bvdev->bdev_desc);
+ goto out;
+ }
+
+ SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: using bdev '%s'\n", name, dev_name);
+out:
+ if (ret != 0 && bvdev) {
+ free(bvdev);
+ }
+ spdk_vhost_unlock();
+ return ret;
+}
+
+static int
+vhost_blk_destroy(struct spdk_vhost_dev *vdev)
+{
+ struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev);
+ int rc;
+
+ assert(bvdev != NULL);
+
+ rc = vhost_dev_unregister(&bvdev->vdev);
+ if (rc != 0) {
+ return rc;
+ }
+
+ /* if the bdev is removed, don't need call spdk_put_io_channel. */
+ if (bvdev->bdev) {
+ spdk_put_io_channel(bvdev->dummy_io_channel);
+ }
+
+ if (bvdev->bdev_desc) {
+ spdk_bdev_close(bvdev->bdev_desc);
+ bvdev->bdev_desc = NULL;
+ }
+ bvdev->bdev = NULL;
+
+ free(bvdev);
+ return 0;
+}
+
+SPDK_LOG_REGISTER_COMPONENT("vhost_blk", SPDK_LOG_VHOST_BLK)
+SPDK_LOG_REGISTER_COMPONENT("vhost_blk_data", SPDK_LOG_VHOST_BLK_DATA)
diff --git a/src/spdk/lib/vhost/vhost_internal.h b/src/spdk/lib/vhost/vhost_internal.h
new file mode 100644
index 000000000..3aa89768a
--- /dev/null
+++ b/src/spdk/lib/vhost/vhost_internal.h
@@ -0,0 +1,496 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef SPDK_VHOST_INTERNAL_H
+#define SPDK_VHOST_INTERNAL_H
+#include <linux/virtio_config.h>
+
+#include "spdk/stdinc.h"
+
+#include <rte_vhost.h>
+
+#include "spdk_internal/vhost_user.h"
+#include "spdk_internal/log.h"
+#include "spdk/util.h"
+#include "spdk/rpc.h"
+#include "spdk/config.h"
+
+#define SPDK_VHOST_MAX_VQUEUES 256
+#define SPDK_VHOST_MAX_VQ_SIZE 1024
+
+#define SPDK_VHOST_SCSI_CTRLR_MAX_DEVS 8
+
+#define SPDK_VHOST_IOVS_MAX 129
+
+#define SPDK_VHOST_VQ_MAX_SUBMISSIONS 32
+
+/*
+ * Rate at which stats are checked for interrupt coalescing.
+ */
+#define SPDK_VHOST_STATS_CHECK_INTERVAL_MS 10
+/*
+ * Default threshold at which interrupts start to be coalesced.
+ */
+#define SPDK_VHOST_VQ_IOPS_COALESCING_THRESHOLD 60000
+
+/*
+ * Currently coalescing is not used by default.
+ * Setting this to value > 0 here or by RPC will enable coalescing.
+ */
+#define SPDK_VHOST_COALESCING_DELAY_BASE_US 0
+
+#define SPDK_VHOST_FEATURES ((1ULL << VHOST_F_LOG_ALL) | \
+ (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | \
+ (1ULL << VIRTIO_F_VERSION_1) | \
+ (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) | \
+ (1ULL << VIRTIO_RING_F_EVENT_IDX) | \
+ (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | \
+ (1ULL << VIRTIO_F_RING_PACKED))
+
+#define SPDK_VHOST_DISABLED_FEATURES ((1ULL << VIRTIO_RING_F_EVENT_IDX) | \
+ (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY))
+
+#define VRING_DESC_F_AVAIL (1ULL << VRING_PACKED_DESC_F_AVAIL)
+#define VRING_DESC_F_USED (1ULL << VRING_PACKED_DESC_F_USED)
+#define VRING_DESC_F_AVAIL_USED (VRING_DESC_F_AVAIL | VRING_DESC_F_USED)
+
+typedef struct rte_vhost_resubmit_desc spdk_vhost_resubmit_desc;
+typedef struct rte_vhost_resubmit_info spdk_vhost_resubmit_info;
+
+struct spdk_vhost_virtqueue {
+ struct rte_vhost_vring vring;
+ struct rte_vhost_ring_inflight vring_inflight;
+ uint16_t last_avail_idx;
+ uint16_t last_used_idx;
+
+ struct {
+ /* To mark a descriptor as available in packed ring
+ * Equal to avail_wrap_counter in spec.
+ */
+ uint8_t avail_phase : 1;
+ /* To mark a descriptor as used in packed ring
+ * Equal to used_wrap_counter in spec.
+ */
+ uint8_t used_phase : 1;
+ uint8_t padding : 5;
+ bool packed_ring : 1;
+ } packed;
+
+ void *tasks;
+
+ /* Request count from last stats check */
+ uint32_t req_cnt;
+
+ /* Request count from last event */
+ uint16_t used_req_cnt;
+
+ /* How long interrupt is delayed */
+ uint32_t irq_delay_time;
+
+ /* Next time when we need to send event */
+ uint64_t next_event_time;
+
+ /* Associated vhost_virtqueue in the virtio device's virtqueue list */
+ uint32_t vring_idx;
+} __attribute((aligned(SPDK_CACHE_LINE_SIZE)));
+
+struct spdk_vhost_session {
+ struct spdk_vhost_dev *vdev;
+
+ /* rte_vhost connection ID. */
+ int vid;
+
+ /* Unique session ID. */
+ uint64_t id;
+ /* Unique session name. */
+ char *name;
+
+ bool initialized;
+ bool started;
+ bool needs_restart;
+ bool forced_polling;
+
+ struct rte_vhost_memory *mem;
+
+ int task_cnt;
+
+ uint16_t max_queues;
+
+ uint64_t negotiated_features;
+
+ /* Local copy of device coalescing settings. */
+ uint32_t coalescing_delay_time_base;
+ uint32_t coalescing_io_rate_threshold;
+
+ /* Next time when stats for event coalescing will be checked. */
+ uint64_t next_stats_check_time;
+
+ /* Interval used for event coalescing checking. */
+ uint64_t stats_check_interval;
+
+ struct spdk_vhost_virtqueue virtqueue[SPDK_VHOST_MAX_VQUEUES];
+
+ TAILQ_ENTRY(spdk_vhost_session) tailq;
+};
+
+struct spdk_vhost_dev {
+ char *name;
+ char *path;
+
+ struct spdk_thread *thread;
+ bool registered;
+
+ uint64_t virtio_features;
+ uint64_t disabled_features;
+ uint64_t protocol_features;
+
+ const struct spdk_vhost_dev_backend *backend;
+
+ /* Saved orginal values used to setup coalescing to avoid integer
+ * rounding issues during save/load config.
+ */
+ uint32_t coalescing_delay_us;
+ uint32_t coalescing_iops_threshold;
+
+ /* Current connections to the device */
+ TAILQ_HEAD(, spdk_vhost_session) vsessions;
+
+ /* Increment-only session counter */
+ uint64_t vsessions_num;
+
+ /* Number of started and actively polled sessions */
+ uint32_t active_session_num;
+
+ /* Number of pending asynchronous operations */
+ uint32_t pending_async_op_num;
+
+ TAILQ_ENTRY(spdk_vhost_dev) tailq;
+};
+
+/**
+ * \param vdev vhost device.
+ * \param vsession vhost session.
+ * \param arg user-provided parameter.
+ *
+ * \return negative values will break the foreach call, meaning
+ * the function won't be called again. Return codes zero and
+ * positive don't have any effect.
+ */
+typedef int (*spdk_vhost_session_fn)(struct spdk_vhost_dev *vdev,
+ struct spdk_vhost_session *vsession,
+ void *arg);
+
+/**
+ * \param vdev vhost device.
+ * \param arg user-provided parameter.
+ */
+typedef void (*spdk_vhost_dev_fn)(struct spdk_vhost_dev *vdev, void *arg);
+
+struct spdk_vhost_dev_backend {
+ /**
+ * Size of additional per-session context data
+ * allocated whenever a new client connects.
+ */
+ size_t session_ctx_size;
+
+ int (*start_session)(struct spdk_vhost_session *vsession);
+ int (*stop_session)(struct spdk_vhost_session *vsession);
+
+ int (*vhost_get_config)(struct spdk_vhost_dev *vdev, uint8_t *config, uint32_t len);
+ int (*vhost_set_config)(struct spdk_vhost_dev *vdev, uint8_t *config,
+ uint32_t offset, uint32_t size, uint32_t flags);
+
+ void (*dump_info_json)(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w);
+ void (*write_config_json)(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w);
+ int (*remove_device)(struct spdk_vhost_dev *vdev);
+};
+
+void *vhost_gpa_to_vva(struct spdk_vhost_session *vsession, uint64_t addr, uint64_t len);
+
+uint16_t vhost_vq_avail_ring_get(struct spdk_vhost_virtqueue *vq, uint16_t *reqs,
+ uint16_t reqs_len);
+
+/**
+ * Get a virtio split descriptor at given index in given virtqueue.
+ * The descriptor will provide access to the entire descriptor
+ * chain. The subsequent descriptors are accesible via
+ * \c spdk_vhost_vring_desc_get_next.
+ * \param vsession vhost session
+ * \param vq virtqueue
+ * \param req_idx descriptor index
+ * \param desc pointer to be set to the descriptor
+ * \param desc_table descriptor table to be used with
+ * \c spdk_vhost_vring_desc_get_next. This might be either
+ * default virtqueue descriptor table or per-chain indirect
+ * table.
+ * \param desc_table_size size of the *desc_table*
+ * \return 0 on success, -1 if given index is invalid.
+ * If -1 is returned, the content of params is undefined.
+ */
+int vhost_vq_get_desc(struct spdk_vhost_session *vsession, struct spdk_vhost_virtqueue *vq,
+ uint16_t req_idx, struct vring_desc **desc, struct vring_desc **desc_table,
+ uint32_t *desc_table_size);
+
+/**
+ * Get a virtio packed descriptor at given index in given virtqueue.
+ * The descriptor will provide access to the entire descriptor
+ * chain. The subsequent descriptors are accesible via
+ * \c vhost_vring_packed_desc_get_next.
+ * \param vsession vhost session
+ * \param vq virtqueue
+ * \param req_idx descriptor index
+ * \param desc pointer to be set to the descriptor
+ * \param desc_table descriptor table to be used with
+ * \c spdk_vhost_vring_desc_get_next. This might be either
+ * \c NULL or per-chain indirect table.
+ * \param desc_table_size size of the *desc_table*
+ * \return 0 on success, -1 if given index is invalid.
+ * If -1 is returned, the content of params is undefined.
+ */
+int vhost_vq_get_desc_packed(struct spdk_vhost_session *vsession,
+ struct spdk_vhost_virtqueue *virtqueue,
+ uint16_t req_idx, struct vring_packed_desc **desc,
+ struct vring_packed_desc **desc_table, uint32_t *desc_table_size);
+
+/**
+ * Send IRQ/call client (if pending) for \c vq.
+ * \param vsession vhost session
+ * \param vq virtqueue
+ * \return
+ * 0 - if no interrupt was signalled
+ * 1 - if interrupt was signalled
+ */
+int vhost_vq_used_signal(struct spdk_vhost_session *vsession, struct spdk_vhost_virtqueue *vq);
+
+
+/**
+ * Send IRQs for all queues that need to be signaled.
+ * \param vsession vhost session
+ * \param vq virtqueue
+ */
+void vhost_session_used_signal(struct spdk_vhost_session *vsession);
+
+void vhost_vq_used_ring_enqueue(struct spdk_vhost_session *vsession,
+ struct spdk_vhost_virtqueue *vq,
+ uint16_t id, uint32_t len);
+
+/**
+ * Enqueue the entry to the used ring when device complete the request.
+ * \param vsession vhost session
+ * \param vq virtqueue
+ * \req_idx descriptor index. It's the first index of this descriptor chain.
+ * \num_descs descriptor count. It's the count of the number of buffers in the chain.
+ * \buffer_id descriptor buffer ID.
+ * \length device write length. Specify the length of the buffer that has been initialized
+ * (written to) by the device
+ */
+void vhost_vq_packed_ring_enqueue(struct spdk_vhost_session *vsession,
+ struct spdk_vhost_virtqueue *virtqueue,
+ uint16_t num_descs, uint16_t buffer_id,
+ uint32_t length);
+
+/**
+ * Get subsequent descriptor from given table.
+ * \param desc current descriptor, will be set to the
+ * next descriptor (NULL in case this is the last
+ * descriptor in the chain or the next desc is invalid)
+ * \param desc_table descriptor table
+ * \param desc_table_size size of the *desc_table*
+ * \return 0 on success, -1 if given index is invalid
+ * The *desc* param will be set regardless of the
+ * return value.
+ */
+int vhost_vring_desc_get_next(struct vring_desc **desc,
+ struct vring_desc *desc_table, uint32_t desc_table_size);
+static inline bool
+vhost_vring_desc_is_wr(struct vring_desc *cur_desc)
+{
+ return !!(cur_desc->flags & VRING_DESC_F_WRITE);
+}
+
+int vhost_vring_desc_to_iov(struct spdk_vhost_session *vsession, struct iovec *iov,
+ uint16_t *iov_index, const struct vring_desc *desc);
+
+bool vhost_vq_packed_ring_is_avail(struct spdk_vhost_virtqueue *virtqueue);
+
+/**
+ * Get subsequent descriptor from vq or desc table.
+ * \param desc current descriptor, will be set to the
+ * next descriptor (NULL in case this is the last
+ * descriptor in the chain or the next desc is invalid)
+ * \req_idx index of current desc, will be set to the next
+ * index. If desc_table != NULL the req_idx is the the vring index
+ * or the req_idx is the desc_table index.
+ * \param desc_table descriptor table
+ * \param desc_table_size size of the *desc_table*
+ * \return 0 on success, -1 if given index is invalid
+ * The *desc* param will be set regardless of the
+ * return value.
+ */
+int vhost_vring_packed_desc_get_next(struct vring_packed_desc **desc, uint16_t *req_idx,
+ struct spdk_vhost_virtqueue *vq,
+ struct vring_packed_desc *desc_table,
+ uint32_t desc_table_size);
+
+bool vhost_vring_packed_desc_is_wr(struct vring_packed_desc *cur_desc);
+
+int vhost_vring_packed_desc_to_iov(struct spdk_vhost_session *vsession, struct iovec *iov,
+ uint16_t *iov_index, const struct vring_packed_desc *desc);
+
+uint16_t vhost_vring_packed_desc_get_buffer_id(struct spdk_vhost_virtqueue *vq, uint16_t req_idx,
+ uint16_t *num_descs);
+
+static inline bool __attribute__((always_inline))
+vhost_dev_has_feature(struct spdk_vhost_session *vsession, unsigned feature_id)
+{
+ return vsession->negotiated_features & (1ULL << feature_id);
+}
+
+int vhost_dev_register(struct spdk_vhost_dev *vdev, const char *name, const char *mask_str,
+ const struct spdk_vhost_dev_backend *backend);
+int vhost_dev_unregister(struct spdk_vhost_dev *vdev);
+
+int vhost_scsi_controller_construct(void);
+int vhost_blk_controller_construct(void);
+void vhost_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w);
+
+/*
+ * Vhost callbacks for vhost_device_ops interface
+ */
+
+int vhost_new_connection_cb(int vid, const char *ifname);
+int vhost_start_device_cb(int vid);
+int vhost_stop_device_cb(int vid);
+int vhost_destroy_connection_cb(int vid);
+
+#ifdef SPDK_CONFIG_VHOST_INTERNAL_LIB
+int vhost_get_config_cb(int vid, uint8_t *config, uint32_t len);
+int vhost_set_config_cb(int vid, uint8_t *config, uint32_t offset,
+ uint32_t size, uint32_t flags);
+#endif
+
+/*
+ * Memory registration functions used in start/stop device callbacks
+ */
+void vhost_session_mem_register(struct rte_vhost_memory *mem);
+void vhost_session_mem_unregister(struct rte_vhost_memory *mem);
+
+/*
+ * Call a function for each session of the provided vhost device.
+ * The function will be called one-by-one on each session's thread.
+ *
+ * \param vdev vhost device
+ * \param fn function to call on each session's thread
+ * \param cpl_fn function to be called at the end of the iteration on
+ * the vhost management thread.
+ * Optional, can be NULL.
+ * \param arg additional argument to the both callbacks
+ */
+void vhost_dev_foreach_session(struct spdk_vhost_dev *dev,
+ spdk_vhost_session_fn fn,
+ spdk_vhost_dev_fn cpl_fn,
+ void *arg);
+
+/**
+ * Call a function on the provided lcore and block until either
+ * spdk_vhost_session_start_done() or spdk_vhost_session_stop_done()
+ * is called.
+ *
+ * This must be called under the global vhost mutex, which this function
+ * will unlock for the time it's waiting. It's meant to be called only
+ * from start/stop session callbacks.
+ *
+ * \param vsession vhost session
+ * \param cb_fn the function to call. The void *arg parameter in cb_fn
+ * is always NULL.
+ * \param timeout_sec timeout in seconds. This function will still
+ * block after the timeout expires, but will print the provided errmsg.
+ * \param errmsg error message to print once the timeout expires
+ * \return return the code passed to spdk_vhost_session_event_done().
+ */
+int vhost_session_send_event(struct spdk_vhost_session *vsession,
+ spdk_vhost_session_fn cb_fn, unsigned timeout_sec,
+ const char *errmsg);
+
+/**
+ * Finish a blocking spdk_vhost_session_send_event() call and finally
+ * start the session. This must be called on the target lcore, which
+ * will now receive all session-related messages (e.g. from
+ * spdk_vhost_dev_foreach_session()).
+ *
+ * Must be called under the global vhost lock.
+ *
+ * \param vsession vhost session
+ * \param response return code
+ */
+void vhost_session_start_done(struct spdk_vhost_session *vsession, int response);
+
+/**
+ * Finish a blocking spdk_vhost_session_send_event() call and finally
+ * stop the session. This must be called on the session's lcore which
+ * used to receive all session-related messages (e.g. from
+ * spdk_vhost_dev_foreach_session()). After this call, the session-
+ * related messages will be once again processed by any arbitrary thread.
+ *
+ * Must be called under the global vhost lock.
+ *
+ * Must be called under the global vhost mutex.
+ *
+ * \param vsession vhost session
+ * \param response return code
+ */
+void vhost_session_stop_done(struct spdk_vhost_session *vsession, int response);
+
+struct spdk_vhost_session *vhost_session_find_by_vid(int vid);
+void vhost_session_install_rte_compat_hooks(struct spdk_vhost_session *vsession);
+int vhost_register_unix_socket(const char *path, const char *ctrl_name,
+ uint64_t virtio_features, uint64_t disabled_features, uint64_t protocol_features);
+int vhost_driver_unregister(const char *path);
+int vhost_get_mem_table(int vid, struct rte_vhost_memory **mem);
+int vhost_get_negotiated_features(int vid, uint64_t *negotiated_features);
+
+int remove_vhost_controller(struct spdk_vhost_dev *vdev);
+
+#ifdef SPDK_CONFIG_VHOST_INTERNAL_LIB
+int vhost_nvme_admin_passthrough(int vid, void *cmd, void *cqe, void *buf);
+int vhost_nvme_set_cq_call(int vid, uint16_t qid, int fd);
+int vhost_nvme_set_bar_mr(int vid, void *bar_addr, uint64_t bar_size);
+int vhost_nvme_get_cap(int vid, uint64_t *cap);
+int vhost_nvme_controller_construct(void);
+int vhost_nvme_dev_construct(const char *name, const char *cpumask, uint32_t io_queues);
+int vhost_nvme_dev_remove(struct spdk_vhost_dev *vdev);
+int vhost_nvme_dev_add_ns(struct spdk_vhost_dev *vdev,
+ const char *bdev_name);
+#endif
+
+#endif /* SPDK_VHOST_INTERNAL_H */
diff --git a/src/spdk/lib/vhost/vhost_nvme.c b/src/spdk/lib/vhost/vhost_nvme.c
new file mode 100644
index 000000000..10f53baf9
--- /dev/null
+++ b/src/spdk/lib/vhost/vhost_nvme.c
@@ -0,0 +1,1500 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/nvme.h"
+#include "spdk/env.h"
+#include "spdk/conf.h"
+#include "spdk/util.h"
+#include "spdk/string.h"
+#include "spdk/thread.h"
+#include "spdk/barrier.h"
+#include "spdk/vhost.h"
+#include "spdk/bdev.h"
+#include "spdk/version.h"
+#include "spdk/nvme_spec.h"
+#include "spdk/likely.h"
+
+#include "vhost_internal.h"
+
+#define MAX_IO_QUEUES 31
+#define MAX_IOVS 64
+#define MAX_NAMESPACE 8
+#define MAX_QUEUE_ENTRIES_SUPPORTED 256
+#define MAX_BATCH_IO 8
+
+struct spdk_vhost_nvme_sq {
+ uint16_t sqid;
+ uint16_t size;
+ uint16_t cqid;
+ bool valid;
+ struct spdk_nvme_cmd *sq_cmd;
+ uint16_t sq_head;
+ uint16_t sq_tail;
+};
+
+struct spdk_vhost_nvme_cq {
+ uint8_t phase;
+ uint16_t size;
+ uint16_t cqid;
+ bool valid;
+ volatile struct spdk_nvme_cpl *cq_cqe;
+ uint16_t cq_head;
+ uint16_t guest_signaled_cq_head;
+ uint32_t need_signaled_cnt;
+ STAILQ_HEAD(, spdk_vhost_nvme_task) cq_full_waited_tasks;
+ bool irq_enabled;
+ int virq;
+};
+
+struct spdk_vhost_nvme_ns {
+ struct spdk_bdev *bdev;
+ uint32_t block_size;
+ uint64_t capacity;
+ uint32_t nsid;
+ uint32_t active_ns;
+ struct spdk_bdev_desc *bdev_desc;
+ struct spdk_io_channel *bdev_io_channel;
+ struct spdk_nvme_ns_data nsdata;
+};
+
+struct spdk_vhost_nvme_task {
+ struct spdk_nvme_cmd cmd;
+ struct spdk_vhost_nvme_dev *nvme;
+ uint16_t sqid;
+ uint16_t cqid;
+
+ /** array of iovecs to transfer. */
+ struct iovec iovs[MAX_IOVS];
+
+ /** Number of iovecs in iovs array. */
+ int iovcnt;
+
+ /** Current iovec position. */
+ int iovpos;
+
+ /** Offset in current iovec. */
+ uint32_t iov_offset;
+
+ /* for bdev_io_wait */
+ struct spdk_bdev_io_wait_entry bdev_io_wait;
+ struct spdk_vhost_nvme_sq *sq;
+ struct spdk_vhost_nvme_ns *ns;
+
+ /* parent pointer. */
+ struct spdk_vhost_nvme_task *parent;
+ uint8_t dnr;
+ uint8_t sct;
+ uint8_t sc;
+ uint32_t num_children;
+ STAILQ_ENTRY(spdk_vhost_nvme_task) stailq;
+};
+
+struct spdk_vhost_nvme_dev {
+ struct spdk_vhost_dev vdev;
+
+ uint32_t num_io_queues;
+ union spdk_nvme_cap_register cap;
+ union spdk_nvme_cc_register cc;
+ union spdk_nvme_csts_register csts;
+ struct spdk_nvme_ctrlr_data cdata;
+
+ uint32_t num_sqs;
+ uint32_t num_cqs;
+
+ uint32_t num_ns;
+ struct spdk_vhost_nvme_ns ns[MAX_NAMESPACE];
+
+ volatile uint32_t *bar;
+ volatile uint32_t *bar_db;
+ uint64_t bar_size;
+ bool dataplane_started;
+
+ volatile uint32_t *dbbuf_dbs;
+ volatile uint32_t *dbbuf_eis;
+ struct spdk_vhost_nvme_sq sq_queue[MAX_IO_QUEUES + 1];
+ struct spdk_vhost_nvme_cq cq_queue[MAX_IO_QUEUES + 1];
+
+ /* The one and only session associated with this device */
+ struct spdk_vhost_session *vsession;
+
+ TAILQ_ENTRY(spdk_vhost_nvme_dev) tailq;
+ STAILQ_HEAD(, spdk_vhost_nvme_task) free_tasks;
+ struct spdk_poller *requestq_poller;
+ struct spdk_poller *stop_poller;
+};
+
+static const struct spdk_vhost_dev_backend spdk_vhost_nvme_device_backend;
+
+/*
+ * Report the SPDK version as the firmware revision.
+ * SPDK_VERSION_STRING won't fit into FR (only 8 bytes), so try to fit the most important parts.
+ */
+#define FW_VERSION SPDK_VERSION_MAJOR_STRING SPDK_VERSION_MINOR_STRING SPDK_VERSION_PATCH_STRING
+
+static int
+nvme_process_sq(struct spdk_vhost_nvme_dev *nvme, struct spdk_vhost_nvme_sq *sq,
+ struct spdk_vhost_nvme_task *task);
+
+static struct spdk_vhost_nvme_dev *
+to_nvme_dev(struct spdk_vhost_dev *vdev)
+{
+ if (vdev->backend != &spdk_vhost_nvme_device_backend) {
+ SPDK_ERRLOG("%s: not a vhost-nvme device\n", vdev->name);
+ return NULL;
+ }
+
+ return SPDK_CONTAINEROF(vdev, struct spdk_vhost_nvme_dev, vdev);
+}
+
+static TAILQ_HEAD(, spdk_vhost_nvme_dev) g_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_ctrlrs);
+
+static inline unsigned int sq_offset(unsigned int qid, uint32_t db_stride)
+{
+ return qid * 2 * db_stride;
+}
+
+static inline unsigned int cq_offset(unsigned int qid, uint32_t db_stride)
+{
+ return (qid * 2 + 1) * db_stride;
+}
+
+static void
+nvme_inc_cq_head(struct spdk_vhost_nvme_cq *cq)
+{
+ cq->cq_head++;
+ if (cq->cq_head >= cq->size) {
+ cq->cq_head = 0;
+ cq->phase = !cq->phase;
+ }
+}
+
+static bool
+nvme_cq_is_full(struct spdk_vhost_nvme_cq *cq)
+{
+ return ((cq->cq_head + 1) % cq->size == cq->guest_signaled_cq_head);
+}
+
+static void
+nvme_inc_sq_head(struct spdk_vhost_nvme_sq *sq)
+{
+ sq->sq_head = (sq->sq_head + 1) % sq->size;
+}
+
+static struct spdk_vhost_nvme_sq *
+vhost_nvme_get_sq_from_qid(struct spdk_vhost_nvme_dev *dev, uint16_t qid)
+{
+ if (spdk_unlikely(!qid || qid > MAX_IO_QUEUES)) {
+ return NULL;
+ }
+
+ return &dev->sq_queue[qid];
+}
+
+static struct spdk_vhost_nvme_cq *
+vhost_nvme_get_cq_from_qid(struct spdk_vhost_nvme_dev *dev, uint16_t qid)
+{
+ if (spdk_unlikely(!qid || qid > MAX_IO_QUEUES)) {
+ return NULL;
+ }
+
+ return &dev->cq_queue[qid];
+}
+
+static inline uint32_t
+vhost_nvme_get_queue_head(struct spdk_vhost_nvme_dev *nvme, uint32_t offset)
+{
+ if (nvme->dataplane_started) {
+ return nvme->dbbuf_dbs[offset];
+
+ } else if (nvme->bar) {
+ return nvme->bar_db[offset];
+ }
+
+ assert(0);
+
+ return 0;
+}
+
+static void *
+vhost_nvme_gpa_to_vva(void *priv, uint64_t addr, uint64_t len)
+{
+ struct spdk_vhost_session *vsession = priv;
+
+ return vhost_gpa_to_vva(vsession, addr, len);
+}
+
+static int
+vhost_nvme_map_prps(struct spdk_vhost_nvme_dev *nvme, struct spdk_nvme_cmd *cmd,
+ struct spdk_vhost_nvme_task *task, uint32_t len)
+{
+ int err;
+
+ err = spdk_nvme_map_prps(nvme->vsession, cmd, task->iovs, len, 4096,
+ vhost_nvme_gpa_to_vva);
+ if (spdk_unlikely(err < 0)) {
+ return err;
+ }
+ task->iovcnt = err;
+ return 0;
+}
+
+static void
+nvme_cq_signal_fd(struct spdk_vhost_nvme_dev *nvme)
+{
+ struct spdk_vhost_nvme_cq *cq;
+ uint32_t qid, cq_head;
+
+ assert(nvme != NULL);
+
+ for (qid = 1; qid <= MAX_IO_QUEUES; qid++) {
+ cq = vhost_nvme_get_cq_from_qid(nvme, qid);
+ if (!cq || !cq->valid) {
+ continue;
+ }
+
+ cq_head = vhost_nvme_get_queue_head(nvme, cq_offset(qid, 1));
+ if (cq->irq_enabled && cq->need_signaled_cnt && (cq->cq_head != cq_head)) {
+ eventfd_write(cq->virq, (eventfd_t)1);
+ cq->need_signaled_cnt = 0;
+ }
+ }
+}
+
+static void
+vhost_nvme_task_complete(struct spdk_vhost_nvme_task *task)
+{
+ struct spdk_vhost_nvme_dev *nvme = task->nvme;
+ struct spdk_nvme_cpl cqe = {0};
+ struct spdk_vhost_nvme_cq *cq;
+ struct spdk_vhost_nvme_sq *sq;
+ struct spdk_nvme_cmd *cmd = &task->cmd;
+ uint16_t cqid = task->cqid;
+ uint16_t sqid = task->sqid;
+
+ cq = vhost_nvme_get_cq_from_qid(nvme, cqid);
+ sq = vhost_nvme_get_sq_from_qid(nvme, sqid);
+ if (spdk_unlikely(!cq || !sq)) {
+ return;
+ }
+
+ cq->guest_signaled_cq_head = vhost_nvme_get_queue_head(nvme, cq_offset(cqid, 1));
+ if (spdk_unlikely(nvme_cq_is_full(cq))) {
+ STAILQ_INSERT_TAIL(&cq->cq_full_waited_tasks, task, stailq);
+ return;
+ }
+
+ cqe.sqid = sqid;
+ cqe.sqhd = sq->sq_head;
+ cqe.cid = cmd->cid;
+ cqe.status.dnr = task->dnr;
+ cqe.status.sct = task->sct;
+ cqe.status.sc = task->sc;
+ cqe.status.p = !cq->phase;
+ cq->cq_cqe[cq->cq_head] = cqe;
+ spdk_smp_wmb();
+ cq->cq_cqe[cq->cq_head].status.p = cq->phase;
+
+ nvme_inc_cq_head(cq);
+ cq->need_signaled_cnt++;
+
+ /* MMIO Controll */
+ if (nvme->dataplane_started) {
+ nvme->dbbuf_eis[cq_offset(cqid, 1)] = (uint32_t)(cq->guest_signaled_cq_head - 1);
+ }
+
+ STAILQ_INSERT_TAIL(&nvme->free_tasks, task, stailq);
+}
+
+static void
+blk_request_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct spdk_vhost_nvme_task *task = cb_arg;
+ struct spdk_nvme_cmd *cmd = &task->cmd;
+ int sc, sct;
+ uint32_t cdw0;
+
+ assert(bdev_io != NULL);
+
+ spdk_bdev_io_get_nvme_status(bdev_io, &cdw0, &sct, &sc);
+ spdk_bdev_free_io(bdev_io);
+
+ task->dnr = !success;
+ task->sct = sct;
+ task->sc = sc;
+
+ if (spdk_unlikely(!success)) {
+ SPDK_ERRLOG("I/O error, sector %u\n", cmd->cdw10);
+ }
+
+ vhost_nvme_task_complete(task);
+}
+
+static void
+blk_unmap_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
+{
+ struct spdk_vhost_nvme_task *child = cb_arg;
+ struct spdk_vhost_nvme_task *task = child->parent;
+ struct spdk_vhost_nvme_dev *nvme = task->nvme;
+ int sct, sc;
+ uint32_t cdw0;
+
+ assert(bdev_io != NULL);
+
+ task->num_children--;
+ if (!success) {
+ task->dnr = 1;
+ spdk_bdev_io_get_nvme_status(bdev_io, &cdw0, &sct, &sc);
+ task->sct = sct;
+ task->sc = sc;
+ }
+
+ spdk_bdev_free_io(bdev_io);
+
+ if (!task->num_children) {
+ vhost_nvme_task_complete(task);
+ }
+
+ STAILQ_INSERT_TAIL(&nvme->free_tasks, child, stailq);
+}
+
+static struct spdk_vhost_nvme_ns *
+vhost_nvme_get_ns_from_nsid(struct spdk_vhost_nvme_dev *dev, uint32_t nsid)
+{
+ if (spdk_unlikely(!nsid || nsid > dev->num_ns)) {
+ return NULL;
+ }
+
+ return &dev->ns[nsid - 1];
+}
+
+static void
+vhost_nvme_resubmit_task(void *arg)
+{
+ struct spdk_vhost_nvme_task *task = (struct spdk_vhost_nvme_task *)arg;
+ int rc;
+
+ rc = nvme_process_sq(task->nvme, task->sq, task);
+ if (rc) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "vhost_nvme: task resubmit failed, rc = %d.\n", rc);
+ }
+}
+
+static int
+vhost_nvme_queue_task(struct spdk_vhost_nvme_task *task)
+{
+ int rc;
+
+ task->bdev_io_wait.bdev = task->ns->bdev;
+ task->bdev_io_wait.cb_fn = vhost_nvme_resubmit_task;
+ task->bdev_io_wait.cb_arg = task;
+
+ rc = spdk_bdev_queue_io_wait(task->ns->bdev, task->ns->bdev_io_channel, &task->bdev_io_wait);
+ if (rc != 0) {
+ SPDK_ERRLOG("Queue io failed in vhost_nvme_queue_task, rc=%d.\n", rc);
+ task->dnr = 1;
+ task->sct = SPDK_NVME_SCT_GENERIC;
+ task->sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+ vhost_nvme_task_complete(task);
+ }
+
+ return rc;
+}
+
+static int
+nvme_process_sq(struct spdk_vhost_nvme_dev *nvme, struct spdk_vhost_nvme_sq *sq,
+ struct spdk_vhost_nvme_task *task)
+{
+ struct spdk_vhost_nvme_task *child;
+ struct spdk_nvme_cmd *cmd = &task->cmd;
+ struct spdk_vhost_nvme_ns *ns;
+ int ret = -1;
+ uint32_t len, nlba, block_size;
+ uint64_t slba;
+ struct spdk_nvme_dsm_range *range;
+ uint16_t i, num_ranges = 0;
+
+ task->nvme = nvme;
+ task->dnr = 0;
+ task->sct = 0;
+ task->sc = 0;
+
+ ns = vhost_nvme_get_ns_from_nsid(nvme, cmd->nsid);
+ if (spdk_unlikely(!ns)) {
+ task->dnr = 1;
+ task->sct = SPDK_NVME_SCT_GENERIC;
+ task->sc = SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT;
+ vhost_nvme_task_complete(task);
+ return -1;
+ }
+
+ block_size = ns->block_size;
+ task->num_children = 0;
+ task->cqid = sq->cqid;
+ task->sqid = sq->sqid;
+
+ task->ns = ns;
+
+ if (spdk_unlikely(!ns->active_ns)) {
+ task->dnr = 1;
+ task->sct = SPDK_NVME_SCT_GENERIC;
+ task->sc = SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT;
+ vhost_nvme_task_complete(task);
+ return -1;
+ }
+
+ /* valid only for Read/Write commands */
+ nlba = (cmd->cdw12 & 0xffff) + 1;
+ slba = cmd->cdw11;
+ slba = (slba << 32) | cmd->cdw10;
+
+ if (cmd->opc == SPDK_NVME_OPC_READ || cmd->opc == SPDK_NVME_OPC_WRITE ||
+ cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) {
+ if (cmd->psdt != SPDK_NVME_PSDT_PRP) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "Invalid PSDT %u%ub in command\n",
+ cmd->psdt >> 1, cmd->psdt & 1u);
+ task->dnr = 1;
+ task->sct = SPDK_NVME_SCT_GENERIC;
+ task->sc = SPDK_NVME_SC_INVALID_FIELD;
+ vhost_nvme_task_complete(task);
+ return -1;
+ }
+
+ if (cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) {
+ num_ranges = (cmd->cdw10 & 0xff) + 1;
+ len = num_ranges * sizeof(struct spdk_nvme_dsm_range);
+ } else {
+ len = nlba * block_size;
+ }
+
+ ret = vhost_nvme_map_prps(nvme, cmd, task, len);
+ if (spdk_unlikely(ret != 0)) {
+ SPDK_ERRLOG("nvme command map prps failed\n");
+ task->dnr = 1;
+ task->sct = SPDK_NVME_SCT_GENERIC;
+ task->sc = SPDK_NVME_SC_INVALID_FIELD;
+ vhost_nvme_task_complete(task);
+ return -1;
+ }
+ }
+
+ switch (cmd->opc) {
+ case SPDK_NVME_OPC_READ:
+ ret = spdk_bdev_readv(ns->bdev_desc, ns->bdev_io_channel,
+ task->iovs, task->iovcnt, slba * block_size,
+ nlba * block_size, blk_request_complete_cb, task);
+ break;
+ case SPDK_NVME_OPC_WRITE:
+ ret = spdk_bdev_writev(ns->bdev_desc, ns->bdev_io_channel,
+ task->iovs, task->iovcnt, slba * block_size,
+ nlba * block_size, blk_request_complete_cb, task);
+ break;
+ case SPDK_NVME_OPC_FLUSH:
+ ret = spdk_bdev_flush(ns->bdev_desc, ns->bdev_io_channel,
+ 0, ns->capacity,
+ blk_request_complete_cb, task);
+ break;
+ case SPDK_NVME_OPC_DATASET_MANAGEMENT:
+ range = (struct spdk_nvme_dsm_range *)task->iovs[0].iov_base;
+ for (i = 0; i < num_ranges; i++) {
+ if (!STAILQ_EMPTY(&nvme->free_tasks)) {
+ child = STAILQ_FIRST(&nvme->free_tasks);
+ STAILQ_REMOVE_HEAD(&nvme->free_tasks, stailq);
+ } else {
+ SPDK_ERRLOG("No free task now\n");
+ ret = -1;
+ break;
+ }
+ task->num_children++;
+ child->parent = task;
+ ret = spdk_bdev_unmap(ns->bdev_desc, ns->bdev_io_channel,
+ range[i].starting_lba * block_size,
+ range[i].length * block_size,
+ blk_unmap_complete_cb, child);
+ if (ret) {
+ STAILQ_INSERT_TAIL(&nvme->free_tasks, child, stailq);
+ break;
+ }
+ }
+ break;
+ default:
+ ret = -1;
+ break;
+ }
+
+ if (spdk_unlikely(ret)) {
+ if (ret == -ENOMEM) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "No memory, start to queue io.\n");
+ task->sq = sq;
+ ret = vhost_nvme_queue_task(task);
+ } else {
+ /* post error status to cqe */
+ SPDK_ERRLOG("Error Submission For Command %u, ret %d\n", cmd->opc, ret);
+ task->dnr = 1;
+ task->sct = SPDK_NVME_SCT_GENERIC;
+ task->sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+ vhost_nvme_task_complete(task);
+ }
+ }
+
+ return ret;
+}
+
+static int
+nvme_worker(void *arg)
+{
+ struct spdk_vhost_nvme_dev *nvme = (struct spdk_vhost_nvme_dev *)arg;
+ struct spdk_vhost_nvme_sq *sq;
+ struct spdk_vhost_nvme_cq *cq;
+ struct spdk_vhost_nvme_task *task;
+ uint32_t qid, dbbuf_sq;
+ int ret;
+ int count = -1;
+
+ if (spdk_unlikely(!nvme->num_sqs)) {
+ return SPDK_POLLER_IDLE;
+ }
+
+ if (spdk_unlikely(!nvme->dataplane_started && !nvme->bar)) {
+ return SPDK_POLLER_IDLE;
+ }
+
+ for (qid = 1; qid <= MAX_IO_QUEUES; qid++) {
+
+ sq = vhost_nvme_get_sq_from_qid(nvme, qid);
+ if (!sq->valid) {
+ continue;
+ }
+ cq = vhost_nvme_get_cq_from_qid(nvme, sq->cqid);
+ if (spdk_unlikely(!cq)) {
+ return SPDK_POLLER_BUSY;
+ }
+ cq->guest_signaled_cq_head = vhost_nvme_get_queue_head(nvme, cq_offset(sq->cqid, 1));
+ if (spdk_unlikely(!STAILQ_EMPTY(&cq->cq_full_waited_tasks) &&
+ !nvme_cq_is_full(cq))) {
+ task = STAILQ_FIRST(&cq->cq_full_waited_tasks);
+ STAILQ_REMOVE_HEAD(&cq->cq_full_waited_tasks, stailq);
+ vhost_nvme_task_complete(task);
+ }
+
+ dbbuf_sq = vhost_nvme_get_queue_head(nvme, sq_offset(qid, 1));
+ sq->sq_tail = (uint16_t)dbbuf_sq;
+ count = 0;
+
+ while (sq->sq_head != sq->sq_tail) {
+ if (spdk_unlikely(!sq->sq_cmd)) {
+ break;
+ }
+ if (spdk_likely(!STAILQ_EMPTY(&nvme->free_tasks))) {
+ task = STAILQ_FIRST(&nvme->free_tasks);
+ STAILQ_REMOVE_HEAD(&nvme->free_tasks, stailq);
+ } else {
+ return SPDK_POLLER_BUSY;
+ }
+
+ task->cmd = sq->sq_cmd[sq->sq_head];
+ nvme_inc_sq_head(sq);
+
+ /* processing IO */
+ ret = nvme_process_sq(nvme, sq, task);
+ if (spdk_unlikely(ret)) {
+ SPDK_ERRLOG("QID %u CID %u, SQ HEAD %u, DBBUF SQ TAIL %u\n", qid, task->cmd.cid, sq->sq_head,
+ sq->sq_tail);
+ }
+
+ /* MMIO Control */
+ if (nvme->dataplane_started) {
+ nvme->dbbuf_eis[sq_offset(qid, 1)] = (uint32_t)(sq->sq_head - 1);
+ }
+
+ /* Maximum batch I/Os to pick up at once */
+ if (count++ == MAX_BATCH_IO) {
+ break;
+ }
+ }
+ }
+
+ /* Completion Queue */
+ nvme_cq_signal_fd(nvme);
+
+ return count;
+}
+
+static int
+vhost_nvme_doorbell_buffer_config(struct spdk_vhost_nvme_dev *nvme,
+ struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl)
+{
+ struct spdk_vhost_session *vsession = nvme->vsession;
+ uint64_t dbs_dma_addr, eis_dma_addr;
+
+ dbs_dma_addr = cmd->dptr.prp.prp1;
+ eis_dma_addr = cmd->dptr.prp.prp2;
+
+ if ((dbs_dma_addr % 4096) || (eis_dma_addr % 4096)) {
+ return -1;
+ }
+ /* Guest Physical Address to Host Virtual Address */
+ nvme->dbbuf_dbs = vhost_gpa_to_vva(vsession, dbs_dma_addr, 4096);
+ nvme->dbbuf_eis = vhost_gpa_to_vva(vsession, eis_dma_addr, 4096);
+ if (!nvme->dbbuf_dbs || !nvme->dbbuf_eis) {
+ return -1;
+ }
+ /* zeroed the doorbell buffer memory */
+ memset((void *)nvme->dbbuf_dbs, 0, 4096);
+ memset((void *)nvme->dbbuf_eis, 0, 4096);
+
+ cpl->status.sc = 0;
+ cpl->status.sct = 0;
+
+ /* Data plane started */
+ nvme->dataplane_started = true;
+
+ return 0;
+}
+
+static int
+vhost_nvme_create_io_sq(struct spdk_vhost_nvme_dev *nvme,
+ struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl)
+{
+ uint16_t qid, qsize, cqid;
+ uint64_t dma_addr;
+ uint64_t requested_len;
+ struct spdk_vhost_nvme_cq *cq;
+ struct spdk_vhost_nvme_sq *sq;
+
+ /* physical contiguous */
+ if (!(cmd->cdw11 & 0x1)) {
+ return -1;
+ }
+
+ cqid = (cmd->cdw11 >> 16) & 0xffff;
+ qid = cmd->cdw10 & 0xffff;
+ qsize = (cmd->cdw10 >> 16) & 0xffff;
+ dma_addr = cmd->dptr.prp.prp1;
+ if (!dma_addr || dma_addr % 4096) {
+ return -1;
+ }
+
+ sq = vhost_nvme_get_sq_from_qid(nvme, qid);
+ cq = vhost_nvme_get_cq_from_qid(nvme, cqid);
+ if (!sq || !cq) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "User requested invalid QID %u or CQID %u\n",
+ qid, cqid);
+ cpl->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
+ cpl->status.sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER;
+ return -1;
+ }
+
+ sq->sqid = qid;
+ sq->cqid = cqid;
+ sq->size = qsize + 1;
+ sq->sq_head = sq->sq_tail = 0;
+ requested_len = sizeof(struct spdk_nvme_cmd) * sq->size;
+ sq->sq_cmd = vhost_gpa_to_vva(nvme->vsession, dma_addr, requested_len);
+ if (!sq->sq_cmd) {
+ return -1;
+ }
+ nvme->num_sqs++;
+ sq->valid = true;
+ if (nvme->bar) {
+ nvme->bar_db[sq_offset(qid, 1)] = 0;
+ }
+
+ cpl->status.sc = 0;
+ cpl->status.sct = 0;
+ return 0;
+}
+
+static int
+vhost_nvme_delete_io_sq(struct spdk_vhost_nvme_dev *nvme,
+ struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl)
+{
+ uint16_t qid;
+ struct spdk_vhost_nvme_sq *sq;
+
+ qid = cmd->cdw10 & 0xffff;
+ sq = vhost_nvme_get_sq_from_qid(nvme, qid);
+ if (!sq) {
+ return -1;
+ }
+
+ /* We didn't see scenarios when deleting submission
+ * queue while I/O is running against the submisson
+ * queue for now, otherwise, we must ensure the poller
+ * will not run with this submission queue.
+ */
+ nvme->num_sqs--;
+ sq->valid = false;
+
+ memset(sq, 0, sizeof(*sq));
+ sq->sq_cmd = NULL;
+
+ cpl->status.sc = 0;
+ cpl->status.sct = 0;
+
+ return 0;
+}
+
+static int
+vhost_nvme_create_io_cq(struct spdk_vhost_nvme_dev *nvme,
+ struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl)
+{
+ uint16_t qsize, qid;
+ uint64_t dma_addr;
+ struct spdk_vhost_nvme_cq *cq;
+ uint64_t requested_len;
+
+ /* physical contiguous */
+ if (!(cmd->cdw11 & 0x1)) {
+ return -1;
+ }
+
+ qid = cmd->cdw10 & 0xffff;
+ qsize = (cmd->cdw10 >> 16) & 0xffff;
+ dma_addr = cmd->dptr.prp.prp1;
+ if (!dma_addr || dma_addr % 4096) {
+ return -1;
+ }
+
+ cq = vhost_nvme_get_cq_from_qid(nvme, qid);
+ if (!cq) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "User requested invalid QID %u\n", qid);
+ cpl->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
+ cpl->status.sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER;
+ return -1;
+ }
+ cq->cqid = qid;
+ cq->size = qsize + 1;
+ cq->phase = 1;
+ cq->irq_enabled = (cmd->cdw11 >> 1) & 0x1;
+ /* Setup virq through vhost messages */
+ cq->virq = -1;
+ cq->cq_head = 0;
+ cq->guest_signaled_cq_head = 0;
+ cq->need_signaled_cnt = 0;
+ requested_len = sizeof(struct spdk_nvme_cpl) * cq->size;
+ cq->cq_cqe = vhost_gpa_to_vva(nvme->vsession, dma_addr, requested_len);
+ if (!cq->cq_cqe) {
+ return -1;
+ }
+ nvme->num_cqs++;
+ cq->valid = true;
+ if (nvme->bar) {
+ nvme->bar_db[cq_offset(qid, 1)] = 0;
+ }
+ STAILQ_INIT(&cq->cq_full_waited_tasks);
+
+ cpl->status.sc = 0;
+ cpl->status.sct = 0;
+ return 0;
+}
+
+static int
+vhost_nvme_delete_io_cq(struct spdk_vhost_nvme_dev *nvme,
+ struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl)
+{
+ uint16_t qid;
+ struct spdk_vhost_nvme_cq *cq;
+
+ qid = cmd->cdw10 & 0xffff;
+ cq = vhost_nvme_get_cq_from_qid(nvme, qid);
+ if (!cq) {
+ return -1;
+ }
+ nvme->num_cqs--;
+ cq->valid = false;
+
+ memset(cq, 0, sizeof(*cq));
+ cq->cq_cqe = NULL;
+
+ cpl->status.sc = 0;
+ cpl->status.sct = 0;
+ return 0;
+}
+
+static struct spdk_vhost_nvme_dev *
+vhost_nvme_get_by_name(int vid)
+{
+ struct spdk_vhost_nvme_dev *nvme;
+ struct spdk_vhost_dev *vdev;
+ struct spdk_vhost_session *vsession;
+
+ TAILQ_FOREACH(nvme, &g_nvme_ctrlrs, tailq) {
+ vdev = &nvme->vdev;
+ TAILQ_FOREACH(vsession, &vdev->vsessions, tailq) {
+ if (vsession->vid == vid) {
+ return nvme;
+ }
+ }
+ }
+
+ return NULL;
+}
+
+int
+vhost_nvme_get_cap(int vid, uint64_t *cap)
+{
+ struct spdk_vhost_nvme_dev *nvme;
+
+ nvme = vhost_nvme_get_by_name(vid);
+ if (!nvme) {
+ return -1;
+ }
+
+ *cap = nvme->cap.raw;
+ return 0;
+}
+
+int
+vhost_nvme_admin_passthrough(int vid, void *cmd, void *cqe, void *buf)
+{
+ struct spdk_nvme_cmd *req = (struct spdk_nvme_cmd *)cmd;
+ struct spdk_nvme_cpl *cpl = (struct spdk_nvme_cpl *)cqe;
+ struct spdk_vhost_nvme_ns *ns;
+ int ret = 0;
+ struct spdk_vhost_nvme_dev *nvme;
+
+ nvme = vhost_nvme_get_by_name(vid);
+ if (!nvme) {
+ return -1;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "Admin Command Opcode %u\n", req->opc);
+ switch (req->opc) {
+ case SPDK_NVME_OPC_IDENTIFY:
+ if (req->cdw10 == SPDK_NVME_IDENTIFY_CTRLR) {
+ memcpy(buf, &nvme->cdata, sizeof(struct spdk_nvme_ctrlr_data));
+
+ } else if (req->cdw10 == SPDK_NVME_IDENTIFY_NS) {
+ ns = vhost_nvme_get_ns_from_nsid(nvme, req->nsid);
+ if (!ns) {
+ cpl->status.sc = SPDK_NVME_SC_NAMESPACE_ID_UNAVAILABLE;
+ cpl->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC;
+ break;
+ }
+ memcpy(buf, &ns->nsdata, sizeof(struct spdk_nvme_ns_data));
+ }
+ /* successfully */
+ cpl->status.sc = 0;
+ cpl->status.sct = 0;
+ break;
+ case SPDK_NVME_OPC_CREATE_IO_CQ:
+ ret = vhost_nvme_create_io_cq(nvme, req, cpl);
+ break;
+ case SPDK_NVME_OPC_DELETE_IO_CQ:
+ ret = vhost_nvme_delete_io_cq(nvme, req, cpl);
+ break;
+ case SPDK_NVME_OPC_CREATE_IO_SQ:
+ ret = vhost_nvme_create_io_sq(nvme, req, cpl);
+ break;
+ case SPDK_NVME_OPC_DELETE_IO_SQ:
+ ret = vhost_nvme_delete_io_sq(nvme, req, cpl);
+ break;
+ case SPDK_NVME_OPC_GET_FEATURES:
+ case SPDK_NVME_OPC_SET_FEATURES:
+ if (req->cdw10 == SPDK_NVME_FEAT_NUMBER_OF_QUEUES) {
+ cpl->status.sc = 0;
+ cpl->status.sct = 0;
+ cpl->cdw0 = (nvme->num_io_queues - 1) | ((nvme->num_io_queues - 1) << 16);
+ } else {
+ cpl->status.sc = SPDK_NVME_SC_INVALID_FIELD;
+ cpl->status.sct = SPDK_NVME_SCT_GENERIC;
+ }
+ break;
+ case SPDK_NVME_OPC_DOORBELL_BUFFER_CONFIG:
+ ret = vhost_nvme_doorbell_buffer_config(nvme, req, cpl);
+ break;
+ case SPDK_NVME_OPC_ABORT:
+ /* TODO: ABORT failed fow now */
+ cpl->cdw0 = 1;
+ cpl->status.sc = 0;
+ cpl->status.sct = 0;
+ break;
+ }
+
+ if (ret) {
+ SPDK_ERRLOG("Admin Passthrough Failed with %u\n", req->opc);
+ }
+
+ return 0;
+}
+
+int
+vhost_nvme_set_bar_mr(int vid, void *bar_addr, uint64_t bar_size)
+{
+ struct spdk_vhost_nvme_dev *nvme;
+
+ nvme = vhost_nvme_get_by_name(vid);
+ if (!nvme) {
+ return -1;
+ }
+
+ nvme->bar = (volatile uint32_t *)(uintptr_t)(bar_addr);
+ /* BAR0 SQ/CQ doorbell registers start from offset 0x1000 */
+ nvme->bar_db = (volatile uint32_t *)(uintptr_t)(bar_addr + 0x1000ull);
+ nvme->bar_size = bar_size;
+
+ return 0;
+}
+
+int
+vhost_nvme_set_cq_call(int vid, uint16_t qid, int fd)
+{
+ struct spdk_vhost_nvme_dev *nvme;
+ struct spdk_vhost_nvme_cq *cq;
+
+ nvme = vhost_nvme_get_by_name(vid);
+ if (!nvme) {
+ return -1;
+ }
+
+ cq = vhost_nvme_get_cq_from_qid(nvme, qid);
+ if (!cq) {
+ return -1;
+ }
+ if (cq->irq_enabled) {
+ cq->virq = fd;
+ } else {
+ SPDK_ERRLOG("NVMe Qid %d Disabled IRQ\n", qid);
+ }
+
+ return 0;
+}
+
+static void
+free_task_pool(struct spdk_vhost_nvme_dev *nvme)
+{
+ struct spdk_vhost_nvme_task *task;
+
+ while (!STAILQ_EMPTY(&nvme->free_tasks)) {
+ task = STAILQ_FIRST(&nvme->free_tasks);
+ STAILQ_REMOVE_HEAD(&nvme->free_tasks, stailq);
+ spdk_free(task);
+ }
+}
+
+static int
+alloc_task_pool(struct spdk_vhost_nvme_dev *nvme)
+{
+ uint32_t entries, i;
+ struct spdk_vhost_nvme_task *task;
+
+ entries = nvme->num_io_queues * MAX_QUEUE_ENTRIES_SUPPORTED;
+
+ for (i = 0; i < entries; i++) {
+ task = spdk_zmalloc(sizeof(struct spdk_vhost_nvme_task),
+ SPDK_CACHE_LINE_SIZE, NULL,
+ SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+ if (task == NULL) {
+ SPDK_ERRLOG("Controller %s alloc task pool failed\n",
+ nvme->vdev.name);
+ free_task_pool(nvme);
+ return -1;
+ }
+ STAILQ_INSERT_TAIL(&nvme->free_tasks, task, stailq);
+ }
+
+ return 0;
+}
+
+static int
+vhost_nvme_start_cb(struct spdk_vhost_dev *vdev,
+ struct spdk_vhost_session *vsession, void *unused)
+{
+ struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev);
+ struct spdk_vhost_nvme_ns *ns_dev;
+ uint32_t i;
+ int rc = 0;
+
+ if (nvme == NULL) {
+ rc = -1;
+ goto out;
+ }
+
+ rc = alloc_task_pool(nvme);
+ if (rc) {
+ goto out;
+ }
+
+ SPDK_NOTICELOG("Start Device %u, Path %s, lcore %d\n", vsession->vid,
+ vdev->path, spdk_env_get_current_core());
+
+ for (i = 0; i < nvme->num_ns; i++) {
+ ns_dev = &nvme->ns[i];
+ ns_dev->bdev_io_channel = spdk_bdev_get_io_channel(ns_dev->bdev_desc);
+ if (!ns_dev->bdev_io_channel) {
+ rc = -1;
+ goto out;
+ }
+ }
+
+ nvme->vsession = vsession;
+ /* Start the NVMe Poller */
+ nvme->requestq_poller = SPDK_POLLER_REGISTER(nvme_worker, nvme, 0);
+
+out:
+ vhost_session_start_done(vsession, rc);
+ return rc;
+}
+
+static int
+vhost_nvme_start(struct spdk_vhost_session *vsession)
+{
+ if (vsession->vdev->active_session_num > 0) {
+ /* We're trying to start a second session */
+ SPDK_ERRLOG("Vhost-NVMe devices can support only one simultaneous connection.\n");
+ return -1;
+ }
+
+ return vhost_session_send_event(vsession, vhost_nvme_start_cb,
+ 3, "start session");
+}
+
+static void
+vhost_nvme_deactive_ns(struct spdk_vhost_nvme_ns *ns)
+{
+ ns->active_ns = 0;
+ spdk_bdev_close(ns->bdev_desc);
+ ns->bdev_desc = NULL;
+ ns->bdev = NULL;
+}
+
+static void
+bdev_remove_cb(void *remove_ctx)
+{
+ struct spdk_vhost_nvme_ns *ns = remove_ctx;
+
+ SPDK_NOTICELOG("Removing NS %u, Block Device %s\n",
+ ns->nsid, spdk_bdev_get_name(ns->bdev));
+
+ vhost_nvme_deactive_ns(ns);
+}
+
+static int
+destroy_device_poller_cb(void *arg)
+{
+ struct spdk_vhost_nvme_dev *nvme = arg;
+ struct spdk_vhost_nvme_ns *ns_dev;
+ uint32_t i;
+
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "Destroy device poller callback\n");
+
+ /* FIXME wait for pending I/Os to complete */
+
+ if (spdk_vhost_trylock() != 0) {
+ return SPDK_POLLER_BUSY;
+ }
+
+ for (i = 0; i < nvme->num_ns; i++) {
+ ns_dev = &nvme->ns[i];
+ if (ns_dev->bdev_io_channel) {
+ spdk_put_io_channel(ns_dev->bdev_io_channel);
+ ns_dev->bdev_io_channel = NULL;
+ }
+ }
+ /* Clear BAR space */
+ if (nvme->bar) {
+ memset((void *)nvme->bar, 0, nvme->bar_size);
+ }
+ nvme->num_sqs = 0;
+ nvme->num_cqs = 0;
+ nvme->dbbuf_dbs = NULL;
+ nvme->dbbuf_eis = NULL;
+ nvme->dataplane_started = false;
+
+ spdk_poller_unregister(&nvme->stop_poller);
+ vhost_session_stop_done(nvme->vsession, 0);
+
+ spdk_vhost_unlock();
+ return SPDK_POLLER_BUSY;
+}
+
+static int
+vhost_nvme_stop_cb(struct spdk_vhost_dev *vdev,
+ struct spdk_vhost_session *vsession, void *unused)
+{
+ struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev);
+
+ if (nvme == NULL) {
+ vhost_session_stop_done(vsession, -1);
+ return -1;
+ }
+
+ free_task_pool(nvme);
+ SPDK_NOTICELOG("Stopping Device %u, Path %s\n", vsession->vid, vdev->path);
+
+ spdk_poller_unregister(&nvme->requestq_poller);
+ nvme->stop_poller = SPDK_POLLER_REGISTER(destroy_device_poller_cb, nvme, 1000);
+
+ return 0;
+}
+
+static int
+vhost_nvme_stop(struct spdk_vhost_session *vsession)
+{
+ return vhost_session_send_event(vsession, vhost_nvme_stop_cb,
+ 3, "start session");
+}
+
+static void
+vhost_nvme_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
+{
+ struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev);
+ struct spdk_vhost_nvme_ns *ns_dev;
+ uint32_t i;
+
+ if (nvme == NULL) {
+ return;
+ }
+
+ spdk_json_write_named_array_begin(w, "namespaces");
+
+ for (i = 0; i < nvme->num_ns; i++) {
+ ns_dev = &nvme->ns[i];
+ if (!ns_dev->active_ns) {
+ continue;
+ }
+
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_uint32(w, "nsid", ns_dev->nsid);
+ spdk_json_write_named_string(w, "bdev", spdk_bdev_get_name(ns_dev->bdev));
+ spdk_json_write_object_end(w);
+ }
+
+ spdk_json_write_array_end(w);
+}
+
+static void
+vhost_nvme_write_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
+{
+ struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev);
+ struct spdk_vhost_nvme_ns *ns_dev;
+ uint32_t i;
+
+ if (nvme == NULL) {
+ return;
+ }
+
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "method", "vhost_create_nvme_controller");
+
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_string(w, "ctrlr", nvme->vdev.name);
+ spdk_json_write_named_uint32(w, "io_queues", nvme->num_io_queues);
+ spdk_json_write_named_string(w, "cpumask",
+ spdk_cpuset_fmt(spdk_thread_get_cpumask(nvme->vdev.thread)));
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+
+ for (i = 0; i < nvme->num_ns; i++) {
+ ns_dev = &nvme->ns[i];
+ if (!ns_dev->active_ns) {
+ continue;
+ }
+
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "method", "vhost_nvme_controller_add_ns");
+
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_string(w, "ctrlr", nvme->vdev.name);
+ spdk_json_write_named_string(w, "bdev_name", spdk_bdev_get_name(ns_dev->bdev));
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+ }
+}
+
+static const struct spdk_vhost_dev_backend spdk_vhost_nvme_device_backend = {
+ .session_ctx_size = 0,
+ .start_session = vhost_nvme_start,
+ .stop_session = vhost_nvme_stop,
+ .dump_info_json = vhost_nvme_dump_info_json,
+ .write_config_json = vhost_nvme_write_config_json,
+ .remove_device = vhost_nvme_dev_remove,
+};
+
+static int
+vhost_nvme_ns_identify_update(struct spdk_vhost_nvme_dev *dev)
+{
+ struct spdk_nvme_ctrlr_data *cdata = &dev->cdata;
+ struct spdk_nvme_ns_data *nsdata;
+ uint64_t num_blocks;
+ uint32_t i;
+
+ /* Identify Namespace */
+ cdata->nn = dev->num_ns;
+ for (i = 0; i < dev->num_ns; i++) {
+ nsdata = &dev->ns[i].nsdata;
+ if (dev->ns[i].active_ns) {
+ num_blocks = spdk_bdev_get_num_blocks(dev->ns[i].bdev);
+ nsdata->nsze = num_blocks;
+ /* ncap must be non-zero for active Namespace */
+ nsdata->ncap = num_blocks;
+ nsdata->nuse = num_blocks;
+ nsdata->nlbaf = 0;
+ nsdata->flbas.format = 0;
+ nsdata->lbaf[0].lbads = spdk_u32log2(spdk_bdev_get_block_size(dev->ns[i].bdev));
+ nsdata->noiob = spdk_bdev_get_optimal_io_boundary(dev->ns[i].bdev);
+ dev->ns[i].block_size = spdk_bdev_get_block_size(dev->ns[i].bdev);
+ dev->ns[i].capacity = num_blocks * dev->ns[i].block_size;
+ } else {
+ memset(nsdata, 0, sizeof(*nsdata));
+ }
+ }
+ return 0;
+}
+
+static int
+vhost_nvme_ctrlr_identify_update(struct spdk_vhost_nvme_dev *dev)
+{
+ struct spdk_nvme_ctrlr_data *cdata = &dev->cdata;
+ char sn[20];
+
+ /* Controller Capabilities */
+ dev->cap.bits.cqr = 1;
+ dev->cap.bits.to = 1;
+ dev->cap.bits.dstrd = 0;
+ dev->cap.bits.css = SPDK_NVME_CAP_CSS_NVM;
+ dev->cap.bits.mpsmin = 0;
+ dev->cap.bits.mpsmax = 0;
+ /* MQES is 0 based value */
+ dev->cap.bits.mqes = MAX_QUEUE_ENTRIES_SUPPORTED - 1;
+
+ /* Controller Configuration */
+ dev->cc.bits.en = 0;
+
+ /* Controller Status */
+ dev->csts.bits.rdy = 0;
+
+ /* Identify Controller */
+ spdk_strcpy_pad(cdata->fr, FW_VERSION, sizeof(cdata->fr), ' ');
+ cdata->vid = 0x8086;
+ cdata->ssvid = 0x8086;
+ spdk_strcpy_pad(cdata->mn, "SPDK Virtual NVMe Controller", sizeof(cdata->mn), ' ');
+ snprintf(sn, sizeof(sn), "NVMe_%s", dev->vdev.name);
+ spdk_strcpy_pad(cdata->sn, sn, sizeof(cdata->sn), ' ');
+ cdata->ieee[0] = 0xe4;
+ cdata->ieee[1] = 0xd2;
+ cdata->ieee[2] = 0x5c;
+ cdata->ver.bits.mjr = 1;
+ cdata->ver.bits.mnr = 0;
+ cdata->mdts = 5; /* 128 KiB */
+ cdata->rab = 6;
+ cdata->sqes.min = 6;
+ cdata->sqes.max = 6;
+ cdata->cqes.min = 4;
+ cdata->cqes.max = 4;
+ cdata->oncs.dsm = 1;
+ /* Emulated NVMe controller */
+ cdata->oacs.doorbell_buffer_config = 1;
+
+ vhost_nvme_ns_identify_update(dev);
+
+ return 0;
+}
+
+int
+vhost_nvme_dev_construct(const char *name, const char *cpumask, uint32_t num_io_queues)
+{
+ struct spdk_vhost_nvme_dev *dev;
+ int rc;
+
+ if (posix_memalign((void **)&dev, SPDK_CACHE_LINE_SIZE, sizeof(*dev))) {
+ return -ENOMEM;
+ }
+ memset(dev, 0, sizeof(*dev));
+
+ if (num_io_queues < 1 || num_io_queues > MAX_IO_QUEUES) {
+ free(dev);
+ return -EINVAL;
+ }
+
+ spdk_vhost_lock();
+ rc = vhost_dev_register(&dev->vdev, name, cpumask,
+ &spdk_vhost_nvme_device_backend);
+
+ if (rc) {
+ free(dev);
+ spdk_vhost_unlock();
+ return rc;
+ }
+
+ dev->num_io_queues = num_io_queues;
+ STAILQ_INIT(&dev->free_tasks);
+ TAILQ_INSERT_TAIL(&g_nvme_ctrlrs, dev, tailq);
+
+ vhost_nvme_ctrlr_identify_update(dev);
+
+ SPDK_NOTICELOG("Controller %s: Constructed\n", name);
+ spdk_vhost_unlock();
+ return rc;
+}
+
+int
+vhost_nvme_dev_remove(struct spdk_vhost_dev *vdev)
+{
+ struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev);
+ struct spdk_vhost_nvme_ns *ns;
+ int rc;
+ uint32_t i;
+
+ if (nvme == NULL) {
+ return -EINVAL;
+ }
+
+ TAILQ_REMOVE(&g_nvme_ctrlrs, nvme, tailq);
+ for (i = 0; i < nvme->num_ns; i++) {
+ ns = &nvme->ns[i];
+ if (ns->active_ns) {
+ vhost_nvme_deactive_ns(ns);
+ }
+ }
+
+ rc = vhost_dev_unregister(vdev);
+ if (rc != 0) {
+ return rc;
+ }
+
+ free(nvme);
+ return 0;
+}
+
+int
+vhost_nvme_dev_add_ns(struct spdk_vhost_dev *vdev, const char *bdev_name)
+{
+ struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev);
+ struct spdk_vhost_nvme_ns *ns;
+ struct spdk_bdev *bdev;
+ int rc = -1;
+
+ if (nvme == NULL) {
+ return -ENODEV;
+ }
+
+ if (nvme->num_ns == MAX_NAMESPACE) {
+ SPDK_ERRLOG("Can't support %d Namespaces\n", nvme->num_ns);
+ return -ENOSPC;
+ }
+
+ bdev = spdk_bdev_get_by_name(bdev_name);
+ if (!bdev) {
+ SPDK_ERRLOG("could not find bdev %s\n", bdev_name);
+ return -ENODEV;
+ }
+
+ ns = &nvme->ns[nvme->num_ns];
+ rc = spdk_bdev_open(bdev, true, bdev_remove_cb, ns, &nvme->ns[nvme->num_ns].bdev_desc);
+ if (rc != 0) {
+ SPDK_ERRLOG("Could not open bdev '%s', error=%d\n",
+ bdev_name, rc);
+ return rc;
+ }
+
+ nvme->ns[nvme->num_ns].bdev = bdev;
+ nvme->ns[nvme->num_ns].active_ns = 1;
+ nvme->ns[nvme->num_ns].nsid = nvme->num_ns + 1;
+ nvme->num_ns++;
+
+ vhost_nvme_ns_identify_update(nvme);
+
+ return rc;
+}
+
+int
+vhost_nvme_controller_construct(void)
+{
+ struct spdk_conf_section *sp;
+ const char *name;
+ const char *bdev_name;
+ const char *cpumask;
+ int rc, i = 0;
+ struct spdk_vhost_dev *vdev;
+ uint32_t ctrlr_num, io_queues;
+
+ for (sp = spdk_conf_first_section(NULL); sp != NULL; sp = spdk_conf_next_section(sp)) {
+ if (!spdk_conf_section_match_prefix(sp, "VhostNvme")) {
+ continue;
+ }
+
+ if (sscanf(spdk_conf_section_get_name(sp), "VhostNvme%u", &ctrlr_num) != 1) {
+ SPDK_ERRLOG("Section '%s' has non-numeric suffix.\n",
+ spdk_conf_section_get_name(sp));
+ return -1;
+ }
+
+ name = spdk_conf_section_get_val(sp, "Name");
+ if (name == NULL) {
+ SPDK_ERRLOG("VhostNvme%u: missing Name\n", ctrlr_num);
+ return -1;
+ }
+
+ cpumask = spdk_conf_section_get_val(sp, "Cpumask");
+ rc = spdk_conf_section_get_intval(sp, "NumberOfQueues");
+ if (rc > 0) {
+ io_queues = rc;
+ } else {
+ io_queues = 1;
+ }
+
+ rc = vhost_nvme_dev_construct(name, cpumask, io_queues);
+ if (rc < 0) {
+ SPDK_ERRLOG("VhostNvme%u: Construct failed\n", ctrlr_num);
+ return -1;
+ }
+
+ vdev = spdk_vhost_dev_find(name);
+ if (!vdev) {
+ return -1;
+ }
+
+ for (i = 0; spdk_conf_section_get_nval(sp, "Namespace", i) != NULL; i++) {
+ bdev_name = spdk_conf_section_get_nmval(sp, "Namespace", i, 0);
+ if (!bdev_name) {
+ SPDK_ERRLOG("namespace configuration missing bdev name\n");
+ break;
+ }
+ rc = vhost_nvme_dev_add_ns(vdev, bdev_name);
+ if (rc < 0) {
+ SPDK_WARNLOG("VhostNvme%u: Construct Namespace with %s failed\n",
+ ctrlr_num, bdev_name);
+ break;
+ }
+ }
+ }
+
+ return 0;
+}
+
+SPDK_LOG_REGISTER_COMPONENT("vhost_nvme", SPDK_LOG_VHOST_NVME)
diff --git a/src/spdk/lib/vhost/vhost_rpc.c b/src/spdk/lib/vhost/vhost_rpc.c
new file mode 100644
index 000000000..196d75918
--- /dev/null
+++ b/src/spdk/lib/vhost/vhost_rpc.c
@@ -0,0 +1,652 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk_internal/log.h"
+#include "spdk/rpc.h"
+#include "spdk/util.h"
+#include "spdk/string.h"
+#include "spdk/env.h"
+
+#include "spdk/scsi.h"
+#include "spdk/vhost.h"
+#include "vhost_internal.h"
+#include "spdk/bdev.h"
+
+struct rpc_vhost_scsi_ctrlr {
+ char *ctrlr;
+ char *cpumask;
+};
+
+static void
+free_rpc_vhost_scsi_ctrlr(struct rpc_vhost_scsi_ctrlr *req)
+{
+ free(req->ctrlr);
+ free(req->cpumask);
+}
+
+static const struct spdk_json_object_decoder rpc_vhost_create_scsi_ctrlr[] = {
+ {"ctrlr", offsetof(struct rpc_vhost_scsi_ctrlr, ctrlr), spdk_json_decode_string },
+ {"cpumask", offsetof(struct rpc_vhost_scsi_ctrlr, cpumask), spdk_json_decode_string, true},
+};
+
+static void
+rpc_vhost_create_scsi_controller(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_vhost_scsi_ctrlr req = {0};
+ struct spdk_json_write_ctx *w;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_vhost_create_scsi_ctrlr,
+ SPDK_COUNTOF(rpc_vhost_create_scsi_ctrlr),
+ &req)) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n");
+ rc = -EINVAL;
+ goto invalid;
+ }
+
+ rc = spdk_vhost_scsi_dev_construct(req.ctrlr, req.cpumask);
+ if (rc < 0) {
+ goto invalid;
+ }
+
+ free_rpc_vhost_scsi_ctrlr(&req);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ return;
+
+invalid:
+ free_rpc_vhost_scsi_ctrlr(&req);
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ spdk_strerror(-rc));
+}
+SPDK_RPC_REGISTER("vhost_create_scsi_controller", rpc_vhost_create_scsi_controller,
+ SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(vhost_create_scsi_controller, construct_vhost_scsi_controller)
+
+struct rpc_vhost_scsi_ctrlr_add_target {
+ char *ctrlr;
+ int32_t scsi_target_num;
+ char *bdev_name;
+};
+
+static void
+free_rpc_vhost_scsi_ctrlr_add_target(struct rpc_vhost_scsi_ctrlr_add_target *req)
+{
+ free(req->ctrlr);
+ free(req->bdev_name);
+}
+
+static const struct spdk_json_object_decoder rpc_vhost_scsi_ctrlr_add_target[] = {
+ {"ctrlr", offsetof(struct rpc_vhost_scsi_ctrlr_add_target, ctrlr), spdk_json_decode_string },
+ {"scsi_target_num", offsetof(struct rpc_vhost_scsi_ctrlr_add_target, scsi_target_num), spdk_json_decode_int32},
+ {"bdev_name", offsetof(struct rpc_vhost_scsi_ctrlr_add_target, bdev_name), spdk_json_decode_string },
+};
+
+static void
+rpc_vhost_scsi_controller_add_target(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_vhost_scsi_ctrlr_add_target req = {0};
+ struct spdk_json_write_ctx *w;
+ struct spdk_vhost_dev *vdev;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_vhost_scsi_ctrlr_add_target,
+ SPDK_COUNTOF(rpc_vhost_scsi_ctrlr_add_target),
+ &req)) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n");
+ rc = -EINVAL;
+ goto invalid;
+ }
+
+ spdk_vhost_lock();
+ vdev = spdk_vhost_dev_find(req.ctrlr);
+ if (vdev == NULL) {
+ spdk_vhost_unlock();
+ rc = -ENODEV;
+ goto invalid;
+ }
+
+ rc = spdk_vhost_scsi_dev_add_tgt(vdev, req.scsi_target_num, req.bdev_name);
+ spdk_vhost_unlock();
+ if (rc < 0) {
+ goto invalid;
+ }
+
+ free_rpc_vhost_scsi_ctrlr_add_target(&req);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_int32(w, rc);
+ spdk_jsonrpc_end_result(request, w);
+ return;
+
+invalid:
+ free_rpc_vhost_scsi_ctrlr_add_target(&req);
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ spdk_strerror(-rc));
+}
+SPDK_RPC_REGISTER("vhost_scsi_controller_add_target", rpc_vhost_scsi_controller_add_target,
+ SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(vhost_scsi_controller_add_target, add_vhost_scsi_lun)
+
+struct rpc_remove_vhost_scsi_ctrlr_target {
+ char *ctrlr;
+ uint32_t scsi_target_num;
+};
+
+static void
+free_rpc_remove_vhost_scsi_ctrlr_target(struct rpc_remove_vhost_scsi_ctrlr_target *req)
+{
+ free(req->ctrlr);
+}
+
+static const struct spdk_json_object_decoder rpc_vhost_remove_target[] = {
+ {"ctrlr", offsetof(struct rpc_remove_vhost_scsi_ctrlr_target, ctrlr), spdk_json_decode_string },
+ {"scsi_target_num", offsetof(struct rpc_remove_vhost_scsi_ctrlr_target, scsi_target_num), spdk_json_decode_uint32},
+};
+
+static int
+rpc_vhost_scsi_controller_remove_target_finish_cb(struct spdk_vhost_dev *vdev, void *arg)
+{
+ struct spdk_jsonrpc_request *request = arg;
+ struct spdk_json_write_ctx *w;
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ return 0;
+}
+
+static void
+rpc_vhost_scsi_controller_remove_target(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_remove_vhost_scsi_ctrlr_target req = {0};
+ struct spdk_vhost_dev *vdev;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_vhost_remove_target,
+ SPDK_COUNTOF(rpc_vhost_remove_target),
+ &req)) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n");
+ rc = -EINVAL;
+ goto invalid;
+ }
+
+ spdk_vhost_lock();
+ vdev = spdk_vhost_dev_find(req.ctrlr);
+ if (vdev == NULL) {
+ spdk_vhost_unlock();
+ rc = -ENODEV;
+ goto invalid;
+ }
+
+ rc = spdk_vhost_scsi_dev_remove_tgt(vdev, req.scsi_target_num,
+ rpc_vhost_scsi_controller_remove_target_finish_cb,
+ request);
+ spdk_vhost_unlock();
+ if (rc < 0) {
+ goto invalid;
+ }
+
+ free_rpc_remove_vhost_scsi_ctrlr_target(&req);
+ return;
+
+invalid:
+ free_rpc_remove_vhost_scsi_ctrlr_target(&req);
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ spdk_strerror(-rc));
+}
+
+SPDK_RPC_REGISTER("vhost_scsi_controller_remove_target",
+ rpc_vhost_scsi_controller_remove_target, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(vhost_scsi_controller_remove_target, remove_vhost_scsi_target)
+
+struct rpc_vhost_blk_ctrlr {
+ char *ctrlr;
+ char *dev_name;
+ char *cpumask;
+ bool readonly;
+ bool packed_ring;
+};
+
+static const struct spdk_json_object_decoder rpc_construct_vhost_blk_ctrlr[] = {
+ {"ctrlr", offsetof(struct rpc_vhost_blk_ctrlr, ctrlr), spdk_json_decode_string },
+ {"dev_name", offsetof(struct rpc_vhost_blk_ctrlr, dev_name), spdk_json_decode_string },
+ {"cpumask", offsetof(struct rpc_vhost_blk_ctrlr, cpumask), spdk_json_decode_string, true},
+ {"readonly", offsetof(struct rpc_vhost_blk_ctrlr, readonly), spdk_json_decode_bool, true},
+ {"packed_ring", offsetof(struct rpc_vhost_blk_ctrlr, packed_ring), spdk_json_decode_bool, true},
+};
+
+static void
+free_rpc_vhost_blk_ctrlr(struct rpc_vhost_blk_ctrlr *req)
+{
+ free(req->ctrlr);
+ free(req->dev_name);
+ free(req->cpumask);
+}
+
+static void
+rpc_vhost_create_blk_controller(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_vhost_blk_ctrlr req = {0};
+ struct spdk_json_write_ctx *w;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_construct_vhost_blk_ctrlr,
+ SPDK_COUNTOF(rpc_construct_vhost_blk_ctrlr),
+ &req)) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n");
+ rc = -EINVAL;
+ goto invalid;
+ }
+
+ rc = spdk_vhost_blk_construct(req.ctrlr, req.cpumask, req.dev_name,
+ req.readonly, req.packed_ring);
+ if (rc < 0) {
+ goto invalid;
+ }
+
+ free_rpc_vhost_blk_ctrlr(&req);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ return;
+
+invalid:
+ free_rpc_vhost_blk_ctrlr(&req);
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ spdk_strerror(-rc));
+
+}
+SPDK_RPC_REGISTER("vhost_create_blk_controller", rpc_vhost_create_blk_controller,
+ SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(vhost_create_blk_controller, construct_vhost_blk_controller)
+
+struct rpc_delete_vhost_ctrlr {
+ char *ctrlr;
+};
+
+static const struct spdk_json_object_decoder rpc_delete_vhost_ctrlr_decoder[] = {
+ {"ctrlr", offsetof(struct rpc_delete_vhost_ctrlr, ctrlr), spdk_json_decode_string },
+};
+
+static void
+free_rpc_delete_vhost_ctrlr(struct rpc_delete_vhost_ctrlr *req)
+{
+ free(req->ctrlr);
+}
+
+static void
+rpc_vhost_delete_controller(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_delete_vhost_ctrlr req = {0};
+ struct spdk_json_write_ctx *w;
+ struct spdk_vhost_dev *vdev;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_delete_vhost_ctrlr_decoder,
+ SPDK_COUNTOF(rpc_delete_vhost_ctrlr_decoder), &req)) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n");
+ rc = -EINVAL;
+ goto invalid;
+ }
+
+ spdk_vhost_lock();
+ vdev = spdk_vhost_dev_find(req.ctrlr);
+ if (vdev == NULL) {
+ spdk_vhost_unlock();
+ rc = -ENODEV;
+ goto invalid;
+ }
+
+ rc = spdk_vhost_dev_remove(vdev);
+ spdk_vhost_unlock();
+ if (rc < 0) {
+ goto invalid;
+ }
+
+ free_rpc_delete_vhost_ctrlr(&req);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+
+ return;
+
+invalid:
+ free_rpc_delete_vhost_ctrlr(&req);
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ spdk_strerror(-rc));
+
+}
+SPDK_RPC_REGISTER("vhost_delete_controller", rpc_vhost_delete_controller, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(vhost_delete_controller, remove_vhost_controller)
+
+struct rpc_get_vhost_ctrlrs {
+ char *name;
+};
+
+static void
+_rpc_get_vhost_controller(struct spdk_json_write_ctx *w, struct spdk_vhost_dev *vdev)
+{
+ uint32_t delay_base_us, iops_threshold;
+
+ spdk_vhost_get_coalescing(vdev, &delay_base_us, &iops_threshold);
+
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_string(w, "ctrlr", spdk_vhost_dev_get_name(vdev));
+ spdk_json_write_named_string_fmt(w, "cpumask", "0x%s",
+ spdk_cpuset_fmt(spdk_thread_get_cpumask(vdev->thread)));
+ spdk_json_write_named_uint32(w, "delay_base_us", delay_base_us);
+ spdk_json_write_named_uint32(w, "iops_threshold", iops_threshold);
+ spdk_json_write_named_string(w, "socket", vdev->path);
+
+ spdk_json_write_named_object_begin(w, "backend_specific");
+ vhost_dump_info_json(vdev, w);
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+}
+
+static const struct spdk_json_object_decoder rpc_get_vhost_ctrlrs_decoders[] = {
+ {"name", offsetof(struct rpc_get_vhost_ctrlrs, name), spdk_json_decode_string, true},
+};
+
+static void
+free_rpc_get_vhost_ctrlrs(struct rpc_get_vhost_ctrlrs *req)
+{
+ free(req->name);
+}
+
+static void
+rpc_vhost_get_controllers(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_get_vhost_ctrlrs req = {0};
+ struct spdk_json_write_ctx *w;
+ struct spdk_vhost_dev *vdev;
+ int rc;
+
+ if (params && spdk_json_decode_object(params, rpc_get_vhost_ctrlrs_decoders,
+ SPDK_COUNTOF(rpc_get_vhost_ctrlrs_decoders), &req)) {
+ SPDK_ERRLOG("spdk_json_decode_object failed\n");
+ rc = -EINVAL;
+ goto invalid;
+ }
+
+ spdk_vhost_lock();
+ if (req.name != NULL) {
+ vdev = spdk_vhost_dev_find(req.name);
+ if (vdev == NULL) {
+ spdk_vhost_unlock();
+ rc = -ENODEV;
+ goto invalid;
+ }
+
+ free_rpc_get_vhost_ctrlrs(&req);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_array_begin(w);
+
+ _rpc_get_vhost_controller(w, vdev);
+ spdk_vhost_unlock();
+
+ spdk_json_write_array_end(w);
+ spdk_jsonrpc_end_result(request, w);
+ return;
+ }
+
+ free_rpc_get_vhost_ctrlrs(&req);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_array_begin(w);
+
+ vdev = spdk_vhost_dev_next(NULL);
+ while (vdev != NULL) {
+ _rpc_get_vhost_controller(w, vdev);
+ vdev = spdk_vhost_dev_next(vdev);
+ }
+ spdk_vhost_unlock();
+
+ spdk_json_write_array_end(w);
+ spdk_jsonrpc_end_result(request, w);
+ return;
+
+invalid:
+ free_rpc_get_vhost_ctrlrs(&req);
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR,
+ spdk_strerror(-rc));
+}
+SPDK_RPC_REGISTER("vhost_get_controllers", rpc_vhost_get_controllers, SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(vhost_get_controllers, get_vhost_controllers)
+
+
+struct rpc_vhost_ctrlr_coalescing {
+ char *ctrlr;
+ uint32_t delay_base_us;
+ uint32_t iops_threshold;
+};
+
+static const struct spdk_json_object_decoder rpc_set_vhost_ctrlr_coalescing[] = {
+ {"ctrlr", offsetof(struct rpc_vhost_ctrlr_coalescing, ctrlr), spdk_json_decode_string },
+ {"delay_base_us", offsetof(struct rpc_vhost_ctrlr_coalescing, delay_base_us), spdk_json_decode_uint32},
+ {"iops_threshold", offsetof(struct rpc_vhost_ctrlr_coalescing, iops_threshold), spdk_json_decode_uint32},
+};
+
+static void
+free_rpc_set_vhost_controllers_event_coalescing(struct rpc_vhost_ctrlr_coalescing *req)
+{
+ free(req->ctrlr);
+}
+
+static void
+rpc_vhost_controller_set_coalescing(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_vhost_ctrlr_coalescing req = {0};
+ struct spdk_json_write_ctx *w;
+ struct spdk_vhost_dev *vdev;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_set_vhost_ctrlr_coalescing,
+ SPDK_COUNTOF(rpc_set_vhost_ctrlr_coalescing), &req)) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n");
+ rc = -EINVAL;
+ goto invalid;
+ }
+
+ spdk_vhost_lock();
+ vdev = spdk_vhost_dev_find(req.ctrlr);
+ if (vdev == NULL) {
+ spdk_vhost_unlock();
+ rc = -ENODEV;
+ goto invalid;
+ }
+
+ rc = spdk_vhost_set_coalescing(vdev, req.delay_base_us, req.iops_threshold);
+ spdk_vhost_unlock();
+ if (rc) {
+ goto invalid;
+ }
+
+ free_rpc_set_vhost_controllers_event_coalescing(&req);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+
+ return;
+
+invalid:
+ free_rpc_set_vhost_controllers_event_coalescing(&req);
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ spdk_strerror(-rc));
+}
+SPDK_RPC_REGISTER("vhost_controller_set_coalescing", rpc_vhost_controller_set_coalescing,
+ SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(vhost_controller_set_coalescing, set_vhost_controller_coalescing)
+
+#ifdef SPDK_CONFIG_VHOST_INTERNAL_LIB
+
+struct rpc_vhost_nvme_ctrlr {
+ char *ctrlr;
+ uint32_t io_queues;
+ char *cpumask;
+};
+
+static const struct spdk_json_object_decoder rpc_construct_vhost_nvme_ctrlr[] = {
+ {"ctrlr", offsetof(struct rpc_vhost_nvme_ctrlr, ctrlr), spdk_json_decode_string },
+ {"io_queues", offsetof(struct rpc_vhost_nvme_ctrlr, io_queues), spdk_json_decode_uint32},
+ {"cpumask", offsetof(struct rpc_vhost_nvme_ctrlr, cpumask), spdk_json_decode_string, true},
+};
+
+static void
+free_rpc_vhost_nvme_ctrlr(struct rpc_vhost_nvme_ctrlr *req)
+{
+ free(req->ctrlr);
+ free(req->cpumask);
+}
+
+static void
+rpc_vhost_create_nvme_controller(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_vhost_nvme_ctrlr req = {};
+ struct spdk_json_write_ctx *w;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_construct_vhost_nvme_ctrlr,
+ SPDK_COUNTOF(rpc_construct_vhost_nvme_ctrlr),
+ &req)) {
+ rc = -EINVAL;
+ goto invalid;
+ }
+
+ rc = vhost_nvme_dev_construct(req.ctrlr, req.cpumask, req.io_queues);
+ if (rc < 0) {
+ goto invalid;
+ }
+
+ free_rpc_vhost_nvme_ctrlr(&req);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ return;
+
+invalid:
+ free_rpc_vhost_nvme_ctrlr(&req);
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ spdk_strerror(-rc));
+
+}
+SPDK_RPC_REGISTER("vhost_create_nvme_controller", rpc_vhost_create_nvme_controller,
+ SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(vhost_create_nvme_controller, construct_vhost_nvme_controller)
+
+struct rpc_vhost_nvme_ctrlr_add_ns {
+ char *ctrlr;
+ char *bdev_name;
+};
+
+static void
+free_rpc_vhost_nvme_ctrlr_add_ns(struct rpc_vhost_nvme_ctrlr_add_ns *req)
+{
+ free(req->ctrlr);
+ free(req->bdev_name);
+}
+
+static const struct spdk_json_object_decoder rpc_vhost_nvme_add_ns[] = {
+ {"ctrlr", offsetof(struct rpc_vhost_nvme_ctrlr_add_ns, ctrlr), spdk_json_decode_string },
+ {"bdev_name", offsetof(struct rpc_vhost_nvme_ctrlr_add_ns, bdev_name), spdk_json_decode_string },
+};
+
+static void
+rpc_vhost_nvme_controller_add_ns(struct spdk_jsonrpc_request *request,
+ const struct spdk_json_val *params)
+{
+ struct rpc_vhost_nvme_ctrlr_add_ns req = {0};
+ struct spdk_json_write_ctx *w;
+ struct spdk_vhost_dev *vdev;
+ int rc;
+
+ if (spdk_json_decode_object(params, rpc_vhost_nvme_add_ns,
+ SPDK_COUNTOF(rpc_vhost_nvme_add_ns),
+ &req)) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n");
+ rc = -EINVAL;
+ goto invalid;
+ }
+
+ spdk_vhost_lock();
+ vdev = spdk_vhost_dev_find(req.ctrlr);
+ if (vdev == NULL) {
+ spdk_vhost_unlock();
+ rc = -ENODEV;
+ goto invalid;
+ }
+
+ rc = vhost_nvme_dev_add_ns(vdev, req.bdev_name);
+ spdk_vhost_unlock();
+ if (rc < 0) {
+ goto invalid;
+ }
+ free_rpc_vhost_nvme_ctrlr_add_ns(&req);
+
+ w = spdk_jsonrpc_begin_result(request);
+ spdk_json_write_bool(w, true);
+ spdk_jsonrpc_end_result(request, w);
+ return;
+
+invalid:
+ free_rpc_vhost_nvme_ctrlr_add_ns(&req);
+ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS,
+ spdk_strerror(-rc));
+}
+SPDK_RPC_REGISTER("vhost_nvme_controller_add_ns", rpc_vhost_nvme_controller_add_ns,
+ SPDK_RPC_RUNTIME)
+SPDK_RPC_REGISTER_ALIAS_DEPRECATED(vhost_nvme_controller_add_ns, add_vhost_nvme_ns)
+
+#endif /* SPDK_CONFIG_VHOST_INTERNAL_LIB */
+
+SPDK_LOG_REGISTER_COMPONENT("vhost_rpc", SPDK_LOG_VHOST_RPC)
diff --git a/src/spdk/lib/vhost/vhost_scsi.c b/src/spdk/lib/vhost/vhost_scsi.c
new file mode 100644
index 000000000..49e49dc76
--- /dev/null
+++ b/src/spdk/lib/vhost/vhost_scsi.c
@@ -0,0 +1,1536 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include <linux/virtio_scsi.h>
+
+#include "spdk/env.h"
+#include "spdk/thread.h"
+#include "spdk/scsi.h"
+#include "spdk/scsi_spec.h"
+#include "spdk/conf.h"
+#include "spdk/util.h"
+#include "spdk/likely.h"
+
+#include "spdk/vhost.h"
+#include "vhost_internal.h"
+
+/* Features supported by SPDK VHOST lib. */
+#define SPDK_VHOST_SCSI_FEATURES (SPDK_VHOST_FEATURES | \
+ (1ULL << VIRTIO_SCSI_F_INOUT) | \
+ (1ULL << VIRTIO_SCSI_F_HOTPLUG) | \
+ (1ULL << VIRTIO_SCSI_F_CHANGE ) | \
+ (1ULL << VIRTIO_SCSI_F_T10_PI ))
+
+/* Features that are specified in VIRTIO SCSI but currently not supported:
+ * - Live migration not supported yet
+ * - T10 PI
+ */
+#define SPDK_VHOST_SCSI_DISABLED_FEATURES (SPDK_VHOST_DISABLED_FEATURES | \
+ (1ULL << VIRTIO_SCSI_F_T10_PI ))
+
+#define MGMT_POLL_PERIOD_US (1000 * 5)
+
+#define VIRTIO_SCSI_CONTROLQ 0
+#define VIRTIO_SCSI_EVENTQ 1
+#define VIRTIO_SCSI_REQUESTQ 2
+
+enum spdk_scsi_dev_vhost_status {
+ /* Target ID is empty. */
+ VHOST_SCSI_DEV_EMPTY,
+
+ /* Target is still being added. */
+ VHOST_SCSI_DEV_ADDING,
+
+ /* Target ID occupied. */
+ VHOST_SCSI_DEV_PRESENT,
+
+ /* Target ID is occupied but removal is in progress. */
+ VHOST_SCSI_DEV_REMOVING,
+
+ /* In session - device (SCSI target) seen but removed. */
+ VHOST_SCSI_DEV_REMOVED,
+};
+
+/** Context for a SCSI target in a vhost device */
+struct spdk_scsi_dev_vhost_state {
+ struct spdk_scsi_dev *dev;
+ enum spdk_scsi_dev_vhost_status status;
+ spdk_vhost_event_fn remove_cb;
+ void *remove_ctx;
+};
+
+struct spdk_vhost_scsi_dev {
+ int ref;
+ bool registered;
+ struct spdk_vhost_dev vdev;
+ struct spdk_scsi_dev_vhost_state scsi_dev_state[SPDK_VHOST_SCSI_CTRLR_MAX_DEVS];
+};
+
+/** Context for a SCSI target in a vhost session */
+struct spdk_scsi_dev_session_state {
+ struct spdk_scsi_dev *dev;
+ enum spdk_scsi_dev_vhost_status status;
+};
+
+struct spdk_vhost_scsi_session {
+ struct spdk_vhost_session vsession;
+
+ struct spdk_vhost_scsi_dev *svdev;
+ /** Local copy of the device state */
+ struct spdk_scsi_dev_session_state scsi_dev_state[SPDK_VHOST_SCSI_CTRLR_MAX_DEVS];
+ struct spdk_poller *requestq_poller;
+ struct spdk_poller *mgmt_poller;
+ struct spdk_poller *stop_poller;
+};
+
+struct spdk_vhost_scsi_task {
+ struct spdk_scsi_task scsi;
+ struct iovec iovs[SPDK_VHOST_IOVS_MAX];
+
+ union {
+ struct virtio_scsi_cmd_resp *resp;
+ struct virtio_scsi_ctrl_tmf_resp *tmf_resp;
+ };
+
+ struct spdk_vhost_scsi_session *svsession;
+ struct spdk_scsi_dev *scsi_dev;
+
+ /** Number of bytes that were written. */
+ uint32_t used_len;
+
+ int req_idx;
+
+ /* If set, the task is currently used for I/O processing. */
+ bool used;
+
+ struct spdk_vhost_virtqueue *vq;
+};
+
+static int vhost_scsi_start(struct spdk_vhost_session *vsession);
+static int vhost_scsi_stop(struct spdk_vhost_session *vsession);
+static void vhost_scsi_dump_info_json(struct spdk_vhost_dev *vdev,
+ struct spdk_json_write_ctx *w);
+static void vhost_scsi_write_config_json(struct spdk_vhost_dev *vdev,
+ struct spdk_json_write_ctx *w);
+static int vhost_scsi_dev_remove(struct spdk_vhost_dev *vdev);
+
+static const struct spdk_vhost_dev_backend spdk_vhost_scsi_device_backend = {
+ .session_ctx_size = sizeof(struct spdk_vhost_scsi_session) - sizeof(struct spdk_vhost_session),
+ .start_session = vhost_scsi_start,
+ .stop_session = vhost_scsi_stop,
+ .dump_info_json = vhost_scsi_dump_info_json,
+ .write_config_json = vhost_scsi_write_config_json,
+ .remove_device = vhost_scsi_dev_remove,
+};
+
+static inline void
+scsi_task_init(struct spdk_vhost_scsi_task *task)
+{
+ memset(&task->scsi, 0, sizeof(task->scsi));
+ /* Tmf_resp pointer and resp pointer are in a union.
+ * Here means task->tmf_resp = task->resp = NULL.
+ */
+ task->resp = NULL;
+ task->used = true;
+ task->used_len = 0;
+}
+
+static void
+vhost_scsi_task_put(struct spdk_vhost_scsi_task *task)
+{
+ spdk_scsi_task_put(&task->scsi);
+}
+
+static void
+vhost_scsi_task_free_cb(struct spdk_scsi_task *scsi_task)
+{
+ struct spdk_vhost_scsi_task *task = SPDK_CONTAINEROF(scsi_task, struct spdk_vhost_scsi_task, scsi);
+ struct spdk_vhost_session *vsession = &task->svsession->vsession;
+
+ assert(vsession->task_cnt > 0);
+ vsession->task_cnt--;
+ task->used = false;
+}
+
+static void
+remove_scsi_tgt(struct spdk_vhost_scsi_dev *svdev,
+ unsigned scsi_tgt_num)
+{
+ struct spdk_scsi_dev_vhost_state *state;
+ struct spdk_scsi_dev *dev;
+
+ state = &svdev->scsi_dev_state[scsi_tgt_num];
+ dev = state->dev;
+ state->dev = NULL;
+ assert(state->status == VHOST_SCSI_DEV_REMOVING);
+ state->status = VHOST_SCSI_DEV_EMPTY;
+ spdk_scsi_dev_destruct(dev, NULL, NULL);
+ if (state->remove_cb) {
+ state->remove_cb(&svdev->vdev, state->remove_ctx);
+ state->remove_cb = NULL;
+ }
+ SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: removed target 'Target %u'\n",
+ svdev->vdev.name, scsi_tgt_num);
+
+ if (--svdev->ref == 0 && svdev->registered == false) {
+ free(svdev);
+ }
+}
+
+static void
+vhost_scsi_dev_process_removed_cpl_cb(struct spdk_vhost_dev *vdev, void *ctx)
+{
+ unsigned scsi_tgt_num = (unsigned)(uintptr_t)ctx;
+ struct spdk_vhost_scsi_dev *svdev = SPDK_CONTAINEROF(vdev,
+ struct spdk_vhost_scsi_dev, vdev);
+
+ /* all sessions have already detached the device */
+ if (svdev->scsi_dev_state[scsi_tgt_num].status != VHOST_SCSI_DEV_REMOVING) {
+ /* device was already removed in the meantime */
+ return;
+ }
+
+ remove_scsi_tgt(svdev, scsi_tgt_num);
+}
+
+static int
+vhost_scsi_session_process_removed(struct spdk_vhost_dev *vdev,
+ struct spdk_vhost_session *vsession, void *ctx)
+{
+ unsigned scsi_tgt_num = (unsigned)(uintptr_t)ctx;
+ struct spdk_vhost_scsi_session *svsession = (struct spdk_vhost_scsi_session *)vsession;
+ struct spdk_scsi_dev_session_state *state = &svsession->scsi_dev_state[scsi_tgt_num];
+
+ if (state->dev != NULL) {
+ /* there's still a session that references this device,
+ * so abort our foreach chain here. We'll be called
+ * again from this session's management poller after it
+ * is removed in there
+ */
+ return -1;
+ }
+
+ return 0;
+}
+
+static void
+process_removed_devs(struct spdk_vhost_scsi_session *svsession)
+{
+ struct spdk_scsi_dev *dev;
+ struct spdk_scsi_dev_session_state *state;
+ int i;
+
+ for (i = 0; i < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; ++i) {
+ state = &svsession->scsi_dev_state[i];
+ dev = state->dev;
+
+ if (dev && state->status == VHOST_SCSI_DEV_REMOVING &&
+ !spdk_scsi_dev_has_pending_tasks(dev, NULL)) {
+ /* detach the device from this session */
+ spdk_scsi_dev_free_io_channels(dev);
+ state->dev = NULL;
+ state->status = VHOST_SCSI_DEV_REMOVED;
+ /* try to detach it globally */
+ spdk_vhost_lock();
+ vhost_dev_foreach_session(&svsession->svdev->vdev,
+ vhost_scsi_session_process_removed,
+ vhost_scsi_dev_process_removed_cpl_cb,
+ (void *)(uintptr_t)i);
+ spdk_vhost_unlock();
+ }
+ }
+}
+
+static void
+eventq_enqueue(struct spdk_vhost_scsi_session *svsession, unsigned scsi_dev_num,
+ uint32_t event, uint32_t reason)
+{
+ struct spdk_vhost_session *vsession = &svsession->vsession;
+ struct spdk_vhost_virtqueue *vq;
+ struct vring_desc *desc, *desc_table;
+ struct virtio_scsi_event *desc_ev;
+ uint32_t desc_table_size, req_size = 0;
+ uint16_t req;
+ int rc;
+
+ assert(scsi_dev_num < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS);
+ vq = &vsession->virtqueue[VIRTIO_SCSI_EVENTQ];
+
+ if (vq->vring.desc == NULL || vhost_vq_avail_ring_get(vq, &req, 1) != 1) {
+ SPDK_ERRLOG("%s: failed to send virtio event (no avail ring entries?).\n",
+ vsession->name);
+ return;
+ }
+
+ rc = vhost_vq_get_desc(vsession, vq, req, &desc, &desc_table, &desc_table_size);
+ if (rc != 0 || desc->len < sizeof(*desc_ev)) {
+ SPDK_ERRLOG("%s: invalid eventq descriptor at index %"PRIu16".\n",
+ vsession->name, req);
+ goto out;
+ }
+
+ desc_ev = vhost_gpa_to_vva(vsession, desc->addr, sizeof(*desc_ev));
+ if (desc_ev == NULL) {
+ SPDK_ERRLOG("%s: eventq descriptor at index %"PRIu16" points "
+ "to unmapped guest memory address %p.\n",
+ vsession->name, req, (void *)(uintptr_t)desc->addr);
+ goto out;
+ }
+
+ desc_ev->event = event;
+ desc_ev->lun[0] = 1;
+ desc_ev->lun[1] = scsi_dev_num;
+ /* virtio LUN id 0 can refer either to the entire device
+ * or actual LUN 0 (the only supported by vhost for now)
+ */
+ desc_ev->lun[2] = 0 >> 8;
+ desc_ev->lun[3] = 0 & 0xFF;
+ /* virtio doesn't specify any strict format for LUN id (bytes 2 and 3)
+ * current implementation relies on linux kernel sources
+ */
+ memset(&desc_ev->lun[4], 0, 4);
+ desc_ev->reason = reason;
+ req_size = sizeof(*desc_ev);
+
+out:
+ vhost_vq_used_ring_enqueue(vsession, vq, req, req_size);
+}
+
+static void
+submit_completion(struct spdk_vhost_scsi_task *task)
+{
+ struct spdk_vhost_session *vsession = &task->svsession->vsession;
+
+ vhost_vq_used_ring_enqueue(vsession, task->vq, task->req_idx,
+ task->used_len);
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "Finished task (%p) req_idx=%d\n", task, task->req_idx);
+
+ vhost_scsi_task_put(task);
+}
+
+static void
+vhost_scsi_task_mgmt_cpl(struct spdk_scsi_task *scsi_task)
+{
+ struct spdk_vhost_scsi_task *task = SPDK_CONTAINEROF(scsi_task, struct spdk_vhost_scsi_task, scsi);
+
+ submit_completion(task);
+}
+
+static void
+vhost_scsi_task_cpl(struct spdk_scsi_task *scsi_task)
+{
+ struct spdk_vhost_scsi_task *task = SPDK_CONTAINEROF(scsi_task, struct spdk_vhost_scsi_task, scsi);
+
+ /* The SCSI task has completed. Do final processing and then post
+ notification to the virtqueue's "used" ring.
+ */
+ task->resp->status = task->scsi.status;
+
+ if (task->scsi.status != SPDK_SCSI_STATUS_GOOD) {
+ memcpy(task->resp->sense, task->scsi.sense_data, task->scsi.sense_data_len);
+ task->resp->sense_len = task->scsi.sense_data_len;
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "Task (%p) req_idx=%d failed - status=%u\n", task, task->req_idx,
+ task->scsi.status);
+ }
+ assert(task->scsi.transfer_len == task->scsi.length);
+ task->resp->resid = task->scsi.length - task->scsi.data_transferred;
+
+ submit_completion(task);
+}
+
+static void
+task_submit(struct spdk_vhost_scsi_task *task)
+{
+ task->resp->response = VIRTIO_SCSI_S_OK;
+ spdk_scsi_dev_queue_task(task->scsi_dev, &task->scsi);
+}
+
+static void
+mgmt_task_submit(struct spdk_vhost_scsi_task *task, enum spdk_scsi_task_func func)
+{
+ task->tmf_resp->response = VIRTIO_SCSI_S_OK;
+ task->scsi.function = func;
+ spdk_scsi_dev_queue_mgmt_task(task->scsi_dev, &task->scsi);
+}
+
+static void
+invalid_request(struct spdk_vhost_scsi_task *task)
+{
+ struct spdk_vhost_session *vsession = &task->svsession->vsession;
+
+ vhost_vq_used_ring_enqueue(vsession, task->vq, task->req_idx,
+ task->used_len);
+ vhost_scsi_task_put(task);
+
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "Invalid request (status=%" PRIu8")\n",
+ task->resp ? task->resp->response : -1);
+}
+
+static int
+vhost_scsi_task_init_target(struct spdk_vhost_scsi_task *task, const __u8 *lun)
+{
+ struct spdk_vhost_scsi_session *svsession = task->svsession;
+ struct spdk_scsi_dev_session_state *state;
+ uint16_t lun_id = (((uint16_t)lun[2] << 8) | lun[3]) & 0x3FFF;
+
+ SPDK_LOGDUMP(SPDK_LOG_VHOST_SCSI_QUEUE, "LUN", lun, 8);
+
+ /* First byte must be 1 and second is target */
+ if (lun[0] != 1 || lun[1] >= SPDK_VHOST_SCSI_CTRLR_MAX_DEVS) {
+ return -1;
+ }
+
+ state = &svsession->scsi_dev_state[lun[1]];
+ task->scsi_dev = state->dev;
+ if (state->dev == NULL || state->status != VHOST_SCSI_DEV_PRESENT) {
+ /* If dev has been hotdetached, return 0 to allow sending
+ * additional hotremove event via sense codes.
+ */
+ return state->status != VHOST_SCSI_DEV_EMPTY ? 0 : -1;
+ }
+
+ task->scsi.target_port = spdk_scsi_dev_find_port_by_id(task->scsi_dev, 0);
+ task->scsi.lun = spdk_scsi_dev_get_lun(state->dev, lun_id);
+ return 0;
+}
+
+static void
+process_ctrl_request(struct spdk_vhost_scsi_task *task)
+{
+ struct spdk_vhost_session *vsession = &task->svsession->vsession;
+ struct vring_desc *desc, *desc_table;
+ struct virtio_scsi_ctrl_tmf_req *ctrl_req;
+ struct virtio_scsi_ctrl_an_resp *an_resp;
+ uint32_t desc_table_size, used_len = 0;
+ int rc;
+
+ spdk_scsi_task_construct(&task->scsi, vhost_scsi_task_mgmt_cpl, vhost_scsi_task_free_cb);
+ rc = vhost_vq_get_desc(vsession, task->vq, task->req_idx, &desc, &desc_table,
+ &desc_table_size);
+ if (spdk_unlikely(rc != 0)) {
+ SPDK_ERRLOG("%s: invalid controlq descriptor at index %d.\n",
+ vsession->name, task->req_idx);
+ goto out;
+ }
+
+ ctrl_req = vhost_gpa_to_vva(vsession, desc->addr, sizeof(*ctrl_req));
+ if (ctrl_req == NULL) {
+ SPDK_ERRLOG("%s: invalid task management request at index %d.\n",
+ vsession->name, task->req_idx);
+ goto out;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_QUEUE,
+ "Processing controlq descriptor: desc %d/%p, desc_addr %p, len %d, flags %d, last_used_idx %d; kickfd %d; size %d\n",
+ task->req_idx, desc, (void *)desc->addr, desc->len, desc->flags, task->vq->last_used_idx,
+ task->vq->vring.kickfd, task->vq->vring.size);
+ SPDK_LOGDUMP(SPDK_LOG_VHOST_SCSI_QUEUE, "Request descriptor", (uint8_t *)ctrl_req, desc->len);
+
+ vhost_scsi_task_init_target(task, ctrl_req->lun);
+
+ vhost_vring_desc_get_next(&desc, desc_table, desc_table_size);
+ if (spdk_unlikely(desc == NULL)) {
+ SPDK_ERRLOG("%s: no response descriptor for controlq request %d.\n",
+ vsession->name, task->req_idx);
+ goto out;
+ }
+
+ /* Process the TMF request */
+ switch (ctrl_req->type) {
+ case VIRTIO_SCSI_T_TMF:
+ task->tmf_resp = vhost_gpa_to_vva(vsession, desc->addr, sizeof(*task->tmf_resp));
+ if (spdk_unlikely(desc->len < sizeof(struct virtio_scsi_ctrl_tmf_resp) || task->tmf_resp == NULL)) {
+ SPDK_ERRLOG("%s: TMF response descriptor at index %d points to invalid guest memory region\n",
+ vsession->name, task->req_idx);
+ goto out;
+ }
+
+ /* Check if we are processing a valid request */
+ if (task->scsi_dev == NULL) {
+ task->tmf_resp->response = VIRTIO_SCSI_S_BAD_TARGET;
+ break;
+ }
+
+ switch (ctrl_req->subtype) {
+ case VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET:
+ /* Handle LUN reset */
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_QUEUE, "%s: LUN reset\n", vsession->name);
+
+ mgmt_task_submit(task, SPDK_SCSI_TASK_FUNC_LUN_RESET);
+ return;
+ default:
+ task->tmf_resp->response = VIRTIO_SCSI_S_ABORTED;
+ /* Unsupported command */
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_QUEUE, "%s: unsupported TMF command %x\n",
+ vsession->name, ctrl_req->subtype);
+ break;
+ }
+ break;
+ case VIRTIO_SCSI_T_AN_QUERY:
+ case VIRTIO_SCSI_T_AN_SUBSCRIBE: {
+ an_resp = vhost_gpa_to_vva(vsession, desc->addr, sizeof(*an_resp));
+ if (spdk_unlikely(desc->len < sizeof(struct virtio_scsi_ctrl_an_resp) || an_resp == NULL)) {
+ SPDK_WARNLOG("%s: asynchronous response descriptor points to invalid guest memory region\n",
+ vsession->name);
+ goto out;
+ }
+
+ an_resp->response = VIRTIO_SCSI_S_ABORTED;
+ break;
+ }
+ default:
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_QUEUE, "%s: Unsupported control command %x\n",
+ vsession->name, ctrl_req->type);
+ break;
+ }
+
+ used_len = sizeof(struct virtio_scsi_ctrl_tmf_resp);
+out:
+ vhost_vq_used_ring_enqueue(vsession, task->vq, task->req_idx, used_len);
+ vhost_scsi_task_put(task);
+}
+
+/*
+ * Process task's descriptor chain and setup data related fields.
+ * Return
+ * -1 if request is invalid and must be aborted,
+ * 0 if all data are set.
+ */
+static int
+task_data_setup(struct spdk_vhost_scsi_task *task,
+ struct virtio_scsi_cmd_req **req)
+{
+ struct spdk_vhost_session *vsession = &task->svsession->vsession;
+ struct vring_desc *desc, *desc_table;
+ struct iovec *iovs = task->iovs;
+ uint16_t iovcnt = 0;
+ uint32_t desc_table_len, len = 0;
+ int rc;
+
+ spdk_scsi_task_construct(&task->scsi, vhost_scsi_task_cpl, vhost_scsi_task_free_cb);
+
+ rc = vhost_vq_get_desc(vsession, task->vq, task->req_idx, &desc, &desc_table, &desc_table_len);
+ /* First descriptor must be readable */
+ if (spdk_unlikely(rc != 0 || vhost_vring_desc_is_wr(desc) ||
+ desc->len < sizeof(struct virtio_scsi_cmd_req))) {
+ SPDK_WARNLOG("%s: invalid first request descriptor at index %"PRIu16".\n",
+ vsession->name, task->req_idx);
+ goto invalid_task;
+ }
+
+ *req = vhost_gpa_to_vva(vsession, desc->addr, sizeof(**req));
+ if (spdk_unlikely(*req == NULL)) {
+ SPDK_WARNLOG("%s: request descriptor at index %d points to invalid guest memory region\n",
+ vsession->name, task->req_idx);
+ goto invalid_task;
+ }
+
+ /* Each request must have at least 2 descriptors (e.g. request and response) */
+ vhost_vring_desc_get_next(&desc, desc_table, desc_table_len);
+ if (desc == NULL) {
+ SPDK_WARNLOG("%s: descriptor chain at index %d contains neither payload nor response buffer.\n",
+ vsession->name, task->req_idx);
+ goto invalid_task;
+ }
+ task->scsi.dxfer_dir = vhost_vring_desc_is_wr(desc) ? SPDK_SCSI_DIR_FROM_DEV :
+ SPDK_SCSI_DIR_TO_DEV;
+ task->scsi.iovs = iovs;
+
+ if (task->scsi.dxfer_dir == SPDK_SCSI_DIR_FROM_DEV) {
+ /*
+ * FROM_DEV (READ): [RD_req][WR_resp][WR_buf0]...[WR_bufN]
+ */
+ task->resp = vhost_gpa_to_vva(vsession, desc->addr, sizeof(*task->resp));
+ if (spdk_unlikely(desc->len < sizeof(struct virtio_scsi_cmd_resp) || task->resp == NULL)) {
+ SPDK_WARNLOG("%s: response descriptor at index %d points to invalid guest memory region\n",
+ vsession->name, task->req_idx);
+ goto invalid_task;
+ }
+ rc = vhost_vring_desc_get_next(&desc, desc_table, desc_table_len);
+ if (spdk_unlikely(rc != 0)) {
+ SPDK_WARNLOG("%s: invalid descriptor chain at request index %d (descriptor id overflow?).\n",
+ vsession->name, task->req_idx);
+ goto invalid_task;
+ }
+
+ if (desc == NULL) {
+ /*
+ * TEST UNIT READY command and some others might not contain any payload and this is not an error.
+ */
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_DATA,
+ "No payload descriptors for FROM DEV command req_idx=%"PRIu16".\n", task->req_idx);
+ SPDK_LOGDUMP(SPDK_LOG_VHOST_SCSI_DATA, "CDB=", (*req)->cdb, VIRTIO_SCSI_CDB_SIZE);
+ task->used_len = sizeof(struct virtio_scsi_cmd_resp);
+ task->scsi.iovcnt = 1;
+ task->scsi.iovs[0].iov_len = 0;
+ task->scsi.length = 0;
+ task->scsi.transfer_len = 0;
+ return 0;
+ }
+
+ /* All remaining descriptors are data. */
+ while (desc) {
+ if (spdk_unlikely(!vhost_vring_desc_is_wr(desc))) {
+ SPDK_WARNLOG("%s: FROM DEV cmd: descriptor nr %" PRIu16" in payload chain is read only.\n",
+ vsession->name, iovcnt);
+ goto invalid_task;
+ }
+
+ if (spdk_unlikely(vhost_vring_desc_to_iov(vsession, iovs, &iovcnt, desc))) {
+ goto invalid_task;
+ }
+ len += desc->len;
+
+ rc = vhost_vring_desc_get_next(&desc, desc_table, desc_table_len);
+ if (spdk_unlikely(rc != 0)) {
+ SPDK_WARNLOG("%s: invalid payload in descriptor chain starting at index %d.\n",
+ vsession->name, task->req_idx);
+ goto invalid_task;
+ }
+ }
+
+ task->used_len = sizeof(struct virtio_scsi_cmd_resp) + len;
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_DATA, "TO DEV");
+ /*
+ * TO_DEV (WRITE):[RD_req][RD_buf0]...[RD_bufN][WR_resp]
+ * No need to check descriptor WR flag as this is done while setting scsi.dxfer_dir.
+ */
+
+ /* Process descriptors up to response. */
+ while (!vhost_vring_desc_is_wr(desc)) {
+ if (spdk_unlikely(vhost_vring_desc_to_iov(vsession, iovs, &iovcnt, desc))) {
+ goto invalid_task;
+ }
+ len += desc->len;
+
+ vhost_vring_desc_get_next(&desc, desc_table, desc_table_len);
+ if (spdk_unlikely(desc == NULL)) {
+ SPDK_WARNLOG("%s: TO_DEV cmd: no response descriptor.\n", vsession->name);
+ goto invalid_task;
+ }
+ }
+
+ task->resp = vhost_gpa_to_vva(vsession, desc->addr, sizeof(*task->resp));
+ if (spdk_unlikely(desc->len < sizeof(struct virtio_scsi_cmd_resp) || task->resp == NULL)) {
+ SPDK_WARNLOG("%s: response descriptor at index %d points to invalid guest memory region\n",
+ vsession->name, task->req_idx);
+ goto invalid_task;
+ }
+
+ task->used_len = sizeof(struct virtio_scsi_cmd_resp);
+ }
+
+ task->scsi.iovcnt = iovcnt;
+ task->scsi.length = len;
+ task->scsi.transfer_len = len;
+ return 0;
+
+invalid_task:
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_DATA, "%s: Invalid task at index %"PRIu16".\n",
+ vsession->name, task->req_idx);
+ return -1;
+}
+
+static int
+process_request(struct spdk_vhost_scsi_task *task)
+{
+ struct virtio_scsi_cmd_req *req;
+ int result;
+
+ result = task_data_setup(task, &req);
+ if (result) {
+ return result;
+ }
+
+ result = vhost_scsi_task_init_target(task, req->lun);
+ if (spdk_unlikely(result != 0)) {
+ task->resp->response = VIRTIO_SCSI_S_BAD_TARGET;
+ return -1;
+ }
+
+ task->scsi.cdb = req->cdb;
+ SPDK_LOGDUMP(SPDK_LOG_VHOST_SCSI_DATA, "request CDB", req->cdb, VIRTIO_SCSI_CDB_SIZE);
+
+ if (spdk_unlikely(task->scsi.lun == NULL)) {
+ spdk_scsi_task_process_null_lun(&task->scsi);
+ task->resp->response = VIRTIO_SCSI_S_OK;
+ return 1;
+ }
+
+ return 0;
+}
+
+static void
+process_scsi_task(struct spdk_vhost_session *vsession,
+ struct spdk_vhost_virtqueue *vq,
+ uint16_t req_idx)
+{
+ struct spdk_vhost_scsi_task *task;
+ int result;
+
+ task = &((struct spdk_vhost_scsi_task *)vq->tasks)[req_idx];
+ if (spdk_unlikely(task->used)) {
+ SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n",
+ vsession->name, req_idx);
+ vhost_vq_used_ring_enqueue(vsession, vq, req_idx, 0);
+ return;
+ }
+
+ vsession->task_cnt++;
+ scsi_task_init(task);
+
+ if (spdk_unlikely(vq->vring_idx == VIRTIO_SCSI_CONTROLQ)) {
+ process_ctrl_request(task);
+ } else {
+ result = process_request(task);
+ if (likely(result == 0)) {
+ task_submit(task);
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "====== Task %p req_idx %d submitted ======\n", task,
+ task->req_idx);
+ } else if (result > 0) {
+ vhost_scsi_task_cpl(&task->scsi);
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "====== Task %p req_idx %d finished early ======\n", task,
+ task->req_idx);
+ } else {
+ invalid_request(task);
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "====== Task %p req_idx %d failed ======\n", task,
+ task->req_idx);
+ }
+ }
+}
+
+static void
+process_vq(struct spdk_vhost_scsi_session *svsession, struct spdk_vhost_virtqueue *vq)
+{
+ struct spdk_vhost_session *vsession = &svsession->vsession;
+ uint16_t reqs[32];
+ uint16_t reqs_cnt, i;
+
+ reqs_cnt = vhost_vq_avail_ring_get(vq, reqs, SPDK_COUNTOF(reqs));
+ assert(reqs_cnt <= 32);
+
+ for (i = 0; i < reqs_cnt; i++) {
+ SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "====== Starting processing request idx %"PRIu16"======\n",
+ reqs[i]);
+
+ if (spdk_unlikely(reqs[i] >= vq->vring.size)) {
+ SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n",
+ vsession->name, reqs[i], vq->vring.size);
+ vhost_vq_used_ring_enqueue(vsession, vq, reqs[i], 0);
+ continue;
+ }
+
+ process_scsi_task(vsession, vq, reqs[i]);
+ }
+}
+
+static int
+vdev_mgmt_worker(void *arg)
+{
+ struct spdk_vhost_scsi_session *svsession = arg;
+ struct spdk_vhost_session *vsession = &svsession->vsession;
+
+ process_removed_devs(svsession);
+ vhost_vq_used_signal(vsession, &vsession->virtqueue[VIRTIO_SCSI_EVENTQ]);
+
+ process_vq(svsession, &vsession->virtqueue[VIRTIO_SCSI_CONTROLQ]);
+ vhost_vq_used_signal(vsession, &vsession->virtqueue[VIRTIO_SCSI_CONTROLQ]);
+
+ return SPDK_POLLER_BUSY;
+}
+
+static int
+vdev_worker(void *arg)
+{
+ struct spdk_vhost_scsi_session *svsession = arg;
+ struct spdk_vhost_session *vsession = &svsession->vsession;
+ uint32_t q_idx;
+
+ for (q_idx = VIRTIO_SCSI_REQUESTQ; q_idx < vsession->max_queues; q_idx++) {
+ process_vq(svsession, &vsession->virtqueue[q_idx]);
+ }
+
+ vhost_session_used_signal(vsession);
+
+ return SPDK_POLLER_BUSY;
+}
+
+static struct spdk_vhost_scsi_dev *
+to_scsi_dev(struct spdk_vhost_dev *ctrlr)
+{
+ if (ctrlr == NULL) {
+ return NULL;
+ }
+
+ if (ctrlr->backend != &spdk_vhost_scsi_device_backend) {
+ SPDK_ERRLOG("%s: not a vhost-scsi device.\n", ctrlr->name);
+ return NULL;
+ }
+
+ return SPDK_CONTAINEROF(ctrlr, struct spdk_vhost_scsi_dev, vdev);
+}
+
+static struct spdk_vhost_scsi_session *
+to_scsi_session(struct spdk_vhost_session *vsession)
+{
+ assert(vsession->vdev->backend == &spdk_vhost_scsi_device_backend);
+ return (struct spdk_vhost_scsi_session *)vsession;
+}
+
+int
+spdk_vhost_scsi_dev_construct(const char *name, const char *cpumask)
+{
+ struct spdk_vhost_scsi_dev *svdev = calloc(1, sizeof(*svdev));
+ int rc;
+
+ if (svdev == NULL) {
+ return -ENOMEM;
+ }
+
+ svdev->vdev.virtio_features = SPDK_VHOST_SCSI_FEATURES;
+ svdev->vdev.disabled_features = SPDK_VHOST_SCSI_DISABLED_FEATURES;
+
+ spdk_vhost_lock();
+ rc = vhost_dev_register(&svdev->vdev, name, cpumask,
+ &spdk_vhost_scsi_device_backend);
+
+ if (rc) {
+ free(svdev);
+ spdk_vhost_unlock();
+ return rc;
+ }
+
+ svdev->registered = true;
+
+ spdk_vhost_unlock();
+ return rc;
+}
+
+static int
+vhost_scsi_dev_remove(struct spdk_vhost_dev *vdev)
+{
+ struct spdk_vhost_scsi_dev *svdev = to_scsi_dev(vdev);
+ int rc, i;
+
+ assert(svdev != NULL);
+ for (i = 0; i < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; ++i) {
+ if (svdev->scsi_dev_state[i].dev) {
+ if (vdev->registered) {
+ SPDK_ERRLOG("%s: SCSI target %d is still present.\n", vdev->name, i);
+ return -EBUSY;
+ }
+
+ rc = spdk_vhost_scsi_dev_remove_tgt(vdev, i, NULL, NULL);
+ if (rc != 0) {
+ SPDK_ERRLOG("%s: failed to force-remove target %d\n", vdev->name, i);
+ return rc;
+ }
+ }
+ }
+
+ rc = vhost_dev_unregister(vdev);
+ if (rc != 0) {
+ return rc;
+ }
+ svdev->registered = false;
+
+ if (svdev->ref == 0) {
+ free(svdev);
+ }
+
+ return 0;
+}
+
+struct spdk_scsi_dev *
+spdk_vhost_scsi_dev_get_tgt(struct spdk_vhost_dev *vdev, uint8_t num)
+{
+ struct spdk_vhost_scsi_dev *svdev;
+
+ assert(num < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS);
+ svdev = to_scsi_dev(vdev);
+ assert(svdev != NULL);
+ if (svdev->scsi_dev_state[num].status != VHOST_SCSI_DEV_PRESENT) {
+ return NULL;
+ }
+
+ assert(svdev->scsi_dev_state[num].dev != NULL);
+ return svdev->scsi_dev_state[num].dev;
+}
+
+static void
+vhost_scsi_lun_hotremove(const struct spdk_scsi_lun *lun, void *arg)
+{
+ struct spdk_vhost_scsi_dev *svdev = arg;
+ const struct spdk_scsi_dev *scsi_dev;
+ unsigned scsi_dev_num;
+
+ assert(lun != NULL);
+ assert(svdev != NULL);
+ scsi_dev = spdk_scsi_lun_get_dev(lun);
+ for (scsi_dev_num = 0; scsi_dev_num < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; scsi_dev_num++) {
+ if (svdev->scsi_dev_state[scsi_dev_num].dev == scsi_dev) {
+ break;
+ }
+ }
+
+ if (scsi_dev_num == SPDK_VHOST_SCSI_CTRLR_MAX_DEVS) {
+ /* The entire device has been already removed. */
+ return;
+ }
+
+ /* remove entire device */
+ spdk_vhost_scsi_dev_remove_tgt(&svdev->vdev, scsi_dev_num, NULL, NULL);
+}
+
+static void
+vhost_scsi_dev_add_tgt_cpl_cb(struct spdk_vhost_dev *vdev, void *ctx)
+{
+ unsigned scsi_tgt_num = (unsigned)(uintptr_t)ctx;
+ struct spdk_vhost_scsi_dev *svdev = SPDK_CONTAINEROF(vdev,
+ struct spdk_vhost_scsi_dev, vdev);
+ struct spdk_scsi_dev_vhost_state *vhost_sdev;
+
+ vhost_sdev = &svdev->scsi_dev_state[scsi_tgt_num];
+
+ /* All sessions have added the target */
+ assert(vhost_sdev->status == VHOST_SCSI_DEV_ADDING);
+ vhost_sdev->status = VHOST_SCSI_DEV_PRESENT;
+ svdev->ref++;
+}
+
+static int
+vhost_scsi_session_add_tgt(struct spdk_vhost_dev *vdev,
+ struct spdk_vhost_session *vsession, void *ctx)
+{
+ unsigned scsi_tgt_num = (unsigned)(uintptr_t)ctx;
+ struct spdk_vhost_scsi_session *svsession = (struct spdk_vhost_scsi_session *)vsession;
+ struct spdk_scsi_dev_session_state *session_sdev = &svsession->scsi_dev_state[scsi_tgt_num];
+ struct spdk_scsi_dev_vhost_state *vhost_sdev;
+ int rc;
+
+ if (!vsession->started || session_sdev->dev != NULL) {
+ /* Nothing to do. */
+ return 0;
+ }
+
+ vhost_sdev = &svsession->svdev->scsi_dev_state[scsi_tgt_num];
+ session_sdev->dev = vhost_sdev->dev;
+ session_sdev->status = VHOST_SCSI_DEV_PRESENT;
+
+ rc = spdk_scsi_dev_allocate_io_channels(svsession->scsi_dev_state[scsi_tgt_num].dev);
+ if (rc != 0) {
+ SPDK_ERRLOG("%s: Couldn't allocate io channnel for SCSI target %u.\n",
+ vsession->name, scsi_tgt_num);
+
+ /* unset the SCSI target so that all I/O to it will be rejected */
+ session_sdev->dev = NULL;
+ /* Set status to EMPTY so that we won't reply with SCSI hotremove
+ * sense codes - the device hasn't ever been added.
+ */
+ session_sdev->status = VHOST_SCSI_DEV_EMPTY;
+
+ /* Return with no error. We'll continue allocating io_channels for
+ * other sessions on this device in hopes they succeed. The sessions
+ * that failed to allocate io_channels simply won't be able to
+ * detect the SCSI target, nor do any I/O to it.
+ */
+ return 0;
+ }
+
+ if (vhost_dev_has_feature(vsession, VIRTIO_SCSI_F_HOTPLUG)) {
+ eventq_enqueue(svsession, scsi_tgt_num,
+ VIRTIO_SCSI_T_TRANSPORT_RESET, VIRTIO_SCSI_EVT_RESET_RESCAN);
+ } else {
+ SPDK_NOTICELOG("%s: driver does not support hotplug. "
+ "Please restart it or perform a rescan.\n",
+ vsession->name);
+ }
+
+ return 0;
+}
+
+int
+spdk_vhost_scsi_dev_add_tgt(struct spdk_vhost_dev *vdev, int scsi_tgt_num,
+ const char *bdev_name)
+{
+ struct spdk_vhost_scsi_dev *svdev;
+ struct spdk_scsi_dev_vhost_state *state;
+ char target_name[SPDK_SCSI_DEV_MAX_NAME];
+ int lun_id_list[1];
+ const char *bdev_names_list[1];
+
+ svdev = to_scsi_dev(vdev);
+ assert(svdev != NULL);
+ if (scsi_tgt_num < 0) {
+ for (scsi_tgt_num = 0; scsi_tgt_num < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; scsi_tgt_num++) {
+ if (svdev->scsi_dev_state[scsi_tgt_num].dev == NULL) {
+ break;
+ }
+ }
+
+ if (scsi_tgt_num == SPDK_VHOST_SCSI_CTRLR_MAX_DEVS) {
+ SPDK_ERRLOG("%s: all SCSI target slots are already in use.\n", vdev->name);
+ return -ENOSPC;
+ }
+ } else {
+ if (scsi_tgt_num >= SPDK_VHOST_SCSI_CTRLR_MAX_DEVS) {
+ SPDK_ERRLOG("%s: SCSI target number is too big (got %d, max %d)\n",
+ vdev->name, scsi_tgt_num, SPDK_VHOST_SCSI_CTRLR_MAX_DEVS);
+ return -EINVAL;
+ }
+ }
+
+ if (bdev_name == NULL) {
+ SPDK_ERRLOG("No lun name specified\n");
+ return -EINVAL;
+ }
+
+ state = &svdev->scsi_dev_state[scsi_tgt_num];
+ if (state->dev != NULL) {
+ SPDK_ERRLOG("%s: SCSI target %u already occupied\n", vdev->name, scsi_tgt_num);
+ return -EEXIST;
+ }
+
+ /*
+ * At this stage only one LUN per target
+ */
+ snprintf(target_name, sizeof(target_name), "Target %u", scsi_tgt_num);
+ lun_id_list[0] = 0;
+ bdev_names_list[0] = (char *)bdev_name;
+
+ state->status = VHOST_SCSI_DEV_ADDING;
+ state->dev = spdk_scsi_dev_construct(target_name, bdev_names_list, lun_id_list, 1,
+ SPDK_SPC_PROTOCOL_IDENTIFIER_SAS,
+ vhost_scsi_lun_hotremove, svdev);
+
+ if (state->dev == NULL) {
+ state->status = VHOST_SCSI_DEV_EMPTY;
+ SPDK_ERRLOG("%s: couldn't create SCSI target %u using bdev '%s'\n",
+ vdev->name, scsi_tgt_num, bdev_name);
+ return -EINVAL;
+ }
+ spdk_scsi_dev_add_port(state->dev, 0, "vhost");
+
+ SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: added SCSI target %u using bdev '%s'\n",
+ vdev->name, scsi_tgt_num, bdev_name);
+
+ vhost_dev_foreach_session(vdev, vhost_scsi_session_add_tgt,
+ vhost_scsi_dev_add_tgt_cpl_cb,
+ (void *)(uintptr_t)scsi_tgt_num);
+ return scsi_tgt_num;
+}
+
+struct scsi_tgt_hotplug_ctx {
+ unsigned scsi_tgt_num;
+ bool async_fini;
+};
+
+static void
+vhost_scsi_dev_remove_tgt_cpl_cb(struct spdk_vhost_dev *vdev, void *_ctx)
+{
+ struct scsi_tgt_hotplug_ctx *ctx = _ctx;
+ struct spdk_vhost_scsi_dev *svdev = SPDK_CONTAINEROF(vdev,
+ struct spdk_vhost_scsi_dev, vdev);
+
+ if (!ctx->async_fini) {
+ /* there aren't any active sessions, so remove the dev and exit */
+ remove_scsi_tgt(svdev, ctx->scsi_tgt_num);
+ }
+
+ free(ctx);
+}
+
+static int
+vhost_scsi_session_remove_tgt(struct spdk_vhost_dev *vdev,
+ struct spdk_vhost_session *vsession, void *_ctx)
+{
+ struct scsi_tgt_hotplug_ctx *ctx = _ctx;
+ unsigned scsi_tgt_num = ctx->scsi_tgt_num;
+ struct spdk_vhost_scsi_session *svsession = (struct spdk_vhost_scsi_session *)vsession;
+ struct spdk_scsi_dev_session_state *state = &svsession->scsi_dev_state[scsi_tgt_num];
+
+ if (!vsession->started || state->dev == NULL) {
+ /* Nothing to do */
+ return 0;
+ }
+
+ /* Mark the target for removal */
+ assert(state->status == VHOST_SCSI_DEV_PRESENT);
+ state->status = VHOST_SCSI_DEV_REMOVING;
+
+ /* Send a hotremove Virtio event */
+ if (vhost_dev_has_feature(vsession, VIRTIO_SCSI_F_HOTPLUG)) {
+ eventq_enqueue(svsession, scsi_tgt_num,
+ VIRTIO_SCSI_T_TRANSPORT_RESET, VIRTIO_SCSI_EVT_RESET_REMOVED);
+ }
+
+ /* Wait for the session's management poller to remove the target after
+ * all its pending I/O has finished.
+ */
+ ctx->async_fini = true;
+ return 0;
+}
+
+int
+spdk_vhost_scsi_dev_remove_tgt(struct spdk_vhost_dev *vdev, unsigned scsi_tgt_num,
+ spdk_vhost_event_fn cb_fn, void *cb_arg)
+{
+ struct spdk_vhost_scsi_dev *svdev;
+ struct spdk_scsi_dev_vhost_state *scsi_dev_state;
+ struct scsi_tgt_hotplug_ctx *ctx;
+
+ if (scsi_tgt_num >= SPDK_VHOST_SCSI_CTRLR_MAX_DEVS) {
+ SPDK_ERRLOG("%s: invalid SCSI target number %d\n", vdev->name, scsi_tgt_num);
+ return -EINVAL;
+ }
+
+ svdev = to_scsi_dev(vdev);
+ assert(svdev != NULL);
+ scsi_dev_state = &svdev->scsi_dev_state[scsi_tgt_num];
+
+ if (scsi_dev_state->status != VHOST_SCSI_DEV_PRESENT) {
+ return -EBUSY;
+ }
+
+ if (scsi_dev_state->dev == NULL || scsi_dev_state->status == VHOST_SCSI_DEV_ADDING) {
+ SPDK_ERRLOG("%s: SCSI target %u is not occupied\n", vdev->name, scsi_tgt_num);
+ return -ENODEV;
+ }
+
+ assert(scsi_dev_state->status != VHOST_SCSI_DEV_EMPTY);
+ ctx = calloc(1, sizeof(*ctx));
+ if (ctx == NULL) {
+ SPDK_ERRLOG("calloc failed\n");
+ return -ENOMEM;
+ }
+
+ ctx->scsi_tgt_num = scsi_tgt_num;
+ ctx->async_fini = false;
+
+ scsi_dev_state->remove_cb = cb_fn;
+ scsi_dev_state->remove_ctx = cb_arg;
+ scsi_dev_state->status = VHOST_SCSI_DEV_REMOVING;
+
+ vhost_dev_foreach_session(vdev, vhost_scsi_session_remove_tgt,
+ vhost_scsi_dev_remove_tgt_cpl_cb, ctx);
+ return 0;
+}
+
+int
+vhost_scsi_controller_construct(void)
+{
+ struct spdk_conf_section *sp = spdk_conf_first_section(NULL);
+ struct spdk_vhost_dev *vdev;
+ int i, dev_num;
+ unsigned ctrlr_num = 0;
+ char *bdev_name, *tgt_num_str;
+ char *cpumask;
+ char *name;
+ char *tgt = NULL;
+
+ while (sp != NULL) {
+ if (!spdk_conf_section_match_prefix(sp, "VhostScsi")) {
+ sp = spdk_conf_next_section(sp);
+ continue;
+ }
+
+ if (sscanf(spdk_conf_section_get_name(sp), "VhostScsi%u", &ctrlr_num) != 1) {
+ SPDK_ERRLOG("Section '%s' has non-numeric suffix.\n",
+ spdk_conf_section_get_name(sp));
+ return -1;
+ }
+
+ name = spdk_conf_section_get_val(sp, "Name");
+ cpumask = spdk_conf_section_get_val(sp, "Cpumask");
+
+ if (spdk_vhost_scsi_dev_construct(name, cpumask) < 0) {
+ return -1;
+ }
+
+ vdev = spdk_vhost_dev_find(name);
+ assert(vdev);
+
+ for (i = 0; ; i++) {
+
+ tgt = spdk_conf_section_get_nval(sp, "Target", i);
+ if (tgt == NULL) {
+ break;
+ }
+
+ tgt_num_str = spdk_conf_section_get_nmval(sp, "Target", i, 0);
+ if (tgt_num_str == NULL) {
+ SPDK_ERRLOG("%s: invalid or missing SCSI target number\n", name);
+ return -1;
+ }
+
+ dev_num = (int)strtol(tgt_num_str, NULL, 10);
+ bdev_name = spdk_conf_section_get_nmval(sp, "Target", i, 1);
+ if (bdev_name == NULL) {
+ SPDK_ERRLOG("%s: invalid or missing bdev name for SCSI target %d\n", name, dev_num);
+ return -1;
+ } else if (spdk_conf_section_get_nmval(sp, "Target", i, 2)) {
+ SPDK_ERRLOG("%s: only one LUN per SCSI target is supported\n", name);
+ return -1;
+ }
+
+ if (spdk_vhost_scsi_dev_add_tgt(vdev, dev_num, bdev_name) < 0) {
+ return -1;
+ }
+ }
+
+ sp = spdk_conf_next_section(sp);
+ }
+
+ return 0;
+}
+
+static void
+free_task_pool(struct spdk_vhost_scsi_session *svsession)
+{
+ struct spdk_vhost_session *vsession = &svsession->vsession;
+ struct spdk_vhost_virtqueue *vq;
+ uint16_t i;
+
+ for (i = 0; i < vsession->max_queues; i++) {
+ vq = &vsession->virtqueue[i];
+ if (vq->tasks == NULL) {
+ continue;
+ }
+
+ spdk_free(vq->tasks);
+ vq->tasks = NULL;
+ }
+}
+
+static int
+alloc_task_pool(struct spdk_vhost_scsi_session *svsession)
+{
+ struct spdk_vhost_session *vsession = &svsession->vsession;
+ struct spdk_vhost_virtqueue *vq;
+ struct spdk_vhost_scsi_task *task;
+ uint32_t task_cnt;
+ uint16_t i;
+ uint32_t j;
+
+ for (i = 0; i < vsession->max_queues; i++) {
+ vq = &vsession->virtqueue[i];
+ if (vq->vring.desc == NULL) {
+ continue;
+ }
+
+ task_cnt = vq->vring.size;
+ if (task_cnt > SPDK_VHOST_MAX_VQ_SIZE) {
+ /* sanity check */
+ SPDK_ERRLOG("%s: virtuque %"PRIu16" is too big. (size = %"PRIu32", max = %"PRIu32")\n",
+ vsession->name, i, task_cnt, SPDK_VHOST_MAX_VQ_SIZE);
+ free_task_pool(svsession);
+ return -1;
+ }
+ vq->tasks = spdk_zmalloc(sizeof(struct spdk_vhost_scsi_task) * task_cnt,
+ SPDK_CACHE_LINE_SIZE, NULL,
+ SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+ if (vq->tasks == NULL) {
+ SPDK_ERRLOG("%s: failed to allocate %"PRIu32" tasks for virtqueue %"PRIu16"\n",
+ vsession->name, task_cnt, i);
+ free_task_pool(svsession);
+ return -1;
+ }
+
+ for (j = 0; j < task_cnt; j++) {
+ task = &((struct spdk_vhost_scsi_task *)vq->tasks)[j];
+ task->svsession = svsession;
+ task->vq = vq;
+ task->req_idx = j;
+ }
+ }
+
+ return 0;
+}
+
+static int
+vhost_scsi_start_cb(struct spdk_vhost_dev *vdev,
+ struct spdk_vhost_session *vsession, void *unused)
+{
+ struct spdk_vhost_scsi_session *svsession = to_scsi_session(vsession);
+ struct spdk_vhost_scsi_dev *svdev = svsession->svdev;
+ struct spdk_scsi_dev_vhost_state *state;
+ uint32_t i;
+ int rc;
+
+ /* validate all I/O queues are in a contiguous index range */
+ for (i = VIRTIO_SCSI_REQUESTQ; i < vsession->max_queues; i++) {
+ if (vsession->virtqueue[i].vring.desc == NULL) {
+ SPDK_ERRLOG("%s: queue %"PRIu32" is empty\n", vsession->name, i);
+ rc = -1;
+ goto out;
+ }
+ }
+
+ rc = alloc_task_pool(svsession);
+ if (rc != 0) {
+ SPDK_ERRLOG("%s: failed to alloc task pool.\n", vsession->name);
+ goto out;
+ }
+
+ for (i = 0; i < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; i++) {
+ state = &svdev->scsi_dev_state[i];
+ if (state->dev == NULL || state->status == VHOST_SCSI_DEV_REMOVING) {
+ continue;
+ }
+
+ assert(svsession->scsi_dev_state[i].status == VHOST_SCSI_DEV_EMPTY);
+ svsession->scsi_dev_state[i].dev = state->dev;
+ svsession->scsi_dev_state[i].status = VHOST_SCSI_DEV_PRESENT;
+ rc = spdk_scsi_dev_allocate_io_channels(state->dev);
+ if (rc != 0) {
+ SPDK_ERRLOG("%s: failed to alloc io_channel for SCSI target %"PRIu32"\n",
+ vsession->name, i);
+ /* unset the SCSI target so that all I/O to it will be rejected */
+ svsession->scsi_dev_state[i].dev = NULL;
+ /* set EMPTY state so that we won't reply with SCSI hotremove
+ * sense codes - the device hasn't ever been added.
+ */
+ svsession->scsi_dev_state[i].status = VHOST_SCSI_DEV_EMPTY;
+ continue;
+ }
+ }
+ SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: started poller on lcore %d\n",
+ vsession->name, spdk_env_get_current_core());
+
+ svsession->requestq_poller = SPDK_POLLER_REGISTER(vdev_worker, svsession, 0);
+ if (vsession->virtqueue[VIRTIO_SCSI_CONTROLQ].vring.desc &&
+ vsession->virtqueue[VIRTIO_SCSI_EVENTQ].vring.desc) {
+ svsession->mgmt_poller = SPDK_POLLER_REGISTER(vdev_mgmt_worker, svsession,
+ MGMT_POLL_PERIOD_US);
+ }
+out:
+ vhost_session_start_done(vsession, rc);
+ return rc;
+}
+
+static int
+vhost_scsi_start(struct spdk_vhost_session *vsession)
+{
+ struct spdk_vhost_scsi_session *svsession = to_scsi_session(vsession);
+ struct spdk_vhost_scsi_dev *svdev;
+
+ svdev = to_scsi_dev(vsession->vdev);
+ assert(svdev != NULL);
+ svsession->svdev = svdev;
+
+ return vhost_session_send_event(vsession, vhost_scsi_start_cb,
+ 3, "start session");
+}
+
+static int
+destroy_session_poller_cb(void *arg)
+{
+ struct spdk_vhost_scsi_session *svsession = arg;
+ struct spdk_vhost_session *vsession = &svsession->vsession;
+ struct spdk_scsi_dev_session_state *state;
+ uint32_t i;
+
+ if (vsession->task_cnt > 0) {
+ return SPDK_POLLER_BUSY;
+ }
+
+ if (spdk_vhost_trylock() != 0) {
+ return SPDK_POLLER_BUSY;
+ }
+
+ for (i = 0; i < vsession->max_queues; i++) {
+ vhost_vq_used_signal(vsession, &vsession->virtqueue[i]);
+ }
+
+ for (i = 0; i < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; i++) {
+ enum spdk_scsi_dev_vhost_status prev_status;
+
+ state = &svsession->scsi_dev_state[i];
+ /* clear the REMOVED status so that we won't send hotremove events anymore */
+ prev_status = state->status;
+ state->status = VHOST_SCSI_DEV_EMPTY;
+ if (state->dev == NULL) {
+ continue;
+ }
+
+ spdk_scsi_dev_free_io_channels(state->dev);
+
+ state->dev = NULL;
+
+ if (prev_status == VHOST_SCSI_DEV_REMOVING) {
+ /* try to detach it globally */
+ vhost_dev_foreach_session(vsession->vdev,
+ vhost_scsi_session_process_removed,
+ vhost_scsi_dev_process_removed_cpl_cb,
+ (void *)(uintptr_t)i);
+ }
+ }
+
+ SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: stopping poller on lcore %d\n",
+ vsession->name, spdk_env_get_current_core());
+
+ free_task_pool(svsession);
+
+ spdk_poller_unregister(&svsession->stop_poller);
+ vhost_session_stop_done(vsession, 0);
+
+ spdk_vhost_unlock();
+ return SPDK_POLLER_BUSY;
+}
+
+static int
+vhost_scsi_stop_cb(struct spdk_vhost_dev *vdev,
+ struct spdk_vhost_session *vsession, void *unused)
+{
+ struct spdk_vhost_scsi_session *svsession = to_scsi_session(vsession);
+
+ /* Stop receiving new I/O requests */
+ spdk_poller_unregister(&svsession->requestq_poller);
+
+ /* Stop receiving controlq requests, also stop processing the
+ * asynchronous hotremove events. All the remaining events
+ * will be finalized by the stop_poller below.
+ */
+ spdk_poller_unregister(&svsession->mgmt_poller);
+
+ /* Wait for all pending I/Os to complete, then process all the
+ * remaining hotremove events one last time.
+ */
+ svsession->stop_poller = SPDK_POLLER_REGISTER(destroy_session_poller_cb,
+ svsession, 1000);
+
+ return 0;
+}
+
+static int
+vhost_scsi_stop(struct spdk_vhost_session *vsession)
+{
+ return vhost_session_send_event(vsession, vhost_scsi_stop_cb,
+ 3, "stop session");
+}
+
+static void
+vhost_scsi_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
+{
+ struct spdk_scsi_dev *sdev;
+ struct spdk_scsi_lun *lun;
+ uint32_t dev_idx;
+ uint32_t lun_idx;
+
+ assert(vdev != NULL);
+ spdk_json_write_named_array_begin(w, "scsi");
+ for (dev_idx = 0; dev_idx < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; dev_idx++) {
+ sdev = spdk_vhost_scsi_dev_get_tgt(vdev, dev_idx);
+ if (!sdev) {
+ continue;
+ }
+
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_uint32(w, "scsi_dev_num", dev_idx);
+
+ spdk_json_write_named_uint32(w, "id", spdk_scsi_dev_get_id(sdev));
+
+ spdk_json_write_named_string(w, "target_name", spdk_scsi_dev_get_name(sdev));
+
+ spdk_json_write_named_array_begin(w, "luns");
+
+ for (lun_idx = 0; lun_idx < SPDK_SCSI_DEV_MAX_LUN; lun_idx++) {
+ lun = spdk_scsi_dev_get_lun(sdev, lun_idx);
+ if (!lun) {
+ continue;
+ }
+
+ spdk_json_write_object_begin(w);
+
+ spdk_json_write_named_int32(w, "id", spdk_scsi_lun_get_id(lun));
+
+ spdk_json_write_named_string(w, "bdev_name", spdk_scsi_lun_get_bdev_name(lun));
+
+ spdk_json_write_object_end(w);
+ }
+
+ spdk_json_write_array_end(w);
+ spdk_json_write_object_end(w);
+ }
+
+ spdk_json_write_array_end(w);
+}
+
+static void
+vhost_scsi_write_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w)
+{
+ struct spdk_scsi_dev *scsi_dev;
+ struct spdk_scsi_lun *lun;
+ uint32_t i;
+
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "method", "vhost_create_scsi_controller");
+
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_string(w, "ctrlr", vdev->name);
+ spdk_json_write_named_string(w, "cpumask",
+ spdk_cpuset_fmt(spdk_thread_get_cpumask(vdev->thread)));
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+
+ for (i = 0; i < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; i++) {
+ scsi_dev = spdk_vhost_scsi_dev_get_tgt(vdev, i);
+ if (scsi_dev == NULL) {
+ continue;
+ }
+
+ lun = spdk_scsi_dev_get_lun(scsi_dev, 0);
+ assert(lun != NULL);
+
+ spdk_json_write_object_begin(w);
+ spdk_json_write_named_string(w, "method", "vhost_scsi_controller_add_target");
+
+ spdk_json_write_named_object_begin(w, "params");
+ spdk_json_write_named_string(w, "ctrlr", vdev->name);
+ spdk_json_write_named_uint32(w, "scsi_target_num", i);
+
+ spdk_json_write_named_string(w, "bdev_name", spdk_scsi_lun_get_bdev_name(lun));
+ spdk_json_write_object_end(w);
+
+ spdk_json_write_object_end(w);
+ }
+}
+
+SPDK_LOG_REGISTER_COMPONENT("vhost_scsi", SPDK_LOG_VHOST_SCSI)
+SPDK_LOG_REGISTER_COMPONENT("vhost_scsi_queue", SPDK_LOG_VHOST_SCSI_QUEUE)
+SPDK_LOG_REGISTER_COMPONENT("vhost_scsi_data", SPDK_LOG_VHOST_SCSI_DATA)
diff --git a/src/spdk/lib/virtio/Makefile b/src/spdk/lib/virtio/Makefile
new file mode 100644
index 000000000..8ea173c3b
--- /dev/null
+++ b/src/spdk/lib/virtio/Makefile
@@ -0,0 +1,46 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+CFLAGS += $(ENV_CFLAGS)
+C_SRCS = virtio.c virtio_user.c virtio_pci.c vhost_user.c
+LIBNAME = virtio
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_virtio.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/virtio/spdk_virtio.map b/src/spdk/lib/virtio/spdk_virtio.map
new file mode 100644
index 000000000..76e02cff8
--- /dev/null
+++ b/src/spdk/lib/virtio/spdk_virtio.map
@@ -0,0 +1,33 @@
+{
+ global:
+
+ # internal functions in spdk_internal/virtio.h
+ virtio_recv_pkts;
+ virtqueue_req_start;
+ virtqueue_req_flush;
+ virtqueue_req_abort;
+ virtqueue_req_add_iovs;
+ virtio_dev_construct;
+ virtio_dev_reset;
+ virtio_dev_start;
+ virtio_dev_stop;
+ virtio_dev_destruct;
+ virtio_dev_acquire_queue;
+ virtio_dev_find_and_acquire_queue;
+ virtio_dev_queue_get_thread;
+ virtio_dev_queue_is_acquired;
+ virtio_dev_release_queue;
+ virtio_dev_get_status;
+ virtio_dev_set_status;
+ virtio_dev_write_dev_config;
+ virtio_dev_read_dev_config;
+ virtio_dev_backend_ops;
+ virtio_dev_has_feature;
+ virtio_dev_dump_json_info;
+ virtio_pci_dev_enumerate;
+ virtio_pci_dev_attach;
+ virtio_user_dev_init;
+ virtio_pci_dev_init;
+
+ local: *;
+};
diff --git a/src/spdk/lib/virtio/vhost_user.c b/src/spdk/lib/virtio/vhost_user.c
new file mode 100644
index 000000000..b3da9d988
--- /dev/null
+++ b/src/spdk/lib/virtio/vhost_user.c
@@ -0,0 +1,489 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "vhost_user.h"
+
+#include "spdk/string.h"
+#include "spdk_internal/vhost_user.h"
+
+/* The version of the protocol we support */
+#define VHOST_USER_VERSION 0x1
+
+static int
+vhost_user_write(int fd, void *buf, int len, int *fds, int fd_num)
+{
+ int r;
+ struct msghdr msgh;
+ struct iovec iov;
+ size_t fd_size = fd_num * sizeof(int);
+ char control[CMSG_SPACE(fd_size)];
+ struct cmsghdr *cmsg;
+
+ memset(&msgh, 0, sizeof(msgh));
+ memset(control, 0, sizeof(control));
+
+ iov.iov_base = (uint8_t *)buf;
+ iov.iov_len = len;
+
+ msgh.msg_iov = &iov;
+ msgh.msg_iovlen = 1;
+
+ if (fds && fd_num > 0) {
+ msgh.msg_control = control;
+ msgh.msg_controllen = sizeof(control);
+ cmsg = CMSG_FIRSTHDR(&msgh);
+ cmsg->cmsg_len = CMSG_LEN(fd_size);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ memcpy(CMSG_DATA(cmsg), fds, fd_size);
+ } else {
+ msgh.msg_control = NULL;
+ msgh.msg_controllen = 0;
+ }
+
+ do {
+ r = sendmsg(fd, &msgh, 0);
+ } while (r < 0 && errno == EINTR);
+
+ if (r == -1) {
+ return -errno;
+ }
+
+ return 0;
+}
+
+static int
+vhost_user_read(int fd, struct vhost_user_msg *msg)
+{
+ uint32_t valid_flags = VHOST_USER_REPLY_MASK | VHOST_USER_VERSION;
+ ssize_t ret;
+ size_t sz_hdr = VHOST_USER_HDR_SIZE, sz_payload;
+
+ ret = recv(fd, (void *)msg, sz_hdr, 0);
+ if ((size_t)ret != sz_hdr) {
+ SPDK_WARNLOG("Failed to recv msg hdr: %zd instead of %zu.\n",
+ ret, sz_hdr);
+ if (ret == -1) {
+ return -errno;
+ } else {
+ return -EBUSY;
+ }
+ }
+
+ /* validate msg flags */
+ if (msg->flags != (valid_flags)) {
+ SPDK_WARNLOG("Failed to recv msg: flags %"PRIx32" instead of %"PRIx32".\n",
+ msg->flags, valid_flags);
+ return -EIO;
+ }
+
+ sz_payload = msg->size;
+
+ if (sz_payload > VHOST_USER_PAYLOAD_SIZE) {
+ SPDK_WARNLOG("Received oversized msg: payload size %zu > available space %zu\n",
+ sz_payload, VHOST_USER_PAYLOAD_SIZE);
+ return -EIO;
+ }
+
+ if (sz_payload) {
+ ret = recv(fd, (void *)((char *)msg + sz_hdr), sz_payload, 0);
+ if ((size_t)ret != sz_payload) {
+ SPDK_WARNLOG("Failed to recv msg payload: %zd instead of %"PRIu32".\n",
+ ret, msg->size);
+ if (ret == -1) {
+ return -errno;
+ } else {
+ return -EBUSY;
+ }
+ }
+ }
+
+ return 0;
+}
+
+struct hugepage_file_info {
+ uint64_t addr; /**< virtual addr */
+ size_t size; /**< the file size */
+ char path[PATH_MAX]; /**< path to backing file */
+};
+
+/* Two possible options:
+ * 1. Match HUGEPAGE_INFO_FMT to find the file storing struct hugepage_file
+ * array. This is simple but cannot be used in secondary process because
+ * secondary process will close and munmap that file.
+ * 2. Match HUGEFILE_FMT to find hugepage files directly.
+ *
+ * We choose option 2.
+ */
+static int
+get_hugepage_file_info(struct hugepage_file_info huges[], int max)
+{
+ int idx, rc;
+ FILE *f;
+ char buf[BUFSIZ], *tmp, *tail;
+ char *str_underline, *str_start;
+ int huge_index;
+ uint64_t v_start, v_end;
+
+ f = fopen("/proc/self/maps", "r");
+ if (!f) {
+ SPDK_ERRLOG("cannot open /proc/self/maps\n");
+ rc = -errno;
+ assert(rc < 0); /* scan-build hack */
+ return rc;
+ }
+
+ idx = 0;
+ while (fgets(buf, sizeof(buf), f) != NULL) {
+ if (sscanf(buf, "%" PRIx64 "-%" PRIx64, &v_start, &v_end) < 2) {
+ SPDK_ERRLOG("Failed to parse address\n");
+ rc = -EIO;
+ goto out;
+ }
+
+ tmp = strchr(buf, ' ') + 1; /** skip address */
+ tmp = strchr(tmp, ' ') + 1; /** skip perm */
+ tmp = strchr(tmp, ' ') + 1; /** skip offset */
+ tmp = strchr(tmp, ' ') + 1; /** skip dev */
+ tmp = strchr(tmp, ' ') + 1; /** skip inode */
+ while (*tmp == ' ') { /** skip spaces */
+ tmp++;
+ }
+ tail = strrchr(tmp, '\n'); /** remove newline if exists */
+ if (tail) {
+ *tail = '\0';
+ }
+
+ /* Match HUGEFILE_FMT, aka "%s/%smap_%d",
+ * which is defined in eal_filesystem.h
+ */
+ str_underline = strrchr(tmp, '_');
+ if (!str_underline) {
+ continue;
+ }
+
+ str_start = str_underline - strlen("map");
+ if (str_start < tmp) {
+ continue;
+ }
+
+ if (sscanf(str_start, "map_%d", &huge_index) != 1) {
+ continue;
+ }
+
+ if (idx >= max) {
+ SPDK_ERRLOG("Exceed maximum of %d\n", max);
+ rc = -ENOSPC;
+ goto out;
+ }
+
+ if (idx > 0 &&
+ strncmp(tmp, huges[idx - 1].path, PATH_MAX) == 0 &&
+ v_start == huges[idx - 1].addr + huges[idx - 1].size) {
+ huges[idx - 1].size += (v_end - v_start);
+ continue;
+ }
+
+ huges[idx].addr = v_start;
+ huges[idx].size = v_end - v_start;
+ snprintf(huges[idx].path, PATH_MAX, "%s", tmp);
+ idx++;
+ }
+
+ rc = idx;
+out:
+ fclose(f);
+ return rc;
+}
+
+static int
+prepare_vhost_memory_user(struct vhost_user_msg *msg, int fds[])
+{
+ int i, num;
+ struct hugepage_file_info huges[VHOST_USER_MEMORY_MAX_NREGIONS];
+
+ num = get_hugepage_file_info(huges, VHOST_USER_MEMORY_MAX_NREGIONS);
+ if (num < 0) {
+ SPDK_ERRLOG("Failed to prepare memory for vhost-user\n");
+ return num;
+ }
+
+ for (i = 0; i < num; ++i) {
+ /* the memory regions are unaligned */
+ msg->payload.memory.regions[i].guest_phys_addr = huges[i].addr; /* use vaddr! */
+ msg->payload.memory.regions[i].userspace_addr = huges[i].addr;
+ msg->payload.memory.regions[i].memory_size = huges[i].size;
+ msg->payload.memory.regions[i].flags_padding = 0;
+ fds[i] = open(huges[i].path, O_RDWR);
+ }
+
+ msg->payload.memory.nregions = num;
+ msg->payload.memory.padding = 0;
+
+ return 0;
+}
+
+static const char *const vhost_msg_strings[VHOST_USER_MAX] = {
+ [VHOST_USER_SET_OWNER] = "VHOST_SET_OWNER",
+ [VHOST_USER_RESET_OWNER] = "VHOST_RESET_OWNER",
+ [VHOST_USER_SET_FEATURES] = "VHOST_SET_FEATURES",
+ [VHOST_USER_GET_FEATURES] = "VHOST_GET_FEATURES",
+ [VHOST_USER_SET_VRING_CALL] = "VHOST_SET_VRING_CALL",
+ [VHOST_USER_GET_PROTOCOL_FEATURES] = "VHOST_USER_GET_PROTOCOL_FEATURES",
+ [VHOST_USER_SET_PROTOCOL_FEATURES] = "VHOST_USER_SET_PROTOCOL_FEATURES",
+ [VHOST_USER_SET_VRING_NUM] = "VHOST_SET_VRING_NUM",
+ [VHOST_USER_SET_VRING_BASE] = "VHOST_SET_VRING_BASE",
+ [VHOST_USER_GET_VRING_BASE] = "VHOST_GET_VRING_BASE",
+ [VHOST_USER_SET_VRING_ADDR] = "VHOST_SET_VRING_ADDR",
+ [VHOST_USER_SET_VRING_KICK] = "VHOST_SET_VRING_KICK",
+ [VHOST_USER_SET_MEM_TABLE] = "VHOST_SET_MEM_TABLE",
+ [VHOST_USER_SET_VRING_ENABLE] = "VHOST_SET_VRING_ENABLE",
+ [VHOST_USER_GET_QUEUE_NUM] = "VHOST_USER_GET_QUEUE_NUM",
+ [VHOST_USER_GET_CONFIG] = "VHOST_USER_GET_CONFIG",
+ [VHOST_USER_SET_CONFIG] = "VHOST_USER_SET_CONFIG",
+};
+
+static int
+vhost_user_sock(struct virtio_user_dev *dev,
+ enum vhost_user_request req,
+ void *arg)
+{
+ struct vhost_user_msg msg;
+ struct vhost_vring_file *file = 0;
+ int need_reply = 0;
+ int fds[VHOST_USER_MEMORY_MAX_NREGIONS];
+ int fd_num = 0;
+ int i, len, rc;
+ int vhostfd = dev->vhostfd;
+
+ SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_USER, "sent message %d = %s\n", req, vhost_msg_strings[req]);
+
+ msg.request = req;
+ msg.flags = VHOST_USER_VERSION;
+ msg.size = 0;
+
+ switch (req) {
+ case VHOST_USER_GET_FEATURES:
+ case VHOST_USER_GET_PROTOCOL_FEATURES:
+ case VHOST_USER_GET_QUEUE_NUM:
+ need_reply = 1;
+ break;
+
+ case VHOST_USER_SET_FEATURES:
+ case VHOST_USER_SET_LOG_BASE:
+ case VHOST_USER_SET_PROTOCOL_FEATURES:
+ msg.payload.u64 = *((__u64 *)arg);
+ msg.size = sizeof(msg.payload.u64);
+ break;
+
+ case VHOST_USER_SET_OWNER:
+ case VHOST_USER_RESET_OWNER:
+ break;
+
+ case VHOST_USER_SET_MEM_TABLE:
+ rc = prepare_vhost_memory_user(&msg, fds);
+ if (rc < 0) {
+ return rc;
+ }
+ fd_num = msg.payload.memory.nregions;
+ msg.size = sizeof(msg.payload.memory.nregions);
+ msg.size += sizeof(msg.payload.memory.padding);
+ msg.size += fd_num * sizeof(struct vhost_memory_region);
+ break;
+
+ case VHOST_USER_SET_LOG_FD:
+ fds[fd_num++] = *((int *)arg);
+ break;
+
+ case VHOST_USER_SET_VRING_NUM:
+ case VHOST_USER_SET_VRING_BASE:
+ case VHOST_USER_SET_VRING_ENABLE:
+ memcpy(&msg.payload.state, arg, sizeof(msg.payload.state));
+ msg.size = sizeof(msg.payload.state);
+ break;
+
+ case VHOST_USER_GET_VRING_BASE:
+ memcpy(&msg.payload.state, arg, sizeof(msg.payload.state));
+ msg.size = sizeof(msg.payload.state);
+ need_reply = 1;
+ break;
+
+ case VHOST_USER_SET_VRING_ADDR:
+ memcpy(&msg.payload.addr, arg, sizeof(msg.payload.addr));
+ msg.size = sizeof(msg.payload.addr);
+ break;
+
+ case VHOST_USER_SET_VRING_KICK:
+ case VHOST_USER_SET_VRING_CALL:
+ case VHOST_USER_SET_VRING_ERR:
+ file = arg;
+ msg.payload.u64 = file->index & VHOST_USER_VRING_IDX_MASK;
+ msg.size = sizeof(msg.payload.u64);
+ if (file->fd > 0) {
+ fds[fd_num++] = file->fd;
+ } else {
+ msg.payload.u64 |= VHOST_USER_VRING_NOFD_MASK;
+ }
+ break;
+
+ case VHOST_USER_GET_CONFIG:
+ memcpy(&msg.payload.cfg, arg, sizeof(msg.payload.cfg));
+ msg.size = sizeof(msg.payload.cfg);
+ need_reply = 1;
+ break;
+
+ case VHOST_USER_SET_CONFIG:
+ memcpy(&msg.payload.cfg, arg, sizeof(msg.payload.cfg));
+ msg.size = sizeof(msg.payload.cfg);
+ break;
+
+ default:
+ SPDK_ERRLOG("trying to send unknown msg\n");
+ return -EINVAL;
+ }
+
+ len = VHOST_USER_HDR_SIZE + msg.size;
+ rc = vhost_user_write(vhostfd, &msg, len, fds, fd_num);
+ if (rc < 0) {
+ SPDK_ERRLOG("%s failed: %s\n",
+ vhost_msg_strings[req], spdk_strerror(-rc));
+ return rc;
+ }
+
+ if (req == VHOST_USER_SET_MEM_TABLE)
+ for (i = 0; i < fd_num; ++i) {
+ close(fds[i]);
+ }
+
+ if (need_reply) {
+ rc = vhost_user_read(vhostfd, &msg);
+ if (rc < 0) {
+ SPDK_WARNLOG("Received msg failed: %s\n", spdk_strerror(-rc));
+ return rc;
+ }
+
+ if (req != msg.request) {
+ SPDK_WARNLOG("Received unexpected msg type\n");
+ return -EIO;
+ }
+
+ switch (req) {
+ case VHOST_USER_GET_FEATURES:
+ case VHOST_USER_GET_PROTOCOL_FEATURES:
+ case VHOST_USER_GET_QUEUE_NUM:
+ if (msg.size != sizeof(msg.payload.u64)) {
+ SPDK_WARNLOG("Received bad msg size\n");
+ return -EIO;
+ }
+ *((__u64 *)arg) = msg.payload.u64;
+ break;
+ case VHOST_USER_GET_VRING_BASE:
+ if (msg.size != sizeof(msg.payload.state)) {
+ SPDK_WARNLOG("Received bad msg size\n");
+ return -EIO;
+ }
+ memcpy(arg, &msg.payload.state,
+ sizeof(struct vhost_vring_state));
+ break;
+ case VHOST_USER_GET_CONFIG:
+ if (msg.size != sizeof(msg.payload.cfg)) {
+ SPDK_WARNLOG("Received bad msg size\n");
+ return -EIO;
+ }
+ memcpy(arg, &msg.payload.cfg, sizeof(msg.payload.cfg));
+ break;
+ default:
+ SPDK_WARNLOG("Received unexpected msg type\n");
+ return -EBADMSG;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * Set up environment to talk with a vhost user backend.
+ *
+ * @return
+ * - (-1) if fail;
+ * - (0) if succeed.
+ */
+static int
+vhost_user_setup(struct virtio_user_dev *dev)
+{
+ int fd;
+ int flag;
+ struct sockaddr_un un;
+ ssize_t rc;
+
+ fd = socket(AF_UNIX, SOCK_STREAM, 0);
+ if (fd < 0) {
+ SPDK_ERRLOG("socket() error, %s\n", spdk_strerror(errno));
+ return -errno;
+ }
+
+ flag = fcntl(fd, F_GETFD);
+ if (fcntl(fd, F_SETFD, flag | FD_CLOEXEC) < 0) {
+ SPDK_ERRLOG("fcntl failed, %s\n", spdk_strerror(errno));
+ }
+
+ memset(&un, 0, sizeof(un));
+ un.sun_family = AF_UNIX;
+ rc = snprintf(un.sun_path, sizeof(un.sun_path), "%s", dev->path);
+ if (rc < 0 || (size_t)rc >= sizeof(un.sun_path)) {
+ SPDK_ERRLOG("socket path too long\n");
+ close(fd);
+ if (rc < 0) {
+ return -errno;
+ } else {
+ return -EINVAL;
+ }
+ }
+ if (connect(fd, (struct sockaddr *)&un, sizeof(un)) < 0) {
+ SPDK_ERRLOG("connect error, %s\n", spdk_strerror(errno));
+ close(fd);
+ return -errno;
+ }
+
+ dev->vhostfd = fd;
+ return 0;
+}
+
+struct virtio_user_backend_ops ops_user = {
+ .setup = vhost_user_setup,
+ .send_request = vhost_user_sock,
+};
+
+SPDK_LOG_REGISTER_COMPONENT("virtio_user", SPDK_LOG_VIRTIO_USER)
diff --git a/src/spdk/lib/virtio/vhost_user.h b/src/spdk/lib/virtio/vhost_user.h
new file mode 100644
index 000000000..0caf51ebc
--- /dev/null
+++ b/src/spdk/lib/virtio/vhost_user.h
@@ -0,0 +1,69 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _VHOST_H
+#define _VHOST_H
+
+#include "spdk/stdinc.h"
+
+#include "spdk_internal/log.h"
+#include "spdk_internal/virtio.h"
+#include "spdk_internal/vhost_user.h"
+
+struct virtio_user_backend_ops;
+
+struct virtio_user_dev {
+ int vhostfd;
+
+ int callfds[SPDK_VIRTIO_MAX_VIRTQUEUES];
+ int kickfds[SPDK_VIRTIO_MAX_VIRTQUEUES];
+ uint32_t queue_size;
+
+ uint8_t status;
+ char path[PATH_MAX];
+ uint64_t protocol_features;
+ struct vring vrings[SPDK_VIRTIO_MAX_VIRTQUEUES];
+ struct virtio_user_backend_ops *ops;
+ struct spdk_mem_map *mem_map;
+};
+
+struct virtio_user_backend_ops {
+ int (*setup)(struct virtio_user_dev *dev);
+ int (*send_request)(struct virtio_user_dev *dev,
+ enum vhost_user_request req,
+ void *arg);
+};
+
+extern struct virtio_user_backend_ops ops_user;
+
+#endif
diff --git a/src/spdk/lib/virtio/virtio.c b/src/spdk/lib/virtio/virtio.c
new file mode 100644
index 000000000..03866040a
--- /dev/null
+++ b/src/spdk/lib/virtio/virtio.c
@@ -0,0 +1,717 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/env.h"
+#include "spdk/util.h"
+#include "spdk/barrier.h"
+
+#include "spdk_internal/virtio.h"
+
+/* We use SMP memory barrier variants as all virtio_pci devices
+ * are purely virtual. All MMIO is executed on a CPU core, so
+ * there's no need to do full MMIO synchronization.
+ */
+#define virtio_mb() spdk_smp_mb()
+#define virtio_rmb() spdk_smp_rmb()
+#define virtio_wmb() spdk_smp_wmb()
+
+/* Chain all the descriptors in the ring with an END */
+static inline void
+vring_desc_init(struct vring_desc *dp, uint16_t n)
+{
+ uint16_t i;
+
+ for (i = 0; i < n - 1; i++) {
+ dp[i].next = (uint16_t)(i + 1);
+ }
+ dp[i].next = VQ_RING_DESC_CHAIN_END;
+}
+
+static void
+virtio_init_vring(struct virtqueue *vq)
+{
+ int size = vq->vq_nentries;
+ struct vring *vr = &vq->vq_ring;
+ uint8_t *ring_mem = vq->vq_ring_virt_mem;
+
+ /*
+ * Reinitialise since virtio port might have been stopped and restarted
+ */
+ memset(ring_mem, 0, vq->vq_ring_size);
+ vring_init(vr, size, ring_mem, VIRTIO_PCI_VRING_ALIGN);
+ vq->vq_used_cons_idx = 0;
+ vq->vq_desc_head_idx = 0;
+ vq->vq_avail_idx = 0;
+ vq->vq_desc_tail_idx = (uint16_t)(vq->vq_nentries - 1);
+ vq->vq_free_cnt = vq->vq_nentries;
+ vq->req_start = VQ_RING_DESC_CHAIN_END;
+ vq->req_end = VQ_RING_DESC_CHAIN_END;
+ vq->reqs_finished = 0;
+ memset(vq->vq_descx, 0, sizeof(struct vq_desc_extra) * vq->vq_nentries);
+
+ vring_desc_init(vr->desc, size);
+
+ /* Tell the backend not to interrupt us.
+ * If F_EVENT_IDX is negotiated, we will always set incredibly high
+ * used event idx, so that we will practically never receive an
+ * interrupt. See virtqueue_req_flush()
+ */
+ if (vq->vdev->negotiated_features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) {
+ vring_used_event(&vq->vq_ring) = UINT16_MAX;
+ } else {
+ vq->vq_ring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT;
+ }
+}
+
+static int
+virtio_init_queue(struct virtio_dev *dev, uint16_t vtpci_queue_idx)
+{
+ unsigned int vq_size, size;
+ struct virtqueue *vq;
+ int rc;
+
+ SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "setting up queue: %"PRIu16"\n", vtpci_queue_idx);
+
+ /*
+ * Read the virtqueue size from the Queue Size field
+ * Always power of 2 and if 0 virtqueue does not exist
+ */
+ vq_size = virtio_dev_backend_ops(dev)->get_queue_size(dev, vtpci_queue_idx);
+ SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "vq_size: %u\n", vq_size);
+ if (vq_size == 0) {
+ SPDK_ERRLOG("virtqueue %"PRIu16" does not exist\n", vtpci_queue_idx);
+ return -EINVAL;
+ }
+
+ if (!spdk_u32_is_pow2(vq_size)) {
+ SPDK_ERRLOG("virtqueue %"PRIu16" size (%u) is not powerof 2\n",
+ vtpci_queue_idx, vq_size);
+ return -EINVAL;
+ }
+
+ size = sizeof(*vq) + vq_size * sizeof(struct vq_desc_extra);
+
+ if (posix_memalign((void **)&vq, SPDK_CACHE_LINE_SIZE, size)) {
+ SPDK_ERRLOG("can not allocate vq\n");
+ return -ENOMEM;
+ }
+ memset(vq, 0, size);
+ dev->vqs[vtpci_queue_idx] = vq;
+
+ vq->vdev = dev;
+ vq->vq_queue_index = vtpci_queue_idx;
+ vq->vq_nentries = vq_size;
+
+ /*
+ * Reserve a memzone for vring elements
+ */
+ size = vring_size(vq_size, VIRTIO_PCI_VRING_ALIGN);
+ vq->vq_ring_size = SPDK_ALIGN_CEIL(size, VIRTIO_PCI_VRING_ALIGN);
+ SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "vring_size: %u, rounded_vring_size: %u\n",
+ size, vq->vq_ring_size);
+
+ vq->owner_thread = NULL;
+
+ rc = virtio_dev_backend_ops(dev)->setup_queue(dev, vq);
+ if (rc < 0) {
+ SPDK_ERRLOG("setup_queue failed\n");
+ free(vq);
+ dev->vqs[vtpci_queue_idx] = NULL;
+ return rc;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "vq->vq_ring_mem: 0x%" PRIx64 "\n",
+ vq->vq_ring_mem);
+ SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "vq->vq_ring_virt_mem: 0x%" PRIx64 "\n",
+ (uint64_t)(uintptr_t)vq->vq_ring_virt_mem);
+
+ virtio_init_vring(vq);
+ return 0;
+}
+
+static void
+virtio_free_queues(struct virtio_dev *dev)
+{
+ uint16_t nr_vq = dev->max_queues;
+ struct virtqueue *vq;
+ uint16_t i;
+
+ if (dev->vqs == NULL) {
+ return;
+ }
+
+ for (i = 0; i < nr_vq; i++) {
+ vq = dev->vqs[i];
+ if (!vq) {
+ continue;
+ }
+
+ virtio_dev_backend_ops(dev)->del_queue(dev, vq);
+
+ free(vq);
+ dev->vqs[i] = NULL;
+ }
+
+ free(dev->vqs);
+ dev->vqs = NULL;
+}
+
+static int
+virtio_alloc_queues(struct virtio_dev *dev, uint16_t request_vq_num, uint16_t fixed_vq_num)
+{
+ uint16_t nr_vq;
+ uint16_t i;
+ int ret;
+
+ nr_vq = request_vq_num + fixed_vq_num;
+ if (nr_vq == 0) {
+ /* perfectly fine to have a device with no virtqueues. */
+ return 0;
+ }
+
+ assert(dev->vqs == NULL);
+ dev->vqs = calloc(1, sizeof(struct virtqueue *) * nr_vq);
+ if (!dev->vqs) {
+ SPDK_ERRLOG("failed to allocate %"PRIu16" vqs\n", nr_vq);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < nr_vq; i++) {
+ ret = virtio_init_queue(dev, i);
+ if (ret < 0) {
+ virtio_free_queues(dev);
+ return ret;
+ }
+ }
+
+ dev->max_queues = nr_vq;
+ dev->fixed_queues_num = fixed_vq_num;
+ return 0;
+}
+
+/**
+ * Negotiate virtio features. For virtio_user this will also set
+ * dev->modern flag if VIRTIO_F_VERSION_1 flag is negotiated.
+ */
+static int
+virtio_negotiate_features(struct virtio_dev *dev, uint64_t req_features)
+{
+ uint64_t host_features = virtio_dev_backend_ops(dev)->get_features(dev);
+ int rc;
+
+ SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "guest features = %" PRIx64 "\n", req_features);
+ SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "device features = %" PRIx64 "\n", host_features);
+
+ rc = virtio_dev_backend_ops(dev)->set_features(dev, req_features & host_features);
+ if (rc != 0) {
+ SPDK_ERRLOG("failed to negotiate device features.\n");
+ return rc;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "negotiated features = %" PRIx64 "\n",
+ dev->negotiated_features);
+
+ virtio_dev_set_status(dev, VIRTIO_CONFIG_S_FEATURES_OK);
+ if (!(virtio_dev_get_status(dev) & VIRTIO_CONFIG_S_FEATURES_OK)) {
+ SPDK_ERRLOG("failed to set FEATURES_OK status!\n");
+ /* either the device failed, or we offered some features that
+ * depend on other, not offered features.
+ */
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int
+virtio_dev_construct(struct virtio_dev *vdev, const char *name,
+ const struct virtio_dev_ops *ops, void *ctx)
+{
+ int rc;
+
+ vdev->name = strdup(name);
+ if (vdev->name == NULL) {
+ return -ENOMEM;
+ }
+
+ rc = pthread_mutex_init(&vdev->mutex, NULL);
+ if (rc != 0) {
+ free(vdev->name);
+ return -rc;
+ }
+
+ vdev->backend_ops = ops;
+ vdev->ctx = ctx;
+
+ return 0;
+}
+
+int
+virtio_dev_reset(struct virtio_dev *dev, uint64_t req_features)
+{
+ req_features |= (1ULL << VIRTIO_F_VERSION_1);
+
+ virtio_dev_stop(dev);
+
+ virtio_dev_set_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
+ if (!(virtio_dev_get_status(dev) & VIRTIO_CONFIG_S_ACKNOWLEDGE)) {
+ SPDK_ERRLOG("Failed to set VIRTIO_CONFIG_S_ACKNOWLEDGE status.\n");
+ return -EIO;
+ }
+
+ virtio_dev_set_status(dev, VIRTIO_CONFIG_S_DRIVER);
+ if (!(virtio_dev_get_status(dev) & VIRTIO_CONFIG_S_DRIVER)) {
+ SPDK_ERRLOG("Failed to set VIRTIO_CONFIG_S_DRIVER status.\n");
+ return -EIO;
+ }
+
+ return virtio_negotiate_features(dev, req_features);
+}
+
+int
+virtio_dev_start(struct virtio_dev *vdev, uint16_t max_queues, uint16_t fixed_queue_num)
+{
+ int ret;
+
+ ret = virtio_alloc_queues(vdev, max_queues, fixed_queue_num);
+ if (ret < 0) {
+ return ret;
+ }
+
+ virtio_dev_set_status(vdev, VIRTIO_CONFIG_S_DRIVER_OK);
+ if (!(virtio_dev_get_status(vdev) & VIRTIO_CONFIG_S_DRIVER_OK)) {
+ SPDK_ERRLOG("Failed to set VIRTIO_CONFIG_S_DRIVER_OK status.\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+void
+virtio_dev_destruct(struct virtio_dev *dev)
+{
+ virtio_dev_backend_ops(dev)->destruct_dev(dev);
+ pthread_mutex_destroy(&dev->mutex);
+ free(dev->name);
+}
+
+static void
+vq_ring_free_chain(struct virtqueue *vq, uint16_t desc_idx)
+{
+ struct vring_desc *dp, *dp_tail;
+ struct vq_desc_extra *dxp;
+ uint16_t desc_idx_last = desc_idx;
+
+ dp = &vq->vq_ring.desc[desc_idx];
+ dxp = &vq->vq_descx[desc_idx];
+ vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt + dxp->ndescs);
+ if ((dp->flags & VRING_DESC_F_INDIRECT) == 0) {
+ while (dp->flags & VRING_DESC_F_NEXT) {
+ desc_idx_last = dp->next;
+ dp = &vq->vq_ring.desc[dp->next];
+ }
+ }
+ dxp->ndescs = 0;
+
+ /*
+ * We must append the existing free chain, if any, to the end of
+ * newly freed chain. If the virtqueue was completely used, then
+ * head would be VQ_RING_DESC_CHAIN_END (ASSERTed above).
+ */
+ if (vq->vq_desc_tail_idx == VQ_RING_DESC_CHAIN_END) {
+ vq->vq_desc_head_idx = desc_idx;
+ } else {
+ dp_tail = &vq->vq_ring.desc[vq->vq_desc_tail_idx];
+ dp_tail->next = desc_idx;
+ }
+
+ vq->vq_desc_tail_idx = desc_idx_last;
+ dp->next = VQ_RING_DESC_CHAIN_END;
+}
+
+static uint16_t
+virtqueue_dequeue_burst_rx(struct virtqueue *vq, void **rx_pkts,
+ uint32_t *len, uint16_t num)
+{
+ struct vring_used_elem *uep;
+ void *cookie;
+ uint16_t used_idx, desc_idx;
+ uint16_t i;
+
+ /* Caller does the check */
+ for (i = 0; i < num ; i++) {
+ used_idx = (uint16_t)(vq->vq_used_cons_idx & (vq->vq_nentries - 1));
+ uep = &vq->vq_ring.used->ring[used_idx];
+ desc_idx = (uint16_t) uep->id;
+ len[i] = uep->len;
+ cookie = vq->vq_descx[desc_idx].cookie;
+
+ if (spdk_unlikely(cookie == NULL)) {
+ SPDK_WARNLOG("vring descriptor with no mbuf cookie at %"PRIu16"\n",
+ vq->vq_used_cons_idx);
+ break;
+ }
+
+ __builtin_prefetch(cookie);
+
+ rx_pkts[i] = cookie;
+ vq->vq_used_cons_idx++;
+ vq_ring_free_chain(vq, desc_idx);
+ vq->vq_descx[desc_idx].cookie = NULL;
+ }
+
+ return i;
+}
+
+static void
+finish_req(struct virtqueue *vq)
+{
+ struct vring_desc *desc;
+ uint16_t avail_idx;
+
+ desc = &vq->vq_ring.desc[vq->req_end];
+ desc->flags &= ~VRING_DESC_F_NEXT;
+
+ /*
+ * Place the head of the descriptor chain into the next slot and make
+ * it usable to the host. The chain is made available now rather than
+ * deferring to virtqueue_req_flush() in the hopes that if the host is
+ * currently running on another CPU, we can keep it processing the new
+ * descriptor.
+ */
+ avail_idx = (uint16_t)(vq->vq_avail_idx & (vq->vq_nentries - 1));
+ vq->vq_ring.avail->ring[avail_idx] = vq->req_start;
+ vq->vq_avail_idx++;
+ vq->req_end = VQ_RING_DESC_CHAIN_END;
+ virtio_wmb();
+ vq->vq_ring.avail->idx = vq->vq_avail_idx;
+ vq->reqs_finished++;
+}
+
+int
+virtqueue_req_start(struct virtqueue *vq, void *cookie, int iovcnt)
+{
+ struct vq_desc_extra *dxp;
+
+ if (iovcnt > vq->vq_free_cnt) {
+ return iovcnt > vq->vq_nentries ? -EINVAL : -ENOMEM;
+ }
+
+ if (vq->req_end != VQ_RING_DESC_CHAIN_END) {
+ finish_req(vq);
+ }
+
+ vq->req_start = vq->vq_desc_head_idx;
+ dxp = &vq->vq_descx[vq->req_start];
+ dxp->cookie = cookie;
+ dxp->ndescs = 0;
+
+ return 0;
+}
+
+void
+virtqueue_req_flush(struct virtqueue *vq)
+{
+ uint16_t reqs_finished;
+
+ if (vq->req_end == VQ_RING_DESC_CHAIN_END) {
+ /* no non-empty requests have been started */
+ return;
+ }
+
+ finish_req(vq);
+ virtio_mb();
+
+ reqs_finished = vq->reqs_finished;
+ vq->reqs_finished = 0;
+
+ if (vq->vdev->negotiated_features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) {
+ /* Set used event idx to a value the device will never reach.
+ * This effectively disables interrupts.
+ */
+ vring_used_event(&vq->vq_ring) = vq->vq_used_cons_idx - vq->vq_nentries - 1;
+
+ if (!vring_need_event(vring_avail_event(&vq->vq_ring),
+ vq->vq_avail_idx,
+ vq->vq_avail_idx - reqs_finished)) {
+ return;
+ }
+ } else if (vq->vq_ring.used->flags & VRING_USED_F_NO_NOTIFY) {
+ return;
+ }
+
+ virtio_dev_backend_ops(vq->vdev)->notify_queue(vq->vdev, vq);
+ SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "Notified backend after xmit\n");
+}
+
+void
+virtqueue_req_abort(struct virtqueue *vq)
+{
+ struct vring_desc *desc;
+
+ if (vq->req_start == VQ_RING_DESC_CHAIN_END) {
+ /* no requests have been started */
+ return;
+ }
+
+ desc = &vq->vq_ring.desc[vq->req_end];
+ desc->flags &= ~VRING_DESC_F_NEXT;
+
+ vq_ring_free_chain(vq, vq->req_start);
+ vq->req_start = VQ_RING_DESC_CHAIN_END;
+}
+
+void
+virtqueue_req_add_iovs(struct virtqueue *vq, struct iovec *iovs, uint16_t iovcnt,
+ enum spdk_virtio_desc_type desc_type)
+{
+ struct vring_desc *desc;
+ struct vq_desc_extra *dxp;
+ uint16_t i, prev_head, new_head;
+
+ assert(vq->req_start != VQ_RING_DESC_CHAIN_END);
+ assert(iovcnt <= vq->vq_free_cnt);
+
+ /* TODO use indirect descriptors if iovcnt is high enough
+ * or the caller specifies SPDK_VIRTIO_DESC_F_INDIRECT
+ */
+
+ prev_head = vq->req_end;
+ new_head = vq->vq_desc_head_idx;
+ for (i = 0; i < iovcnt; ++i) {
+ desc = &vq->vq_ring.desc[new_head];
+
+ if (!vq->vdev->is_hw) {
+ desc->addr = (uintptr_t)iovs[i].iov_base;
+ } else {
+ desc->addr = spdk_vtophys(iovs[i].iov_base, NULL);
+ }
+
+ desc->len = iovs[i].iov_len;
+ /* always set NEXT flag. unset it on the last descriptor
+ * in the request-ending function.
+ */
+ desc->flags = desc_type | VRING_DESC_F_NEXT;
+
+ prev_head = new_head;
+ new_head = desc->next;
+ }
+
+ dxp = &vq->vq_descx[vq->req_start];
+ dxp->ndescs += iovcnt;
+
+ vq->req_end = prev_head;
+ vq->vq_desc_head_idx = new_head;
+ vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt - iovcnt);
+ if (vq->vq_desc_head_idx == VQ_RING_DESC_CHAIN_END) {
+ assert(vq->vq_free_cnt == 0);
+ vq->vq_desc_tail_idx = VQ_RING_DESC_CHAIN_END;
+ }
+}
+
+#define DESC_PER_CACHELINE (SPDK_CACHE_LINE_SIZE / sizeof(struct vring_desc))
+uint16_t
+virtio_recv_pkts(struct virtqueue *vq, void **io, uint32_t *len, uint16_t nb_pkts)
+{
+ uint16_t nb_used, num;
+
+ nb_used = vq->vq_ring.used->idx - vq->vq_used_cons_idx;
+ virtio_rmb();
+
+ num = (uint16_t)(spdk_likely(nb_used <= nb_pkts) ? nb_used : nb_pkts);
+ if (spdk_likely(num > DESC_PER_CACHELINE)) {
+ num = num - ((vq->vq_used_cons_idx + num) % DESC_PER_CACHELINE);
+ }
+
+ return virtqueue_dequeue_burst_rx(vq, io, len, num);
+}
+
+int
+virtio_dev_acquire_queue(struct virtio_dev *vdev, uint16_t index)
+{
+ struct virtqueue *vq = NULL;
+
+ if (index >= vdev->max_queues) {
+ SPDK_ERRLOG("requested vq index %"PRIu16" exceeds max queue count %"PRIu16".\n",
+ index, vdev->max_queues);
+ return -1;
+ }
+
+ pthread_mutex_lock(&vdev->mutex);
+ vq = vdev->vqs[index];
+ if (vq == NULL || vq->owner_thread != NULL) {
+ pthread_mutex_unlock(&vdev->mutex);
+ return -1;
+ }
+
+ vq->owner_thread = spdk_get_thread();
+ pthread_mutex_unlock(&vdev->mutex);
+ return 0;
+}
+
+int32_t
+virtio_dev_find_and_acquire_queue(struct virtio_dev *vdev, uint16_t start_index)
+{
+ struct virtqueue *vq = NULL;
+ uint16_t i;
+
+ pthread_mutex_lock(&vdev->mutex);
+ for (i = start_index; i < vdev->max_queues; ++i) {
+ vq = vdev->vqs[i];
+ if (vq != NULL && vq->owner_thread == NULL) {
+ break;
+ }
+ }
+
+ if (vq == NULL || i == vdev->max_queues) {
+ SPDK_ERRLOG("no more unused virtio queues with idx >= %"PRIu16".\n", start_index);
+ pthread_mutex_unlock(&vdev->mutex);
+ return -1;
+ }
+
+ vq->owner_thread = spdk_get_thread();
+ pthread_mutex_unlock(&vdev->mutex);
+ return i;
+}
+
+struct spdk_thread *
+virtio_dev_queue_get_thread(struct virtio_dev *vdev, uint16_t index)
+{
+ struct spdk_thread *thread = NULL;
+
+ if (index >= vdev->max_queues) {
+ SPDK_ERRLOG("given vq index %"PRIu16" exceeds max queue count %"PRIu16"\n",
+ index, vdev->max_queues);
+ abort(); /* This is not recoverable */
+ }
+
+ pthread_mutex_lock(&vdev->mutex);
+ thread = vdev->vqs[index]->owner_thread;
+ pthread_mutex_unlock(&vdev->mutex);
+
+ return thread;
+}
+
+bool
+virtio_dev_queue_is_acquired(struct virtio_dev *vdev, uint16_t index)
+{
+ return virtio_dev_queue_get_thread(vdev, index) != NULL;
+}
+
+void
+virtio_dev_release_queue(struct virtio_dev *vdev, uint16_t index)
+{
+ struct virtqueue *vq = NULL;
+
+ if (index >= vdev->max_queues) {
+ SPDK_ERRLOG("given vq index %"PRIu16" exceeds max queue count %"PRIu16".\n",
+ index, vdev->max_queues);
+ return;
+ }
+
+ pthread_mutex_lock(&vdev->mutex);
+ vq = vdev->vqs[index];
+ if (vq == NULL) {
+ SPDK_ERRLOG("virtqueue at index %"PRIu16" is not initialized.\n", index);
+ pthread_mutex_unlock(&vdev->mutex);
+ return;
+ }
+
+ assert(vq->owner_thread == spdk_get_thread());
+ vq->owner_thread = NULL;
+ pthread_mutex_unlock(&vdev->mutex);
+}
+
+int
+virtio_dev_read_dev_config(struct virtio_dev *dev, size_t offset,
+ void *dst, int length)
+{
+ return virtio_dev_backend_ops(dev)->read_dev_cfg(dev, offset, dst, length);
+}
+
+int
+virtio_dev_write_dev_config(struct virtio_dev *dev, size_t offset,
+ const void *src, int length)
+{
+ return virtio_dev_backend_ops(dev)->write_dev_cfg(dev, offset, src, length);
+}
+
+void
+virtio_dev_stop(struct virtio_dev *dev)
+{
+ virtio_dev_backend_ops(dev)->set_status(dev, VIRTIO_CONFIG_S_RESET);
+ /* flush status write */
+ virtio_dev_backend_ops(dev)->get_status(dev);
+ virtio_free_queues(dev);
+}
+
+void
+virtio_dev_set_status(struct virtio_dev *dev, uint8_t status)
+{
+ if (status != VIRTIO_CONFIG_S_RESET) {
+ status |= virtio_dev_backend_ops(dev)->get_status(dev);
+ }
+
+ virtio_dev_backend_ops(dev)->set_status(dev, status);
+}
+
+uint8_t
+virtio_dev_get_status(struct virtio_dev *dev)
+{
+ return virtio_dev_backend_ops(dev)->get_status(dev);
+}
+
+const struct virtio_dev_ops *
+virtio_dev_backend_ops(struct virtio_dev *dev)
+{
+ return dev->backend_ops;
+}
+
+void
+virtio_dev_dump_json_info(struct virtio_dev *hw, struct spdk_json_write_ctx *w)
+{
+ spdk_json_write_named_object_begin(w, "virtio");
+
+ spdk_json_write_named_uint32(w, "vq_count", hw->max_queues);
+
+ spdk_json_write_named_uint32(w, "vq_size",
+ virtio_dev_backend_ops(hw)->get_queue_size(hw, 0));
+
+ virtio_dev_backend_ops(hw)->dump_json_info(hw, w);
+
+ spdk_json_write_object_end(w);
+}
+
+SPDK_LOG_REGISTER_COMPONENT("virtio_dev", SPDK_LOG_VIRTIO_DEV)
diff --git a/src/spdk/lib/virtio/virtio_pci.c b/src/spdk/lib/virtio/virtio_pci.c
new file mode 100644
index 000000000..646f77c1a
--- /dev/null
+++ b/src/spdk/lib/virtio/virtio_pci.c
@@ -0,0 +1,599 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "spdk/memory.h"
+#include "spdk/mmio.h"
+#include "spdk/string.h"
+#include "spdk/env.h"
+
+#include "spdk_internal/virtio.h"
+
+struct virtio_hw {
+ uint8_t use_msix;
+ uint32_t notify_off_multiplier;
+ uint8_t *isr;
+ uint16_t *notify_base;
+
+ struct {
+ /** Mem-mapped resources from given PCI BAR */
+ void *vaddr;
+
+ /** Length of the address space */
+ uint32_t len;
+ } pci_bar[6];
+
+ struct virtio_pci_common_cfg *common_cfg;
+ struct spdk_pci_device *pci_dev;
+
+ /** Device-specific PCI config space */
+ void *dev_cfg;
+};
+
+struct virtio_pci_probe_ctx {
+ virtio_pci_create_cb enum_cb;
+ void *enum_ctx;
+ uint16_t device_id;
+};
+
+/*
+ * Following macros are derived from linux/pci_regs.h, however,
+ * we can't simply include that header here, as there is no such
+ * file for non-Linux platform.
+ */
+#define PCI_CAPABILITY_LIST 0x34
+#define PCI_CAP_ID_VNDR 0x09
+#define PCI_CAP_ID_MSIX 0x11
+
+static inline int
+check_vq_phys_addr_ok(struct virtqueue *vq)
+{
+ /* Virtio PCI device VIRTIO_PCI_QUEUE_PF register is 32bit,
+ * and only accepts 32 bit page frame number.
+ * Check if the allocated physical memory exceeds 16TB.
+ */
+ if ((vq->vq_ring_mem + vq->vq_ring_size - 1) >>
+ (VIRTIO_PCI_QUEUE_ADDR_SHIFT + 32)) {
+ SPDK_ERRLOG("vring address shouldn't be above 16TB!\n");
+ return 0;
+ }
+
+ return 1;
+}
+
+static void
+free_virtio_hw(struct virtio_hw *hw)
+{
+ unsigned i;
+
+ for (i = 0; i < 6; ++i) {
+ if (hw->pci_bar[i].vaddr == NULL) {
+ continue;
+ }
+
+ spdk_pci_device_unmap_bar(hw->pci_dev, i, hw->pci_bar[i].vaddr);
+ }
+
+ free(hw);
+}
+
+static void
+pci_dump_json_info(struct virtio_dev *dev, struct spdk_json_write_ctx *w)
+{
+ struct virtio_hw *hw = dev->ctx;
+ struct spdk_pci_addr pci_addr = spdk_pci_device_get_addr((struct spdk_pci_device *)hw->pci_dev);
+ char addr[32];
+
+ spdk_json_write_name(w, "type");
+ if (dev->modern) {
+ spdk_json_write_string(w, "pci-modern");
+ } else {
+ spdk_json_write_string(w, "pci-legacy");
+ }
+
+ spdk_pci_addr_fmt(addr, sizeof(addr), &pci_addr);
+ spdk_json_write_named_string(w, "pci_address", addr);
+}
+
+static void
+pci_write_json_config(struct virtio_dev *dev, struct spdk_json_write_ctx *w)
+{
+ struct virtio_hw *hw = dev->ctx;
+ struct spdk_pci_addr pci_addr = spdk_pci_device_get_addr(hw->pci_dev);
+ char addr[32];
+
+ spdk_pci_addr_fmt(addr, sizeof(addr), &pci_addr);
+
+ spdk_json_write_named_string(w, "trtype", "pci");
+ spdk_json_write_named_string(w, "traddr", addr);
+}
+
+static inline void
+io_write64_twopart(uint64_t val, uint32_t *lo, uint32_t *hi)
+{
+ spdk_mmio_write_4(lo, val & ((1ULL << 32) - 1));
+ spdk_mmio_write_4(hi, val >> 32);
+}
+
+static int
+modern_read_dev_config(struct virtio_dev *dev, size_t offset,
+ void *dst, int length)
+{
+ struct virtio_hw *hw = dev->ctx;
+ int i;
+ uint8_t *p;
+ uint8_t old_gen, new_gen;
+
+ do {
+ old_gen = spdk_mmio_read_1(&hw->common_cfg->config_generation);
+
+ p = dst;
+ for (i = 0; i < length; i++) {
+ *p++ = spdk_mmio_read_1((uint8_t *)hw->dev_cfg + offset + i);
+ }
+
+ new_gen = spdk_mmio_read_1(&hw->common_cfg->config_generation);
+ } while (old_gen != new_gen);
+
+ return 0;
+}
+
+static int
+modern_write_dev_config(struct virtio_dev *dev, size_t offset,
+ const void *src, int length)
+{
+ struct virtio_hw *hw = dev->ctx;
+ int i;
+ const uint8_t *p = src;
+
+ for (i = 0; i < length; i++) {
+ spdk_mmio_write_1(((uint8_t *)hw->dev_cfg) + offset + i, *p++);
+ }
+
+ return 0;
+}
+
+static uint64_t
+modern_get_features(struct virtio_dev *dev)
+{
+ struct virtio_hw *hw = dev->ctx;
+ uint32_t features_lo, features_hi;
+
+ spdk_mmio_write_4(&hw->common_cfg->device_feature_select, 0);
+ features_lo = spdk_mmio_read_4(&hw->common_cfg->device_feature);
+
+ spdk_mmio_write_4(&hw->common_cfg->device_feature_select, 1);
+ features_hi = spdk_mmio_read_4(&hw->common_cfg->device_feature);
+
+ return ((uint64_t)features_hi << 32) | features_lo;
+}
+
+static int
+modern_set_features(struct virtio_dev *dev, uint64_t features)
+{
+ struct virtio_hw *hw = dev->ctx;
+
+ if ((features & (1ULL << VIRTIO_F_VERSION_1)) == 0) {
+ SPDK_ERRLOG("VIRTIO_F_VERSION_1 feature is not enabled.\n");
+ return -EINVAL;
+ }
+
+ spdk_mmio_write_4(&hw->common_cfg->guest_feature_select, 0);
+ spdk_mmio_write_4(&hw->common_cfg->guest_feature, features & ((1ULL << 32) - 1));
+
+ spdk_mmio_write_4(&hw->common_cfg->guest_feature_select, 1);
+ spdk_mmio_write_4(&hw->common_cfg->guest_feature, features >> 32);
+
+ dev->negotiated_features = features;
+
+ return 0;
+}
+
+static void
+modern_destruct_dev(struct virtio_dev *vdev)
+{
+ struct virtio_hw *hw = vdev->ctx;
+ struct spdk_pci_device *pci_dev = hw->pci_dev;
+
+ free_virtio_hw(hw);
+ spdk_pci_device_detach(pci_dev);
+}
+
+static uint8_t
+modern_get_status(struct virtio_dev *dev)
+{
+ struct virtio_hw *hw = dev->ctx;
+
+ return spdk_mmio_read_1(&hw->common_cfg->device_status);
+}
+
+static void
+modern_set_status(struct virtio_dev *dev, uint8_t status)
+{
+ struct virtio_hw *hw = dev->ctx;
+
+ spdk_mmio_write_1(&hw->common_cfg->device_status, status);
+}
+
+static uint16_t
+modern_get_queue_size(struct virtio_dev *dev, uint16_t queue_id)
+{
+ struct virtio_hw *hw = dev->ctx;
+
+ spdk_mmio_write_2(&hw->common_cfg->queue_select, queue_id);
+ return spdk_mmio_read_2(&hw->common_cfg->queue_size);
+}
+
+static int
+modern_setup_queue(struct virtio_dev *dev, struct virtqueue *vq)
+{
+ struct virtio_hw *hw = dev->ctx;
+ uint64_t desc_addr, avail_addr, used_addr;
+ uint16_t notify_off;
+ void *queue_mem;
+ uint64_t queue_mem_phys_addr;
+
+ /* To ensure physical address contiguity we make the queue occupy
+ * only a single hugepage (2MB). As of Virtio 1.0, the queue size
+ * always falls within this limit.
+ */
+ if (vq->vq_ring_size > VALUE_2MB) {
+ return -ENOMEM;
+ }
+
+ queue_mem = spdk_zmalloc(vq->vq_ring_size, VALUE_2MB, NULL,
+ SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+ if (queue_mem == NULL) {
+ return -ENOMEM;
+ }
+
+ queue_mem_phys_addr = spdk_vtophys(queue_mem, NULL);
+ if (queue_mem_phys_addr == SPDK_VTOPHYS_ERROR) {
+ spdk_free(queue_mem);
+ return -EFAULT;
+ }
+
+ vq->vq_ring_mem = queue_mem_phys_addr;
+ vq->vq_ring_virt_mem = queue_mem;
+
+ if (!check_vq_phys_addr_ok(vq)) {
+ spdk_free(queue_mem);
+ return -ENOMEM;
+ }
+
+ desc_addr = vq->vq_ring_mem;
+ avail_addr = desc_addr + vq->vq_nentries * sizeof(struct vring_desc);
+ used_addr = (avail_addr + offsetof(struct vring_avail, ring[vq->vq_nentries])
+ + VIRTIO_PCI_VRING_ALIGN - 1) & ~(VIRTIO_PCI_VRING_ALIGN - 1);
+
+ spdk_mmio_write_2(&hw->common_cfg->queue_select, vq->vq_queue_index);
+
+ io_write64_twopart(desc_addr, &hw->common_cfg->queue_desc_lo,
+ &hw->common_cfg->queue_desc_hi);
+ io_write64_twopart(avail_addr, &hw->common_cfg->queue_avail_lo,
+ &hw->common_cfg->queue_avail_hi);
+ io_write64_twopart(used_addr, &hw->common_cfg->queue_used_lo,
+ &hw->common_cfg->queue_used_hi);
+
+ notify_off = spdk_mmio_read_2(&hw->common_cfg->queue_notify_off);
+ vq->notify_addr = (void *)((uint8_t *)hw->notify_base +
+ notify_off * hw->notify_off_multiplier);
+
+ spdk_mmio_write_2(&hw->common_cfg->queue_enable, 1);
+
+ SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "queue %"PRIu16" addresses:\n", vq->vq_queue_index);
+ SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "\t desc_addr: %" PRIx64 "\n", desc_addr);
+ SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "\t aval_addr: %" PRIx64 "\n", avail_addr);
+ SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "\t used_addr: %" PRIx64 "\n", used_addr);
+ SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "\t notify addr: %p (notify offset: %"PRIu16")\n",
+ vq->notify_addr, notify_off);
+
+ return 0;
+}
+
+static void
+modern_del_queue(struct virtio_dev *dev, struct virtqueue *vq)
+{
+ struct virtio_hw *hw = dev->ctx;
+
+ spdk_mmio_write_2(&hw->common_cfg->queue_select, vq->vq_queue_index);
+
+ io_write64_twopart(0, &hw->common_cfg->queue_desc_lo,
+ &hw->common_cfg->queue_desc_hi);
+ io_write64_twopart(0, &hw->common_cfg->queue_avail_lo,
+ &hw->common_cfg->queue_avail_hi);
+ io_write64_twopart(0, &hw->common_cfg->queue_used_lo,
+ &hw->common_cfg->queue_used_hi);
+
+ spdk_mmio_write_2(&hw->common_cfg->queue_enable, 0);
+
+ spdk_free(vq->vq_ring_virt_mem);
+}
+
+static void
+modern_notify_queue(struct virtio_dev *dev, struct virtqueue *vq)
+{
+ spdk_mmio_write_2(vq->notify_addr, vq->vq_queue_index);
+}
+
+static const struct virtio_dev_ops modern_ops = {
+ .read_dev_cfg = modern_read_dev_config,
+ .write_dev_cfg = modern_write_dev_config,
+ .get_status = modern_get_status,
+ .set_status = modern_set_status,
+ .get_features = modern_get_features,
+ .set_features = modern_set_features,
+ .destruct_dev = modern_destruct_dev,
+ .get_queue_size = modern_get_queue_size,
+ .setup_queue = modern_setup_queue,
+ .del_queue = modern_del_queue,
+ .notify_queue = modern_notify_queue,
+ .dump_json_info = pci_dump_json_info,
+ .write_json_config = pci_write_json_config,
+};
+
+static void *
+get_cfg_addr(struct virtio_hw *hw, struct virtio_pci_cap *cap)
+{
+ uint8_t bar = cap->bar;
+ uint32_t length = cap->length;
+ uint32_t offset = cap->offset;
+
+ if (bar > 5) {
+ SPDK_ERRLOG("invalid bar: %"PRIu8"\n", bar);
+ return NULL;
+ }
+
+ if (offset + length < offset) {
+ SPDK_ERRLOG("offset(%"PRIu32") + length(%"PRIu32") overflows\n",
+ offset, length);
+ return NULL;
+ }
+
+ if (offset + length > hw->pci_bar[bar].len) {
+ SPDK_ERRLOG("invalid cap: overflows bar space: %"PRIu32" > %"PRIu32"\n",
+ offset + length, hw->pci_bar[bar].len);
+ return NULL;
+ }
+
+ if (hw->pci_bar[bar].vaddr == NULL) {
+ SPDK_ERRLOG("bar %"PRIu8" base addr is NULL\n", bar);
+ return NULL;
+ }
+
+ return hw->pci_bar[bar].vaddr + offset;
+}
+
+static int
+virtio_read_caps(struct virtio_hw *hw)
+{
+ uint8_t pos;
+ struct virtio_pci_cap cap;
+ int ret;
+
+ ret = spdk_pci_device_cfg_read(hw->pci_dev, &pos, 1, PCI_CAPABILITY_LIST);
+ if (ret < 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "failed to read pci capability list\n");
+ return ret;
+ }
+
+ while (pos) {
+ ret = spdk_pci_device_cfg_read(hw->pci_dev, &cap, sizeof(cap), pos);
+ if (ret < 0) {
+ SPDK_ERRLOG("failed to read pci cap at pos: %"PRIx8"\n", pos);
+ break;
+ }
+
+ if (cap.cap_vndr == PCI_CAP_ID_MSIX) {
+ hw->use_msix = 1;
+ }
+
+ if (cap.cap_vndr != PCI_CAP_ID_VNDR) {
+ SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI,
+ "[%2"PRIx8"] skipping non VNDR cap id: %02"PRIx8"\n",
+ pos, cap.cap_vndr);
+ goto next;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI,
+ "[%2"PRIx8"] cfg type: %"PRIu8", bar: %"PRIu8", offset: %04"PRIx32", len: %"PRIu32"\n",
+ pos, cap.cfg_type, cap.bar, cap.offset, cap.length);
+
+ switch (cap.cfg_type) {
+ case VIRTIO_PCI_CAP_COMMON_CFG:
+ hw->common_cfg = get_cfg_addr(hw, &cap);
+ break;
+ case VIRTIO_PCI_CAP_NOTIFY_CFG:
+ spdk_pci_device_cfg_read(hw->pci_dev, &hw->notify_off_multiplier,
+ 4, pos + sizeof(cap));
+ hw->notify_base = get_cfg_addr(hw, &cap);
+ break;
+ case VIRTIO_PCI_CAP_DEVICE_CFG:
+ hw->dev_cfg = get_cfg_addr(hw, &cap);
+ break;
+ case VIRTIO_PCI_CAP_ISR_CFG:
+ hw->isr = get_cfg_addr(hw, &cap);
+ break;
+ }
+
+next:
+ pos = cap.cap_next;
+ }
+
+ if (hw->common_cfg == NULL || hw->notify_base == NULL ||
+ hw->dev_cfg == NULL || hw->isr == NULL) {
+ SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "no modern virtio pci device found.\n");
+ if (ret < 0) {
+ return ret;
+ } else {
+ return -EINVAL;
+ }
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "found modern virtio pci device.\n");
+
+ SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "common cfg mapped at: %p\n", hw->common_cfg);
+ SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "device cfg mapped at: %p\n", hw->dev_cfg);
+ SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "isr cfg mapped at: %p\n", hw->isr);
+ SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "notify base: %p, notify off multiplier: %u\n",
+ hw->notify_base, hw->notify_off_multiplier);
+
+ return 0;
+}
+
+static int
+virtio_pci_dev_probe(struct spdk_pci_device *pci_dev, struct virtio_pci_probe_ctx *ctx)
+{
+ struct virtio_hw *hw;
+ uint8_t *bar_vaddr;
+ uint64_t bar_paddr, bar_len;
+ int rc;
+ unsigned i;
+ char bdf[32];
+ struct spdk_pci_addr addr;
+
+ addr = spdk_pci_device_get_addr(pci_dev);
+ rc = spdk_pci_addr_fmt(bdf, sizeof(bdf), &addr);
+ if (rc != 0) {
+ SPDK_ERRLOG("Ignoring a device with non-parseable PCI address\n");
+ return -1;
+ }
+
+ hw = calloc(1, sizeof(*hw));
+ if (hw == NULL) {
+ SPDK_ERRLOG("%s: calloc failed\n", bdf);
+ return -1;
+ }
+
+ hw->pci_dev = pci_dev;
+
+ for (i = 0; i < 6; ++i) {
+ rc = spdk_pci_device_map_bar(pci_dev, i, (void *) &bar_vaddr, &bar_paddr,
+ &bar_len);
+ if (rc != 0) {
+ SPDK_ERRLOG("%s: failed to memmap PCI BAR %u\n", bdf, i);
+ free_virtio_hw(hw);
+ return -1;
+ }
+
+ hw->pci_bar[i].vaddr = bar_vaddr;
+ hw->pci_bar[i].len = bar_len;
+ }
+
+ /* Virtio PCI caps exist only on modern PCI devices.
+ * Legacy devices are not supported.
+ */
+ if (virtio_read_caps(hw) != 0) {
+ SPDK_NOTICELOG("Ignoring legacy PCI device at %s\n", bdf);
+ free_virtio_hw(hw);
+ return -1;
+ }
+
+ rc = ctx->enum_cb((struct virtio_pci_ctx *)hw, ctx->enum_ctx);
+ if (rc != 0) {
+ free_virtio_hw(hw);
+ }
+
+ return rc;
+}
+
+static int
+virtio_pci_dev_probe_cb(void *probe_ctx, struct spdk_pci_device *pci_dev)
+{
+ struct virtio_pci_probe_ctx *ctx = probe_ctx;
+ uint16_t pci_device_id = spdk_pci_device_get_device_id(pci_dev);
+
+ if (pci_device_id != ctx->device_id) {
+ return 1;
+ }
+
+ return virtio_pci_dev_probe(pci_dev, ctx);
+}
+
+int
+virtio_pci_dev_enumerate(virtio_pci_create_cb enum_cb, void *enum_ctx,
+ uint16_t pci_device_id)
+{
+ struct virtio_pci_probe_ctx ctx;
+
+ if (!spdk_process_is_primary()) {
+ SPDK_WARNLOG("virtio_pci secondary process support is not implemented yet.\n");
+ return 0;
+ }
+
+ ctx.enum_cb = enum_cb;
+ ctx.enum_ctx = enum_ctx;
+ ctx.device_id = pci_device_id;
+
+ return spdk_pci_enumerate(spdk_pci_virtio_get_driver(),
+ virtio_pci_dev_probe_cb, &ctx);
+}
+
+int
+virtio_pci_dev_attach(virtio_pci_create_cb enum_cb, void *enum_ctx,
+ uint16_t pci_device_id, struct spdk_pci_addr *pci_address)
+{
+ struct virtio_pci_probe_ctx ctx;
+
+ if (!spdk_process_is_primary()) {
+ SPDK_WARNLOG("virtio_pci secondary process support is not implemented yet.\n");
+ return 0;
+ }
+
+ ctx.enum_cb = enum_cb;
+ ctx.enum_ctx = enum_ctx;
+ ctx.device_id = pci_device_id;
+
+ return spdk_pci_device_attach(spdk_pci_virtio_get_driver(),
+ virtio_pci_dev_probe_cb, &ctx, pci_address);
+}
+
+int
+virtio_pci_dev_init(struct virtio_dev *vdev, const char *name,
+ struct virtio_pci_ctx *pci_ctx)
+{
+ int rc;
+
+ rc = virtio_dev_construct(vdev, name, &modern_ops, pci_ctx);
+ if (rc != 0) {
+ return rc;
+ }
+
+ vdev->is_hw = 1;
+ vdev->modern = 1;
+
+ return 0;
+}
+
+SPDK_LOG_REGISTER_COMPONENT("virtio_pci", SPDK_LOG_VIRTIO_PCI)
diff --git a/src/spdk/lib/virtio/virtio_user.c b/src/spdk/lib/virtio/virtio_user.c
new file mode 100644
index 000000000..4f4932db9
--- /dev/null
+++ b/src/spdk/lib/virtio/virtio_user.c
@@ -0,0 +1,628 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include <sys/eventfd.h>
+
+#include "vhost_user.h"
+#include "spdk/string.h"
+#include "spdk/config.h"
+
+#include "spdk_internal/virtio.h"
+
+#define VIRTIO_USER_SUPPORTED_PROTOCOL_FEATURES \
+ ((1ULL << VHOST_USER_PROTOCOL_F_MQ) | \
+ (1ULL << VHOST_USER_PROTOCOL_F_CONFIG))
+
+static int
+virtio_user_create_queue(struct virtio_dev *vdev, uint32_t queue_sel)
+{
+ struct virtio_user_dev *dev = vdev->ctx;
+
+ /* Of all per virtqueue MSGs, make sure VHOST_SET_VRING_CALL come
+ * firstly because vhost depends on this msg to allocate virtqueue
+ * pair.
+ */
+ struct vhost_vring_file file;
+
+ file.index = queue_sel;
+ file.fd = dev->callfds[queue_sel];
+ return dev->ops->send_request(dev, VHOST_USER_SET_VRING_CALL, &file);
+}
+
+static int
+virtio_user_set_vring_addr(struct virtio_dev *vdev, uint32_t queue_sel)
+{
+ struct virtio_user_dev *dev = vdev->ctx;
+ struct vring *vring = &dev->vrings[queue_sel];
+ struct vhost_vring_addr addr = {
+ .index = queue_sel,
+ .desc_user_addr = (uint64_t)(uintptr_t)vring->desc,
+ .avail_user_addr = (uint64_t)(uintptr_t)vring->avail,
+ .used_user_addr = (uint64_t)(uintptr_t)vring->used,
+ .log_guest_addr = 0,
+ .flags = 0, /* disable log */
+ };
+
+ return dev->ops->send_request(dev, VHOST_USER_SET_VRING_ADDR, &addr);
+}
+
+static int
+virtio_user_kick_queue(struct virtio_dev *vdev, uint32_t queue_sel)
+{
+ struct virtio_user_dev *dev = vdev->ctx;
+ struct vhost_vring_file file;
+ struct vhost_vring_state state;
+ struct vring *vring = &dev->vrings[queue_sel];
+ int rc;
+
+ state.index = queue_sel;
+ state.num = vring->num;
+ rc = dev->ops->send_request(dev, VHOST_USER_SET_VRING_NUM, &state);
+ if (rc < 0) {
+ return rc;
+ }
+
+ state.index = queue_sel;
+ state.num = 0; /* no reservation */
+ rc = dev->ops->send_request(dev, VHOST_USER_SET_VRING_BASE, &state);
+ if (rc < 0) {
+ return rc;
+ }
+
+ virtio_user_set_vring_addr(vdev, queue_sel);
+
+ /* Of all per virtqueue MSGs, make sure VHOST_USER_SET_VRING_KICK comes
+ * lastly because vhost depends on this msg to judge if
+ * virtio is ready.
+ */
+ file.index = queue_sel;
+ file.fd = dev->kickfds[queue_sel];
+ return dev->ops->send_request(dev, VHOST_USER_SET_VRING_KICK, &file);
+}
+
+static int
+virtio_user_stop_queue(struct virtio_dev *vdev, uint32_t queue_sel)
+{
+ struct virtio_user_dev *dev = vdev->ctx;
+ struct vhost_vring_state state;
+
+ state.index = queue_sel;
+ state.num = 0;
+
+ return dev->ops->send_request(dev, VHOST_USER_GET_VRING_BASE, &state);
+}
+
+static int
+virtio_user_queue_setup(struct virtio_dev *vdev,
+ int (*fn)(struct virtio_dev *, uint32_t))
+{
+ uint32_t i;
+ int rc;
+
+ for (i = 0; i < vdev->max_queues; ++i) {
+ rc = fn(vdev, i);
+ if (rc < 0) {
+ SPDK_ERRLOG("setup tx vq fails: %"PRIu32".\n", i);
+ return rc;
+ }
+ }
+
+ return 0;
+}
+
+static int
+virtio_user_map_notify(void *cb_ctx, struct spdk_mem_map *map,
+ enum spdk_mem_map_notify_action action,
+ void *vaddr, size_t size)
+{
+ struct virtio_dev *vdev = cb_ctx;
+ struct virtio_user_dev *dev = vdev->ctx;
+ uint64_t features;
+ int ret;
+
+ /* We have to resend all mappings anyway, so don't bother with any
+ * page tracking.
+ */
+ ret = dev->ops->send_request(dev, VHOST_USER_SET_MEM_TABLE, NULL);
+ if (ret < 0) {
+ return ret;
+ }
+
+#ifdef SPDK_CONFIG_VHOST_INTERNAL_LIB
+ /* Our internal rte_vhost lib requires SET_VRING_ADDR to flush a pending
+ * SET_MEM_TABLE. On the other hand, the upstream rte_vhost will invalidate
+ * the entire queue upon receiving SET_VRING_ADDR message, so we mustn't
+ * send it here. Both behaviors are strictly implementation specific, but
+ * this message isn't needed from the point of the spec, so send it only
+ * if vhost is compiled with our internal lib.
+ */
+ ret = virtio_user_queue_setup(vdev, virtio_user_set_vring_addr);
+ if (ret < 0) {
+ return ret;
+ }
+#endif
+
+ /* Since we might want to use that mapping straight away, we have to
+ * make sure the guest has already processed our SET_MEM_TABLE message.
+ * F_REPLY_ACK is just a feature and the host is not obliged to
+ * support it, so we send a simple message that always has a response
+ * and we wait for that response. Messages are always processed in order.
+ */
+ return dev->ops->send_request(dev, VHOST_USER_GET_FEATURES, &features);
+}
+
+static int
+virtio_user_register_mem(struct virtio_dev *vdev)
+{
+ struct virtio_user_dev *dev = vdev->ctx;
+ const struct spdk_mem_map_ops virtio_user_map_ops = {
+ .notify_cb = virtio_user_map_notify,
+ .are_contiguous = NULL
+ };
+
+ dev->mem_map = spdk_mem_map_alloc(0, &virtio_user_map_ops, vdev);
+ if (dev->mem_map == NULL) {
+ SPDK_ERRLOG("spdk_mem_map_alloc() failed\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+static void
+virtio_user_unregister_mem(struct virtio_dev *vdev)
+{
+ struct virtio_user_dev *dev = vdev->ctx;
+
+ spdk_mem_map_free(&dev->mem_map);
+}
+
+static int
+virtio_user_start_device(struct virtio_dev *vdev)
+{
+ struct virtio_user_dev *dev = vdev->ctx;
+ uint64_t host_max_queues;
+ int ret;
+
+ if ((dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_MQ)) == 0 &&
+ vdev->max_queues > 1 + vdev->fixed_queues_num) {
+ SPDK_WARNLOG("%s: requested %"PRIu16" request queues, but the "
+ "host doesn't support VHOST_USER_PROTOCOL_F_MQ. "
+ "Only one request queue will be used.\n",
+ vdev->name, vdev->max_queues - vdev->fixed_queues_num);
+ vdev->max_queues = 1 + vdev->fixed_queues_num;
+ }
+
+ /* negotiate the number of I/O queues. */
+ ret = dev->ops->send_request(dev, VHOST_USER_GET_QUEUE_NUM, &host_max_queues);
+ if (ret < 0) {
+ return ret;
+ }
+
+ if (vdev->max_queues > host_max_queues + vdev->fixed_queues_num) {
+ SPDK_WARNLOG("%s: requested %"PRIu16" request queues"
+ "but only %"PRIu64" available\n",
+ vdev->name, vdev->max_queues - vdev->fixed_queues_num,
+ host_max_queues);
+ vdev->max_queues = host_max_queues;
+ }
+
+ /* tell vhost to create queues */
+ ret = virtio_user_queue_setup(vdev, virtio_user_create_queue);
+ if (ret < 0) {
+ return ret;
+ }
+
+ ret = virtio_user_register_mem(vdev);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return virtio_user_queue_setup(vdev, virtio_user_kick_queue);
+}
+
+static int
+virtio_user_stop_device(struct virtio_dev *vdev)
+{
+ int ret;
+
+ ret = virtio_user_queue_setup(vdev, virtio_user_stop_queue);
+ /* a queue might fail to stop for various reasons, e.g. socket
+ * connection going down, but this mustn't prevent us from freeing
+ * the mem map.
+ */
+ virtio_user_unregister_mem(vdev);
+ return ret;
+}
+
+static int
+virtio_user_dev_setup(struct virtio_dev *vdev)
+{
+ struct virtio_user_dev *dev = vdev->ctx;
+ uint16_t i;
+
+ dev->vhostfd = -1;
+
+ for (i = 0; i < SPDK_VIRTIO_MAX_VIRTQUEUES; ++i) {
+ dev->callfds[i] = -1;
+ dev->kickfds[i] = -1;
+ }
+
+ dev->ops = &ops_user;
+
+ return dev->ops->setup(dev);
+}
+
+static int
+virtio_user_read_dev_config(struct virtio_dev *vdev, size_t offset,
+ void *dst, int length)
+{
+ struct virtio_user_dev *dev = vdev->ctx;
+ struct vhost_user_config cfg = {0};
+ int rc;
+
+ if ((dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_CONFIG)) == 0) {
+ return -ENOTSUP;
+ }
+
+ cfg.offset = 0;
+ cfg.size = VHOST_USER_MAX_CONFIG_SIZE;
+
+ rc = dev->ops->send_request(dev, VHOST_USER_GET_CONFIG, &cfg);
+ if (rc < 0) {
+ SPDK_ERRLOG("get_config failed: %s\n", spdk_strerror(-rc));
+ return rc;
+ }
+
+ memcpy(dst, cfg.region + offset, length);
+ return 0;
+}
+
+static int
+virtio_user_write_dev_config(struct virtio_dev *vdev, size_t offset,
+ const void *src, int length)
+{
+ struct virtio_user_dev *dev = vdev->ctx;
+ struct vhost_user_config cfg = {0};
+ int rc;
+
+ if ((dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_CONFIG)) == 0) {
+ return -ENOTSUP;
+ }
+
+ cfg.offset = offset;
+ cfg.size = length;
+ memcpy(cfg.region, src, length);
+
+ rc = dev->ops->send_request(dev, VHOST_USER_SET_CONFIG, &cfg);
+ if (rc < 0) {
+ SPDK_ERRLOG("set_config failed: %s\n", spdk_strerror(-rc));
+ return rc;
+ }
+
+ return 0;
+}
+
+static void
+virtio_user_set_status(struct virtio_dev *vdev, uint8_t status)
+{
+ struct virtio_user_dev *dev = vdev->ctx;
+ int rc = 0;
+
+ if ((dev->status & VIRTIO_CONFIG_S_NEEDS_RESET) &&
+ status != VIRTIO_CONFIG_S_RESET) {
+ rc = -1;
+ } else if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
+ rc = virtio_user_start_device(vdev);
+ } else if (status == VIRTIO_CONFIG_S_RESET &&
+ (dev->status & VIRTIO_CONFIG_S_DRIVER_OK)) {
+ rc = virtio_user_stop_device(vdev);
+ }
+
+ if (rc != 0) {
+ dev->status |= VIRTIO_CONFIG_S_NEEDS_RESET;
+ } else {
+ dev->status = status;
+ }
+}
+
+static uint8_t
+virtio_user_get_status(struct virtio_dev *vdev)
+{
+ struct virtio_user_dev *dev = vdev->ctx;
+
+ return dev->status;
+}
+
+static uint64_t
+virtio_user_get_features(struct virtio_dev *vdev)
+{
+ struct virtio_user_dev *dev = vdev->ctx;
+ uint64_t features;
+ int rc;
+
+ rc = dev->ops->send_request(dev, VHOST_USER_GET_FEATURES, &features);
+ if (rc < 0) {
+ SPDK_ERRLOG("get_features failed: %s\n", spdk_strerror(-rc));
+ return 0;
+ }
+
+ return features;
+}
+
+static int
+virtio_user_set_features(struct virtio_dev *vdev, uint64_t features)
+{
+ struct virtio_user_dev *dev = vdev->ctx;
+ uint64_t protocol_features;
+ int ret;
+
+ ret = dev->ops->send_request(dev, VHOST_USER_SET_FEATURES, &features);
+ if (ret < 0) {
+ return ret;
+ }
+
+ vdev->negotiated_features = features;
+ vdev->modern = virtio_dev_has_feature(vdev, VIRTIO_F_VERSION_1);
+
+ if (!virtio_dev_has_feature(vdev, VHOST_USER_F_PROTOCOL_FEATURES)) {
+ /* nothing else to do */
+ return 0;
+ }
+
+ ret = dev->ops->send_request(dev, VHOST_USER_GET_PROTOCOL_FEATURES, &protocol_features);
+ if (ret < 0) {
+ return ret;
+ }
+
+ protocol_features &= VIRTIO_USER_SUPPORTED_PROTOCOL_FEATURES;
+ ret = dev->ops->send_request(dev, VHOST_USER_SET_PROTOCOL_FEATURES, &protocol_features);
+ if (ret < 0) {
+ return ret;
+ }
+
+ dev->protocol_features = protocol_features;
+ return 0;
+}
+
+static uint16_t
+virtio_user_get_queue_size(struct virtio_dev *vdev, uint16_t queue_id)
+{
+ struct virtio_user_dev *dev = vdev->ctx;
+
+ /* Currently each queue has same queue size */
+ return dev->queue_size;
+}
+
+static int
+virtio_user_setup_queue(struct virtio_dev *vdev, struct virtqueue *vq)
+{
+ struct virtio_user_dev *dev = vdev->ctx;
+ struct vhost_vring_state state;
+ uint16_t queue_idx = vq->vq_queue_index;
+ void *queue_mem;
+ uint64_t desc_addr, avail_addr, used_addr;
+ int callfd, kickfd, rc;
+
+ if (dev->callfds[queue_idx] != -1 || dev->kickfds[queue_idx] != -1) {
+ SPDK_ERRLOG("queue %"PRIu16" already exists\n", queue_idx);
+ return -EEXIST;
+ }
+
+ /* May use invalid flag, but some backend uses kickfd and
+ * callfd as criteria to judge if dev is alive. so finally we
+ * use real event_fd.
+ */
+ callfd = eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK);
+ if (callfd < 0) {
+ SPDK_ERRLOG("callfd error, %s\n", spdk_strerror(errno));
+ return -errno;
+ }
+
+ kickfd = eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK);
+ if (kickfd < 0) {
+ SPDK_ERRLOG("kickfd error, %s\n", spdk_strerror(errno));
+ close(callfd);
+ return -errno;
+ }
+
+ queue_mem = spdk_zmalloc(vq->vq_ring_size, VIRTIO_PCI_VRING_ALIGN, NULL,
+ SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
+ if (queue_mem == NULL) {
+ close(kickfd);
+ close(callfd);
+ return -ENOMEM;
+ }
+
+ vq->vq_ring_mem = SPDK_VTOPHYS_ERROR;
+ vq->vq_ring_virt_mem = queue_mem;
+
+ state.index = vq->vq_queue_index;
+ state.num = 0;
+
+ if (virtio_dev_has_feature(vdev, VHOST_USER_F_PROTOCOL_FEATURES)) {
+ rc = dev->ops->send_request(dev, VHOST_USER_SET_VRING_ENABLE, &state);
+ if (rc < 0) {
+ SPDK_ERRLOG("failed to send VHOST_USER_SET_VRING_ENABLE: %s\n",
+ spdk_strerror(-rc));
+ close(kickfd);
+ close(callfd);
+ spdk_free(queue_mem);
+ return -rc;
+ }
+ }
+
+ dev->callfds[queue_idx] = callfd;
+ dev->kickfds[queue_idx] = kickfd;
+
+ desc_addr = (uintptr_t)vq->vq_ring_virt_mem;
+ avail_addr = desc_addr + vq->vq_nentries * sizeof(struct vring_desc);
+ used_addr = SPDK_ALIGN_CEIL(avail_addr + offsetof(struct vring_avail,
+ ring[vq->vq_nentries]),
+ VIRTIO_PCI_VRING_ALIGN);
+
+ dev->vrings[queue_idx].num = vq->vq_nentries;
+ dev->vrings[queue_idx].desc = (void *)(uintptr_t)desc_addr;
+ dev->vrings[queue_idx].avail = (void *)(uintptr_t)avail_addr;
+ dev->vrings[queue_idx].used = (void *)(uintptr_t)used_addr;
+
+ return 0;
+}
+
+static void
+virtio_user_del_queue(struct virtio_dev *vdev, struct virtqueue *vq)
+{
+ /* For legacy devices, write 0 to VIRTIO_PCI_QUEUE_PFN port, QEMU
+ * correspondingly stops the ioeventfds, and reset the status of
+ * the device.
+ * For modern devices, set queue desc, avail, used in PCI bar to 0,
+ * not see any more behavior in QEMU.
+ *
+ * Here we just care about what information to deliver to vhost-user.
+ * So we just close ioeventfd for now.
+ */
+ struct virtio_user_dev *dev = vdev->ctx;
+
+ close(dev->callfds[vq->vq_queue_index]);
+ close(dev->kickfds[vq->vq_queue_index]);
+ dev->callfds[vq->vq_queue_index] = -1;
+ dev->kickfds[vq->vq_queue_index] = -1;
+
+ spdk_free(vq->vq_ring_virt_mem);
+}
+
+static void
+virtio_user_notify_queue(struct virtio_dev *vdev, struct virtqueue *vq)
+{
+ uint64_t buf = 1;
+ struct virtio_user_dev *dev = vdev->ctx;
+
+ if (write(dev->kickfds[vq->vq_queue_index], &buf, sizeof(buf)) < 0) {
+ SPDK_ERRLOG("failed to kick backend: %s.\n", spdk_strerror(errno));
+ }
+}
+
+static void
+virtio_user_destroy(struct virtio_dev *vdev)
+{
+ struct virtio_user_dev *dev = vdev->ctx;
+
+ close(dev->vhostfd);
+ free(dev);
+}
+
+static void
+virtio_user_dump_json_info(struct virtio_dev *vdev, struct spdk_json_write_ctx *w)
+{
+ struct virtio_user_dev *dev = vdev->ctx;
+
+ spdk_json_write_named_string(w, "type", "user");
+ spdk_json_write_named_string(w, "socket", dev->path);
+}
+
+static void
+virtio_user_write_json_config(struct virtio_dev *vdev, struct spdk_json_write_ctx *w)
+{
+ struct virtio_user_dev *dev = vdev->ctx;
+
+ spdk_json_write_named_string(w, "trtype", "user");
+ spdk_json_write_named_string(w, "traddr", dev->path);
+ spdk_json_write_named_uint32(w, "vq_count", vdev->max_queues - vdev->fixed_queues_num);
+ spdk_json_write_named_uint32(w, "vq_size", virtio_dev_backend_ops(vdev)->get_queue_size(vdev, 0));
+}
+
+static const struct virtio_dev_ops virtio_user_ops = {
+ .read_dev_cfg = virtio_user_read_dev_config,
+ .write_dev_cfg = virtio_user_write_dev_config,
+ .get_status = virtio_user_get_status,
+ .set_status = virtio_user_set_status,
+ .get_features = virtio_user_get_features,
+ .set_features = virtio_user_set_features,
+ .destruct_dev = virtio_user_destroy,
+ .get_queue_size = virtio_user_get_queue_size,
+ .setup_queue = virtio_user_setup_queue,
+ .del_queue = virtio_user_del_queue,
+ .notify_queue = virtio_user_notify_queue,
+ .dump_json_info = virtio_user_dump_json_info,
+ .write_json_config = virtio_user_write_json_config,
+};
+
+int
+virtio_user_dev_init(struct virtio_dev *vdev, const char *name, const char *path,
+ uint32_t queue_size)
+{
+ struct virtio_user_dev *dev;
+ int rc;
+
+ if (name == NULL) {
+ SPDK_ERRLOG("No name gived for controller: %s\n", path);
+ return -EINVAL;
+ }
+
+ dev = calloc(1, sizeof(*dev));
+ if (dev == NULL) {
+ return -ENOMEM;
+ }
+
+ rc = virtio_dev_construct(vdev, name, &virtio_user_ops, dev);
+ if (rc != 0) {
+ SPDK_ERRLOG("Failed to init device: %s\n", path);
+ free(dev);
+ return rc;
+ }
+
+ vdev->is_hw = 0;
+
+ snprintf(dev->path, PATH_MAX, "%s", path);
+ dev->queue_size = queue_size;
+
+ rc = virtio_user_dev_setup(vdev);
+ if (rc < 0) {
+ SPDK_ERRLOG("backend set up fails\n");
+ goto err;
+ }
+
+ rc = dev->ops->send_request(dev, VHOST_USER_SET_OWNER, NULL);
+ if (rc < 0) {
+ SPDK_ERRLOG("set_owner fails: %s\n", spdk_strerror(-rc));
+ goto err;
+ }
+
+ return 0;
+
+err:
+ virtio_dev_destruct(vdev);
+ return rc;
+}
diff --git a/src/spdk/lib/vmd/Makefile b/src/spdk/lib/vmd/Makefile
new file mode 100644
index 000000000..13813c559
--- /dev/null
+++ b/src/spdk/lib/vmd/Makefile
@@ -0,0 +1,45 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+SO_VER := 2
+SO_MINOR := 0
+
+C_SRCS = vmd.c led.c
+LIBNAME = vmd
+
+SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_vmd.map)
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/vmd/led.c b/src/spdk/lib/vmd/led.c
new file mode 100644
index 000000000..878983aab
--- /dev/null
+++ b/src/spdk/lib/vmd/led.c
@@ -0,0 +1,166 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+#include "spdk/likely.h"
+#include "spdk/log.h"
+#include "vmd.h"
+
+struct vmd_led_indicator_config {
+ uint8_t attention_indicator : 2;
+ uint8_t power_indicator : 2;
+ uint8_t reserved : 4;
+};
+
+/*
+ * VMD LED Attn Power LED Amber
+ * State Indicator Indicator
+ * Control Control
+ * ------------------------------------------------
+ * Off 11b 11b Off
+ * Ident 11b 01b Blink 4Hz
+ * Fault 01b 11b On
+ * Rebuild 01b 01b Blink 1Hz
+ */
+static const struct vmd_led_indicator_config g_led_config[] = {
+ [SPDK_VMD_LED_STATE_OFF] = { .attention_indicator = 3, .power_indicator = 3 },
+ [SPDK_VMD_LED_STATE_IDENTIFY] = { .attention_indicator = 3, .power_indicator = 1 },
+ [SPDK_VMD_LED_STATE_FAULT] = { .attention_indicator = 1, .power_indicator = 3 },
+ [SPDK_VMD_LED_STATE_REBUILD] = { .attention_indicator = 1, .power_indicator = 1 },
+};
+
+static void
+vmd_led_set_indicator_control(struct vmd_pci_device *vmd_device, enum spdk_vmd_led_state state)
+{
+ const struct vmd_led_indicator_config *config;
+ union express_slot_control_register slot_control;
+
+ assert(state >= SPDK_VMD_LED_STATE_OFF && state <= SPDK_VMD_LED_STATE_REBUILD);
+ config = &g_led_config[state];
+
+ slot_control = vmd_device->pcie_cap->slot_control;
+ slot_control.bit_field.attention_indicator_control = config->attention_indicator;
+ slot_control.bit_field.power_indicator_control = config->power_indicator;
+
+ /*
+ * Due to the fact that writes to the PCI config space are posted writes, we need to issue
+ * a read to the register we've just written to ensure it reached its destination.
+ * TODO: wrap all register writes with a function taking care of that.
+ */
+ vmd_device->pcie_cap->slot_control = slot_control;
+ vmd_device->cached_slot_control = vmd_device->pcie_cap->slot_control;
+}
+
+static unsigned int
+vmd_led_get_state(struct vmd_pci_device *vmd_device)
+{
+ const struct vmd_led_indicator_config *config;
+ union express_slot_control_register slot_control;
+ unsigned int state;
+
+ slot_control = vmd_device->cached_slot_control;
+ for (state = SPDK_VMD_LED_STATE_OFF; state <= SPDK_VMD_LED_STATE_REBUILD; ++state) {
+ config = &g_led_config[state];
+
+ if (slot_control.bit_field.attention_indicator_control == config->attention_indicator &&
+ slot_control.bit_field.power_indicator_control == config->power_indicator) {
+ return state;
+ }
+ }
+
+ return SPDK_VMD_LED_STATE_UNKNOWN;
+}
+
+/*
+ * The identifying device under VMD is located in the global list of VMD controllers. If the BDF
+ * identifies an endpoint, then the LED is attached to the endpoint's parent. If the BDF identifies
+ * a type 1 header, then this device has the corresponding LED. This may arise when a user wants to
+ * identify a given empty slot under VMD.
+ */
+static struct vmd_pci_device *
+vmd_get_led_device(const struct spdk_pci_device *pci_device)
+{
+ struct vmd_pci_device *vmd_device;
+
+ assert(strcmp(spdk_pci_device_get_type(pci_device), "vmd") == 0);
+
+ vmd_device = vmd_find_device(&pci_device->addr);
+ if (spdk_unlikely(vmd_device == NULL)) {
+ return NULL;
+ }
+
+ if (vmd_device->header_type == PCI_HEADER_TYPE_NORMAL) {
+ if (spdk_unlikely(vmd_device->parent == NULL)) {
+ return NULL;
+ }
+
+ return vmd_device->parent->self;
+ }
+
+ return vmd_device;
+}
+
+int
+spdk_vmd_set_led_state(struct spdk_pci_device *pci_device, enum spdk_vmd_led_state state)
+{
+ struct vmd_pci_device *vmd_device;
+
+ if (state < SPDK_VMD_LED_STATE_OFF || state > SPDK_VMD_LED_STATE_REBUILD) {
+ SPDK_ERRLOG("Invalid LED state\n");
+ return -EINVAL;
+ }
+
+ vmd_device = vmd_get_led_device(pci_device);
+ if (spdk_unlikely(vmd_device == NULL)) {
+ SPDK_ERRLOG("The PCI device is not behind the VMD\n");
+ return -ENODEV;
+ }
+
+ vmd_led_set_indicator_control(vmd_device, state);
+ return 0;
+}
+
+int
+spdk_vmd_get_led_state(struct spdk_pci_device *pci_device, enum spdk_vmd_led_state *state)
+{
+ struct vmd_pci_device *vmd_device;
+
+ vmd_device = vmd_get_led_device(pci_device);
+ if (spdk_unlikely(vmd_device == NULL)) {
+ SPDK_ERRLOG("The PCI device is not behind the VMD\n");
+ return -ENODEV;
+ }
+
+ *state = (enum spdk_vmd_led_state)vmd_led_get_state(vmd_device);
+ return 0;
+}
diff --git a/src/spdk/lib/vmd/spdk_vmd.map b/src/spdk/lib/vmd/spdk_vmd.map
new file mode 100644
index 000000000..036d079b5
--- /dev/null
+++ b/src/spdk/lib/vmd/spdk_vmd.map
@@ -0,0 +1,13 @@
+{
+ global:
+
+ # public functions
+ spdk_vmd_init;
+ spdk_vmd_fini;
+ spdk_vmd_pci_device_list;
+ spdk_vmd_set_led_state;
+ spdk_vmd_get_led_state;
+ spdk_vmd_hotplug_monitor;
+
+ local: *;
+};
diff --git a/src/spdk/lib/vmd/vmd.c b/src/spdk/lib/vmd/vmd.c
new file mode 100644
index 000000000..14d9558c2
--- /dev/null
+++ b/src/spdk/lib/vmd/vmd.c
@@ -0,0 +1,1376 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "vmd.h"
+
+#include "spdk/stdinc.h"
+#include "spdk/likely.h"
+
+static unsigned char *device_type[] = {
+ "PCI Express Endpoint",
+ "Legacy PCI Express Endpoint",
+ "Reserved 1",
+ "Reserved 2",
+ "Root Port of PCI Express Root Complex",
+ "Upstream Port of PCI Express Switch",
+ "Downstream Port of PCI Express Switch",
+ "PCI Express to PCI/PCI-X Bridge",
+ "PCI/PCI-X to PCI Express Bridge",
+ "Root Complex Integrated Endpoint",
+ "Root Complex Event Collector",
+ "Reserved Capability"
+};
+
+/*
+ * Container for all VMD adapter probed in the system.
+ */
+struct vmd_container {
+ uint32_t count;
+ struct vmd_adapter vmd[MAX_VMD_SUPPORTED];
+};
+
+static struct vmd_container g_vmd_container;
+static uint8_t g_end_device_count;
+
+static bool
+vmd_is_valid_cfg_addr(struct vmd_pci_bus *bus, uint64_t addr)
+{
+ return addr >= (uint64_t)bus->vmd->cfg_vaddr &&
+ addr < bus->vmd->cfgbar_size + (uint64_t)bus->vmd->cfg_vaddr;
+}
+
+static void
+vmd_align_base_addrs(struct vmd_adapter *vmd, uint32_t alignment)
+{
+ uint32_t pad;
+
+ /*
+ * Device is not in hot plug path, align the base address remaining from membar 1.
+ */
+ if (vmd->physical_addr & (alignment - 1)) {
+ pad = alignment - (vmd->physical_addr & (alignment - 1));
+ vmd->physical_addr += pad;
+ vmd->current_addr_size -= pad;
+ }
+}
+
+static bool
+vmd_device_is_enumerated(const struct vmd_pci_device *vmd_device)
+{
+ return vmd_device->header->one.prefetch_base_upper == VMD_UPPER_BASE_SIGNATURE &&
+ vmd_device->header->one.prefetch_limit_upper == VMD_UPPER_LIMIT_SIGNATURE;
+}
+
+static bool
+vmd_device_is_root_port(const struct vmd_pci_device *vmd_device)
+{
+ return vmd_device->header->common.vendor_id == 0x8086 &&
+ (vmd_device->header->common.device_id == 0x2030 ||
+ vmd_device->header->common.device_id == 0x2031 ||
+ vmd_device->header->common.device_id == 0x2032 ||
+ vmd_device->header->common.device_id == 0x2033);
+}
+
+static void
+vmd_hotplug_coalesce_regions(struct vmd_hot_plug *hp)
+{
+ struct pci_mem_mgr *region, *prev;
+
+ do {
+ prev = NULL;
+ TAILQ_FOREACH(region, &hp->free_mem_queue, tailq) {
+ if (prev != NULL && (prev->addr + prev->size == region->addr)) {
+ break;
+ }
+
+ prev = region;
+ }
+
+ if (region != NULL) {
+ prev->size += region->size;
+ TAILQ_REMOVE(&hp->free_mem_queue, region, tailq);
+ TAILQ_INSERT_TAIL(&hp->unused_mem_queue, region, tailq);
+ }
+ } while (region != NULL);
+}
+
+static void
+vmd_hotplug_free_region(struct vmd_hot_plug *hp, struct pci_mem_mgr *region)
+{
+ struct pci_mem_mgr *current, *prev = NULL;
+
+ assert(region->addr >= hp->bar.start && region->addr < hp->bar.start + hp->bar.size);
+
+ TAILQ_FOREACH(current, &hp->free_mem_queue, tailq) {
+ if (current->addr > region->addr) {
+ break;
+ }
+
+ prev = current;
+ }
+
+ if (prev != NULL) {
+ assert(prev->addr + prev->size <= region->addr);
+ assert(current == NULL || (region->addr + region->size <= current->addr));
+ TAILQ_INSERT_AFTER(&hp->free_mem_queue, prev, region, tailq);
+ } else {
+ TAILQ_INSERT_HEAD(&hp->free_mem_queue, region, tailq);
+ }
+
+ vmd_hotplug_coalesce_regions(hp);
+}
+
+static void
+vmd_hotplug_free_addr(struct vmd_hot_plug *hp, uint64_t addr)
+{
+ struct pci_mem_mgr *region;
+
+ TAILQ_FOREACH(region, &hp->alloc_mem_queue, tailq) {
+ if (region->addr == addr) {
+ break;
+ }
+ }
+
+ assert(region != NULL);
+ TAILQ_REMOVE(&hp->alloc_mem_queue, region, tailq);
+
+ vmd_hotplug_free_region(hp, region);
+}
+
+static uint64_t
+vmd_hotplug_allocate_base_addr(struct vmd_hot_plug *hp, uint32_t size)
+{
+ struct pci_mem_mgr *region = NULL, *free_region;
+
+ TAILQ_FOREACH(region, &hp->free_mem_queue, tailq) {
+ if (region->size >= size) {
+ break;
+ }
+ }
+
+ if (region == NULL) {
+ SPDK_DEBUGLOG(SPDK_LOG_VMD, "Unable to find free hotplug memory region of size:"
+ "%"PRIx32"\n", size);
+ return 0;
+ }
+
+ TAILQ_REMOVE(&hp->free_mem_queue, region, tailq);
+ if (size < region->size) {
+ free_region = TAILQ_FIRST(&hp->unused_mem_queue);
+ if (free_region == NULL) {
+ SPDK_DEBUGLOG(SPDK_LOG_VMD, "Unable to find unused descriptor to store the "
+ "free region of size: %"PRIu32"\n", region->size - size);
+ } else {
+ TAILQ_REMOVE(&hp->unused_mem_queue, free_region, tailq);
+ free_region->size = region->size - size;
+ free_region->addr = region->addr + size;
+ region->size = size;
+ vmd_hotplug_free_region(hp, free_region);
+ }
+ }
+
+ TAILQ_INSERT_TAIL(&hp->alloc_mem_queue, region, tailq);
+
+ return region->addr;
+}
+
+/*
+ * Allocates an address from vmd membar for the input memory size
+ * vmdAdapter - vmd adapter object
+ * dev - vmd_pci_device to allocate a base address for.
+ * size - size of the memory window requested.
+ * Size must be an integral multiple of 2. Addresses are returned on the size boundary.
+ * Returns physical address within the VMD membar window, or 0x0 if cannot allocate window.
+ * Consider increasing the size of vmd membar if 0x0 is returned.
+ */
+static uint64_t
+vmd_allocate_base_addr(struct vmd_adapter *vmd, struct vmd_pci_device *dev, uint32_t size)
+{
+ uint64_t base_address = 0, padding = 0;
+ struct vmd_pci_bus *hp_bus;
+
+ if (size && ((size & (~size + 1)) != size)) {
+ return base_address;
+ }
+
+ /*
+ * If device is downstream of a hot plug port, allocate address from the
+ * range dedicated for the hot plug slot. Search the list of addresses allocated to determine
+ * if a free range exists that satisfy the input request. If a free range cannot be found,
+ * get a buffer from the unused chunk. First fit algorithm, is used.
+ */
+ if (dev) {
+ hp_bus = dev->parent;
+ if (hp_bus && hp_bus->self && hp_bus->self->hotplug_capable) {
+ return vmd_hotplug_allocate_base_addr(&hp_bus->self->hp, size);
+ }
+ }
+
+ /* Ensure physical membar allocated is size aligned */
+ if (vmd->physical_addr & (size - 1)) {
+ padding = size - (vmd->physical_addr & (size - 1));
+ }
+
+ /* Allocate from membar if enough memory is left */
+ if (vmd->current_addr_size >= size + padding) {
+ base_address = vmd->physical_addr + padding;
+ vmd->physical_addr += size + padding;
+ vmd->current_addr_size -= size + padding;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_VMD, "allocated(size) %lx (%x)\n", base_address, size);
+
+ return base_address;
+}
+
+static bool
+vmd_is_end_device(struct vmd_pci_device *dev)
+{
+ return (dev && dev->header) &&
+ ((dev->header->common.header_type & ~PCI_MULTI_FUNCTION) == PCI_HEADER_TYPE_NORMAL);
+}
+
+static void
+vmd_update_base_limit_register(struct vmd_pci_device *dev, uint16_t base, uint16_t limit)
+{
+ struct vmd_pci_bus *bus;
+ struct vmd_pci_device *bridge;
+
+ if (base == 0 || limit == 0) {
+ return;
+ }
+
+ if (dev->header->common.header_type == PCI_HEADER_TYPE_BRIDGE) {
+ bus = dev->bus_object;
+ } else {
+ bus = dev->parent;
+ }
+
+ bridge = bus->self;
+ SPDK_DEBUGLOG(SPDK_LOG_VMD, "base:limit = %x:%x\n", bridge->header->one.mem_base,
+ bridge->header->one.mem_limit);
+
+ if (dev->bus->vmd->scan_completed) {
+ return;
+ }
+
+ while (bus && bus->self != NULL) {
+ bridge = bus->self;
+
+ /* This is only for 32-bit memory space, need to revisit to support 64-bit */
+ if (bridge->header->one.mem_base > base) {
+ bridge->header->one.mem_base = base;
+ base = bridge->header->one.mem_base;
+ }
+
+ if (bridge->header->one.mem_limit < limit) {
+ bridge->header->one.mem_limit = limit;
+ limit = bridge->header->one.mem_limit;
+ }
+
+ bus = bus->parent;
+ }
+}
+
+static uint64_t
+vmd_get_base_addr(struct vmd_pci_device *dev, uint32_t index, uint32_t size)
+{
+ struct vmd_pci_bus *bus = dev->parent;
+
+ if (dev->header_type == PCI_HEADER_TYPE_BRIDGE) {
+ return dev->header->zero.BAR[index] & ~0xf;
+ } else {
+ if (bus->self->hotplug_capable) {
+ return vmd_hotplug_allocate_base_addr(&bus->self->hp, size);
+ } else {
+ return (uint64_t)bus->self->header->one.mem_base << 16;
+ }
+ }
+}
+
+static bool
+vmd_assign_base_addrs(struct vmd_pci_device *dev)
+{
+ uint16_t mem_base = 0, mem_limit = 0;
+ unsigned char mem_attr = 0;
+ int last;
+ struct vmd_adapter *vmd = NULL;
+ bool ret_val = false;
+ uint32_t bar_value;
+ uint32_t table_offset;
+
+ if (dev && dev->bus) {
+ vmd = dev->bus->vmd;
+ }
+
+ if (!vmd) {
+ return 0;
+ }
+
+ vmd_align_base_addrs(vmd, ONE_MB);
+
+ last = dev->header_type ? 2 : 6;
+ for (int i = 0; i < last; i++) {
+ bar_value = dev->header->zero.BAR[i];
+ dev->header->zero.BAR[i] = ~(0U);
+ dev->bar[i].size = dev->header->zero.BAR[i];
+ dev->header->zero.BAR[i] = bar_value;
+
+ if (dev->bar[i].size == ~(0U) || dev->bar[i].size == 0 ||
+ dev->header->zero.BAR[i] & 1) {
+ dev->bar[i].size = 0;
+ continue;
+ }
+ mem_attr = dev->bar[i].size & PCI_BASE_ADDR_MASK;
+ dev->bar[i].size = TWOS_COMPLEMENT(dev->bar[i].size & PCI_BASE_ADDR_MASK);
+
+ if (vmd->scan_completed) {
+ dev->bar[i].start = vmd_get_base_addr(dev, i, dev->bar[i].size);
+ } else {
+ dev->bar[i].start = vmd_allocate_base_addr(vmd, dev, dev->bar[i].size);
+ }
+
+ dev->header->zero.BAR[i] = (uint32_t)dev->bar[i].start;
+
+ if (!dev->bar[i].start) {
+ if (mem_attr == (PCI_BAR_MEMORY_PREFETCH | PCI_BAR_MEMORY_TYPE_64)) {
+ i++;
+ }
+ continue;
+ }
+
+ dev->bar[i].vaddr = ((uint64_t)vmd->mem_vaddr + (dev->bar[i].start - vmd->membar));
+ mem_limit = BRIDGE_BASEREG(dev->header->zero.BAR[i]) +
+ BRIDGE_BASEREG(dev->bar[i].size - 1);
+ if (!mem_base) {
+ mem_base = BRIDGE_BASEREG(dev->header->zero.BAR[i]);
+ }
+
+ ret_val = true;
+
+ if (mem_attr == (PCI_BAR_MEMORY_PREFETCH | PCI_BAR_MEMORY_TYPE_64)) {
+ i++;
+ if (i < last) {
+ dev->header->zero.BAR[i] = (uint32_t)(dev->bar[i].start >> PCI_DWORD_SHIFT);
+ }
+ }
+ }
+
+ /* Enable device MEM and bus mastering */
+ dev->header->zero.command |= (PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER);
+ uint16_t cmd = dev->header->zero.command;
+ cmd++;
+
+ if (dev->msix_cap && ret_val) {
+ table_offset = ((volatile struct pci_msix_cap *)dev->msix_cap)->msix_table_offset;
+ if (dev->bar[table_offset & 0x3].vaddr) {
+ dev->msix_table = (volatile struct pci_msix_table_entry *)
+ (dev->bar[table_offset & 0x3].vaddr + (table_offset & 0xfff8));
+ }
+ }
+
+ if (ret_val && vmd_is_end_device(dev)) {
+ vmd_update_base_limit_register(dev, mem_base, mem_limit);
+ }
+
+ return ret_val;
+}
+
+static void
+vmd_get_device_capabilities(struct vmd_pci_device *dev)
+
+{
+ volatile uint8_t *config_space;
+ uint8_t capabilities_offset;
+ struct pci_capabilities_header *capabilities_hdr;
+
+ config_space = (volatile uint8_t *)dev->header;
+ if ((dev->header->common.status & PCI_CAPABILITIES_LIST) == 0) {
+ return;
+ }
+
+ capabilities_offset = dev->header->zero.cap_pointer;
+ if (dev->header->common.header_type & PCI_HEADER_TYPE_BRIDGE) {
+ capabilities_offset = dev->header->one.cap_pointer;
+ }
+
+ while (capabilities_offset > 0) {
+ capabilities_hdr = (struct pci_capabilities_header *)
+ &config_space[capabilities_offset];
+ switch (capabilities_hdr->capability_id) {
+ case CAPABILITY_ID_PCI_EXPRESS:
+ dev->pcie_cap = (volatile struct pci_express_cap *)(capabilities_hdr);
+ break;
+
+ case CAPABILITY_ID_MSI:
+ dev->msi_cap = (volatile struct pci_msi_cap *)capabilities_hdr;
+ break;
+
+ case CAPABILITY_ID_MSIX:
+ dev->msix_cap = (volatile struct pci_msix_capability *)capabilities_hdr;
+ dev->msix_table_size = dev->msix_cap->message_control.bit.table_size + 1;
+ break;
+
+ default:
+ break;
+ }
+ capabilities_offset = capabilities_hdr->next;
+ }
+}
+
+static volatile struct pci_enhanced_capability_header *
+vmd_get_enhanced_capabilities(struct vmd_pci_device *dev, uint16_t capability_id)
+{
+ uint8_t *data;
+ uint16_t cap_offset = EXTENDED_CAPABILITY_OFFSET;
+ volatile struct pci_enhanced_capability_header *cap_hdr = NULL;
+
+ data = (uint8_t *)dev->header;
+ while (cap_offset >= EXTENDED_CAPABILITY_OFFSET) {
+ cap_hdr = (volatile struct pci_enhanced_capability_header *) &data[cap_offset];
+ if (cap_hdr->capability_id == capability_id) {
+ return cap_hdr;
+ }
+ cap_offset = cap_hdr->next;
+ if (cap_offset == 0 || cap_offset < EXTENDED_CAPABILITY_OFFSET) {
+ break;
+ }
+ }
+
+ return NULL;
+}
+
+static void
+vmd_read_config_space(struct vmd_pci_device *dev)
+{
+ /*
+ * Writes to the pci config space is posted weite. To ensure transaction reaches its destination
+ * before another write is posed, an immediate read of the written value should be performed.
+ */
+ dev->header->common.command |= (BUS_MASTER_ENABLE | MEMORY_SPACE_ENABLE);
+ { uint16_t cmd = dev->header->common.command; (void)cmd; }
+
+ vmd_get_device_capabilities(dev);
+ dev->sn_cap = (struct serial_number_capability *)vmd_get_enhanced_capabilities(dev,
+ DEVICE_SERIAL_NUMBER_CAP_ID);
+}
+
+static void
+vmd_update_scan_info(struct vmd_pci_device *dev)
+{
+ struct vmd_adapter *vmd_adapter = dev->bus->vmd;
+
+ if (vmd_adapter->root_port_updated) {
+ return;
+ }
+
+ if (dev->header_type == PCI_HEADER_TYPE_NORMAL) {
+ return;
+ }
+
+ if (vmd_device_is_root_port(dev)) {
+ vmd_adapter->root_port_updated = 1;
+ SPDK_DEBUGLOG(SPDK_LOG_VMD, "root_port_updated = %d\n",
+ vmd_adapter->root_port_updated);
+ SPDK_DEBUGLOG(SPDK_LOG_VMD, "upper:limit = %x : %x\n",
+ dev->header->one.prefetch_base_upper,
+ dev->header->one.prefetch_limit_upper);
+ if (vmd_device_is_enumerated(dev)) {
+ vmd_adapter->scan_completed = 1;
+ SPDK_DEBUGLOG(SPDK_LOG_VMD, "scan_completed = %d\n",
+ vmd_adapter->scan_completed);
+ }
+ }
+}
+
+static void
+vmd_reset_base_limit_registers(struct vmd_pci_device *dev)
+{
+ uint32_t reg __attribute__((unused));
+
+ assert(dev->header_type != PCI_HEADER_TYPE_NORMAL);
+ /*
+ * Writes to the pci config space are posted writes.
+ * To ensure transaction reaches its destination
+ * before another write is posted, an immediate read
+ * of the written value should be performed.
+ */
+ dev->header->one.mem_base = 0xfff0;
+ reg = dev->header->one.mem_base;
+ dev->header->one.mem_limit = 0x0;
+ reg = dev->header->one.mem_limit;
+ dev->header->one.prefetch_base = 0x0;
+ reg = dev->header->one.prefetch_base;
+ dev->header->one.prefetch_limit = 0x0;
+ reg = dev->header->one.prefetch_limit;
+ dev->header->one.prefetch_base_upper = 0x0;
+ reg = dev->header->one.prefetch_base_upper;
+ dev->header->one.prefetch_limit_upper = 0x0;
+ reg = dev->header->one.prefetch_limit_upper;
+ dev->header->one.io_base_upper = 0x0;
+ reg = dev->header->one.io_base_upper;
+ dev->header->one.io_limit_upper = 0x0;
+ reg = dev->header->one.io_limit_upper;
+ dev->header->one.primary = 0;
+ reg = dev->header->one.primary;
+ dev->header->one.secondary = 0;
+ reg = dev->header->one.secondary;
+ dev->header->one.subordinate = 0;
+ reg = dev->header->one.subordinate;
+}
+
+static void
+vmd_init_hotplug(struct vmd_pci_device *dev, struct vmd_pci_bus *bus)
+{
+ struct vmd_adapter *vmd = bus->vmd;
+ struct vmd_hot_plug *hp = &dev->hp;
+ size_t mem_id;
+
+ dev->hotplug_capable = true;
+ hp->bar.size = 1 << 20;
+
+ if (!vmd->scan_completed) {
+ hp->bar.start = vmd_allocate_base_addr(vmd, NULL, hp->bar.size);
+ bus->self->header->one.mem_base = BRIDGE_BASEREG(hp->bar.start);
+ bus->self->header->one.mem_limit =
+ bus->self->header->one.mem_base + BRIDGE_BASEREG(hp->bar.size - 1);
+ } else {
+ hp->bar.start = (uint64_t)bus->self->header->one.mem_base << 16;
+ }
+
+ hp->bar.vaddr = (uint64_t)vmd->mem_vaddr + (hp->bar.start - vmd->membar);
+
+ TAILQ_INIT(&hp->free_mem_queue);
+ TAILQ_INIT(&hp->unused_mem_queue);
+ TAILQ_INIT(&hp->alloc_mem_queue);
+
+ hp->mem[0].size = hp->bar.size;
+ hp->mem[0].addr = hp->bar.start;
+
+ TAILQ_INSERT_TAIL(&hp->free_mem_queue, &hp->mem[0], tailq);
+
+ for (mem_id = 1; mem_id < ADDR_ELEM_COUNT; ++mem_id) {
+ TAILQ_INSERT_TAIL(&hp->unused_mem_queue, &hp->mem[mem_id], tailq);
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_VMD, "%s: mem_base:mem_limit = %x : %x\n", __func__,
+ bus->self->header->one.mem_base, bus->self->header->one.mem_limit);
+}
+
+static bool
+vmd_bus_device_present(struct vmd_pci_bus *bus, uint32_t devfn)
+{
+ volatile struct pci_header *header;
+
+ header = (volatile struct pci_header *)(bus->vmd->cfg_vaddr +
+ CONFIG_OFFSET_ADDR(bus->bus_number, devfn, 0, 0));
+ if (!vmd_is_valid_cfg_addr(bus, (uint64_t)header)) {
+ return false;
+ }
+
+ if (header->common.vendor_id == PCI_INVALID_VENDORID || header->common.vendor_id == 0x0) {
+ return false;
+ }
+
+ return true;
+}
+
+static struct vmd_pci_device *
+vmd_alloc_dev(struct vmd_pci_bus *bus, uint32_t devfn)
+{
+ struct vmd_pci_device *dev = NULL;
+ struct pci_header volatile *header;
+ uint8_t header_type;
+ uint32_t rev_class;
+
+ /* Make sure we're not creating two devices on the same dev/fn */
+ TAILQ_FOREACH(dev, &bus->dev_list, tailq) {
+ if (dev->devfn == devfn) {
+ return NULL;
+ }
+ }
+
+ if (!vmd_bus_device_present(bus, devfn)) {
+ return NULL;
+ }
+
+ header = (struct pci_header * volatile)(bus->vmd->cfg_vaddr +
+ CONFIG_OFFSET_ADDR(bus->bus_number, devfn, 0, 0));
+
+ SPDK_DEBUGLOG(SPDK_LOG_VMD, "PCI device found: %04x:%04x ***\n",
+ header->common.vendor_id, header->common.device_id);
+
+ dev = calloc(1, sizeof(*dev));
+ if (!dev) {
+ return NULL;
+ }
+
+ dev->header = header;
+ dev->vid = dev->header->common.vendor_id;
+ dev->did = dev->header->common.device_id;
+ dev->bus = bus;
+ dev->parent = bus;
+ dev->devfn = devfn;
+ header_type = dev->header->common.header_type;
+ rev_class = dev->header->common.rev_class;
+ dev->class = rev_class >> 8;
+ dev->header_type = header_type & 0x7;
+
+ if (header_type == PCI_HEADER_TYPE_BRIDGE) {
+ vmd_update_scan_info(dev);
+ if (!dev->bus->vmd->scan_completed) {
+ vmd_reset_base_limit_registers(dev);
+ }
+ }
+
+ vmd_read_config_space(dev);
+
+ return dev;
+}
+
+static struct vmd_pci_bus *
+vmd_create_new_bus(struct vmd_pci_bus *parent, struct vmd_pci_device *bridge, uint8_t bus_number)
+{
+ struct vmd_pci_bus *new_bus;
+
+ new_bus = calloc(1, sizeof(*new_bus));
+ if (!new_bus) {
+ return NULL;
+ }
+
+ new_bus->parent = parent;
+ new_bus->domain = parent->domain;
+ new_bus->bus_number = bus_number;
+ new_bus->secondary_bus = new_bus->subordinate_bus = bus_number;
+ new_bus->self = bridge;
+ new_bus->vmd = parent->vmd;
+ TAILQ_INIT(&new_bus->dev_list);
+
+ bridge->subordinate = new_bus;
+
+ bridge->pci.addr.bus = new_bus->bus_number;
+ bridge->pci.addr.dev = bridge->devfn;
+ bridge->pci.addr.func = 0;
+ bridge->pci.addr.domain = parent->vmd->pci->addr.domain;
+
+ return new_bus;
+}
+
+/*
+ * Assigns a bus number from the list of available
+ * bus numbers. If the device is downstream of a hot plug port,
+ * assign the bus number from thiose assigned to the HP port. Otherwise,
+ * assign the next bus number from the vmd bus number list.
+ */
+static uint8_t
+vmd_get_next_bus_number(struct vmd_pci_device *dev, struct vmd_adapter *vmd)
+{
+ uint8_t bus = 0xff;
+ struct vmd_pci_bus *hp_bus;
+
+ if (dev) {
+ hp_bus = vmd_is_dev_in_hotplug_path(dev);
+ if (hp_bus && hp_bus->self && hp_bus->self->hotplug_capable) {
+ return vmd_hp_get_next_bus_number(&hp_bus->self->hp);
+ }
+ }
+
+ /* Device is not under a hot plug path. Return next global bus number */
+ if ((vmd->next_bus_number + 1) < vmd->max_pci_bus) {
+ bus = vmd->next_bus_number;
+ vmd->next_bus_number++;
+ }
+ return bus;
+}
+
+static uint8_t
+vmd_get_hotplug_bus_numbers(struct vmd_pci_device *dev)
+{
+ uint8_t bus_number = 0xff;
+
+ if (dev && dev->bus && dev->bus->vmd &&
+ ((dev->bus->vmd->next_bus_number + RESERVED_HOTPLUG_BUSES) < dev->bus->vmd->max_pci_bus)) {
+ bus_number = RESERVED_HOTPLUG_BUSES;
+ dev->bus->vmd->next_bus_number += RESERVED_HOTPLUG_BUSES;
+ }
+
+ return bus_number;
+}
+
+static void
+vmd_enable_msix(struct vmd_pci_device *dev)
+{
+ volatile uint16_t control;
+
+ control = dev->msix_cap->message_control.as_uint16_t | (1 << 14);
+ dev->msix_cap->message_control.as_uint16_t = control;
+ control = dev->msix_cap->message_control.as_uint16_t;
+ dev->msix_cap->message_control.as_uint16_t = (control | (1 << 15));
+ control = dev->msix_cap->message_control.as_uint16_t;
+ control = control & ~(1 << 14);
+ dev->msix_cap->message_control.as_uint16_t = control;
+ control = dev->msix_cap->message_control.as_uint16_t;
+}
+
+static void
+vmd_disable_msix(struct vmd_pci_device *dev)
+{
+ volatile uint16_t control;
+
+ control = dev->msix_cap->message_control.as_uint16_t | (1 << 14);
+ dev->msix_cap->message_control.as_uint16_t = control;
+ control = dev->msix_cap->message_control.as_uint16_t & ~(1 << 15);
+ dev->msix_cap->message_control.as_uint16_t = control;
+ control = dev->msix_cap->message_control.as_uint16_t;
+}
+
+/*
+ * Set up MSI-X table entries for the port. Vmd MSIX vector 0 is used for
+ * port interrupt, so vector 0 is mapped to all MSIX entries for the port.
+ */
+static void
+vmd_setup_msix(struct vmd_pci_device *dev, volatile struct pci_msix_table_entry *vmdEntry)
+{
+ int entry;
+
+ if (!dev || !vmdEntry || !dev->msix_cap) {
+ return;
+ }
+
+ vmd_disable_msix(dev);
+ if (dev->msix_table == NULL || dev->msix_table_size > MAX_MSIX_TABLE_SIZE) {
+ return;
+ }
+
+ for (entry = 0; entry < dev->msix_table_size; ++entry) {
+ dev->msix_table[entry].vector_control = 1;
+ }
+ vmd_enable_msix(dev);
+}
+
+static void
+vmd_bus_update_bridge_info(struct vmd_pci_device *bridge)
+{
+ /* Update the subordinate bus of all bridges above this bridge */
+ volatile struct vmd_pci_device *dev = bridge;
+ uint8_t subordinate_bus;
+
+ if (!dev) {
+ return;
+ }
+ subordinate_bus = bridge->header->one.subordinate;
+ while (dev->parent_bridge != NULL) {
+ dev = dev->parent_bridge;
+ if (dev->header->one.subordinate < subordinate_bus) {
+ dev->header->one.subordinate = subordinate_bus;
+ subordinate_bus = dev->header->one.subordinate;
+ }
+ }
+}
+
+static bool
+vmd_is_supported_device(struct vmd_pci_device *dev)
+{
+ return dev->class == PCI_CLASS_STORAGE_EXPRESS;
+}
+
+static int
+vmd_dev_map_bar(struct spdk_pci_device *pci_dev, uint32_t bar,
+ void **mapped_addr, uint64_t *phys_addr, uint64_t *size)
+{
+ struct vmd_pci_device *dev = SPDK_CONTAINEROF(pci_dev, struct vmd_pci_device, pci);
+
+ *size = dev->bar[bar].size;
+ *phys_addr = dev->bar[bar].start;
+ *mapped_addr = (void *)dev->bar[bar].vaddr;
+
+ return 0;
+}
+
+static int
+vmd_dev_unmap_bar(struct spdk_pci_device *_dev, uint32_t bar, void *addr)
+{
+ return 0;
+}
+
+static int
+vmd_dev_cfg_read(struct spdk_pci_device *_dev, void *value, uint32_t len,
+ uint32_t offset)
+{
+ struct vmd_pci_device *dev = SPDK_CONTAINEROF(_dev, struct vmd_pci_device, pci);
+ volatile uint8_t *src = (volatile uint8_t *)dev->header;
+ uint8_t *dst = value;
+ size_t i;
+
+ if (len + offset > PCI_MAX_CFG_SIZE) {
+ return -1;
+ }
+
+ for (i = 0; i < len; ++i) {
+ dst[i] = src[offset + i];
+ }
+
+ return 0;
+}
+
+static int
+vmd_dev_cfg_write(struct spdk_pci_device *_dev, void *value,
+ uint32_t len, uint32_t offset)
+{
+ struct vmd_pci_device *dev = SPDK_CONTAINEROF(_dev, struct vmd_pci_device, pci);
+ volatile uint8_t *dst = (volatile uint8_t *)dev->header;
+ uint8_t *src = value;
+ size_t i;
+
+ if ((len + offset) > PCI_MAX_CFG_SIZE) {
+ return -1;
+ }
+
+ for (i = 0; i < len; ++i) {
+ dst[offset + i] = src[i];
+ }
+
+ return 0;
+}
+
+static void
+vmd_dev_detach(struct spdk_pci_device *dev)
+{
+ struct vmd_pci_device *vmd_device = (struct vmd_pci_device *)dev;
+ struct vmd_pci_device *bus_device = vmd_device->bus->self;
+ struct vmd_pci_bus *bus = vmd_device->bus;
+ size_t i, num_bars = vmd_device->header_type ? 2 : 6;
+
+ spdk_pci_unhook_device(dev);
+ TAILQ_REMOVE(&bus->dev_list, vmd_device, tailq);
+
+ /* Release the hotplug region if the device is under hotplug-capable bus */
+ if (bus_device && bus_device->hotplug_capable) {
+ for (i = 0; i < num_bars; ++i) {
+ if (vmd_device->bar[i].start != 0) {
+ vmd_hotplug_free_addr(&bus_device->hp, vmd_device->bar[i].start);
+ }
+ }
+ }
+
+ free(dev);
+}
+
+static void
+vmd_dev_init(struct vmd_pci_device *dev)
+{
+ uint8_t bdf[32];
+
+ dev->pci.addr.domain = dev->bus->vmd->domain;
+ dev->pci.addr.bus = dev->bus->bus_number;
+ dev->pci.addr.dev = dev->devfn;
+ dev->pci.addr.func = 0;
+ dev->pci.id.vendor_id = dev->header->common.vendor_id;
+ dev->pci.id.device_id = dev->header->common.device_id;
+ dev->pci.type = "vmd";
+ dev->pci.map_bar = vmd_dev_map_bar;
+ dev->pci.unmap_bar = vmd_dev_unmap_bar;
+ dev->pci.cfg_read = vmd_dev_cfg_read;
+ dev->pci.cfg_write = vmd_dev_cfg_write;
+ dev->hotplug_capable = false;
+ if (dev->pcie_cap != NULL) {
+ dev->cached_slot_control = dev->pcie_cap->slot_control;
+ }
+
+ if (vmd_is_supported_device(dev)) {
+ spdk_pci_addr_fmt(bdf, sizeof(bdf), &dev->pci.addr);
+ SPDK_DEBUGLOG(SPDK_LOG_VMD, "Initalizing NVMe device at %s\n", bdf);
+ dev->pci.parent = dev->bus->vmd->pci;
+ spdk_pci_hook_device(spdk_pci_nvme_get_driver(), &dev->pci);
+ }
+}
+
+/*
+ * Scans a single bus for all devices attached and return a count of
+ * how many devices found. In the VMD topology, it is assume there are no multi-
+ * function devices. Hence a bus(bridge) will not have multi function with both type
+ * 0 and 1 header.
+ *
+ * The other option for implementing this function is the bus is an int and
+ * create a new device PciBridge. PciBridge would inherit from PciDevice with extra fields,
+ * sub/pri/sec bus. The input becomes PciPort, bus number and parent_bridge.
+ *
+ * The bus number is scanned and if a device is found, based on the header_type, create
+ * either PciBridge(1) or PciDevice(0).
+ *
+ * If a PciBridge, assign bus numbers and rescan new bus. The currenty PciBridge being
+ * scanned becomes the passed in parent_bridge with the new bus number.
+ *
+ * The linked list becomes list of pciBridges with PciDevices attached.
+ *
+ * Return count of how many devices found(type1 + type 0 header devices)
+ */
+static uint8_t
+vmd_scan_single_bus(struct vmd_pci_bus *bus, struct vmd_pci_device *parent_bridge)
+{
+ /* assuming only single function devices are on the bus */
+ struct vmd_pci_device *new_dev;
+ struct vmd_adapter *vmd;
+ union express_slot_capabilities_register slot_cap;
+ struct vmd_pci_bus *new_bus;
+ uint8_t device_number, dev_cnt = 0;
+ uint8_t new_bus_num;
+
+ for (device_number = 0; device_number < 32; device_number++) {
+ new_dev = vmd_alloc_dev(bus, device_number);
+ if (new_dev == NULL) {
+ continue;
+ }
+
+ dev_cnt++;
+ if (new_dev->header->common.header_type & PCI_HEADER_TYPE_BRIDGE) {
+ slot_cap.as_uint32_t = 0;
+ if (new_dev->pcie_cap != NULL) {
+ slot_cap.as_uint32_t = new_dev->pcie_cap->slot_cap.as_uint32_t;
+ }
+
+ new_bus_num = vmd_get_next_bus_number(bus->vmd->is_hotplug_scan ? new_dev : NULL, bus->vmd);
+ if (new_bus_num == 0xff) {
+ free(new_dev);
+ return dev_cnt;
+ }
+ new_bus = vmd_create_new_bus(bus, new_dev, new_bus_num);
+ if (!new_bus) {
+ free(new_dev);
+ return dev_cnt;
+ }
+ new_bus->primary_bus = bus->secondary_bus;
+ new_bus->self = new_dev;
+ new_dev->bus_object = new_bus;
+
+ if (slot_cap.bit_field.hotplug_capable && new_dev->pcie_cap != NULL &&
+ new_dev->pcie_cap->express_cap_register.bit_field.slot_implemented) {
+ new_bus->hotplug_buses = vmd_get_hotplug_bus_numbers(new_dev);
+ new_bus->subordinate_bus += new_bus->hotplug_buses;
+
+ /* Attach hot plug instance if HP is supported */
+ /* Hot inserted SSDs can be assigned port bus of sub-ordinate + 1 */
+ SPDK_DEBUGLOG(SPDK_LOG_VMD, "hotplug_capable/slot_implemented = "
+ "%x:%x\n", slot_cap.bit_field.hotplug_capable,
+ new_dev->pcie_cap->express_cap_register.bit_field.slot_implemented);
+ }
+
+ new_dev->parent_bridge = parent_bridge;
+ new_dev->header->one.primary = new_bus->primary_bus;
+ new_dev->header->one.secondary = new_bus->secondary_bus;
+ new_dev->header->one.subordinate = new_bus->subordinate_bus;
+
+ vmd_bus_update_bridge_info(new_dev);
+ TAILQ_INSERT_TAIL(&bus->vmd->bus_list, new_bus, tailq);
+
+ vmd_dev_init(new_dev);
+
+ if (slot_cap.bit_field.hotplug_capable && new_dev->pcie_cap != NULL &&
+ new_dev->pcie_cap->express_cap_register.bit_field.slot_implemented) {
+ vmd_init_hotplug(new_dev, new_bus);
+ }
+
+ dev_cnt += vmd_scan_single_bus(new_bus, new_dev);
+ if (new_dev->pcie_cap != NULL) {
+ if (new_dev->pcie_cap->express_cap_register.bit_field.device_type == SwitchUpstreamPort) {
+ return dev_cnt;
+ }
+ }
+ } else {
+ /* Attach the device to the current bus and assign base addresses */
+ TAILQ_INSERT_TAIL(&bus->dev_list, new_dev, tailq);
+ g_end_device_count++;
+ if (vmd_assign_base_addrs(new_dev)) {
+ vmd_setup_msix(new_dev, &bus->vmd->msix_table[0]);
+ vmd_dev_init(new_dev);
+ if (vmd_is_supported_device(new_dev)) {
+ vmd = bus->vmd;
+ vmd->target[vmd->nvme_count] = new_dev;
+ vmd->nvme_count++;
+ }
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_VMD, "Removing failed device:%p\n", new_dev);
+ TAILQ_REMOVE(&bus->dev_list, new_dev, tailq);
+ free(new_dev);
+ if (dev_cnt) {
+ dev_cnt--;
+ }
+ }
+ }
+ }
+
+ return dev_cnt;
+}
+
+static void
+vmd_print_pci_info(struct vmd_pci_device *dev)
+{
+ if (!dev) {
+ return;
+ }
+
+ if (dev->pcie_cap != NULL) {
+ SPDK_INFOLOG(SPDK_LOG_VMD, "PCI DEVICE: [%04X:%04X] type(%x) : %s\n",
+ dev->header->common.vendor_id, dev->header->common.device_id,
+ dev->pcie_cap->express_cap_register.bit_field.device_type,
+ device_type[dev->pcie_cap->express_cap_register.bit_field.device_type]);
+ } else {
+ SPDK_INFOLOG(SPDK_LOG_VMD, "PCI DEVICE: [%04X:%04X]\n",
+ dev->header->common.vendor_id, dev->header->common.device_id);
+ }
+
+ SPDK_INFOLOG(SPDK_LOG_VMD, "\tDOMAIN:BDF: %04x:%02x:%02x:%x\n", dev->pci.addr.domain,
+ dev->pci.addr.bus, dev->pci.addr.dev, dev->pci.addr.func);
+
+ if (!(dev->header_type & PCI_HEADER_TYPE_BRIDGE) && dev->bus) {
+ SPDK_INFOLOG(SPDK_LOG_VMD, "\tbase addr: %x : %p\n",
+ dev->header->zero.BAR[0], (void *)dev->bar[0].vaddr);
+ }
+
+ if ((dev->header_type & PCI_HEADER_TYPE_BRIDGE)) {
+ SPDK_INFOLOG(SPDK_LOG_VMD, "\tPrimary = %d, Secondary = %d, Subordinate = %d\n",
+ dev->header->one.primary, dev->header->one.secondary, dev->header->one.subordinate);
+ if (dev->pcie_cap && dev->pcie_cap->express_cap_register.bit_field.slot_implemented) {
+ SPDK_INFOLOG(SPDK_LOG_VMD, "\tSlot implemented on this device.\n");
+ if (dev->pcie_cap->slot_cap.bit_field.hotplug_capable) {
+ SPDK_INFOLOG(SPDK_LOG_VMD, "Device has HOT-PLUG capable slot.\n");
+ }
+ }
+ }
+
+ if (dev->sn_cap != NULL) {
+ uint8_t *snLow = (uint8_t *)&dev->sn_cap->sn_low;
+ uint8_t *snHi = (uint8_t *)&dev->sn_cap->sn_hi;
+
+ SPDK_INFOLOG(SPDK_LOG_VMD, "\tSN: %02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x\n",
+ snHi[3], snHi[2], snHi[1], snHi[0], snLow[3], snLow[2], snLow[1], snLow[0]);
+ }
+}
+
+static void
+vmd_cache_scan_info(struct vmd_pci_device *dev)
+{
+ uint32_t reg __attribute__((unused));
+
+ if (dev->header_type == PCI_HEADER_TYPE_NORMAL) {
+ return;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_VMD, "vendor/device id:%x:%x\n", dev->header->common.vendor_id,
+ dev->header->common.device_id);
+
+ if (vmd_device_is_root_port(dev)) {
+ dev->header->one.prefetch_base_upper = VMD_UPPER_BASE_SIGNATURE;
+ reg = dev->header->one.prefetch_base_upper;
+ dev->header->one.prefetch_limit_upper = VMD_UPPER_LIMIT_SIGNATURE;
+ reg = dev->header->one.prefetch_limit_upper;
+
+ SPDK_DEBUGLOG(SPDK_LOG_VMD, "prefetch: %x:%x\n",
+ dev->header->one.prefetch_base_upper,
+ dev->header->one.prefetch_limit_upper);
+ }
+}
+
+static uint8_t
+vmd_scan_pcibus(struct vmd_pci_bus *bus)
+{
+ struct vmd_pci_bus *bus_entry;
+ struct vmd_pci_device *dev;
+ uint8_t dev_cnt;
+
+ g_end_device_count = 0;
+ TAILQ_INSERT_TAIL(&bus->vmd->bus_list, bus, tailq);
+ bus->vmd->next_bus_number = bus->bus_number + 1;
+ dev_cnt = vmd_scan_single_bus(bus, NULL);
+
+ SPDK_DEBUGLOG(SPDK_LOG_VMD, "VMD scan found %u devices\n", dev_cnt);
+ SPDK_DEBUGLOG(SPDK_LOG_VMD, "VMD scan found %u END DEVICES\n", g_end_device_count);
+
+ SPDK_INFOLOG(SPDK_LOG_VMD, "PCIe devices attached to VMD %04x:%02x:%02x:%x...\n",
+ bus->vmd->pci->addr.domain, bus->vmd->pci->addr.bus,
+ bus->vmd->pci->addr.dev, bus->vmd->pci->addr.func);
+
+ TAILQ_FOREACH(bus_entry, &bus->vmd->bus_list, tailq) {
+ if (bus_entry->self != NULL) {
+ vmd_print_pci_info(bus_entry->self);
+ vmd_cache_scan_info(bus_entry->self);
+ }
+
+ TAILQ_FOREACH(dev, &bus_entry->dev_list, tailq) {
+ vmd_print_pci_info(dev);
+ }
+ }
+
+ return dev_cnt;
+}
+
+static int
+vmd_map_bars(struct vmd_adapter *vmd, struct spdk_pci_device *dev)
+{
+ int rc;
+
+ rc = spdk_pci_device_map_bar(dev, 0, (void **)&vmd->cfg_vaddr,
+ &vmd->cfgbar, &vmd->cfgbar_size);
+ if (rc == 0) {
+ rc = spdk_pci_device_map_bar(dev, 2, (void **)&vmd->mem_vaddr,
+ &vmd->membar, &vmd->membar_size);
+ }
+
+ if (rc == 0) {
+ rc = spdk_pci_device_map_bar(dev, 4, (void **)&vmd->msix_vaddr,
+ &vmd->msixbar, &vmd->msixbar_size);
+ }
+
+ if (rc == 0) {
+ vmd->physical_addr = vmd->membar;
+ vmd->current_addr_size = vmd->membar_size;
+ }
+ return rc;
+}
+
+static int
+vmd_enumerate_devices(struct vmd_adapter *vmd)
+{
+ vmd->vmd_bus.vmd = vmd;
+ vmd->vmd_bus.secondary_bus = vmd->vmd_bus.subordinate_bus = 0;
+ vmd->vmd_bus.primary_bus = vmd->vmd_bus.bus_number = 0;
+ vmd->vmd_bus.domain = vmd->pci->addr.domain;
+
+ return vmd_scan_pcibus(&vmd->vmd_bus);
+}
+
+struct vmd_pci_device *
+vmd_find_device(const struct spdk_pci_addr *addr)
+{
+ struct vmd_pci_bus *bus;
+ struct vmd_pci_device *dev;
+ int i;
+
+ for (i = 0; i < MAX_VMD_TARGET; ++i) {
+ TAILQ_FOREACH(bus, &g_vmd_container.vmd[i].bus_list, tailq) {
+ if (bus->self) {
+ if (spdk_pci_addr_compare(&bus->self->pci.addr, addr) == 0) {
+ return bus->self;
+ }
+ }
+
+ TAILQ_FOREACH(dev, &bus->dev_list, tailq) {
+ if (spdk_pci_addr_compare(&dev->pci.addr, addr) == 0) {
+ return dev;
+ }
+ }
+ }
+ }
+
+ return NULL;
+}
+
+static int
+vmd_enum_cb(void *ctx, struct spdk_pci_device *pci_dev)
+{
+ uint32_t cmd_reg = 0;
+ char bdf[32] = {0};
+ struct vmd_container *vmd_c = ctx;
+ size_t i;
+
+ spdk_pci_device_cfg_read32(pci_dev, &cmd_reg, 4);
+ cmd_reg |= 0x6; /* PCI bus master/memory enable. */
+ spdk_pci_device_cfg_write32(pci_dev, cmd_reg, 4);
+
+ spdk_pci_addr_fmt(bdf, sizeof(bdf), &pci_dev->addr);
+ SPDK_DEBUGLOG(SPDK_LOG_VMD, "Found a VMD[ %d ] at %s\n", vmd_c->count, bdf);
+
+ /* map vmd bars */
+ i = vmd_c->count;
+ vmd_c->vmd[i].pci = pci_dev;
+ vmd_c->vmd[i].vmd_index = i;
+ vmd_c->vmd[i].domain =
+ (pci_dev->addr.bus << 16) | (pci_dev->addr.dev << 8) | pci_dev->addr.func;
+ vmd_c->vmd[i].max_pci_bus = PCI_MAX_BUS_NUMBER;
+ TAILQ_INIT(&vmd_c->vmd[i].bus_list);
+
+ if (vmd_map_bars(&vmd_c->vmd[i], pci_dev) == -1) {
+ return -1;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_VMD, "vmd config bar(%p) vaddr(%p) size(%x)\n",
+ (void *)vmd_c->vmd[i].cfgbar, (void *)vmd_c->vmd[i].cfg_vaddr,
+ (uint32_t)vmd_c->vmd[i].cfgbar_size);
+ SPDK_DEBUGLOG(SPDK_LOG_VMD, "vmd mem bar(%p) vaddr(%p) size(%x)\n",
+ (void *)vmd_c->vmd[i].membar, (void *)vmd_c->vmd[i].mem_vaddr,
+ (uint32_t)vmd_c->vmd[i].membar_size);
+ SPDK_DEBUGLOG(SPDK_LOG_VMD, "vmd msix bar(%p) vaddr(%p) size(%x)\n\n",
+ (void *)vmd_c->vmd[i].msixbar, (void *)vmd_c->vmd[i].msix_vaddr,
+ (uint32_t)vmd_c->vmd[i].msixbar_size);
+
+ vmd_c->count = i + 1;
+
+ vmd_enumerate_devices(&vmd_c->vmd[i]);
+
+ return 0;
+}
+
+int
+spdk_vmd_pci_device_list(struct spdk_pci_addr vmd_addr, struct spdk_pci_device *nvme_list)
+{
+ int cnt = 0;
+ struct vmd_pci_bus *bus;
+ struct vmd_pci_device *dev;
+
+ if (!nvme_list) {
+ return -1;
+ }
+
+ for (int i = 0; i < MAX_VMD_TARGET; ++i) {
+ if (spdk_pci_addr_compare(&vmd_addr, &g_vmd_container.vmd[i].pci->addr) == 0) {
+ TAILQ_FOREACH(bus, &g_vmd_container.vmd[i].bus_list, tailq) {
+ TAILQ_FOREACH(dev, &bus->dev_list, tailq) {
+ nvme_list[cnt++] = dev->pci;
+ if (!dev->is_hooked) {
+ vmd_dev_init(dev);
+ dev->is_hooked = 1;
+ }
+ }
+ }
+ }
+ }
+
+ return cnt;
+}
+
+static void
+vmd_clear_hotplug_status(struct vmd_pci_bus *bus)
+{
+ struct vmd_pci_device *device = bus->self;
+ uint16_t status __attribute__((unused));
+
+ status = device->pcie_cap->slot_status.as_uint16_t;
+ device->pcie_cap->slot_status.as_uint16_t = status;
+ status = device->pcie_cap->slot_status.as_uint16_t;
+
+ status = device->pcie_cap->link_status.as_uint16_t;
+ device->pcie_cap->link_status.as_uint16_t = status;
+ status = device->pcie_cap->link_status.as_uint16_t;
+}
+
+static void
+vmd_bus_handle_hotplug(struct vmd_pci_bus *bus)
+{
+ uint8_t num_devices, sleep_count;
+
+ for (sleep_count = 0; sleep_count < 20; ++sleep_count) {
+ /* Scan until a new device is found */
+ num_devices = vmd_scan_single_bus(bus, bus->self);
+ if (num_devices > 0) {
+ break;
+ }
+
+ spdk_delay_us(200000);
+ }
+
+ if (num_devices == 0) {
+ SPDK_ERRLOG("Timed out while scanning for hotplugged devices\n");
+ }
+}
+
+static void
+vmd_bus_handle_hotremove(struct vmd_pci_bus *bus)
+{
+ struct vmd_pci_device *device, *tmpdev;
+
+ TAILQ_FOREACH_SAFE(device, &bus->dev_list, tailq, tmpdev) {
+ if (!vmd_bus_device_present(bus, device->devfn)) {
+ device->pci.internal.pending_removal = true;
+
+ /* If the device isn't attached, remove it immediately */
+ if (!device->pci.internal.attached) {
+ vmd_dev_detach(&device->pci);
+ }
+ }
+ }
+}
+
+int
+spdk_vmd_hotplug_monitor(void)
+{
+ struct vmd_pci_bus *bus;
+ struct vmd_pci_device *device;
+ int num_hotplugs = 0;
+ uint32_t i;
+
+ for (i = 0; i < g_vmd_container.count; ++i) {
+ TAILQ_FOREACH(bus, &g_vmd_container.vmd[i].bus_list, tailq) {
+ device = bus->self;
+ if (device == NULL || !device->hotplug_capable) {
+ continue;
+ }
+
+ if (device->pcie_cap->slot_status.bit_field.datalink_state_changed != 1) {
+ continue;
+ }
+
+ if (device->pcie_cap->link_status.bit_field.datalink_layer_active == 1) {
+ SPDK_DEBUGLOG(SPDK_LOG_VMD, "Device hotplug detected on bus "
+ "%"PRIu32"\n", bus->bus_number);
+ vmd_bus_handle_hotplug(bus);
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_VMD, "Device hotremove detected on bus "
+ "%"PRIu32"\n", bus->bus_number);
+ vmd_bus_handle_hotremove(bus);
+ }
+
+ vmd_clear_hotplug_status(bus);
+ num_hotplugs++;
+ }
+ }
+
+ return num_hotplugs;
+}
+
+int
+spdk_vmd_init(void)
+{
+ return spdk_pci_enumerate(spdk_pci_vmd_get_driver(), vmd_enum_cb, &g_vmd_container);
+}
+
+void
+spdk_vmd_fini(void)
+{
+ uint32_t i;
+
+ for (i = 0; i < g_vmd_container.count; ++i) {
+ spdk_pci_device_detach(g_vmd_container.vmd[i].pci);
+ }
+}
+
+SPDK_LOG_REGISTER_COMPONENT("vmd", SPDK_LOG_VMD)
diff --git a/src/spdk/lib/vmd/vmd.h b/src/spdk/lib/vmd/vmd.h
new file mode 100644
index 000000000..46490a6f7
--- /dev/null
+++ b/src/spdk/lib/vmd/vmd.h
@@ -0,0 +1,201 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef VMD_H
+#define VMD_H
+
+#include "spdk/stdinc.h"
+#include "spdk/vmd.h"
+#include "spdk/env.h"
+#include "spdk/util.h"
+#include "spdk_internal/log.h"
+#include "vmd_spec.h"
+
+struct vmd_hot_plug;
+struct vmd_adapter;
+struct vmd_pci_device;
+
+struct pci_bars {
+ uint64_t vaddr;
+ uint64_t start;
+ uint32_t size;
+};
+
+struct vmd_pci_bus {
+ struct vmd_adapter *vmd;
+ struct vmd_pci_bus *parent; /* parent bus that this bus is attached to(primary bus. */
+ struct vmd_pci_device *self; /* Pci device that describes this bus(bar, bus numbers, etc */
+
+ uint32_t domain : 8;
+ uint32_t hotplug_buses : 10;
+ uint32_t is_added : 1;
+ uint32_t hp_event_queued : 1;
+ uint32_t rsv : 12;
+
+ uint32_t bus_number : 8;
+ uint32_t primary_bus : 8;
+ uint32_t secondary_bus : 8;
+ uint32_t subordinate_bus : 8;
+
+ TAILQ_HEAD(, vmd_pci_device) dev_list; /* list of pci end device attached to this bus */
+ TAILQ_ENTRY(vmd_pci_bus) tailq; /* link for all buses found during scan */
+};
+
+/*
+ * memory element for base address assignment and reuse
+ */
+struct pci_mem_mgr {
+ uint32_t size : 30; /* size of memory element */
+ uint32_t in_use : 1;
+ uint32_t rsv : 1;
+ uint64_t addr;
+ TAILQ_ENTRY(pci_mem_mgr) tailq;
+};
+
+struct vmd_hot_plug {
+ uint32_t count : 12;
+ uint32_t reserved_bus_count : 4;
+ uint32_t max_hotplug_bus_number : 8;
+ uint32_t next_bus_number : 8;
+ struct pci_bars bar;
+ union express_slot_status_register slot_status;
+ struct pci_mem_mgr mem[ADDR_ELEM_COUNT];
+ uint8_t bus_numbers[RESERVED_HOTPLUG_BUSES];
+ struct vmd_pci_bus *bus;
+ TAILQ_HEAD(, pci_mem_mgr) free_mem_queue;
+ TAILQ_HEAD(, pci_mem_mgr) alloc_mem_queue;
+ TAILQ_HEAD(, pci_mem_mgr) unused_mem_queue;
+};
+
+struct vmd_pci_device {
+ struct spdk_pci_device pci;
+ struct pci_bars bar[6];
+
+ struct vmd_pci_device *parent_bridge;
+ struct vmd_pci_bus *bus, *parent;
+ struct vmd_pci_bus *bus_object; /* bus tracks pci bus associated with this dev if type 1 dev. */
+ struct vmd_pci_bus *subordinate;
+ volatile struct pci_header *header;
+ volatile struct pci_express_cap *pcie_cap;
+ volatile struct pci_msix_capability *msix_cap;
+ volatile struct pci_msi_cap *msi_cap;
+ volatile struct serial_number_capability *sn_cap;
+ volatile struct pci_msix_table_entry *msix_table;
+
+ TAILQ_ENTRY(vmd_pci_device) tailq;
+
+ uint32_t class;
+ uint16_t vid;
+ uint16_t did;
+ uint16_t pcie_flags, msix_table_size;
+ uint32_t devfn;
+ bool hotplug_capable;
+
+ uint32_t header_type : 1;
+ uint32_t multifunction : 1;
+ uint32_t hotplug_bridge : 1;
+ uint32_t is_added : 1;
+ uint32_t is_hooked : 1;
+ uint32_t rsv1 : 12;
+ uint32_t target : 16;
+
+ struct vmd_hot_plug hp;
+ /* Cached version of the slot_control register */
+ union express_slot_control_register cached_slot_control;
+};
+
+/*
+ * The VMD adapter
+ */
+struct vmd_adapter {
+ struct spdk_pci_device *pci;
+ uint32_t domain;
+ /* physical and virtual VMD bars */
+ uint64_t cfgbar, cfgbar_size;
+ uint64_t membar, membar_size;
+ uint64_t msixbar, msixbar_size;
+ volatile uint8_t *cfg_vaddr;
+ volatile uint8_t *mem_vaddr;
+ volatile uint8_t *msix_vaddr;
+ volatile struct pci_msix_table_entry *msix_table;
+ uint32_t bar_sizes[6];
+
+ uint64_t physical_addr;
+ uint32_t current_addr_size;
+
+ uint32_t next_bus_number : 10;
+ uint32_t max_pci_bus : 10;
+ uint32_t is_hotplug_scan : 1;
+ uint32_t is_ready : 1;
+ uint32_t processing_hp : 1;
+ uint32_t max_payload_size: 3;
+ uint32_t root_port_updated : 1;
+ uint32_t scan_completed : 1;
+ uint32_t rsv : 4;
+
+ /* end devices attached to vmd adapters */
+ struct vmd_pci_device *target[MAX_VMD_TARGET];
+ uint32_t dev_count : 16;
+ uint32_t nvme_count : 8;
+ uint32_t vmd_index : 8;
+
+ struct vmd_pci_bus vmd_bus;
+
+ TAILQ_HEAD(, vmd_pci_bus) bus_list;
+
+ struct event_fifo *hp_queue;
+};
+
+/* TODO: Temporary stubs for Hot Plug interface */
+static inline struct vmd_pci_bus *
+vmd_is_dev_in_hotplug_path(struct vmd_pci_device *dev)
+{
+ return NULL;
+}
+
+static inline void
+vmd_hp_enable_hotplug(struct vmd_hot_plug *hp)
+{
+
+}
+
+static inline uint8_t
+vmd_hp_get_next_bus_number(struct vmd_hot_plug *hp)
+{
+ assert(false);
+ return 0;
+}
+
+struct vmd_pci_device *vmd_find_device(const struct spdk_pci_addr *addr);
+
+#endif /* VMD_H */
diff --git a/src/spdk/lib/vmd/vmd_spec.h b/src/spdk/lib/vmd/vmd_spec.h
new file mode 100644
index 000000000..07a4a113d
--- /dev/null
+++ b/src/spdk/lib/vmd/vmd_spec.h
@@ -0,0 +1,473 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef VMD_SPEC_H
+#define VMD_SPEC_H
+
+#define MAX_VMD_SUPPORTED 48 /* max number of vmd controllers in a system - */
+#define VMD_DOMAIN_START 0x201D
+
+#define PCI_INVALID_VENDORID 0xFFFF
+#define ONE_MB (1<<20)
+#define PCI_OFFSET_OF(object, member) ((uint32_t)&((object*)0)->member)
+#define TWOS_COMPLEMENT(value) (~(value) + 1)
+
+#define VMD_UPPER_BASE_SIGNATURE 0xFFFFFFEF
+#define VMD_UPPER_LIMIT_SIGNATURE 0xFFFFFFED
+
+/*
+ * BAR assignment constants
+ */
+#define PCI_DWORD_SHIFT 32
+#define PCI_BASE_ADDR_MASK 0xFFFFFFF0
+#define PCI_BAR_MEMORY_MASK 0x0000000F
+#define PCI_BAR_MEMORY_MEM_IND 0x1
+#define PCI_BAR_MEMORY_TYPE 0x6
+#define PCI_BAR_MEMORY_PREFETCH 0x8
+#define PCI_BAR_MEMORY_TYPE_32 0x0
+#define PCI_BAR_MEMORY_TYPE_64 0x4
+#define PCI_BAR_MB_MASK 0xFFFFF
+#define PCI_PCI_BRIDGE_ADDR_DEF 0xFFF0
+#define PCI_BRIDGE_MEMORY_MASK 0xFFF0
+#define PCI_BRIDGE_PREFETCH_64 0x0001
+#define PCI_BRIDGE_MEMORY_SHIFT 16
+#define PCI_CONFIG_ACCESS_DELAY 500
+
+#define PCI_MAX_CFG_SIZE 0x1000
+
+#define PCI_HEADER_TYPE 0x0e
+#define PCI_HEADER_TYPE_NORMAL 0
+#define PCI_HEADER_TYPE_BRIDGE 1
+#define PCI_MULTI_FUNCTION 0x80
+
+#define PCI_COMMAND_MEMORY 0x2
+#define PCI_COMMAND_MASTER 0x4
+
+#define PCIE_TYPE_FLAGS 0xf0
+#define PCIE_TYPE_SHIFT 4
+#define PCIE_TYPE_ROOT_PORT 0x4
+#define PCIE_TYPE_DOWNSTREAM 0x6
+
+#define PCI_CLASS_STORAGE_EXPRESS 0x010802
+#define ADDR_ELEM_COUNT 32
+#define PCI_MAX_BUS_NUMBER 0x7F
+#define RESERVED_HOTPLUG_BUSES 1
+#define isHotPlugCapable(slotCap) ((slotCap) & (1<<6))
+#define CONFIG_OFFSET_ADDR(bus, device, function, reg) (((bus)<<20) | (device)<<15 | (function<<12) | (reg))
+#define BRIDGE_BASEREG(reg) (0xFFF0 & ((reg)>>16))
+
+#define MISCCTRLSTS_0_OFFSET 0x188
+#define ENABLE_ACPI_MODE_FOR_HOTPLUG (1 << 3)
+
+/* Bit encodings for Command Register */
+#define IO_SPACE_ENABLE 0x0001
+#define MEMORY_SPACE_ENABLE 0x0002
+#define BUS_MASTER_ENABLE 0x0004
+
+/* Bit encodings for Status Register */
+#define PCI_CAPABILITIES_LIST 0x0010
+#define PCI_RECEIVED_TARGET_ABORT 0x1000
+#define PCI_RECEIVED_MASTER_ABORT 0x2000
+#define PCI_SIGNALED_SYSTEM_ERROR 0x4000
+#define PCI_DETECTED_PARITY_ERROR 0x8000
+
+/* Capability IDs */
+#define CAPABILITY_ID_POWER_MANAGEMENT 0x01
+#define CAPABILITY_ID_MSI 0x05
+#define CAPABILITY_ID_PCI_EXPRESS 0x10
+#define CAPABILITY_ID_MSIX 0x11
+
+#define PCI_MSIX_ENABLE (1 << 15) /* bit 15 of MSIX Message Control */
+#define PCI_MSIX_FUNCTION_MASK (1 << 14) /* bit 14 of MSIX Message Control */
+
+/* extended capability */
+#define EXTENDED_CAPABILITY_OFFSET 0x100
+#define DEVICE_SERIAL_NUMBER_CAP_ID 0x3
+
+#define BAR_SIZE (1 << 20)
+
+struct pci_enhanced_capability_header {
+ uint16_t capability_id;
+ uint16_t version: 4;
+ uint16_t next: 12;
+};
+
+struct serial_number_capability {
+ struct pci_enhanced_capability_header hdr;
+ uint32_t sn_low;
+ uint32_t sn_hi;
+};
+
+struct pci_header_common {
+ uint16_t vendor_id;
+ uint16_t device_id;
+ uint16_t command;
+ uint16_t status;
+ uint32_t rev_class;
+ uint8_t cache_line_size;
+ uint8_t master_lat_timer;
+ uint8_t header_type;
+ uint8_t BIST;
+ uint8_t rsvd12[36];
+ uint8_t cap_pointer;
+ uint8_t rsvd53[7];
+ uint8_t int_line;
+ uint8_t int_pin;
+ uint8_t rsvd62[2];
+};
+
+struct pci_header_zero {
+ uint16_t vendor_id;
+ uint16_t device_id;
+ uint16_t command;
+ uint16_t status;
+ uint32_t rev_class;
+ uint8_t cache_line_size;
+ uint8_t master_lat_timer;
+ uint8_t header_type;
+ uint8_t BIST;
+ uint32_t BAR[6];
+ uint32_t carbus_cis_pointer;
+ uint16_t ssvid;
+ uint16_t ssid;
+ uint32_t exp_rom_base_addr;
+ uint8_t cap_pointer;
+ uint8_t rsvd53[7];
+ uint8_t intLine;
+ uint8_t int_pin;
+ uint8_t min_gnt;
+ uint8_t max_lat;
+};
+
+struct pci_header_one {
+ uint16_t vendor_id;
+ uint16_t device_id;
+ uint16_t command;
+ uint16_t status;
+ uint32_t rev_class;
+ uint8_t cache_line_size;
+ uint8_t master_lat_timer;
+ uint8_t header_type;
+ uint8_t BIST;
+ uint32_t BAR[2];
+ uint8_t primary;
+ uint8_t secondary;
+ uint8_t subordinate;
+ uint8_t secondary_lat_timer;
+ uint8_t io_base;
+ uint8_t io_limit;
+ uint16_t secondary_status;
+ uint16_t mem_base;
+ uint16_t mem_limit;
+ uint16_t prefetch_base;
+ uint16_t prefetch_limit;
+ uint32_t prefetch_base_upper;
+ uint32_t prefetch_limit_upper;
+ uint16_t io_base_upper;
+ uint16_t io_limit_upper;
+ uint8_t cap_pointer;
+ uint8_t rsvd53[3];
+ uint32_t exp_romBase_addr;
+ uint8_t int_line;
+ uint8_t int_pin;
+ uint16_t bridge_control;
+};
+
+struct pci_capabilities_header {
+ uint8_t capability_id;
+ uint8_t next;
+};
+
+/*
+ * MSI capability structure for msi interrupt vectors
+ */
+#define MAX_MSIX_TABLE_SIZE 0x800
+#define MSIX_ENTRY_VECTOR_CTRL_MASKBIT 1
+#define PORT_INT_VECTOR 0;
+#define CLEAR_MSIX_DESTINATION_ID 0xfff00fff
+struct pci_msi_cap {
+ struct pci_capabilities_header header;
+ union _MsiControl {
+ uint16_t as_uint16_t;
+ struct _PCI_MSI_MESSAGE_CONTROL {
+ uint16_t msi_enable : 1;
+ uint16_t multiple_message_capable : 3;
+ uint16_t multiple_message_enable : 3;
+ uint16_t capable_of_64bits : 1;
+ uint16_t per_vector_mask_capable : 1;
+ uint16_t reserved : 7;
+ } bit;
+ } message_control;
+ union {
+ struct _PCI_MSI_MESSAGE_ADDRESS {
+ uint32_t reserved : 2;
+ uint32_t address : 30;
+ } reg;
+ uint32_t raw;
+ } message_address_lower;
+ union {
+ struct _Option32_bit {
+ uint16_t message_data;
+ } option32_bit;
+ struct _Option64_bit {
+ uint32_t message_address_upper;
+ uint16_t message_data;
+ uint16_t reserved;
+ uint32_t mask_bits;
+ uint32_t pending_bits;
+ } option64_bit;
+ };
+};
+
+struct pcix_table_pointer {
+ union {
+ struct {
+ uint32_t BaseIndexRegister : 3;
+ uint32_t Reserved : 29;
+ } TableBIR;
+ uint32_t TableOffset;
+ };
+};
+
+struct pci_msix_capability {
+ struct pci_capabilities_header header;
+ union _MsixControl {
+ uint16_t as_uint16_t;
+ struct msg_ctrl {
+ uint16_t table_size : 11;
+ uint16_t reserved : 3;
+ uint16_t function_mask : 1;
+ uint16_t msix_enable : 1;
+ } bit;
+ } message_control;
+
+ struct pcix_table_pointer message_table;
+ struct pcix_table_pointer pba_table;
+};
+
+struct pci_msix_table_entry {
+ volatile uint32_t message_addr_lo;
+ volatile uint32_t message_addr_hi;
+ volatile uint32_t message_data;
+ volatile uint32_t vector_control;
+};
+
+/*
+ * Pci express capability
+ */
+enum PciExpressCapabilities {
+ /* 0001b Legacy PCI Express Endpoint */
+ LegacyEndpoint = 0x1,
+ /* 0000b PCI Express Endpoint */
+ ExpressEndpoint = 0x0,
+ /* 0100b Root Port of PCI Express Root Complex* */
+ RootComplexRootPort = 0x4,
+ /* 0101b Upstream Port of PCI Express Switch* */
+ SwitchUpstreamPort = 0x5,
+ /* 0110b Downstream Port of PCI Express Switch* */
+ SwitchDownStreamPort = 0x6,
+ /* 0111b PCI Express to PCI/PCI-X Bridge* */
+ ExpressToPciBridge = 0x7,
+ /* 1000b PCI/PCI-X to PCI Express Bridge* */
+ PciToExpressBridge = 0x8,
+ /* 1001b Root Complex Integrated Endpoint */
+ RCIntegratedEndpoint = 0x9,
+ /* 1010b Root Complex Event Collector */
+ RootComplexEventCollector = 0xa,
+ InvalidCapability = 0xff
+};
+
+union express_capability_register {
+ struct {
+ uint16_t capability_version : 4;
+ uint16_t device_type : 4;
+ uint16_t slot_implemented : 1;
+ uint16_t interrupt_message_number : 5;
+ uint16_t rsv : 2;
+ } bit_field;
+ uint16_t as_uint16_t;
+};
+
+union express_slot_capabilities_register {
+ struct {
+ uint32_t attention_button_present : 1;
+ uint32_t power_controller_present : 1;
+ uint32_t MRL_sensor_present : 1;
+ uint32_t attention_indicator_present : 1;
+ uint32_t power_indicator_present : 1;
+ uint32_t hotplug_surprise : 1;
+ uint32_t hotplug_capable : 1;
+ uint32_t slot_power_limit : 8;
+ uint32_t slotPower_limit_scale : 2;
+ uint32_t electromechanical_lock_present : 1;
+ uint32_t no_command_completed_support : 1;
+ uint32_t physical_slot_number : 13;
+ } bit_field;
+ uint32_t as_uint32_t;
+};
+
+union express_slot_control_register {
+ struct {
+ uint16_t attention_button_enable : 1;
+ uint16_t power_fault_detect_enable : 1;
+ uint16_t MRLsensor_enable : 1;
+ uint16_t presence_detect_enable : 1;
+ uint16_t command_completed_enable : 1;
+ uint16_t hotplug_interrupt_enable : 1;
+ uint16_t attention_indicator_control : 2;
+ uint16_t power_indicator_control : 2;
+ uint16_t power_controller_control : 1;
+ uint16_t electromechanical_lockcontrol : 1;
+ uint16_t datalink_state_change_enable : 1;
+ uint16_t Rsvd : 3;
+ } bit_field;
+ uint16_t as_uint16_t;
+};
+
+union express_slot_status_register {
+ struct {
+ uint16_t attention_button_pressed : 1;
+ uint16_t power_fault_detected : 1;
+ uint16_t MRL_sensor_changed : 1;
+ uint16_t presence_detect_changed : 1;
+ uint16_t command_completed : 1;
+ uint16_t MRL_sensor_state : 1;
+ uint16_t presence_detect_state : 1;
+ uint16_t electromechanical_lock_engaged : 1;
+ uint16_t datalink_state_changed : 1;
+ uint16_t rsvd : 7;
+ } bit_field;
+ uint16_t as_uint16_t;
+};
+
+union express_root_control_register {
+ struct {
+ uint16_t CorrectableSerrEnable : 1;
+ uint16_t NonFatalSerrEnable : 1;
+ uint16_t FatalSerrEnable : 1;
+ uint16_t PMEInterruptEnable : 1;
+ uint16_t CRSSoftwareVisibilityEnable : 1;
+ uint16_t Rsvd : 11;
+ } bit_field;
+ uint16_t as_uint16_t;
+};
+
+union express_link_capability_register {
+ struct {
+ uint32_t maximum_link_speed : 4;
+ uint32_t maximum_link_width : 6;
+ uint32_t active_state_pms_support : 2;
+ uint32_t l0_exit_latency : 3;
+ uint32_t l1_exit_latency : 3;
+ uint32_t clock_power_management : 1;
+ uint32_t surprise_down_error_reporting_capable : 1;
+ uint32_t datalink_layer_active_reporting_capable : 1;
+ uint32_t link_bandwidth_notification_capability : 1;
+ uint32_t aspm_optionality_compliance : 1;
+ uint32_t rsvd : 1;
+ uint32_t port_number : 8;
+ } bit_field;
+ uint32_t as_uint32_t;
+};
+
+union express_link_control_register {
+ struct {
+ uint16_t active_state_pm_control : 2;
+ uint16_t rsvd1 : 1;
+ uint16_t read_completion_boundary : 1;
+ uint16_t link_disable : 1;
+ uint16_t retrain_link : 1;
+ uint16_t common_clock_config : 1;
+ uint16_t extended_synch : 1;
+ uint16_t enable_clock_power_management : 1;
+ uint16_t rsvd2 : 7;
+ } bit_field;
+ uint16_t as_uint16_t;
+};
+
+union express_link_status_register {
+ struct {
+ uint16_t link_speed : 4;
+ uint16_t link_width : 6;
+ uint16_t undefined : 1;
+ uint16_t link_training : 1;
+ uint16_t slot_clock_config : 1;
+ uint16_t datalink_layer_active : 1;
+ uint16_t asvd : 2;
+ } bit_field;
+ uint16_t as_uint16_t;
+};
+
+struct pci_express_cap {
+ uint8_t capid;
+ uint8_t next_cap;
+ union express_capability_register express_cap_register;
+ uint32_t device_cap;
+ uint16_t device_control;
+ uint16_t device_status;
+ union express_link_capability_register link_cap;
+ union express_link_control_register link_control;
+ union express_link_status_register link_status;
+ union express_slot_capabilities_register slot_cap;
+ union express_slot_control_register slot_control;
+ union express_slot_status_register slot_status;
+ uint32_t root_status;
+ uint32_t deviceCap2;
+ uint16_t deviceControl2;
+ uint16_t deviceStatus2;
+ uint32_t linkCap2;
+ uint16_t linkControl2;
+ uint16_t linkStatus2;
+ uint32_t slotCap2;
+ uint16_t slotControl2;
+ uint16_t slotStatus2;
+};
+
+struct pci_msix_cap {
+ uint8_t cap_idd;
+ uint8_t next_cap;
+ uint16_t msg_control_reg;
+ uint32_t msix_table_offset;
+ uint32_t pba_offset;
+};
+
+struct pci_header {
+ union {
+ struct pci_header_common common;
+ struct pci_header_zero zero;
+ struct pci_header_one one;
+ };
+};
+
+#endif /* VMD_SPEC_H */